• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  fs/ext4/extents_status.c
4  *
5  * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
6  * Modified by
7  *	Allison Henderson <achender@linux.vnet.ibm.com>
8  *	Hugh Dickins <hughd@google.com>
9  *	Zheng Liu <wenqing.lz@taobao.com>
10  *
11  * Ext4 extents status tree core functions.
12  */
13 #include <linux/list_sort.h>
14 #include <linux/proc_fs.h>
15 #include <linux/seq_file.h>
16 #include "ext4.h"
17 
18 #include <trace/events/ext4.h>
19 
20 /*
21  * According to previous discussion in Ext4 Developer Workshop, we
22  * will introduce a new structure called io tree to track all extent
23  * status in order to solve some problems that we have met
24  * (e.g. Reservation space warning), and provide extent-level locking.
25  * Delay extent tree is the first step to achieve this goal.  It is
26  * original built by Yongqiang Yang.  At that time it is called delay
27  * extent tree, whose goal is only track delayed extents in memory to
28  * simplify the implementation of fiemap and bigalloc, and introduce
29  * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
30  * delay extent tree at the first commit.  But for better understand
31  * what it does, it has been rename to extent status tree.
32  *
33  * Step1:
34  * Currently the first step has been done.  All delayed extents are
35  * tracked in the tree.  It maintains the delayed extent when a delayed
36  * allocation is issued, and the delayed extent is written out or
37  * invalidated.  Therefore the implementation of fiemap and bigalloc
38  * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
39  *
40  * The following comment describes the implemenmtation of extent
41  * status tree and future works.
42  *
43  * Step2:
44  * In this step all extent status are tracked by extent status tree.
45  * Thus, we can first try to lookup a block mapping in this tree before
46  * finding it in extent tree.  Hence, single extent cache can be removed
47  * because extent status tree can do a better job.  Extents in status
48  * tree are loaded on-demand.  Therefore, the extent status tree may not
49  * contain all of the extents in a file.  Meanwhile we define a shrinker
50  * to reclaim memory from extent status tree because fragmented extent
51  * tree will make status tree cost too much memory.  written/unwritten/-
52  * hole extents in the tree will be reclaimed by this shrinker when we
53  * are under high memory pressure.  Delayed extents will not be
54  * reclimed because fiemap, bigalloc, and seek_data/hole need it.
55  */
56 
57 /*
58  * Extent status tree implementation for ext4.
59  *
60  *
61  * ==========================================================================
62  * Extent status tree tracks all extent status.
63  *
64  * 1. Why we need to implement extent status tree?
65  *
66  * Without extent status tree, ext4 identifies a delayed extent by looking
67  * up page cache, this has several deficiencies - complicated, buggy,
68  * and inefficient code.
69  *
70  * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
71  * block or a range of blocks are belonged to a delayed extent.
72  *
73  * Let us have a look at how they do without extent status tree.
74  *   --	FIEMAP
75  *	FIEMAP looks up page cache to identify delayed allocations from holes.
76  *
77  *   --	SEEK_HOLE/DATA
78  *	SEEK_HOLE/DATA has the same problem as FIEMAP.
79  *
80  *   --	bigalloc
81  *	bigalloc looks up page cache to figure out if a block is
82  *	already under delayed allocation or not to determine whether
83  *	quota reserving is needed for the cluster.
84  *
85  *   --	writeout
86  *	Writeout looks up whole page cache to see if a buffer is
87  *	mapped, If there are not very many delayed buffers, then it is
88  *	time consuming.
89  *
90  * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
91  * bigalloc and writeout can figure out if a block or a range of
92  * blocks is under delayed allocation(belonged to a delayed extent) or
93  * not by searching the extent tree.
94  *
95  *
96  * ==========================================================================
97  * 2. Ext4 extent status tree impelmentation
98  *
99  *   --	extent
100  *	A extent is a range of blocks which are contiguous logically and
101  *	physically.  Unlike extent in extent tree, this extent in ext4 is
102  *	a in-memory struct, there is no corresponding on-disk data.  There
103  *	is no limit on length of extent, so an extent can contain as many
104  *	blocks as they are contiguous logically and physically.
105  *
106  *   --	extent status tree
107  *	Every inode has an extent status tree and all allocation blocks
108  *	are added to the tree with different status.  The extent in the
109  *	tree are ordered by logical block no.
110  *
111  *   --	operations on a extent status tree
112  *	There are three important operations on a delayed extent tree: find
113  *	next extent, adding a extent(a range of blocks) and removing a extent.
114  *
115  *   --	race on a extent status tree
116  *	Extent status tree is protected by inode->i_es_lock.
117  *
118  *   --	memory consumption
119  *      Fragmented extent tree will make extent status tree cost too much
120  *      memory.  Hence, we will reclaim written/unwritten/hole extents from
121  *      the tree under a heavy memory pressure.
122  *
123  *
124  * ==========================================================================
125  * 3. Performance analysis
126  *
127  *   --	overhead
128  *	1. There is a cache extent for write access, so if writes are
129  *	not very random, adding space operaions are in O(1) time.
130  *
131  *   --	gain
132  *	2. Code is much simpler, more readable, more maintainable and
133  *	more efficient.
134  *
135  *
136  * ==========================================================================
137  * 4. TODO list
138  *
139  *   -- Refactor delayed space reservation
140  *
141  *   -- Extent-level locking
142  */
143 
144 static struct kmem_cache *ext4_es_cachep;
145 static struct kmem_cache *ext4_pending_cachep;
146 
147 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
148 			      struct extent_status *prealloc);
149 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
150 			      ext4_lblk_t end, int *reserved,
151 			      struct extent_status *prealloc);
152 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
153 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
154 		       struct ext4_inode_info *locked_ei);
155 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
156 			    ext4_lblk_t len,
157 			    struct pending_reservation **prealloc);
158 
ext4_init_es(void)159 int __init ext4_init_es(void)
160 {
161 	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
162 					   sizeof(struct extent_status),
163 					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
164 	if (ext4_es_cachep == NULL)
165 		return -ENOMEM;
166 	return 0;
167 }
168 
ext4_exit_es(void)169 void ext4_exit_es(void)
170 {
171 	kmem_cache_destroy(ext4_es_cachep);
172 }
173 
ext4_es_init_tree(struct ext4_es_tree * tree)174 void ext4_es_init_tree(struct ext4_es_tree *tree)
175 {
176 	tree->root = RB_ROOT;
177 	tree->cache_es = NULL;
178 }
179 
180 #ifdef ES_DEBUG__
ext4_es_print_tree(struct inode * inode)181 static void ext4_es_print_tree(struct inode *inode)
182 {
183 	struct ext4_es_tree *tree;
184 	struct rb_node *node;
185 
186 	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
187 	tree = &EXT4_I(inode)->i_es_tree;
188 	node = rb_first(&tree->root);
189 	while (node) {
190 		struct extent_status *es;
191 		es = rb_entry(node, struct extent_status, rb_node);
192 		printk(KERN_DEBUG " [%u/%u) %llu %x",
193 		       es->es_lblk, es->es_len,
194 		       ext4_es_pblock(es), ext4_es_status(es));
195 		node = rb_next(node);
196 	}
197 	printk(KERN_DEBUG "\n");
198 }
199 #else
200 #define ext4_es_print_tree(inode)
201 #endif
202 
ext4_es_end(struct extent_status * es)203 static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
204 {
205 	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
206 	return es->es_lblk + es->es_len - 1;
207 }
208 
209 /*
210  * search through the tree for an delayed extent with a given offset.  If
211  * it can't be found, try to find next extent.
212  */
__es_tree_search(struct rb_root * root,ext4_lblk_t lblk)213 static struct extent_status *__es_tree_search(struct rb_root *root,
214 					      ext4_lblk_t lblk)
215 {
216 	struct rb_node *node = root->rb_node;
217 	struct extent_status *es = NULL;
218 
219 	while (node) {
220 		es = rb_entry(node, struct extent_status, rb_node);
221 		if (lblk < es->es_lblk)
222 			node = node->rb_left;
223 		else if (lblk > ext4_es_end(es))
224 			node = node->rb_right;
225 		else
226 			return es;
227 	}
228 
229 	if (es && lblk < es->es_lblk)
230 		return es;
231 
232 	if (es && lblk > ext4_es_end(es)) {
233 		node = rb_next(&es->rb_node);
234 		return node ? rb_entry(node, struct extent_status, rb_node) :
235 			      NULL;
236 	}
237 
238 	return NULL;
239 }
240 
241 /*
242  * ext4_es_find_extent_range - find extent with specified status within block
243  *                             range or next extent following block range in
244  *                             extents status tree
245  *
246  * @inode - file containing the range
247  * @matching_fn - pointer to function that matches extents with desired status
248  * @lblk - logical block defining start of range
249  * @end - logical block defining end of range
250  * @es - extent found, if any
251  *
252  * Find the first extent within the block range specified by @lblk and @end
253  * in the extents status tree that satisfies @matching_fn.  If a match
254  * is found, it's returned in @es.  If not, and a matching extent is found
255  * beyond the block range, it's returned in @es.  If no match is found, an
256  * extent is returned in @es whose es_lblk, es_len, and es_pblk components
257  * are 0.
258  */
__es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)259 static void __es_find_extent_range(struct inode *inode,
260 				   int (*matching_fn)(struct extent_status *es),
261 				   ext4_lblk_t lblk, ext4_lblk_t end,
262 				   struct extent_status *es)
263 {
264 	struct ext4_es_tree *tree = NULL;
265 	struct extent_status *es1 = NULL;
266 	struct rb_node *node;
267 
268 	WARN_ON(es == NULL);
269 	WARN_ON(end < lblk);
270 
271 	tree = &EXT4_I(inode)->i_es_tree;
272 
273 	/* see if the extent has been cached */
274 	es->es_lblk = es->es_len = es->es_pblk = 0;
275 	es1 = READ_ONCE(tree->cache_es);
276 	if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
277 		es_debug("%u cached by [%u/%u) %llu %x\n",
278 			 lblk, es1->es_lblk, es1->es_len,
279 			 ext4_es_pblock(es1), ext4_es_status(es1));
280 		goto out;
281 	}
282 
283 	es1 = __es_tree_search(&tree->root, lblk);
284 
285 out:
286 	if (es1 && !matching_fn(es1)) {
287 		while ((node = rb_next(&es1->rb_node)) != NULL) {
288 			es1 = rb_entry(node, struct extent_status, rb_node);
289 			if (es1->es_lblk > end) {
290 				es1 = NULL;
291 				break;
292 			}
293 			if (matching_fn(es1))
294 				break;
295 		}
296 	}
297 
298 	if (es1 && matching_fn(es1)) {
299 		WRITE_ONCE(tree->cache_es, es1);
300 		es->es_lblk = es1->es_lblk;
301 		es->es_len = es1->es_len;
302 		es->es_pblk = es1->es_pblk;
303 	}
304 
305 }
306 
307 /*
308  * Locking for __es_find_extent_range() for external use
309  */
ext4_es_find_extent_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end,struct extent_status * es)310 void ext4_es_find_extent_range(struct inode *inode,
311 			       int (*matching_fn)(struct extent_status *es),
312 			       ext4_lblk_t lblk, ext4_lblk_t end,
313 			       struct extent_status *es)
314 {
315 	trace_ext4_es_find_extent_range_enter(inode, lblk);
316 
317 	read_lock(&EXT4_I(inode)->i_es_lock);
318 	__es_find_extent_range(inode, matching_fn, lblk, end, es);
319 	read_unlock(&EXT4_I(inode)->i_es_lock);
320 
321 	trace_ext4_es_find_extent_range_exit(inode, es);
322 }
323 
324 /*
325  * __es_scan_range - search block range for block with specified status
326  *                   in extents status tree
327  *
328  * @inode - file containing the range
329  * @matching_fn - pointer to function that matches extents with desired status
330  * @lblk - logical block defining start of range
331  * @end - logical block defining end of range
332  *
333  * Returns true if at least one block in the specified block range satisfies
334  * the criterion specified by @matching_fn, and false if not.  If at least
335  * one extent has the specified status, then there is at least one block
336  * in the cluster with that status.  Should only be called by code that has
337  * taken i_es_lock.
338  */
__es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t start,ext4_lblk_t end)339 static bool __es_scan_range(struct inode *inode,
340 			    int (*matching_fn)(struct extent_status *es),
341 			    ext4_lblk_t start, ext4_lblk_t end)
342 {
343 	struct extent_status es;
344 
345 	__es_find_extent_range(inode, matching_fn, start, end, &es);
346 	if (es.es_len == 0)
347 		return false;   /* no matching extent in the tree */
348 	else if (es.es_lblk <= start &&
349 		 start < es.es_lblk + es.es_len)
350 		return true;
351 	else if (start <= es.es_lblk && es.es_lblk <= end)
352 		return true;
353 	else
354 		return false;
355 }
356 /*
357  * Locking for __es_scan_range() for external use
358  */
ext4_es_scan_range(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk,ext4_lblk_t end)359 bool ext4_es_scan_range(struct inode *inode,
360 			int (*matching_fn)(struct extent_status *es),
361 			ext4_lblk_t lblk, ext4_lblk_t end)
362 {
363 	bool ret;
364 
365 	read_lock(&EXT4_I(inode)->i_es_lock);
366 	ret = __es_scan_range(inode, matching_fn, lblk, end);
367 	read_unlock(&EXT4_I(inode)->i_es_lock);
368 
369 	return ret;
370 }
371 
372 /*
373  * __es_scan_clu - search cluster for block with specified status in
374  *                 extents status tree
375  *
376  * @inode - file containing the cluster
377  * @matching_fn - pointer to function that matches extents with desired status
378  * @lblk - logical block in cluster to be searched
379  *
380  * Returns true if at least one extent in the cluster containing @lblk
381  * satisfies the criterion specified by @matching_fn, and false if not.  If at
382  * least one extent has the specified status, then there is at least one block
383  * in the cluster with that status.  Should only be called by code that has
384  * taken i_es_lock.
385  */
__es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)386 static bool __es_scan_clu(struct inode *inode,
387 			  int (*matching_fn)(struct extent_status *es),
388 			  ext4_lblk_t lblk)
389 {
390 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
391 	ext4_lblk_t lblk_start, lblk_end;
392 
393 	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
394 	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
395 
396 	return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
397 }
398 
399 /*
400  * Locking for __es_scan_clu() for external use
401  */
ext4_es_scan_clu(struct inode * inode,int (* matching_fn)(struct extent_status * es),ext4_lblk_t lblk)402 bool ext4_es_scan_clu(struct inode *inode,
403 		      int (*matching_fn)(struct extent_status *es),
404 		      ext4_lblk_t lblk)
405 {
406 	bool ret;
407 
408 	read_lock(&EXT4_I(inode)->i_es_lock);
409 	ret = __es_scan_clu(inode, matching_fn, lblk);
410 	read_unlock(&EXT4_I(inode)->i_es_lock);
411 
412 	return ret;
413 }
414 
ext4_es_list_add(struct inode * inode)415 static void ext4_es_list_add(struct inode *inode)
416 {
417 	struct ext4_inode_info *ei = EXT4_I(inode);
418 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
419 
420 	if (!list_empty(&ei->i_es_list))
421 		return;
422 
423 	spin_lock(&sbi->s_es_lock);
424 	if (list_empty(&ei->i_es_list)) {
425 		list_add_tail(&ei->i_es_list, &sbi->s_es_list);
426 		sbi->s_es_nr_inode++;
427 	}
428 	spin_unlock(&sbi->s_es_lock);
429 }
430 
ext4_es_list_del(struct inode * inode)431 static void ext4_es_list_del(struct inode *inode)
432 {
433 	struct ext4_inode_info *ei = EXT4_I(inode);
434 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
435 
436 	spin_lock(&sbi->s_es_lock);
437 	if (!list_empty(&ei->i_es_list)) {
438 		list_del_init(&ei->i_es_list);
439 		sbi->s_es_nr_inode--;
440 		WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
441 	}
442 	spin_unlock(&sbi->s_es_lock);
443 }
444 
__alloc_pending(bool nofail)445 static inline struct pending_reservation *__alloc_pending(bool nofail)
446 {
447 	if (!nofail)
448 		return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
449 
450 	return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
451 }
452 
__free_pending(struct pending_reservation * pr)453 static inline void __free_pending(struct pending_reservation *pr)
454 {
455 	kmem_cache_free(ext4_pending_cachep, pr);
456 }
457 
458 /*
459  * Returns true if we cannot fail to allocate memory for this extent_status
460  * entry and cannot reclaim it until its status changes.
461  */
ext4_es_must_keep(struct extent_status * es)462 static inline bool ext4_es_must_keep(struct extent_status *es)
463 {
464 	/* fiemap, bigalloc, and seek_data/hole need to use it. */
465 	if (ext4_es_is_delayed(es))
466 		return true;
467 
468 	return false;
469 }
470 
__es_alloc_extent(bool nofail)471 static inline struct extent_status *__es_alloc_extent(bool nofail)
472 {
473 	if (!nofail)
474 		return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
475 
476 	return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
477 }
478 
ext4_es_init_extent(struct inode * inode,struct extent_status * es,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk)479 static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
480 		ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
481 {
482 	es->es_lblk = lblk;
483 	es->es_len = len;
484 	es->es_pblk = pblk;
485 
486 	/* We never try to reclaim a must kept extent, so we don't count it. */
487 	if (!ext4_es_must_keep(es)) {
488 		if (!EXT4_I(inode)->i_es_shk_nr++)
489 			ext4_es_list_add(inode);
490 		percpu_counter_inc(&EXT4_SB(inode->i_sb)->
491 					s_es_stats.es_stats_shk_cnt);
492 	}
493 
494 	EXT4_I(inode)->i_es_all_nr++;
495 	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
496 }
497 
__es_free_extent(struct extent_status * es)498 static inline void __es_free_extent(struct extent_status *es)
499 {
500 	kmem_cache_free(ext4_es_cachep, es);
501 }
502 
ext4_es_free_extent(struct inode * inode,struct extent_status * es)503 static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
504 {
505 	EXT4_I(inode)->i_es_all_nr--;
506 	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
507 
508 	/* Decrease the shrink counter when we can reclaim the extent. */
509 	if (!ext4_es_must_keep(es)) {
510 		BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
511 		if (!--EXT4_I(inode)->i_es_shk_nr)
512 			ext4_es_list_del(inode);
513 		percpu_counter_dec(&EXT4_SB(inode->i_sb)->
514 					s_es_stats.es_stats_shk_cnt);
515 	}
516 
517 	__es_free_extent(es);
518 }
519 
520 /*
521  * Check whether or not two extents can be merged
522  * Condition:
523  *  - logical block number is contiguous
524  *  - physical block number is contiguous
525  *  - status is equal
526  */
ext4_es_can_be_merged(struct extent_status * es1,struct extent_status * es2)527 static int ext4_es_can_be_merged(struct extent_status *es1,
528 				 struct extent_status *es2)
529 {
530 	if (ext4_es_type(es1) != ext4_es_type(es2))
531 		return 0;
532 
533 	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
534 		pr_warn("ES assertion failed when merging extents. "
535 			"The sum of lengths of es1 (%d) and es2 (%d) "
536 			"is bigger than allowed file size (%d)\n",
537 			es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
538 		WARN_ON(1);
539 		return 0;
540 	}
541 
542 	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
543 		return 0;
544 
545 	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
546 	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
547 		return 1;
548 
549 	if (ext4_es_is_hole(es1))
550 		return 1;
551 
552 	/* we need to check delayed extent is without unwritten status */
553 	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
554 		return 1;
555 
556 	return 0;
557 }
558 
559 static struct extent_status *
ext4_es_try_to_merge_left(struct inode * inode,struct extent_status * es)560 ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
561 {
562 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
563 	struct extent_status *es1;
564 	struct rb_node *node;
565 
566 	node = rb_prev(&es->rb_node);
567 	if (!node)
568 		return es;
569 
570 	es1 = rb_entry(node, struct extent_status, rb_node);
571 	if (ext4_es_can_be_merged(es1, es)) {
572 		es1->es_len += es->es_len;
573 		if (ext4_es_is_referenced(es))
574 			ext4_es_set_referenced(es1);
575 		rb_erase(&es->rb_node, &tree->root);
576 		ext4_es_free_extent(inode, es);
577 		es = es1;
578 	}
579 
580 	return es;
581 }
582 
583 static struct extent_status *
ext4_es_try_to_merge_right(struct inode * inode,struct extent_status * es)584 ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
585 {
586 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
587 	struct extent_status *es1;
588 	struct rb_node *node;
589 
590 	node = rb_next(&es->rb_node);
591 	if (!node)
592 		return es;
593 
594 	es1 = rb_entry(node, struct extent_status, rb_node);
595 	if (ext4_es_can_be_merged(es, es1)) {
596 		es->es_len += es1->es_len;
597 		if (ext4_es_is_referenced(es1))
598 			ext4_es_set_referenced(es);
599 		rb_erase(node, &tree->root);
600 		ext4_es_free_extent(inode, es1);
601 	}
602 
603 	return es;
604 }
605 
606 #ifdef ES_AGGRESSIVE_TEST
607 #include "ext4_extents.h"	/* Needed when ES_AGGRESSIVE_TEST is defined */
608 
ext4_es_insert_extent_ext_check(struct inode * inode,struct extent_status * es)609 static void ext4_es_insert_extent_ext_check(struct inode *inode,
610 					    struct extent_status *es)
611 {
612 	struct ext4_ext_path *path = NULL;
613 	struct ext4_extent *ex;
614 	ext4_lblk_t ee_block;
615 	ext4_fsblk_t ee_start;
616 	unsigned short ee_len;
617 	int depth, ee_status, es_status;
618 
619 	path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
620 	if (IS_ERR(path))
621 		return;
622 
623 	depth = ext_depth(inode);
624 	ex = path[depth].p_ext;
625 
626 	if (ex) {
627 
628 		ee_block = le32_to_cpu(ex->ee_block);
629 		ee_start = ext4_ext_pblock(ex);
630 		ee_len = ext4_ext_get_actual_len(ex);
631 
632 		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
633 		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
634 
635 		/*
636 		 * Make sure ex and es are not overlap when we try to insert
637 		 * a delayed/hole extent.
638 		 */
639 		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
640 			if (in_range(es->es_lblk, ee_block, ee_len)) {
641 				pr_warn("ES insert assertion failed for "
642 					"inode: %lu we can find an extent "
643 					"at block [%d/%d/%llu/%c], but we "
644 					"want to add a delayed/hole extent "
645 					"[%d/%d/%llu/%x]\n",
646 					inode->i_ino, ee_block, ee_len,
647 					ee_start, ee_status ? 'u' : 'w',
648 					es->es_lblk, es->es_len,
649 					ext4_es_pblock(es), ext4_es_status(es));
650 			}
651 			goto out;
652 		}
653 
654 		/*
655 		 * We don't check ee_block == es->es_lblk, etc. because es
656 		 * might be a part of whole extent, vice versa.
657 		 */
658 		if (es->es_lblk < ee_block ||
659 		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
660 			pr_warn("ES insert assertion failed for inode: %lu "
661 				"ex_status [%d/%d/%llu/%c] != "
662 				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
663 				ee_block, ee_len, ee_start,
664 				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
665 				ext4_es_pblock(es), es_status ? 'u' : 'w');
666 			goto out;
667 		}
668 
669 		if (ee_status ^ es_status) {
670 			pr_warn("ES insert assertion failed for inode: %lu "
671 				"ex_status [%d/%d/%llu/%c] != "
672 				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
673 				ee_block, ee_len, ee_start,
674 				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
675 				ext4_es_pblock(es), es_status ? 'u' : 'w');
676 		}
677 	} else {
678 		/*
679 		 * We can't find an extent on disk.  So we need to make sure
680 		 * that we don't want to add an written/unwritten extent.
681 		 */
682 		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
683 			pr_warn("ES insert assertion failed for inode: %lu "
684 				"can't find an extent at block %d but we want "
685 				"to add a written/unwritten extent "
686 				"[%d/%d/%llu/%x]\n", inode->i_ino,
687 				es->es_lblk, es->es_lblk, es->es_len,
688 				ext4_es_pblock(es), ext4_es_status(es));
689 		}
690 	}
691 out:
692 	ext4_ext_drop_refs(path);
693 	kfree(path);
694 }
695 
ext4_es_insert_extent_ind_check(struct inode * inode,struct extent_status * es)696 static void ext4_es_insert_extent_ind_check(struct inode *inode,
697 					    struct extent_status *es)
698 {
699 	struct ext4_map_blocks map;
700 	int retval;
701 
702 	/*
703 	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
704 	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
705 	 * access direct/indirect tree from outside.  It is too dirty to define
706 	 * this function in indirect.c file.
707 	 */
708 
709 	map.m_lblk = es->es_lblk;
710 	map.m_len = es->es_len;
711 
712 	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
713 	if (retval > 0) {
714 		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
715 			/*
716 			 * We want to add a delayed/hole extent but this
717 			 * block has been allocated.
718 			 */
719 			pr_warn("ES insert assertion failed for inode: %lu "
720 				"We can find blocks but we want to add a "
721 				"delayed/hole extent [%d/%d/%llu/%x]\n",
722 				inode->i_ino, es->es_lblk, es->es_len,
723 				ext4_es_pblock(es), ext4_es_status(es));
724 			return;
725 		} else if (ext4_es_is_written(es)) {
726 			if (retval != es->es_len) {
727 				pr_warn("ES insert assertion failed for "
728 					"inode: %lu retval %d != es_len %d\n",
729 					inode->i_ino, retval, es->es_len);
730 				return;
731 			}
732 			if (map.m_pblk != ext4_es_pblock(es)) {
733 				pr_warn("ES insert assertion failed for "
734 					"inode: %lu m_pblk %llu != "
735 					"es_pblk %llu\n",
736 					inode->i_ino, map.m_pblk,
737 					ext4_es_pblock(es));
738 				return;
739 			}
740 		} else {
741 			/*
742 			 * We don't need to check unwritten extent because
743 			 * indirect-based file doesn't have it.
744 			 */
745 			BUG();
746 		}
747 	} else if (retval == 0) {
748 		if (ext4_es_is_written(es)) {
749 			pr_warn("ES insert assertion failed for inode: %lu "
750 				"We can't find the block but we want to add "
751 				"a written extent [%d/%d/%llu/%x]\n",
752 				inode->i_ino, es->es_lblk, es->es_len,
753 				ext4_es_pblock(es), ext4_es_status(es));
754 			return;
755 		}
756 	}
757 }
758 
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)759 static inline void ext4_es_insert_extent_check(struct inode *inode,
760 					       struct extent_status *es)
761 {
762 	/*
763 	 * We don't need to worry about the race condition because
764 	 * caller takes i_data_sem locking.
765 	 */
766 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
767 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
768 		ext4_es_insert_extent_ext_check(inode, es);
769 	else
770 		ext4_es_insert_extent_ind_check(inode, es);
771 }
772 #else
ext4_es_insert_extent_check(struct inode * inode,struct extent_status * es)773 static inline void ext4_es_insert_extent_check(struct inode *inode,
774 					       struct extent_status *es)
775 {
776 }
777 #endif
778 
__es_insert_extent(struct inode * inode,struct extent_status * newes,struct extent_status * prealloc)779 static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
780 			      struct extent_status *prealloc)
781 {
782 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
783 	struct rb_node **p = &tree->root.rb_node;
784 	struct rb_node *parent = NULL;
785 	struct extent_status *es;
786 
787 	while (*p) {
788 		parent = *p;
789 		es = rb_entry(parent, struct extent_status, rb_node);
790 
791 		if (newes->es_lblk < es->es_lblk) {
792 			if (ext4_es_can_be_merged(newes, es)) {
793 				/*
794 				 * Here we can modify es_lblk directly
795 				 * because it isn't overlapped.
796 				 */
797 				es->es_lblk = newes->es_lblk;
798 				es->es_len += newes->es_len;
799 				if (ext4_es_is_written(es) ||
800 				    ext4_es_is_unwritten(es))
801 					ext4_es_store_pblock(es,
802 							     newes->es_pblk);
803 				es = ext4_es_try_to_merge_left(inode, es);
804 				goto out;
805 			}
806 			p = &(*p)->rb_left;
807 		} else if (newes->es_lblk > ext4_es_end(es)) {
808 			if (ext4_es_can_be_merged(es, newes)) {
809 				es->es_len += newes->es_len;
810 				es = ext4_es_try_to_merge_right(inode, es);
811 				goto out;
812 			}
813 			p = &(*p)->rb_right;
814 		} else {
815 			BUG();
816 			return -EINVAL;
817 		}
818 	}
819 
820 	if (prealloc)
821 		es = prealloc;
822 	else
823 		es = __es_alloc_extent(false);
824 	if (!es)
825 		return -ENOMEM;
826 	ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
827 			    newes->es_pblk);
828 
829 	rb_link_node(&es->rb_node, parent, p);
830 	rb_insert_color(&es->rb_node, &tree->root);
831 
832 out:
833 	tree->cache_es = es;
834 	return 0;
835 }
836 
837 /*
838  * ext4_es_insert_extent() adds information to an inode's extent
839  * status tree.
840  *
841  * Return 0 on success, error code on failure.
842  */
ext4_es_insert_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status)843 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
844 			  ext4_lblk_t len, ext4_fsblk_t pblk,
845 			  unsigned int status)
846 {
847 	struct extent_status newes;
848 	ext4_lblk_t end = lblk + len - 1;
849 	int err1 = 0, err2 = 0, err3 = 0;
850 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
851 	struct extent_status *es1 = NULL;
852 	struct extent_status *es2 = NULL;
853 	struct pending_reservation *pr = NULL;
854 	bool revise_pending = false;
855 
856 	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
857 		 lblk, len, pblk, status, inode->i_ino);
858 
859 	if (!len)
860 		return 0;
861 
862 	BUG_ON(end < lblk);
863 
864 	if ((status & EXTENT_STATUS_DELAYED) &&
865 	    (status & EXTENT_STATUS_WRITTEN)) {
866 		ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
867 				" delayed and written which can potentially "
868 				" cause data loss.", lblk, len);
869 		WARN_ON(1);
870 	}
871 
872 	newes.es_lblk = lblk;
873 	newes.es_len = len;
874 	ext4_es_store_pblock_status(&newes, pblk, status);
875 	trace_ext4_es_insert_extent(inode, &newes);
876 
877 	ext4_es_insert_extent_check(inode, &newes);
878 
879 	revise_pending = sbi->s_cluster_ratio > 1 &&
880 			 test_opt(inode->i_sb, DELALLOC) &&
881 			 (status & (EXTENT_STATUS_WRITTEN |
882 				    EXTENT_STATUS_UNWRITTEN));
883 retry:
884 	if (err1 && !es1)
885 		es1 = __es_alloc_extent(true);
886 	if ((err1 || err2) && !es2)
887 		es2 = __es_alloc_extent(true);
888 	if ((err1 || err2 || err3) && revise_pending && !pr)
889 		pr = __alloc_pending(true);
890 	write_lock(&EXT4_I(inode)->i_es_lock);
891 
892 	err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
893 	if (err1 != 0)
894 		goto error;
895 	/* Free preallocated extent if it didn't get used. */
896 	if (es1) {
897 		if (!es1->es_len)
898 			__es_free_extent(es1);
899 		es1 = NULL;
900 	}
901 
902 	err2 = __es_insert_extent(inode, &newes, es2);
903 	if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
904 		err2 = 0;
905 	if (err2 != 0)
906 		goto error;
907 	/* Free preallocated extent if it didn't get used. */
908 	if (es2) {
909 		if (!es2->es_len)
910 			__es_free_extent(es2);
911 		es2 = NULL;
912 	}
913 
914 	if (revise_pending) {
915 		err3 = __revise_pending(inode, lblk, len, &pr);
916 		if (err3 != 0)
917 			goto error;
918 		if (pr) {
919 			__free_pending(pr);
920 			pr = NULL;
921 		}
922 	}
923 error:
924 	write_unlock(&EXT4_I(inode)->i_es_lock);
925 	if (err1 || err2 || err3)
926 		goto retry;
927 
928 	ext4_es_print_tree(inode);
929 	return 0;
930 }
931 
932 /*
933  * ext4_es_cache_extent() inserts information into the extent status
934  * tree if and only if there isn't information about the range in
935  * question already.
936  */
ext4_es_cache_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,ext4_fsblk_t pblk,unsigned int status)937 void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
938 			  ext4_lblk_t len, ext4_fsblk_t pblk,
939 			  unsigned int status)
940 {
941 	struct extent_status *es;
942 	struct extent_status newes;
943 	ext4_lblk_t end = lblk + len - 1;
944 
945 	newes.es_lblk = lblk;
946 	newes.es_len = len;
947 	ext4_es_store_pblock_status(&newes, pblk, status);
948 	trace_ext4_es_cache_extent(inode, &newes);
949 
950 	if (!len)
951 		return;
952 
953 	BUG_ON(end < lblk);
954 
955 	write_lock(&EXT4_I(inode)->i_es_lock);
956 
957 	es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
958 	if (!es || es->es_lblk > end)
959 		__es_insert_extent(inode, &newes, NULL);
960 	write_unlock(&EXT4_I(inode)->i_es_lock);
961 }
962 
963 /*
964  * ext4_es_lookup_extent() looks up an extent in extent status tree.
965  *
966  * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
967  *
968  * Return: 1 on found, 0 on not
969  */
ext4_es_lookup_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t * next_lblk,struct extent_status * es)970 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
971 			  ext4_lblk_t *next_lblk,
972 			  struct extent_status *es)
973 {
974 	struct ext4_es_tree *tree;
975 	struct ext4_es_stats *stats;
976 	struct extent_status *es1 = NULL;
977 	struct rb_node *node;
978 	int found = 0;
979 
980 	trace_ext4_es_lookup_extent_enter(inode, lblk);
981 	es_debug("lookup extent in block %u\n", lblk);
982 
983 	tree = &EXT4_I(inode)->i_es_tree;
984 	read_lock(&EXT4_I(inode)->i_es_lock);
985 
986 	/* find extent in cache firstly */
987 	es->es_lblk = es->es_len = es->es_pblk = 0;
988 	es1 = READ_ONCE(tree->cache_es);
989 	if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
990 		es_debug("%u cached by [%u/%u)\n",
991 			 lblk, es1->es_lblk, es1->es_len);
992 		found = 1;
993 		goto out;
994 	}
995 
996 	node = tree->root.rb_node;
997 	while (node) {
998 		es1 = rb_entry(node, struct extent_status, rb_node);
999 		if (lblk < es1->es_lblk)
1000 			node = node->rb_left;
1001 		else if (lblk > ext4_es_end(es1))
1002 			node = node->rb_right;
1003 		else {
1004 			found = 1;
1005 			break;
1006 		}
1007 	}
1008 
1009 out:
1010 	stats = &EXT4_SB(inode->i_sb)->s_es_stats;
1011 	if (found) {
1012 		BUG_ON(!es1);
1013 		es->es_lblk = es1->es_lblk;
1014 		es->es_len = es1->es_len;
1015 		es->es_pblk = es1->es_pblk;
1016 		if (!ext4_es_is_referenced(es1))
1017 			ext4_es_set_referenced(es1);
1018 		percpu_counter_inc(&stats->es_stats_cache_hits);
1019 		if (next_lblk) {
1020 			node = rb_next(&es1->rb_node);
1021 			if (node) {
1022 				es1 = rb_entry(node, struct extent_status,
1023 					       rb_node);
1024 				*next_lblk = es1->es_lblk;
1025 			} else
1026 				*next_lblk = 0;
1027 		}
1028 	} else {
1029 		percpu_counter_inc(&stats->es_stats_cache_misses);
1030 	}
1031 
1032 	read_unlock(&EXT4_I(inode)->i_es_lock);
1033 
1034 	trace_ext4_es_lookup_extent_exit(inode, es, found);
1035 	return found;
1036 }
1037 
1038 struct rsvd_count {
1039 	int ndelonly;
1040 	bool first_do_lblk_found;
1041 	ext4_lblk_t first_do_lblk;
1042 	ext4_lblk_t last_do_lblk;
1043 	struct extent_status *left_es;
1044 	bool partial;
1045 	ext4_lblk_t lclu;
1046 };
1047 
1048 /*
1049  * init_rsvd - initialize reserved count data before removing block range
1050  *	       in file from extent status tree
1051  *
1052  * @inode - file containing range
1053  * @lblk - first block in range
1054  * @es - pointer to first extent in range
1055  * @rc - pointer to reserved count data
1056  *
1057  * Assumes es is not NULL
1058  */
init_rsvd(struct inode * inode,ext4_lblk_t lblk,struct extent_status * es,struct rsvd_count * rc)1059 static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
1060 		      struct extent_status *es, struct rsvd_count *rc)
1061 {
1062 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1063 	struct rb_node *node;
1064 
1065 	rc->ndelonly = 0;
1066 
1067 	/*
1068 	 * for bigalloc, note the first delonly block in the range has not
1069 	 * been found, record the extent containing the block to the left of
1070 	 * the region to be removed, if any, and note that there's no partial
1071 	 * cluster to track
1072 	 */
1073 	if (sbi->s_cluster_ratio > 1) {
1074 		rc->first_do_lblk_found = false;
1075 		if (lblk > es->es_lblk) {
1076 			rc->left_es = es;
1077 		} else {
1078 			node = rb_prev(&es->rb_node);
1079 			rc->left_es = node ? rb_entry(node,
1080 						      struct extent_status,
1081 						      rb_node) : NULL;
1082 		}
1083 		rc->partial = false;
1084 	}
1085 }
1086 
1087 /*
1088  * count_rsvd - count the clusters containing delayed and not unwritten
1089  *		(delonly) blocks in a range within an extent and add to
1090  *	        the running tally in rsvd_count
1091  *
1092  * @inode - file containing extent
1093  * @lblk - first block in range
1094  * @len - length of range in blocks
1095  * @es - pointer to extent containing clusters to be counted
1096  * @rc - pointer to reserved count data
1097  *
1098  * Tracks partial clusters found at the beginning and end of extents so
1099  * they aren't overcounted when they span adjacent extents
1100  */
count_rsvd(struct inode * inode,ext4_lblk_t lblk,long len,struct extent_status * es,struct rsvd_count * rc)1101 static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
1102 		       struct extent_status *es, struct rsvd_count *rc)
1103 {
1104 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1105 	ext4_lblk_t i, end, nclu;
1106 
1107 	if (!ext4_es_is_delonly(es))
1108 		return;
1109 
1110 	WARN_ON(len <= 0);
1111 
1112 	if (sbi->s_cluster_ratio == 1) {
1113 		rc->ndelonly += (int) len;
1114 		return;
1115 	}
1116 
1117 	/* bigalloc */
1118 
1119 	i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
1120 	end = lblk + (ext4_lblk_t) len - 1;
1121 	end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
1122 
1123 	/* record the first block of the first delonly extent seen */
1124 	if (rc->first_do_lblk_found == false) {
1125 		rc->first_do_lblk = i;
1126 		rc->first_do_lblk_found = true;
1127 	}
1128 
1129 	/* update the last lblk in the region seen so far */
1130 	rc->last_do_lblk = end;
1131 
1132 	/*
1133 	 * if we're tracking a partial cluster and the current extent
1134 	 * doesn't start with it, count it and stop tracking
1135 	 */
1136 	if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
1137 		rc->ndelonly++;
1138 		rc->partial = false;
1139 	}
1140 
1141 	/*
1142 	 * if the first cluster doesn't start on a cluster boundary but
1143 	 * ends on one, count it
1144 	 */
1145 	if (EXT4_LBLK_COFF(sbi, i) != 0) {
1146 		if (end >= EXT4_LBLK_CFILL(sbi, i)) {
1147 			rc->ndelonly++;
1148 			rc->partial = false;
1149 			i = EXT4_LBLK_CFILL(sbi, i) + 1;
1150 		}
1151 	}
1152 
1153 	/*
1154 	 * if the current cluster starts on a cluster boundary, count the
1155 	 * number of whole delonly clusters in the extent
1156 	 */
1157 	if ((i + sbi->s_cluster_ratio - 1) <= end) {
1158 		nclu = (end - i + 1) >> sbi->s_cluster_bits;
1159 		rc->ndelonly += nclu;
1160 		i += nclu << sbi->s_cluster_bits;
1161 	}
1162 
1163 	/*
1164 	 * start tracking a partial cluster if there's a partial at the end
1165 	 * of the current extent and we're not already tracking one
1166 	 */
1167 	if (!rc->partial && i <= end) {
1168 		rc->partial = true;
1169 		rc->lclu = EXT4_B2C(sbi, i);
1170 	}
1171 }
1172 
1173 /*
1174  * __pr_tree_search - search for a pending cluster reservation
1175  *
1176  * @root - root of pending reservation tree
1177  * @lclu - logical cluster to search for
1178  *
1179  * Returns the pending reservation for the cluster identified by @lclu
1180  * if found.  If not, returns a reservation for the next cluster if any,
1181  * and if not, returns NULL.
1182  */
__pr_tree_search(struct rb_root * root,ext4_lblk_t lclu)1183 static struct pending_reservation *__pr_tree_search(struct rb_root *root,
1184 						    ext4_lblk_t lclu)
1185 {
1186 	struct rb_node *node = root->rb_node;
1187 	struct pending_reservation *pr = NULL;
1188 
1189 	while (node) {
1190 		pr = rb_entry(node, struct pending_reservation, rb_node);
1191 		if (lclu < pr->lclu)
1192 			node = node->rb_left;
1193 		else if (lclu > pr->lclu)
1194 			node = node->rb_right;
1195 		else
1196 			return pr;
1197 	}
1198 	if (pr && lclu < pr->lclu)
1199 		return pr;
1200 	if (pr && lclu > pr->lclu) {
1201 		node = rb_next(&pr->rb_node);
1202 		return node ? rb_entry(node, struct pending_reservation,
1203 				       rb_node) : NULL;
1204 	}
1205 	return NULL;
1206 }
1207 
1208 /*
1209  * get_rsvd - calculates and returns the number of cluster reservations to be
1210  *	      released when removing a block range from the extent status tree
1211  *	      and releases any pending reservations within the range
1212  *
1213  * @inode - file containing block range
1214  * @end - last block in range
1215  * @right_es - pointer to extent containing next block beyond end or NULL
1216  * @rc - pointer to reserved count data
1217  *
1218  * The number of reservations to be released is equal to the number of
1219  * clusters containing delayed and not unwritten (delonly) blocks within
1220  * the range, minus the number of clusters still containing delonly blocks
1221  * at the ends of the range, and minus the number of pending reservations
1222  * within the range.
1223  */
get_rsvd(struct inode * inode,ext4_lblk_t end,struct extent_status * right_es,struct rsvd_count * rc)1224 static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
1225 			     struct extent_status *right_es,
1226 			     struct rsvd_count *rc)
1227 {
1228 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1229 	struct pending_reservation *pr;
1230 	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1231 	struct rb_node *node;
1232 	ext4_lblk_t first_lclu, last_lclu;
1233 	bool left_delonly, right_delonly, count_pending;
1234 	struct extent_status *es;
1235 
1236 	if (sbi->s_cluster_ratio > 1) {
1237 		/* count any remaining partial cluster */
1238 		if (rc->partial)
1239 			rc->ndelonly++;
1240 
1241 		if (rc->ndelonly == 0)
1242 			return 0;
1243 
1244 		first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
1245 		last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
1246 
1247 		/*
1248 		 * decrease the delonly count by the number of clusters at the
1249 		 * ends of the range that still contain delonly blocks -
1250 		 * these clusters still need to be reserved
1251 		 */
1252 		left_delonly = right_delonly = false;
1253 
1254 		es = rc->left_es;
1255 		while (es && ext4_es_end(es) >=
1256 		       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
1257 			if (ext4_es_is_delonly(es)) {
1258 				rc->ndelonly--;
1259 				left_delonly = true;
1260 				break;
1261 			}
1262 			node = rb_prev(&es->rb_node);
1263 			if (!node)
1264 				break;
1265 			es = rb_entry(node, struct extent_status, rb_node);
1266 		}
1267 		if (right_es && (!left_delonly || first_lclu != last_lclu)) {
1268 			if (end < ext4_es_end(right_es)) {
1269 				es = right_es;
1270 			} else {
1271 				node = rb_next(&right_es->rb_node);
1272 				es = node ? rb_entry(node, struct extent_status,
1273 						     rb_node) : NULL;
1274 			}
1275 			while (es && es->es_lblk <=
1276 			       EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
1277 				if (ext4_es_is_delonly(es)) {
1278 					rc->ndelonly--;
1279 					right_delonly = true;
1280 					break;
1281 				}
1282 				node = rb_next(&es->rb_node);
1283 				if (!node)
1284 					break;
1285 				es = rb_entry(node, struct extent_status,
1286 					      rb_node);
1287 			}
1288 		}
1289 
1290 		/*
1291 		 * Determine the block range that should be searched for
1292 		 * pending reservations, if any.  Clusters on the ends of the
1293 		 * original removed range containing delonly blocks are
1294 		 * excluded.  They've already been accounted for and it's not
1295 		 * possible to determine if an associated pending reservation
1296 		 * should be released with the information available in the
1297 		 * extents status tree.
1298 		 */
1299 		if (first_lclu == last_lclu) {
1300 			if (left_delonly | right_delonly)
1301 				count_pending = false;
1302 			else
1303 				count_pending = true;
1304 		} else {
1305 			if (left_delonly)
1306 				first_lclu++;
1307 			if (right_delonly)
1308 				last_lclu--;
1309 			if (first_lclu <= last_lclu)
1310 				count_pending = true;
1311 			else
1312 				count_pending = false;
1313 		}
1314 
1315 		/*
1316 		 * a pending reservation found between first_lclu and last_lclu
1317 		 * represents an allocated cluster that contained at least one
1318 		 * delonly block, so the delonly total must be reduced by one
1319 		 * for each pending reservation found and released
1320 		 */
1321 		if (count_pending) {
1322 			pr = __pr_tree_search(&tree->root, first_lclu);
1323 			while (pr && pr->lclu <= last_lclu) {
1324 				rc->ndelonly--;
1325 				node = rb_next(&pr->rb_node);
1326 				rb_erase(&pr->rb_node, &tree->root);
1327 				__free_pending(pr);
1328 				if (!node)
1329 					break;
1330 				pr = rb_entry(node, struct pending_reservation,
1331 					      rb_node);
1332 			}
1333 		}
1334 	}
1335 	return rc->ndelonly;
1336 }
1337 
1338 
1339 /*
1340  * __es_remove_extent - removes block range from extent status tree
1341  *
1342  * @inode - file containing range
1343  * @lblk - first block in range
1344  * @end - last block in range
1345  * @reserved - number of cluster reservations released
1346  * @prealloc - pre-allocated es to avoid memory allocation failures
1347  *
1348  * If @reserved is not NULL and delayed allocation is enabled, counts
1349  * block/cluster reservations freed by removing range and if bigalloc
1350  * enabled cancels pending reservations as needed. Returns 0 on success,
1351  * error code on failure.
1352  */
__es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t end,int * reserved,struct extent_status * prealloc)1353 static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1354 			      ext4_lblk_t end, int *reserved,
1355 			      struct extent_status *prealloc)
1356 {
1357 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
1358 	struct rb_node *node;
1359 	struct extent_status *es;
1360 	struct extent_status orig_es;
1361 	ext4_lblk_t len1, len2;
1362 	ext4_fsblk_t block;
1363 	int err = 0;
1364 	bool count_reserved = true;
1365 	struct rsvd_count rc;
1366 
1367 	if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
1368 		count_reserved = false;
1369 
1370 	es = __es_tree_search(&tree->root, lblk);
1371 	if (!es)
1372 		goto out;
1373 	if (es->es_lblk > end)
1374 		goto out;
1375 
1376 	/* Simply invalidate cache_es. */
1377 	tree->cache_es = NULL;
1378 	if (count_reserved)
1379 		init_rsvd(inode, lblk, es, &rc);
1380 
1381 	orig_es.es_lblk = es->es_lblk;
1382 	orig_es.es_len = es->es_len;
1383 	orig_es.es_pblk = es->es_pblk;
1384 
1385 	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
1386 	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
1387 	if (len1 > 0)
1388 		es->es_len = len1;
1389 	if (len2 > 0) {
1390 		if (len1 > 0) {
1391 			struct extent_status newes;
1392 
1393 			newes.es_lblk = end + 1;
1394 			newes.es_len = len2;
1395 			block = 0x7FDEADBEEFULL;
1396 			if (ext4_es_is_written(&orig_es) ||
1397 			    ext4_es_is_unwritten(&orig_es))
1398 				block = ext4_es_pblock(&orig_es) +
1399 					orig_es.es_len - len2;
1400 			ext4_es_store_pblock_status(&newes, block,
1401 						    ext4_es_status(&orig_es));
1402 			err = __es_insert_extent(inode, &newes, prealloc);
1403 			if (err) {
1404 				if (!ext4_es_must_keep(&newes))
1405 					return 0;
1406 
1407 				es->es_lblk = orig_es.es_lblk;
1408 				es->es_len = orig_es.es_len;
1409 				goto out;
1410 			}
1411 		} else {
1412 			es->es_lblk = end + 1;
1413 			es->es_len = len2;
1414 			if (ext4_es_is_written(es) ||
1415 			    ext4_es_is_unwritten(es)) {
1416 				block = orig_es.es_pblk + orig_es.es_len - len2;
1417 				ext4_es_store_pblock(es, block);
1418 			}
1419 		}
1420 		if (count_reserved)
1421 			count_rsvd(inode, orig_es.es_lblk + len1,
1422 				   orig_es.es_len - len1 - len2, &orig_es, &rc);
1423 		goto out_get_reserved;
1424 	}
1425 
1426 	if (len1 > 0) {
1427 		if (count_reserved)
1428 			count_rsvd(inode, lblk, orig_es.es_len - len1,
1429 				   &orig_es, &rc);
1430 		node = rb_next(&es->rb_node);
1431 		if (node)
1432 			es = rb_entry(node, struct extent_status, rb_node);
1433 		else
1434 			es = NULL;
1435 	}
1436 
1437 	while (es && ext4_es_end(es) <= end) {
1438 		if (count_reserved)
1439 			count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
1440 		node = rb_next(&es->rb_node);
1441 		rb_erase(&es->rb_node, &tree->root);
1442 		ext4_es_free_extent(inode, es);
1443 		if (!node) {
1444 			es = NULL;
1445 			break;
1446 		}
1447 		es = rb_entry(node, struct extent_status, rb_node);
1448 	}
1449 
1450 	if (es && es->es_lblk < end + 1) {
1451 		ext4_lblk_t orig_len = es->es_len;
1452 
1453 		len1 = ext4_es_end(es) - end;
1454 		if (count_reserved)
1455 			count_rsvd(inode, es->es_lblk, orig_len - len1,
1456 				   es, &rc);
1457 		es->es_lblk = end + 1;
1458 		es->es_len = len1;
1459 		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
1460 			block = es->es_pblk + orig_len - len1;
1461 			ext4_es_store_pblock(es, block);
1462 		}
1463 	}
1464 
1465 out_get_reserved:
1466 	if (count_reserved)
1467 		*reserved = get_rsvd(inode, end, es, &rc);
1468 out:
1469 	return err;
1470 }
1471 
1472 /*
1473  * ext4_es_remove_extent - removes block range from extent status tree
1474  *
1475  * @inode - file containing range
1476  * @lblk - first block in range
1477  * @len - number of blocks to remove
1478  *
1479  * Reduces block/cluster reservation count and for bigalloc cancels pending
1480  * reservations as needed. Returns 0 on success, error code on failure.
1481  */
ext4_es_remove_extent(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len)1482 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
1483 			  ext4_lblk_t len)
1484 {
1485 	ext4_lblk_t end;
1486 	int err = 0;
1487 	int reserved = 0;
1488 	struct extent_status *es = NULL;
1489 
1490 	trace_ext4_es_remove_extent(inode, lblk, len);
1491 	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
1492 		 lblk, len, inode->i_ino);
1493 
1494 	if (!len)
1495 		return err;
1496 
1497 	end = lblk + len - 1;
1498 	BUG_ON(end < lblk);
1499 
1500 retry:
1501 	if (err && !es)
1502 		es = __es_alloc_extent(true);
1503 	/*
1504 	 * ext4_clear_inode() depends on us taking i_es_lock unconditionally
1505 	 * so that we are sure __es_shrink() is done with the inode before it
1506 	 * is reclaimed.
1507 	 */
1508 	write_lock(&EXT4_I(inode)->i_es_lock);
1509 	err = __es_remove_extent(inode, lblk, end, &reserved, es);
1510 	/* Free preallocated extent if it didn't get used. */
1511 	if (es) {
1512 		if (!es->es_len)
1513 			__es_free_extent(es);
1514 		es = NULL;
1515 	}
1516 	write_unlock(&EXT4_I(inode)->i_es_lock);
1517 	if (err)
1518 		goto retry;
1519 
1520 	ext4_es_print_tree(inode);
1521 	ext4_da_release_space(inode, reserved);
1522 	return 0;
1523 }
1524 
__es_shrink(struct ext4_sb_info * sbi,int nr_to_scan,struct ext4_inode_info * locked_ei)1525 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
1526 		       struct ext4_inode_info *locked_ei)
1527 {
1528 	struct ext4_inode_info *ei;
1529 	struct ext4_es_stats *es_stats;
1530 	ktime_t start_time;
1531 	u64 scan_time;
1532 	int nr_to_walk;
1533 	int nr_shrunk = 0;
1534 	int retried = 0, nr_skipped = 0;
1535 
1536 	es_stats = &sbi->s_es_stats;
1537 	start_time = ktime_get();
1538 
1539 retry:
1540 	spin_lock(&sbi->s_es_lock);
1541 	nr_to_walk = sbi->s_es_nr_inode;
1542 	while (nr_to_walk-- > 0) {
1543 		if (list_empty(&sbi->s_es_list)) {
1544 			spin_unlock(&sbi->s_es_lock);
1545 			goto out;
1546 		}
1547 		ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
1548 				      i_es_list);
1549 		/* Move the inode to the tail */
1550 		list_move_tail(&ei->i_es_list, &sbi->s_es_list);
1551 
1552 		/*
1553 		 * Normally we try hard to avoid shrinking precached inodes,
1554 		 * but we will as a last resort.
1555 		 */
1556 		if (!retried && ext4_test_inode_state(&ei->vfs_inode,
1557 						EXT4_STATE_EXT_PRECACHED)) {
1558 			nr_skipped++;
1559 			continue;
1560 		}
1561 
1562 		if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
1563 			nr_skipped++;
1564 			continue;
1565 		}
1566 		/*
1567 		 * Now we hold i_es_lock which protects us from inode reclaim
1568 		 * freeing inode under us
1569 		 */
1570 		spin_unlock(&sbi->s_es_lock);
1571 
1572 		nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
1573 		write_unlock(&ei->i_es_lock);
1574 
1575 		if (nr_to_scan <= 0)
1576 			goto out;
1577 		spin_lock(&sbi->s_es_lock);
1578 	}
1579 	spin_unlock(&sbi->s_es_lock);
1580 
1581 	/*
1582 	 * If we skipped any inodes, and we weren't able to make any
1583 	 * forward progress, try again to scan precached inodes.
1584 	 */
1585 	if ((nr_shrunk == 0) && nr_skipped && !retried) {
1586 		retried++;
1587 		goto retry;
1588 	}
1589 
1590 	if (locked_ei && nr_shrunk == 0)
1591 		nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
1592 
1593 out:
1594 	scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1595 	if (likely(es_stats->es_stats_scan_time))
1596 		es_stats->es_stats_scan_time = (scan_time +
1597 				es_stats->es_stats_scan_time*3) / 4;
1598 	else
1599 		es_stats->es_stats_scan_time = scan_time;
1600 	if (scan_time > es_stats->es_stats_max_scan_time)
1601 		es_stats->es_stats_max_scan_time = scan_time;
1602 	if (likely(es_stats->es_stats_shrunk))
1603 		es_stats->es_stats_shrunk = (nr_shrunk +
1604 				es_stats->es_stats_shrunk*3) / 4;
1605 	else
1606 		es_stats->es_stats_shrunk = nr_shrunk;
1607 
1608 	trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
1609 			     nr_skipped, retried);
1610 	return nr_shrunk;
1611 }
1612 
ext4_es_count(struct shrinker * shrink,struct shrink_control * sc)1613 static unsigned long ext4_es_count(struct shrinker *shrink,
1614 				   struct shrink_control *sc)
1615 {
1616 	unsigned long nr;
1617 	struct ext4_sb_info *sbi;
1618 
1619 	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1620 	nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1621 	trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1622 	return nr;
1623 }
1624 
ext4_es_scan(struct shrinker * shrink,struct shrink_control * sc)1625 static unsigned long ext4_es_scan(struct shrinker *shrink,
1626 				  struct shrink_control *sc)
1627 {
1628 	struct ext4_sb_info *sbi = container_of(shrink,
1629 					struct ext4_sb_info, s_es_shrinker);
1630 	int nr_to_scan = sc->nr_to_scan;
1631 	int ret, nr_shrunk;
1632 
1633 	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1634 	trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1635 
1636 	nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
1637 
1638 	ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
1639 	trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1640 	return nr_shrunk;
1641 }
1642 
ext4_seq_es_shrinker_info_show(struct seq_file * seq,void * v)1643 int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
1644 {
1645 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
1646 	struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1647 	struct ext4_inode_info *ei, *max = NULL;
1648 	unsigned int inode_cnt = 0;
1649 
1650 	if (v != SEQ_START_TOKEN)
1651 		return 0;
1652 
1653 	/* here we just find an inode that has the max nr. of objects */
1654 	spin_lock(&sbi->s_es_lock);
1655 	list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
1656 		inode_cnt++;
1657 		if (max && max->i_es_all_nr < ei->i_es_all_nr)
1658 			max = ei;
1659 		else if (!max)
1660 			max = ei;
1661 	}
1662 	spin_unlock(&sbi->s_es_lock);
1663 
1664 	seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
1665 		   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1666 		   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
1667 	seq_printf(seq, "  %lld/%lld cache hits/misses\n",
1668 		   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
1669 		   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
1670 	if (inode_cnt)
1671 		seq_printf(seq, "  %d inodes on list\n", inode_cnt);
1672 
1673 	seq_printf(seq, "average:\n  %llu us scan time\n",
1674 	    div_u64(es_stats->es_stats_scan_time, 1000));
1675 	seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
1676 	if (inode_cnt)
1677 		seq_printf(seq,
1678 		    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
1679 		    "  %llu us max scan time\n",
1680 		    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
1681 		    div_u64(es_stats->es_stats_max_scan_time, 1000));
1682 
1683 	return 0;
1684 }
1685 
ext4_es_register_shrinker(struct ext4_sb_info * sbi)1686 int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1687 {
1688 	int err;
1689 
1690 	/* Make sure we have enough bits for physical block number */
1691 	BUILD_BUG_ON(ES_SHIFT < 48);
1692 	INIT_LIST_HEAD(&sbi->s_es_list);
1693 	sbi->s_es_nr_inode = 0;
1694 	spin_lock_init(&sbi->s_es_lock);
1695 	sbi->s_es_stats.es_stats_shrunk = 0;
1696 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
1697 				  GFP_KERNEL);
1698 	if (err)
1699 		return err;
1700 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
1701 				  GFP_KERNEL);
1702 	if (err)
1703 		goto err1;
1704 	sbi->s_es_stats.es_stats_scan_time = 0;
1705 	sbi->s_es_stats.es_stats_max_scan_time = 0;
1706 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1707 	if (err)
1708 		goto err2;
1709 	err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
1710 	if (err)
1711 		goto err3;
1712 
1713 	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1714 	sbi->s_es_shrinker.count_objects = ext4_es_count;
1715 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1716 	err = register_shrinker(&sbi->s_es_shrinker);
1717 	if (err)
1718 		goto err4;
1719 
1720 	return 0;
1721 err4:
1722 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1723 err3:
1724 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1725 err2:
1726 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1727 err1:
1728 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1729 	return err;
1730 }
1731 
ext4_es_unregister_shrinker(struct ext4_sb_info * sbi)1732 void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1733 {
1734 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
1735 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
1736 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1737 	percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
1738 	unregister_shrinker(&sbi->s_es_shrinker);
1739 }
1740 
1741 /*
1742  * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
1743  * most *nr_to_scan extents, update *nr_to_scan accordingly.
1744  *
1745  * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
1746  * Increment *nr_shrunk by the number of reclaimed extents. Also update
1747  * ei->i_es_shrink_lblk to where we should continue scanning.
1748  */
es_do_reclaim_extents(struct ext4_inode_info * ei,ext4_lblk_t end,int * nr_to_scan,int * nr_shrunk)1749 static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
1750 				 int *nr_to_scan, int *nr_shrunk)
1751 {
1752 	struct inode *inode = &ei->vfs_inode;
1753 	struct ext4_es_tree *tree = &ei->i_es_tree;
1754 	struct extent_status *es;
1755 	struct rb_node *node;
1756 
1757 	es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
1758 	if (!es)
1759 		goto out_wrap;
1760 
1761 	while (*nr_to_scan > 0) {
1762 		if (es->es_lblk > end) {
1763 			ei->i_es_shrink_lblk = end + 1;
1764 			return 0;
1765 		}
1766 
1767 		(*nr_to_scan)--;
1768 		node = rb_next(&es->rb_node);
1769 
1770 		if (ext4_es_must_keep(es))
1771 			goto next;
1772 		if (ext4_es_is_referenced(es)) {
1773 			ext4_es_clear_referenced(es);
1774 			goto next;
1775 		}
1776 
1777 		rb_erase(&es->rb_node, &tree->root);
1778 		ext4_es_free_extent(inode, es);
1779 		(*nr_shrunk)++;
1780 next:
1781 		if (!node)
1782 			goto out_wrap;
1783 		es = rb_entry(node, struct extent_status, rb_node);
1784 	}
1785 	ei->i_es_shrink_lblk = es->es_lblk;
1786 	return 1;
1787 out_wrap:
1788 	ei->i_es_shrink_lblk = 0;
1789 	return 0;
1790 }
1791 
es_reclaim_extents(struct ext4_inode_info * ei,int * nr_to_scan)1792 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
1793 {
1794 	struct inode *inode = &ei->vfs_inode;
1795 	int nr_shrunk = 0;
1796 	ext4_lblk_t start = ei->i_es_shrink_lblk;
1797 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1798 				      DEFAULT_RATELIMIT_BURST);
1799 
1800 	if (ei->i_es_shk_nr == 0)
1801 		return 0;
1802 
1803 	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
1804 	    __ratelimit(&_rs))
1805 		ext4_warning(inode->i_sb, "forced shrink of precached extents");
1806 
1807 	if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
1808 	    start != 0)
1809 		es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
1810 
1811 	ei->i_es_tree.cache_es = NULL;
1812 	return nr_shrunk;
1813 }
1814 
1815 /*
1816  * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
1817  * discretionary entries from the extent status cache.  (Some entries
1818  * must be present for proper operations.)
1819  */
ext4_clear_inode_es(struct inode * inode)1820 void ext4_clear_inode_es(struct inode *inode)
1821 {
1822 	struct ext4_inode_info *ei = EXT4_I(inode);
1823 	struct extent_status *es;
1824 	struct ext4_es_tree *tree;
1825 	struct rb_node *node;
1826 
1827 	write_lock(&ei->i_es_lock);
1828 	tree = &EXT4_I(inode)->i_es_tree;
1829 	tree->cache_es = NULL;
1830 	node = rb_first(&tree->root);
1831 	while (node) {
1832 		es = rb_entry(node, struct extent_status, rb_node);
1833 		node = rb_next(node);
1834 		if (!ext4_es_must_keep(es)) {
1835 			rb_erase(&es->rb_node, &tree->root);
1836 			ext4_es_free_extent(inode, es);
1837 		}
1838 	}
1839 	ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
1840 	write_unlock(&ei->i_es_lock);
1841 }
1842 
1843 #ifdef ES_DEBUG__
ext4_print_pending_tree(struct inode * inode)1844 static void ext4_print_pending_tree(struct inode *inode)
1845 {
1846 	struct ext4_pending_tree *tree;
1847 	struct rb_node *node;
1848 	struct pending_reservation *pr;
1849 
1850 	printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
1851 	tree = &EXT4_I(inode)->i_pending_tree;
1852 	node = rb_first(&tree->root);
1853 	while (node) {
1854 		pr = rb_entry(node, struct pending_reservation, rb_node);
1855 		printk(KERN_DEBUG " %u", pr->lclu);
1856 		node = rb_next(node);
1857 	}
1858 	printk(KERN_DEBUG "\n");
1859 }
1860 #else
1861 #define ext4_print_pending_tree(inode)
1862 #endif
1863 
ext4_init_pending(void)1864 int __init ext4_init_pending(void)
1865 {
1866 	ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
1867 					   sizeof(struct pending_reservation),
1868 					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
1869 	if (ext4_pending_cachep == NULL)
1870 		return -ENOMEM;
1871 	return 0;
1872 }
1873 
ext4_exit_pending(void)1874 void ext4_exit_pending(void)
1875 {
1876 	kmem_cache_destroy(ext4_pending_cachep);
1877 }
1878 
ext4_init_pending_tree(struct ext4_pending_tree * tree)1879 void ext4_init_pending_tree(struct ext4_pending_tree *tree)
1880 {
1881 	tree->root = RB_ROOT;
1882 }
1883 
1884 /*
1885  * __get_pending - retrieve a pointer to a pending reservation
1886  *
1887  * @inode - file containing the pending cluster reservation
1888  * @lclu - logical cluster of interest
1889  *
1890  * Returns a pointer to a pending reservation if it's a member of
1891  * the set, and NULL if not.  Must be called holding i_es_lock.
1892  */
__get_pending(struct inode * inode,ext4_lblk_t lclu)1893 static struct pending_reservation *__get_pending(struct inode *inode,
1894 						 ext4_lblk_t lclu)
1895 {
1896 	struct ext4_pending_tree *tree;
1897 	struct rb_node *node;
1898 	struct pending_reservation *pr = NULL;
1899 
1900 	tree = &EXT4_I(inode)->i_pending_tree;
1901 	node = (&tree->root)->rb_node;
1902 
1903 	while (node) {
1904 		pr = rb_entry(node, struct pending_reservation, rb_node);
1905 		if (lclu < pr->lclu)
1906 			node = node->rb_left;
1907 		else if (lclu > pr->lclu)
1908 			node = node->rb_right;
1909 		else if (lclu == pr->lclu)
1910 			return pr;
1911 	}
1912 	return NULL;
1913 }
1914 
1915 /*
1916  * __insert_pending - adds a pending cluster reservation to the set of
1917  *                    pending reservations
1918  *
1919  * @inode - file containing the cluster
1920  * @lblk - logical block in the cluster to be added
1921  * @prealloc - preallocated pending entry
1922  *
1923  * Returns 0 on successful insertion and -ENOMEM on failure.  If the
1924  * pending reservation is already in the set, returns successfully.
1925  */
__insert_pending(struct inode * inode,ext4_lblk_t lblk,struct pending_reservation ** prealloc)1926 static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
1927 			    struct pending_reservation **prealloc)
1928 {
1929 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1930 	struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
1931 	struct rb_node **p = &tree->root.rb_node;
1932 	struct rb_node *parent = NULL;
1933 	struct pending_reservation *pr;
1934 	ext4_lblk_t lclu;
1935 	int ret = 0;
1936 
1937 	lclu = EXT4_B2C(sbi, lblk);
1938 	/* search to find parent for insertion */
1939 	while (*p) {
1940 		parent = *p;
1941 		pr = rb_entry(parent, struct pending_reservation, rb_node);
1942 
1943 		if (lclu < pr->lclu) {
1944 			p = &(*p)->rb_left;
1945 		} else if (lclu > pr->lclu) {
1946 			p = &(*p)->rb_right;
1947 		} else {
1948 			/* pending reservation already inserted */
1949 			goto out;
1950 		}
1951 	}
1952 
1953 	if (likely(*prealloc == NULL)) {
1954 		pr = __alloc_pending(false);
1955 		if (!pr) {
1956 			ret = -ENOMEM;
1957 			goto out;
1958 		}
1959 	} else {
1960 		pr = *prealloc;
1961 		*prealloc = NULL;
1962 	}
1963 	pr->lclu = lclu;
1964 
1965 	rb_link_node(&pr->rb_node, parent, p);
1966 	rb_insert_color(&pr->rb_node, &tree->root);
1967 
1968 out:
1969 	return ret;
1970 }
1971 
1972 /*
1973  * __remove_pending - removes a pending cluster reservation from the set
1974  *                    of pending reservations
1975  *
1976  * @inode - file containing the cluster
1977  * @lblk - logical block in the pending cluster reservation to be removed
1978  *
1979  * Returns successfully if pending reservation is not a member of the set.
1980  */
__remove_pending(struct inode * inode,ext4_lblk_t lblk)1981 static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
1982 {
1983 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1984 	struct pending_reservation *pr;
1985 	struct ext4_pending_tree *tree;
1986 
1987 	pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
1988 	if (pr != NULL) {
1989 		tree = &EXT4_I(inode)->i_pending_tree;
1990 		rb_erase(&pr->rb_node, &tree->root);
1991 		__free_pending(pr);
1992 	}
1993 }
1994 
1995 /*
1996  * ext4_remove_pending - removes a pending cluster reservation from the set
1997  *                       of pending reservations
1998  *
1999  * @inode - file containing the cluster
2000  * @lblk - logical block in the pending cluster reservation to be removed
2001  *
2002  * Locking for external use of __remove_pending.
2003  */
ext4_remove_pending(struct inode * inode,ext4_lblk_t lblk)2004 void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
2005 {
2006 	struct ext4_inode_info *ei = EXT4_I(inode);
2007 
2008 	write_lock(&ei->i_es_lock);
2009 	__remove_pending(inode, lblk);
2010 	write_unlock(&ei->i_es_lock);
2011 }
2012 
2013 /*
2014  * ext4_is_pending - determine whether a cluster has a pending reservation
2015  *                   on it
2016  *
2017  * @inode - file containing the cluster
2018  * @lblk - logical block in the cluster
2019  *
2020  * Returns true if there's a pending reservation for the cluster in the
2021  * set of pending reservations, and false if not.
2022  */
ext4_is_pending(struct inode * inode,ext4_lblk_t lblk)2023 bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
2024 {
2025 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2026 	struct ext4_inode_info *ei = EXT4_I(inode);
2027 	bool ret;
2028 
2029 	read_lock(&ei->i_es_lock);
2030 	ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
2031 	read_unlock(&ei->i_es_lock);
2032 
2033 	return ret;
2034 }
2035 
2036 /*
2037  * ext4_es_insert_delayed_block - adds a delayed block to the extents status
2038  *                                tree, adding a pending reservation where
2039  *                                needed
2040  *
2041  * @inode - file containing the newly added block
2042  * @lblk - logical block to be added
2043  * @allocated - indicates whether a physical cluster has been allocated for
2044  *              the logical cluster that contains the block
2045  *
2046  * Returns 0 on success, negative error code on failure.
2047  */
ext4_es_insert_delayed_block(struct inode * inode,ext4_lblk_t lblk,bool allocated)2048 int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
2049 				 bool allocated)
2050 {
2051 	struct extent_status newes;
2052 	int err1 = 0, err2 = 0, err3 = 0;
2053 	struct extent_status *es1 = NULL;
2054 	struct extent_status *es2 = NULL;
2055 	struct pending_reservation *pr = NULL;
2056 
2057 	es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
2058 		 lblk, inode->i_ino);
2059 
2060 	newes.es_lblk = lblk;
2061 	newes.es_len = 1;
2062 	ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
2063 	trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
2064 
2065 	ext4_es_insert_extent_check(inode, &newes);
2066 
2067 retry:
2068 	if (err1 && !es1)
2069 		es1 = __es_alloc_extent(true);
2070 	if ((err1 || err2) && !es2)
2071 		es2 = __es_alloc_extent(true);
2072 	if ((err1 || err2 || err3) && allocated && !pr)
2073 		pr = __alloc_pending(true);
2074 	write_lock(&EXT4_I(inode)->i_es_lock);
2075 
2076 	err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
2077 	if (err1 != 0)
2078 		goto error;
2079 	/* Free preallocated extent if it didn't get used. */
2080 	if (es1) {
2081 		if (!es1->es_len)
2082 			__es_free_extent(es1);
2083 		es1 = NULL;
2084 	}
2085 
2086 	err2 = __es_insert_extent(inode, &newes, es2);
2087 	if (err2 != 0)
2088 		goto error;
2089 	/* Free preallocated extent if it didn't get used. */
2090 	if (es2) {
2091 		if (!es2->es_len)
2092 			__es_free_extent(es2);
2093 		es2 = NULL;
2094 	}
2095 
2096 	if (allocated) {
2097 		err3 = __insert_pending(inode, lblk, &pr);
2098 		if (err3 != 0)
2099 			goto error;
2100 		if (pr) {
2101 			__free_pending(pr);
2102 			pr = NULL;
2103 		}
2104 	}
2105 error:
2106 	write_unlock(&EXT4_I(inode)->i_es_lock);
2107 	if (err1 || err2 || err3)
2108 		goto retry;
2109 
2110 	ext4_es_print_tree(inode);
2111 	ext4_print_pending_tree(inode);
2112 	return 0;
2113 }
2114 
2115 /*
2116  * __es_delayed_clu - count number of clusters containing blocks that
2117  *                    are delayed only
2118  *
2119  * @inode - file containing block range
2120  * @start - logical block defining start of range
2121  * @end - logical block defining end of range
2122  *
2123  * Returns the number of clusters containing only delayed (not delayed
2124  * and unwritten) blocks in the range specified by @start and @end.  Any
2125  * cluster or part of a cluster within the range and containing a delayed
2126  * and not unwritten block within the range is counted as a whole cluster.
2127  */
__es_delayed_clu(struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)2128 static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
2129 				     ext4_lblk_t end)
2130 {
2131 	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
2132 	struct extent_status *es;
2133 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2134 	struct rb_node *node;
2135 	ext4_lblk_t first_lclu, last_lclu;
2136 	unsigned long long last_counted_lclu;
2137 	unsigned int n = 0;
2138 
2139 	/* guaranteed to be unequal to any ext4_lblk_t value */
2140 	last_counted_lclu = ~0ULL;
2141 
2142 	es = __es_tree_search(&tree->root, start);
2143 
2144 	while (es && (es->es_lblk <= end)) {
2145 		if (ext4_es_is_delonly(es)) {
2146 			if (es->es_lblk <= start)
2147 				first_lclu = EXT4_B2C(sbi, start);
2148 			else
2149 				first_lclu = EXT4_B2C(sbi, es->es_lblk);
2150 
2151 			if (ext4_es_end(es) >= end)
2152 				last_lclu = EXT4_B2C(sbi, end);
2153 			else
2154 				last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
2155 
2156 			if (first_lclu == last_counted_lclu)
2157 				n += last_lclu - first_lclu;
2158 			else
2159 				n += last_lclu - first_lclu + 1;
2160 			last_counted_lclu = last_lclu;
2161 		}
2162 		node = rb_next(&es->rb_node);
2163 		if (!node)
2164 			break;
2165 		es = rb_entry(node, struct extent_status, rb_node);
2166 	}
2167 
2168 	return n;
2169 }
2170 
2171 /*
2172  * ext4_es_delayed_clu - count number of clusters containing blocks that
2173  *                       are both delayed and unwritten
2174  *
2175  * @inode - file containing block range
2176  * @lblk - logical block defining start of range
2177  * @len - number of blocks in range
2178  *
2179  * Locking for external use of __es_delayed_clu().
2180  */
ext4_es_delayed_clu(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len)2181 unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
2182 				 ext4_lblk_t len)
2183 {
2184 	struct ext4_inode_info *ei = EXT4_I(inode);
2185 	ext4_lblk_t end;
2186 	unsigned int n;
2187 
2188 	if (len == 0)
2189 		return 0;
2190 
2191 	end = lblk + len - 1;
2192 	WARN_ON(end < lblk);
2193 
2194 	read_lock(&ei->i_es_lock);
2195 
2196 	n = __es_delayed_clu(inode, lblk, end);
2197 
2198 	read_unlock(&ei->i_es_lock);
2199 
2200 	return n;
2201 }
2202 
2203 /*
2204  * __revise_pending - makes, cancels, or leaves unchanged pending cluster
2205  *                    reservations for a specified block range depending
2206  *                    upon the presence or absence of delayed blocks
2207  *                    outside the range within clusters at the ends of the
2208  *                    range
2209  *
2210  * @inode - file containing the range
2211  * @lblk - logical block defining the start of range
2212  * @len  - length of range in blocks
2213  * @prealloc - preallocated pending entry
2214  *
2215  * Used after a newly allocated extent is added to the extents status tree.
2216  * Requires that the extents in the range have either written or unwritten
2217  * status.  Must be called while holding i_es_lock.
2218  */
__revise_pending(struct inode * inode,ext4_lblk_t lblk,ext4_lblk_t len,struct pending_reservation ** prealloc)2219 static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
2220 			    ext4_lblk_t len,
2221 			    struct pending_reservation **prealloc)
2222 {
2223 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2224 	ext4_lblk_t end = lblk + len - 1;
2225 	ext4_lblk_t first, last;
2226 	bool f_del = false, l_del = false;
2227 	int ret = 0;
2228 
2229 	if (len == 0)
2230 		return 0;
2231 
2232 	/*
2233 	 * Two cases - block range within single cluster and block range
2234 	 * spanning two or more clusters.  Note that a cluster belonging
2235 	 * to a range starting and/or ending on a cluster boundary is treated
2236 	 * as if it does not contain a delayed extent.  The new range may
2237 	 * have allocated space for previously delayed blocks out to the
2238 	 * cluster boundary, requiring that any pre-existing pending
2239 	 * reservation be canceled.  Because this code only looks at blocks
2240 	 * outside the range, it should revise pending reservations
2241 	 * correctly even if the extent represented by the range can't be
2242 	 * inserted in the extents status tree due to ENOSPC.
2243 	 */
2244 
2245 	if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
2246 		first = EXT4_LBLK_CMASK(sbi, lblk);
2247 		if (first != lblk)
2248 			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2249 						first, lblk - 1);
2250 		if (f_del) {
2251 			ret = __insert_pending(inode, first, prealloc);
2252 			if (ret < 0)
2253 				goto out;
2254 		} else {
2255 			last = EXT4_LBLK_CMASK(sbi, end) +
2256 			       sbi->s_cluster_ratio - 1;
2257 			if (last != end)
2258 				l_del = __es_scan_range(inode,
2259 							&ext4_es_is_delonly,
2260 							end + 1, last);
2261 			if (l_del) {
2262 				ret = __insert_pending(inode, last, prealloc);
2263 				if (ret < 0)
2264 					goto out;
2265 			} else
2266 				__remove_pending(inode, last);
2267 		}
2268 	} else {
2269 		first = EXT4_LBLK_CMASK(sbi, lblk);
2270 		if (first != lblk)
2271 			f_del = __es_scan_range(inode, &ext4_es_is_delonly,
2272 						first, lblk - 1);
2273 		if (f_del) {
2274 			ret = __insert_pending(inode, first, prealloc);
2275 			if (ret < 0)
2276 				goto out;
2277 		} else
2278 			__remove_pending(inode, first);
2279 
2280 		last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
2281 		if (last != end)
2282 			l_del = __es_scan_range(inode, &ext4_es_is_delonly,
2283 						end + 1, last);
2284 		if (l_del) {
2285 			ret = __insert_pending(inode, last, prealloc);
2286 			if (ret < 0)
2287 				goto out;
2288 		} else
2289 			__remove_pending(inode, last);
2290 	}
2291 out:
2292 	return ret;
2293 }
2294