• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4   * All Rights Reserved.
5   */
6  #include "xfs.h"
7  #include "xfs_fs.h"
8  #include "xfs_shared.h"
9  #include "xfs_format.h"
10  #include "xfs_log_format.h"
11  #include "xfs_trans_resv.h"
12  #include "xfs_mount.h"
13  #include "xfs_inode.h"
14  #include "xfs_trans.h"
15  #include "xfs_trans_priv.h"
16  #include "xfs_inode_item.h"
17  #include "xfs_quota.h"
18  #include "xfs_trace.h"
19  #include "xfs_icache.h"
20  #include "xfs_bmap_util.h"
21  #include "xfs_dquot_item.h"
22  #include "xfs_dquot.h"
23  #include "xfs_reflink.h"
24  #include "xfs_ialloc.h"
25  #include "xfs_ag.h"
26  
27  #include <linux/iversion.h>
28  
29  /* Radix tree tags for incore inode tree. */
30  
31  /* inode is to be reclaimed */
32  #define XFS_ICI_RECLAIM_TAG	0
33  /* Inode has speculative preallocations (posteof or cow) to clean. */
34  #define XFS_ICI_BLOCKGC_TAG	1
35  
36  /*
37   * The goal for walking incore inodes.  These can correspond with incore inode
38   * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
39   */
40  enum xfs_icwalk_goal {
41  	/* Goals directly associated with tagged inodes. */
42  	XFS_ICWALK_BLOCKGC	= XFS_ICI_BLOCKGC_TAG,
43  	XFS_ICWALK_RECLAIM	= XFS_ICI_RECLAIM_TAG,
44  };
45  
46  static int xfs_icwalk(struct xfs_mount *mp,
47  		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
48  static int xfs_icwalk_ag(struct xfs_perag *pag,
49  		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
50  
51  /*
52   * Private inode cache walk flags for struct xfs_icwalk.  Must not
53   * coincide with XFS_ICWALK_FLAGS_VALID.
54   */
55  
56  /* Stop scanning after icw_scan_limit inodes. */
57  #define XFS_ICWALK_FLAG_SCAN_LIMIT	(1U << 28)
58  
59  #define XFS_ICWALK_FLAG_RECLAIM_SICK	(1U << 27)
60  #define XFS_ICWALK_FLAG_UNION		(1U << 26) /* union filter algorithm */
61  
62  #define XFS_ICWALK_PRIVATE_FLAGS	(XFS_ICWALK_FLAG_SCAN_LIMIT | \
63  					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
64  					 XFS_ICWALK_FLAG_UNION)
65  
66  /*
67   * Allocate and initialise an xfs_inode.
68   */
69  struct xfs_inode *
xfs_inode_alloc(struct xfs_mount * mp,xfs_ino_t ino)70  xfs_inode_alloc(
71  	struct xfs_mount	*mp,
72  	xfs_ino_t		ino)
73  {
74  	struct xfs_inode	*ip;
75  
76  	/*
77  	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
78  	 * and return NULL here on ENOMEM.
79  	 */
80  	ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
81  
82  	if (inode_init_always(mp->m_super, VFS_I(ip))) {
83  		kmem_cache_free(xfs_inode_zone, ip);
84  		return NULL;
85  	}
86  
87  	/* VFS doesn't initialise i_mode or i_state! */
88  	VFS_I(ip)->i_mode = 0;
89  	VFS_I(ip)->i_state = 0;
90  
91  	XFS_STATS_INC(mp, vn_active);
92  	ASSERT(atomic_read(&ip->i_pincount) == 0);
93  	ASSERT(ip->i_ino == 0);
94  
95  	/* initialise the xfs inode */
96  	ip->i_ino = ino;
97  	ip->i_mount = mp;
98  	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
99  	ip->i_afp = NULL;
100  	ip->i_cowfp = NULL;
101  	memset(&ip->i_df, 0, sizeof(ip->i_df));
102  	ip->i_flags = 0;
103  	ip->i_delayed_blks = 0;
104  	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
105  	ip->i_nblocks = 0;
106  	ip->i_forkoff = 0;
107  	ip->i_sick = 0;
108  	ip->i_checked = 0;
109  	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
110  	INIT_LIST_HEAD(&ip->i_ioend_list);
111  	spin_lock_init(&ip->i_ioend_lock);
112  
113  	return ip;
114  }
115  
116  STATIC void
xfs_inode_free_callback(struct rcu_head * head)117  xfs_inode_free_callback(
118  	struct rcu_head		*head)
119  {
120  	struct inode		*inode = container_of(head, struct inode, i_rcu);
121  	struct xfs_inode	*ip = XFS_I(inode);
122  
123  	switch (VFS_I(ip)->i_mode & S_IFMT) {
124  	case S_IFREG:
125  	case S_IFDIR:
126  	case S_IFLNK:
127  		xfs_idestroy_fork(&ip->i_df);
128  		break;
129  	}
130  
131  	if (ip->i_afp) {
132  		xfs_idestroy_fork(ip->i_afp);
133  		kmem_cache_free(xfs_ifork_zone, ip->i_afp);
134  	}
135  	if (ip->i_cowfp) {
136  		xfs_idestroy_fork(ip->i_cowfp);
137  		kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
138  	}
139  	if (ip->i_itemp) {
140  		ASSERT(!test_bit(XFS_LI_IN_AIL,
141  				 &ip->i_itemp->ili_item.li_flags));
142  		xfs_inode_item_destroy(ip);
143  		ip->i_itemp = NULL;
144  	}
145  
146  	kmem_cache_free(xfs_inode_zone, ip);
147  }
148  
149  static void
__xfs_inode_free(struct xfs_inode * ip)150  __xfs_inode_free(
151  	struct xfs_inode	*ip)
152  {
153  	/* asserts to verify all state is correct here */
154  	ASSERT(atomic_read(&ip->i_pincount) == 0);
155  	ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
156  	XFS_STATS_DEC(ip->i_mount, vn_active);
157  
158  	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
159  }
160  
161  void
xfs_inode_free(struct xfs_inode * ip)162  xfs_inode_free(
163  	struct xfs_inode	*ip)
164  {
165  	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
166  
167  	/*
168  	 * Because we use RCU freeing we need to ensure the inode always
169  	 * appears to be reclaimed with an invalid inode number when in the
170  	 * free state. The ip->i_flags_lock provides the barrier against lookup
171  	 * races.
172  	 */
173  	spin_lock(&ip->i_flags_lock);
174  	ip->i_flags = XFS_IRECLAIM;
175  	ip->i_ino = 0;
176  	spin_unlock(&ip->i_flags_lock);
177  
178  	__xfs_inode_free(ip);
179  }
180  
181  /*
182   * Queue background inode reclaim work if there are reclaimable inodes and there
183   * isn't reclaim work already scheduled or in progress.
184   */
185  static void
xfs_reclaim_work_queue(struct xfs_mount * mp)186  xfs_reclaim_work_queue(
187  	struct xfs_mount        *mp)
188  {
189  
190  	rcu_read_lock();
191  	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
192  		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
193  			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
194  	}
195  	rcu_read_unlock();
196  }
197  
198  /*
199   * Background scanning to trim preallocated space. This is queued based on the
200   * 'speculative_prealloc_lifetime' tunable (5m by default).
201   */
202  static inline void
xfs_blockgc_queue(struct xfs_perag * pag)203  xfs_blockgc_queue(
204  	struct xfs_perag	*pag)
205  {
206  	struct xfs_mount	*mp = pag->pag_mount;
207  
208  	if (!xfs_is_blockgc_enabled(mp))
209  		return;
210  
211  	rcu_read_lock();
212  	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
213  		queue_delayed_work(pag->pag_mount->m_blockgc_wq,
214  				   &pag->pag_blockgc_work,
215  				   msecs_to_jiffies(xfs_blockgc_secs * 1000));
216  	rcu_read_unlock();
217  }
218  
219  /* Set a tag on both the AG incore inode tree and the AG radix tree. */
220  static void
xfs_perag_set_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)221  xfs_perag_set_inode_tag(
222  	struct xfs_perag	*pag,
223  	xfs_agino_t		agino,
224  	unsigned int		tag)
225  {
226  	struct xfs_mount	*mp = pag->pag_mount;
227  	bool			was_tagged;
228  
229  	lockdep_assert_held(&pag->pag_ici_lock);
230  
231  	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
232  	radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
233  
234  	if (tag == XFS_ICI_RECLAIM_TAG)
235  		pag->pag_ici_reclaimable++;
236  
237  	if (was_tagged)
238  		return;
239  
240  	/* propagate the tag up into the perag radix tree */
241  	spin_lock(&mp->m_perag_lock);
242  	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
243  	spin_unlock(&mp->m_perag_lock);
244  
245  	/* start background work */
246  	switch (tag) {
247  	case XFS_ICI_RECLAIM_TAG:
248  		xfs_reclaim_work_queue(mp);
249  		break;
250  	case XFS_ICI_BLOCKGC_TAG:
251  		xfs_blockgc_queue(pag);
252  		break;
253  	}
254  
255  	trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
256  }
257  
258  /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
259  static void
xfs_perag_clear_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)260  xfs_perag_clear_inode_tag(
261  	struct xfs_perag	*pag,
262  	xfs_agino_t		agino,
263  	unsigned int		tag)
264  {
265  	struct xfs_mount	*mp = pag->pag_mount;
266  
267  	lockdep_assert_held(&pag->pag_ici_lock);
268  
269  	/*
270  	 * Reclaim can signal (with a null agino) that it cleared its own tag
271  	 * by removing the inode from the radix tree.
272  	 */
273  	if (agino != NULLAGINO)
274  		radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
275  	else
276  		ASSERT(tag == XFS_ICI_RECLAIM_TAG);
277  
278  	if (tag == XFS_ICI_RECLAIM_TAG)
279  		pag->pag_ici_reclaimable--;
280  
281  	if (radix_tree_tagged(&pag->pag_ici_root, tag))
282  		return;
283  
284  	/* clear the tag from the perag radix tree */
285  	spin_lock(&mp->m_perag_lock);
286  	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
287  	spin_unlock(&mp->m_perag_lock);
288  
289  	trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
290  }
291  
292  static inline void
xfs_inew_wait(struct xfs_inode * ip)293  xfs_inew_wait(
294  	struct xfs_inode	*ip)
295  {
296  	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
297  	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
298  
299  	do {
300  		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
301  		if (!xfs_iflags_test(ip, XFS_INEW))
302  			break;
303  		schedule();
304  	} while (true);
305  	finish_wait(wq, &wait.wq_entry);
306  }
307  
308  /*
309   * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
310   * part of the structure. This is made more complex by the fact we store
311   * information about the on-disk values in the VFS inode and so we can't just
312   * overwrite the values unconditionally. Hence we save the parameters we
313   * need to retain across reinitialisation, and rewrite them into the VFS inode
314   * after reinitialisation even if it fails.
315   */
316  static int
xfs_reinit_inode(struct xfs_mount * mp,struct inode * inode)317  xfs_reinit_inode(
318  	struct xfs_mount	*mp,
319  	struct inode		*inode)
320  {
321  	int			error;
322  	uint32_t		nlink = inode->i_nlink;
323  	uint32_t		generation = inode->i_generation;
324  	uint64_t		version = inode_peek_iversion(inode);
325  	umode_t			mode = inode->i_mode;
326  	dev_t			dev = inode->i_rdev;
327  	kuid_t			uid = inode->i_uid;
328  	kgid_t			gid = inode->i_gid;
329  
330  	error = inode_init_always(mp->m_super, inode);
331  
332  	set_nlink(inode, nlink);
333  	inode->i_generation = generation;
334  	inode_set_iversion_queried(inode, version);
335  	inode->i_mode = mode;
336  	inode->i_rdev = dev;
337  	inode->i_uid = uid;
338  	inode->i_gid = gid;
339  	return error;
340  }
341  
342  /*
343   * Carefully nudge an inode whose VFS state has been torn down back into a
344   * usable state.  Drops the i_flags_lock and the rcu read lock.
345   */
346  static int
xfs_iget_recycle(struct xfs_perag * pag,struct xfs_inode * ip)347  xfs_iget_recycle(
348  	struct xfs_perag	*pag,
349  	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
350  {
351  	struct xfs_mount	*mp = ip->i_mount;
352  	struct inode		*inode = VFS_I(ip);
353  	int			error;
354  
355  	trace_xfs_iget_recycle(ip);
356  
357  	/*
358  	 * We need to make it look like the inode is being reclaimed to prevent
359  	 * the actual reclaim workers from stomping over us while we recycle
360  	 * the inode.  We can't clear the radix tree tag yet as it requires
361  	 * pag_ici_lock to be held exclusive.
362  	 */
363  	ip->i_flags |= XFS_IRECLAIM;
364  
365  	spin_unlock(&ip->i_flags_lock);
366  	rcu_read_unlock();
367  
368  	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
369  	error = xfs_reinit_inode(mp, inode);
370  	if (error) {
371  		bool	wake;
372  
373  		/*
374  		 * Re-initializing the inode failed, and we are in deep
375  		 * trouble.  Try to re-add it to the reclaim list.
376  		 */
377  		rcu_read_lock();
378  		spin_lock(&ip->i_flags_lock);
379  		wake = !!__xfs_iflags_test(ip, XFS_INEW);
380  		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
381  		if (wake)
382  			wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
383  		ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
384  		spin_unlock(&ip->i_flags_lock);
385  		rcu_read_unlock();
386  
387  		trace_xfs_iget_recycle_fail(ip);
388  		return error;
389  	}
390  
391  	spin_lock(&pag->pag_ici_lock);
392  	spin_lock(&ip->i_flags_lock);
393  
394  	/*
395  	 * Clear the per-lifetime state in the inode as we are now effectively
396  	 * a new inode and need to return to the initial state before reuse
397  	 * occurs.
398  	 */
399  	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
400  	ip->i_flags |= XFS_INEW;
401  	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
402  			XFS_ICI_RECLAIM_TAG);
403  	inode->i_state = I_NEW;
404  	spin_unlock(&ip->i_flags_lock);
405  	spin_unlock(&pag->pag_ici_lock);
406  
407  	return 0;
408  }
409  
410  /*
411   * If we are allocating a new inode, then check what was returned is
412   * actually a free, empty inode. If we are not allocating an inode,
413   * then check we didn't find a free inode.
414   *
415   * Returns:
416   *	0		if the inode free state matches the lookup context
417   *	-ENOENT		if the inode is free and we are not allocating
418   *	-EFSCORRUPTED	if there is any state mismatch at all
419   */
420  static int
xfs_iget_check_free_state(struct xfs_inode * ip,int flags)421  xfs_iget_check_free_state(
422  	struct xfs_inode	*ip,
423  	int			flags)
424  {
425  	if (flags & XFS_IGET_CREATE) {
426  		/* should be a free inode */
427  		if (VFS_I(ip)->i_mode != 0) {
428  			xfs_warn(ip->i_mount,
429  "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
430  				ip->i_ino, VFS_I(ip)->i_mode);
431  			return -EFSCORRUPTED;
432  		}
433  
434  		if (ip->i_nblocks != 0) {
435  			xfs_warn(ip->i_mount,
436  "Corruption detected! Free inode 0x%llx has blocks allocated!",
437  				ip->i_ino);
438  			return -EFSCORRUPTED;
439  		}
440  		return 0;
441  	}
442  
443  	/* should be an allocated inode */
444  	if (VFS_I(ip)->i_mode == 0)
445  		return -ENOENT;
446  
447  	return 0;
448  }
449  
450  /* Make all pending inactivation work start immediately. */
451  static bool
xfs_inodegc_queue_all(struct xfs_mount * mp)452  xfs_inodegc_queue_all(
453  	struct xfs_mount	*mp)
454  {
455  	struct xfs_inodegc	*gc;
456  	int			cpu;
457  	bool			ret = false;
458  
459  	for_each_online_cpu(cpu) {
460  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
461  		if (!llist_empty(&gc->list)) {
462  			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
463  			ret = true;
464  		}
465  	}
466  
467  	return ret;
468  }
469  
470  /*
471   * Check the validity of the inode we just found it the cache
472   */
473  static int
xfs_iget_cache_hit(struct xfs_perag * pag,struct xfs_inode * ip,xfs_ino_t ino,int flags,int lock_flags)474  xfs_iget_cache_hit(
475  	struct xfs_perag	*pag,
476  	struct xfs_inode	*ip,
477  	xfs_ino_t		ino,
478  	int			flags,
479  	int			lock_flags) __releases(RCU)
480  {
481  	struct inode		*inode = VFS_I(ip);
482  	struct xfs_mount	*mp = ip->i_mount;
483  	int			error;
484  
485  	/*
486  	 * check for re-use of an inode within an RCU grace period due to the
487  	 * radix tree nodes not being updated yet. We monitor for this by
488  	 * setting the inode number to zero before freeing the inode structure.
489  	 * If the inode has been reallocated and set up, then the inode number
490  	 * will not match, so check for that, too.
491  	 */
492  	spin_lock(&ip->i_flags_lock);
493  	if (ip->i_ino != ino)
494  		goto out_skip;
495  
496  	/*
497  	 * If we are racing with another cache hit that is currently
498  	 * instantiating this inode or currently recycling it out of
499  	 * reclaimable state, wait for the initialisation to complete
500  	 * before continuing.
501  	 *
502  	 * If we're racing with the inactivation worker we also want to wait.
503  	 * If we're creating a new file, it's possible that the worker
504  	 * previously marked the inode as free on disk but hasn't finished
505  	 * updating the incore state yet.  The AGI buffer will be dirty and
506  	 * locked to the icreate transaction, so a synchronous push of the
507  	 * inodegc workers would result in deadlock.  For a regular iget, the
508  	 * worker is running already, so we might as well wait.
509  	 *
510  	 * XXX(hch): eventually we should do something equivalent to
511  	 *	     wait_on_inode to wait for these flags to be cleared
512  	 *	     instead of polling for it.
513  	 */
514  	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
515  		goto out_skip;
516  
517  	if (ip->i_flags & XFS_NEED_INACTIVE) {
518  		/* Unlinked inodes cannot be re-grabbed. */
519  		if (VFS_I(ip)->i_nlink == 0) {
520  			error = -ENOENT;
521  			goto out_error;
522  		}
523  		goto out_inodegc_flush;
524  	}
525  
526  	/*
527  	 * Check the inode free state is valid. This also detects lookup
528  	 * racing with unlinks.
529  	 */
530  	error = xfs_iget_check_free_state(ip, flags);
531  	if (error)
532  		goto out_error;
533  
534  	/* Skip inodes that have no vfs state. */
535  	if ((flags & XFS_IGET_INCORE) &&
536  	    (ip->i_flags & XFS_IRECLAIMABLE))
537  		goto out_skip;
538  
539  	/* The inode fits the selection criteria; process it. */
540  	if (ip->i_flags & XFS_IRECLAIMABLE) {
541  		/* Drops i_flags_lock and RCU read lock. */
542  		error = xfs_iget_recycle(pag, ip);
543  		if (error)
544  			return error;
545  	} else {
546  		/* If the VFS inode is being torn down, pause and try again. */
547  		if (!igrab(inode))
548  			goto out_skip;
549  
550  		/* We've got a live one. */
551  		spin_unlock(&ip->i_flags_lock);
552  		rcu_read_unlock();
553  		trace_xfs_iget_hit(ip);
554  	}
555  
556  	if (lock_flags != 0)
557  		xfs_ilock(ip, lock_flags);
558  
559  	if (!(flags & XFS_IGET_INCORE))
560  		xfs_iflags_clear(ip, XFS_ISTALE);
561  	XFS_STATS_INC(mp, xs_ig_found);
562  
563  	return 0;
564  
565  out_skip:
566  	trace_xfs_iget_skip(ip);
567  	XFS_STATS_INC(mp, xs_ig_frecycle);
568  	error = -EAGAIN;
569  out_error:
570  	spin_unlock(&ip->i_flags_lock);
571  	rcu_read_unlock();
572  	return error;
573  
574  out_inodegc_flush:
575  	spin_unlock(&ip->i_flags_lock);
576  	rcu_read_unlock();
577  	/*
578  	 * Do not wait for the workers, because the caller could hold an AGI
579  	 * buffer lock.  We're just going to sleep in a loop anyway.
580  	 */
581  	if (xfs_is_inodegc_enabled(mp))
582  		xfs_inodegc_queue_all(mp);
583  	return -EAGAIN;
584  }
585  
586  static int
xfs_iget_cache_miss(struct xfs_mount * mp,struct xfs_perag * pag,xfs_trans_t * tp,xfs_ino_t ino,struct xfs_inode ** ipp,int flags,int lock_flags)587  xfs_iget_cache_miss(
588  	struct xfs_mount	*mp,
589  	struct xfs_perag	*pag,
590  	xfs_trans_t		*tp,
591  	xfs_ino_t		ino,
592  	struct xfs_inode	**ipp,
593  	int			flags,
594  	int			lock_flags)
595  {
596  	struct xfs_inode	*ip;
597  	int			error;
598  	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
599  	int			iflags;
600  
601  	ip = xfs_inode_alloc(mp, ino);
602  	if (!ip)
603  		return -ENOMEM;
604  
605  	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
606  	if (error)
607  		goto out_destroy;
608  
609  	/*
610  	 * For version 5 superblocks, if we are initialising a new inode and we
611  	 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
612  	 * simply build the new inode core with a random generation number.
613  	 *
614  	 * For version 4 (and older) superblocks, log recovery is dependent on
615  	 * the i_flushiter field being initialised from the current on-disk
616  	 * value and hence we must also read the inode off disk even when
617  	 * initializing new inodes.
618  	 */
619  	if (xfs_has_v3inodes(mp) &&
620  	    (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
621  		VFS_I(ip)->i_generation = prandom_u32();
622  	} else {
623  		struct xfs_buf		*bp;
624  
625  		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
626  		if (error)
627  			goto out_destroy;
628  
629  		error = xfs_inode_from_disk(ip,
630  				xfs_buf_offset(bp, ip->i_imap.im_boffset));
631  		if (!error)
632  			xfs_buf_set_ref(bp, XFS_INO_REF);
633  		xfs_trans_brelse(tp, bp);
634  
635  		if (error)
636  			goto out_destroy;
637  	}
638  
639  	trace_xfs_iget_miss(ip);
640  
641  	/*
642  	 * Check the inode free state is valid. This also detects lookup
643  	 * racing with unlinks.
644  	 */
645  	error = xfs_iget_check_free_state(ip, flags);
646  	if (error)
647  		goto out_destroy;
648  
649  	/*
650  	 * Preload the radix tree so we can insert safely under the
651  	 * write spinlock. Note that we cannot sleep inside the preload
652  	 * region. Since we can be called from transaction context, don't
653  	 * recurse into the file system.
654  	 */
655  	if (radix_tree_preload(GFP_NOFS)) {
656  		error = -EAGAIN;
657  		goto out_destroy;
658  	}
659  
660  	/*
661  	 * Because the inode hasn't been added to the radix-tree yet it can't
662  	 * be found by another thread, so we can do the non-sleeping lock here.
663  	 */
664  	if (lock_flags) {
665  		if (!xfs_ilock_nowait(ip, lock_flags))
666  			BUG();
667  	}
668  
669  	/*
670  	 * These values must be set before inserting the inode into the radix
671  	 * tree as the moment it is inserted a concurrent lookup (allowed by the
672  	 * RCU locking mechanism) can find it and that lookup must see that this
673  	 * is an inode currently under construction (i.e. that XFS_INEW is set).
674  	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
675  	 * memory barrier that ensures this detection works correctly at lookup
676  	 * time.
677  	 */
678  	iflags = XFS_INEW;
679  	if (flags & XFS_IGET_DONTCACHE)
680  		d_mark_dontcache(VFS_I(ip));
681  	ip->i_udquot = NULL;
682  	ip->i_gdquot = NULL;
683  	ip->i_pdquot = NULL;
684  	xfs_iflags_set(ip, iflags);
685  
686  	/* insert the new inode */
687  	spin_lock(&pag->pag_ici_lock);
688  	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
689  	if (unlikely(error)) {
690  		WARN_ON(error != -EEXIST);
691  		XFS_STATS_INC(mp, xs_ig_dup);
692  		error = -EAGAIN;
693  		goto out_preload_end;
694  	}
695  	spin_unlock(&pag->pag_ici_lock);
696  	radix_tree_preload_end();
697  
698  	*ipp = ip;
699  	return 0;
700  
701  out_preload_end:
702  	spin_unlock(&pag->pag_ici_lock);
703  	radix_tree_preload_end();
704  	if (lock_flags)
705  		xfs_iunlock(ip, lock_flags);
706  out_destroy:
707  	__destroy_inode(VFS_I(ip));
708  	xfs_inode_free(ip);
709  	return error;
710  }
711  
712  /*
713   * Look up an inode by number in the given file system.  The inode is looked up
714   * in the cache held in each AG.  If the inode is found in the cache, initialise
715   * the vfs inode if necessary.
716   *
717   * If it is not in core, read it in from the file system's device, add it to the
718   * cache and initialise the vfs inode.
719   *
720   * The inode is locked according to the value of the lock_flags parameter.
721   * Inode lookup is only done during metadata operations and not as part of the
722   * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
723   */
724  int
xfs_iget(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,uint flags,uint lock_flags,struct xfs_inode ** ipp)725  xfs_iget(
726  	struct xfs_mount	*mp,
727  	struct xfs_trans	*tp,
728  	xfs_ino_t		ino,
729  	uint			flags,
730  	uint			lock_flags,
731  	struct xfs_inode	**ipp)
732  {
733  	struct xfs_inode	*ip;
734  	struct xfs_perag	*pag;
735  	xfs_agino_t		agino;
736  	int			error;
737  
738  	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
739  
740  	/* reject inode numbers outside existing AGs */
741  	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
742  		return -EINVAL;
743  
744  	XFS_STATS_INC(mp, xs_ig_attempts);
745  
746  	/* get the perag structure and ensure that it's inode capable */
747  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
748  	agino = XFS_INO_TO_AGINO(mp, ino);
749  
750  again:
751  	error = 0;
752  	rcu_read_lock();
753  	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
754  
755  	if (ip) {
756  		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
757  		if (error)
758  			goto out_error_or_again;
759  	} else {
760  		rcu_read_unlock();
761  		if (flags & XFS_IGET_INCORE) {
762  			error = -ENODATA;
763  			goto out_error_or_again;
764  		}
765  		XFS_STATS_INC(mp, xs_ig_missed);
766  
767  		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
768  							flags, lock_flags);
769  		if (error)
770  			goto out_error_or_again;
771  	}
772  	xfs_perag_put(pag);
773  
774  	*ipp = ip;
775  
776  	/*
777  	 * If we have a real type for an on-disk inode, we can setup the inode
778  	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
779  	 */
780  	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
781  		xfs_setup_existing_inode(ip);
782  	return 0;
783  
784  out_error_or_again:
785  	if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
786  		delay(1);
787  		goto again;
788  	}
789  	xfs_perag_put(pag);
790  	return error;
791  }
792  
793  /*
794   * "Is this a cached inode that's also allocated?"
795   *
796   * Look up an inode by number in the given file system.  If the inode is
797   * in cache and isn't in purgatory, return 1 if the inode is allocated
798   * and 0 if it is not.  For all other cases (not in cache, being torn
799   * down, etc.), return a negative error code.
800   *
801   * The caller has to prevent inode allocation and freeing activity,
802   * presumably by locking the AGI buffer.   This is to ensure that an
803   * inode cannot transition from allocated to freed until the caller is
804   * ready to allow that.  If the inode is in an intermediate state (new,
805   * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
806   * inode is not in the cache, -ENOENT will be returned.  The caller must
807   * deal with these scenarios appropriately.
808   *
809   * This is a specialized use case for the online scrubber; if you're
810   * reading this, you probably want xfs_iget.
811   */
812  int
xfs_icache_inode_is_allocated(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,bool * inuse)813  xfs_icache_inode_is_allocated(
814  	struct xfs_mount	*mp,
815  	struct xfs_trans	*tp,
816  	xfs_ino_t		ino,
817  	bool			*inuse)
818  {
819  	struct xfs_inode	*ip;
820  	int			error;
821  
822  	error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
823  	if (error)
824  		return error;
825  
826  	*inuse = !!(VFS_I(ip)->i_mode);
827  	xfs_irele(ip);
828  	return 0;
829  }
830  
831  /*
832   * Grab the inode for reclaim exclusively.
833   *
834   * We have found this inode via a lookup under RCU, so the inode may have
835   * already been freed, or it may be in the process of being recycled by
836   * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
837   * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
838   * will not be set. Hence we need to check for both these flag conditions to
839   * avoid inodes that are no longer reclaim candidates.
840   *
841   * Note: checking for other state flags here, under the i_flags_lock or not, is
842   * racy and should be avoided. Those races should be resolved only after we have
843   * ensured that we are able to reclaim this inode and the world can see that we
844   * are going to reclaim it.
845   *
846   * Return true if we grabbed it, false otherwise.
847   */
848  static bool
xfs_reclaim_igrab(struct xfs_inode * ip,struct xfs_icwalk * icw)849  xfs_reclaim_igrab(
850  	struct xfs_inode	*ip,
851  	struct xfs_icwalk	*icw)
852  {
853  	ASSERT(rcu_read_lock_held());
854  
855  	spin_lock(&ip->i_flags_lock);
856  	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
857  	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
858  		/* not a reclaim candidate. */
859  		spin_unlock(&ip->i_flags_lock);
860  		return false;
861  	}
862  
863  	/* Don't reclaim a sick inode unless the caller asked for it. */
864  	if (ip->i_sick &&
865  	    (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
866  		spin_unlock(&ip->i_flags_lock);
867  		return false;
868  	}
869  
870  	__xfs_iflags_set(ip, XFS_IRECLAIM);
871  	spin_unlock(&ip->i_flags_lock);
872  	return true;
873  }
874  
875  /*
876   * Inode reclaim is non-blocking, so the default action if progress cannot be
877   * made is to "requeue" the inode for reclaim by unlocking it and clearing the
878   * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
879   * blocking anymore and hence we can wait for the inode to be able to reclaim
880   * it.
881   *
882   * We do no IO here - if callers require inodes to be cleaned they must push the
883   * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
884   * done in the background in a non-blocking manner, and enables memory reclaim
885   * to make progress without blocking.
886   */
887  static void
xfs_reclaim_inode(struct xfs_inode * ip,struct xfs_perag * pag)888  xfs_reclaim_inode(
889  	struct xfs_inode	*ip,
890  	struct xfs_perag	*pag)
891  {
892  	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
893  
894  	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
895  		goto out;
896  	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
897  		goto out_iunlock;
898  
899  	if (xfs_is_shutdown(ip->i_mount)) {
900  		xfs_iunpin_wait(ip);
901  		xfs_iflush_abort(ip);
902  		goto reclaim;
903  	}
904  	if (xfs_ipincount(ip))
905  		goto out_clear_flush;
906  	if (!xfs_inode_clean(ip))
907  		goto out_clear_flush;
908  
909  	xfs_iflags_clear(ip, XFS_IFLUSHING);
910  reclaim:
911  	trace_xfs_inode_reclaiming(ip);
912  
913  	/*
914  	 * Because we use RCU freeing we need to ensure the inode always appears
915  	 * to be reclaimed with an invalid inode number when in the free state.
916  	 * We do this as early as possible under the ILOCK so that
917  	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
918  	 * detect races with us here. By doing this, we guarantee that once
919  	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
920  	 * it will see either a valid inode that will serialise correctly, or it
921  	 * will see an invalid inode that it can skip.
922  	 */
923  	spin_lock(&ip->i_flags_lock);
924  	ip->i_flags = XFS_IRECLAIM;
925  	ip->i_ino = 0;
926  	ip->i_sick = 0;
927  	ip->i_checked = 0;
928  	spin_unlock(&ip->i_flags_lock);
929  
930  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
931  
932  	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
933  	/*
934  	 * Remove the inode from the per-AG radix tree.
935  	 *
936  	 * Because radix_tree_delete won't complain even if the item was never
937  	 * added to the tree assert that it's been there before to catch
938  	 * problems with the inode life time early on.
939  	 */
940  	spin_lock(&pag->pag_ici_lock);
941  	if (!radix_tree_delete(&pag->pag_ici_root,
942  				XFS_INO_TO_AGINO(ip->i_mount, ino)))
943  		ASSERT(0);
944  	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
945  	spin_unlock(&pag->pag_ici_lock);
946  
947  	/*
948  	 * Here we do an (almost) spurious inode lock in order to coordinate
949  	 * with inode cache radix tree lookups.  This is because the lookup
950  	 * can reference the inodes in the cache without taking references.
951  	 *
952  	 * We make that OK here by ensuring that we wait until the inode is
953  	 * unlocked after the lookup before we go ahead and free it.
954  	 */
955  	xfs_ilock(ip, XFS_ILOCK_EXCL);
956  	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
957  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
958  	ASSERT(xfs_inode_clean(ip));
959  
960  	__xfs_inode_free(ip);
961  	return;
962  
963  out_clear_flush:
964  	xfs_iflags_clear(ip, XFS_IFLUSHING);
965  out_iunlock:
966  	xfs_iunlock(ip, XFS_ILOCK_EXCL);
967  out:
968  	xfs_iflags_clear(ip, XFS_IRECLAIM);
969  }
970  
971  /* Reclaim sick inodes if we're unmounting or the fs went down. */
972  static inline bool
xfs_want_reclaim_sick(struct xfs_mount * mp)973  xfs_want_reclaim_sick(
974  	struct xfs_mount	*mp)
975  {
976  	return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
977  	       xfs_is_shutdown(mp);
978  }
979  
980  void
xfs_reclaim_inodes(struct xfs_mount * mp)981  xfs_reclaim_inodes(
982  	struct xfs_mount	*mp)
983  {
984  	struct xfs_icwalk	icw = {
985  		.icw_flags	= 0,
986  	};
987  
988  	if (xfs_want_reclaim_sick(mp))
989  		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
990  
991  	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
992  		xfs_ail_push_all_sync(mp->m_ail);
993  		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
994  	}
995  }
996  
997  /*
998   * The shrinker infrastructure determines how many inodes we should scan for
999   * reclaim. We want as many clean inodes ready to reclaim as possible, so we
1000   * push the AIL here. We also want to proactively free up memory if we can to
1001   * minimise the amount of work memory reclaim has to do so we kick the
1002   * background reclaim if it isn't already scheduled.
1003   */
1004  long
xfs_reclaim_inodes_nr(struct xfs_mount * mp,unsigned long nr_to_scan)1005  xfs_reclaim_inodes_nr(
1006  	struct xfs_mount	*mp,
1007  	unsigned long		nr_to_scan)
1008  {
1009  	struct xfs_icwalk	icw = {
1010  		.icw_flags	= XFS_ICWALK_FLAG_SCAN_LIMIT,
1011  		.icw_scan_limit	= min_t(unsigned long, LONG_MAX, nr_to_scan),
1012  	};
1013  
1014  	if (xfs_want_reclaim_sick(mp))
1015  		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1016  
1017  	/* kick background reclaimer and push the AIL */
1018  	xfs_reclaim_work_queue(mp);
1019  	xfs_ail_push_all(mp->m_ail);
1020  
1021  	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1022  	return 0;
1023  }
1024  
1025  /*
1026   * Return the number of reclaimable inodes in the filesystem for
1027   * the shrinker to determine how much to reclaim.
1028   */
1029  long
xfs_reclaim_inodes_count(struct xfs_mount * mp)1030  xfs_reclaim_inodes_count(
1031  	struct xfs_mount	*mp)
1032  {
1033  	struct xfs_perag	*pag;
1034  	xfs_agnumber_t		ag = 0;
1035  	long			reclaimable = 0;
1036  
1037  	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1038  		ag = pag->pag_agno + 1;
1039  		reclaimable += pag->pag_ici_reclaimable;
1040  		xfs_perag_put(pag);
1041  	}
1042  	return reclaimable;
1043  }
1044  
1045  STATIC bool
xfs_icwalk_match_id(struct xfs_inode * ip,struct xfs_icwalk * icw)1046  xfs_icwalk_match_id(
1047  	struct xfs_inode	*ip,
1048  	struct xfs_icwalk	*icw)
1049  {
1050  	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1051  	    !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1052  		return false;
1053  
1054  	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1055  	    !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1056  		return false;
1057  
1058  	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1059  	    ip->i_projid != icw->icw_prid)
1060  		return false;
1061  
1062  	return true;
1063  }
1064  
1065  /*
1066   * A union-based inode filtering algorithm. Process the inode if any of the
1067   * criteria match. This is for global/internal scans only.
1068   */
1069  STATIC bool
xfs_icwalk_match_id_union(struct xfs_inode * ip,struct xfs_icwalk * icw)1070  xfs_icwalk_match_id_union(
1071  	struct xfs_inode	*ip,
1072  	struct xfs_icwalk	*icw)
1073  {
1074  	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1075  	    uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1076  		return true;
1077  
1078  	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1079  	    gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1080  		return true;
1081  
1082  	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1083  	    ip->i_projid == icw->icw_prid)
1084  		return true;
1085  
1086  	return false;
1087  }
1088  
1089  /*
1090   * Is this inode @ip eligible for eof/cow block reclamation, given some
1091   * filtering parameters @icw?  The inode is eligible if @icw is null or
1092   * if the predicate functions match.
1093   */
1094  static bool
xfs_icwalk_match(struct xfs_inode * ip,struct xfs_icwalk * icw)1095  xfs_icwalk_match(
1096  	struct xfs_inode	*ip,
1097  	struct xfs_icwalk	*icw)
1098  {
1099  	bool			match;
1100  
1101  	if (!icw)
1102  		return true;
1103  
1104  	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1105  		match = xfs_icwalk_match_id_union(ip, icw);
1106  	else
1107  		match = xfs_icwalk_match_id(ip, icw);
1108  	if (!match)
1109  		return false;
1110  
1111  	/* skip the inode if the file size is too small */
1112  	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1113  	    XFS_ISIZE(ip) < icw->icw_min_file_size)
1114  		return false;
1115  
1116  	return true;
1117  }
1118  
1119  /*
1120   * This is a fast pass over the inode cache to try to get reclaim moving on as
1121   * many inodes as possible in a short period of time. It kicks itself every few
1122   * seconds, as well as being kicked by the inode cache shrinker when memory
1123   * goes low.
1124   */
1125  void
xfs_reclaim_worker(struct work_struct * work)1126  xfs_reclaim_worker(
1127  	struct work_struct *work)
1128  {
1129  	struct xfs_mount *mp = container_of(to_delayed_work(work),
1130  					struct xfs_mount, m_reclaim_work);
1131  
1132  	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1133  	xfs_reclaim_work_queue(mp);
1134  }
1135  
1136  STATIC int
xfs_inode_free_eofblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1137  xfs_inode_free_eofblocks(
1138  	struct xfs_inode	*ip,
1139  	struct xfs_icwalk	*icw,
1140  	unsigned int		*lockflags)
1141  {
1142  	bool			wait;
1143  
1144  	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1145  
1146  	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1147  		return 0;
1148  
1149  	/*
1150  	 * If the mapping is dirty the operation can block and wait for some
1151  	 * time. Unless we are waiting, skip it.
1152  	 */
1153  	if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1154  		return 0;
1155  
1156  	if (!xfs_icwalk_match(ip, icw))
1157  		return 0;
1158  
1159  	/*
1160  	 * If the caller is waiting, return -EAGAIN to keep the background
1161  	 * scanner moving and revisit the inode in a subsequent pass.
1162  	 */
1163  	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1164  		if (wait)
1165  			return -EAGAIN;
1166  		return 0;
1167  	}
1168  	*lockflags |= XFS_IOLOCK_EXCL;
1169  
1170  	if (xfs_can_free_eofblocks(ip, false))
1171  		return xfs_free_eofblocks(ip);
1172  
1173  	/* inode could be preallocated or append-only */
1174  	trace_xfs_inode_free_eofblocks_invalid(ip);
1175  	xfs_inode_clear_eofblocks_tag(ip);
1176  	return 0;
1177  }
1178  
1179  static void
xfs_blockgc_set_iflag(struct xfs_inode * ip,unsigned long iflag)1180  xfs_blockgc_set_iflag(
1181  	struct xfs_inode	*ip,
1182  	unsigned long		iflag)
1183  {
1184  	struct xfs_mount	*mp = ip->i_mount;
1185  	struct xfs_perag	*pag;
1186  
1187  	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1188  
1189  	/*
1190  	 * Don't bother locking the AG and looking up in the radix trees
1191  	 * if we already know that we have the tag set.
1192  	 */
1193  	if (ip->i_flags & iflag)
1194  		return;
1195  	spin_lock(&ip->i_flags_lock);
1196  	ip->i_flags |= iflag;
1197  	spin_unlock(&ip->i_flags_lock);
1198  
1199  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1200  	spin_lock(&pag->pag_ici_lock);
1201  
1202  	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1203  			XFS_ICI_BLOCKGC_TAG);
1204  
1205  	spin_unlock(&pag->pag_ici_lock);
1206  	xfs_perag_put(pag);
1207  }
1208  
1209  void
xfs_inode_set_eofblocks_tag(xfs_inode_t * ip)1210  xfs_inode_set_eofblocks_tag(
1211  	xfs_inode_t	*ip)
1212  {
1213  	trace_xfs_inode_set_eofblocks_tag(ip);
1214  	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1215  }
1216  
1217  static void
xfs_blockgc_clear_iflag(struct xfs_inode * ip,unsigned long iflag)1218  xfs_blockgc_clear_iflag(
1219  	struct xfs_inode	*ip,
1220  	unsigned long		iflag)
1221  {
1222  	struct xfs_mount	*mp = ip->i_mount;
1223  	struct xfs_perag	*pag;
1224  	bool			clear_tag;
1225  
1226  	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1227  
1228  	spin_lock(&ip->i_flags_lock);
1229  	ip->i_flags &= ~iflag;
1230  	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1231  	spin_unlock(&ip->i_flags_lock);
1232  
1233  	if (!clear_tag)
1234  		return;
1235  
1236  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1237  	spin_lock(&pag->pag_ici_lock);
1238  
1239  	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1240  			XFS_ICI_BLOCKGC_TAG);
1241  
1242  	spin_unlock(&pag->pag_ici_lock);
1243  	xfs_perag_put(pag);
1244  }
1245  
1246  void
xfs_inode_clear_eofblocks_tag(xfs_inode_t * ip)1247  xfs_inode_clear_eofblocks_tag(
1248  	xfs_inode_t	*ip)
1249  {
1250  	trace_xfs_inode_clear_eofblocks_tag(ip);
1251  	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1252  }
1253  
1254  /*
1255   * Set ourselves up to free CoW blocks from this file.  If it's already clean
1256   * then we can bail out quickly, but otherwise we must back off if the file
1257   * is undergoing some kind of write.
1258   */
1259  static bool
xfs_prep_free_cowblocks(struct xfs_inode * ip)1260  xfs_prep_free_cowblocks(
1261  	struct xfs_inode	*ip)
1262  {
1263  	/*
1264  	 * Just clear the tag if we have an empty cow fork or none at all. It's
1265  	 * possible the inode was fully unshared since it was originally tagged.
1266  	 */
1267  	if (!xfs_inode_has_cow_data(ip)) {
1268  		trace_xfs_inode_free_cowblocks_invalid(ip);
1269  		xfs_inode_clear_cowblocks_tag(ip);
1270  		return false;
1271  	}
1272  
1273  	/*
1274  	 * If the mapping is dirty or under writeback we cannot touch the
1275  	 * CoW fork.  Leave it alone if we're in the midst of a directio.
1276  	 */
1277  	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1278  	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1279  	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1280  	    atomic_read(&VFS_I(ip)->i_dio_count))
1281  		return false;
1282  
1283  	return true;
1284  }
1285  
1286  /*
1287   * Automatic CoW Reservation Freeing
1288   *
1289   * These functions automatically garbage collect leftover CoW reservations
1290   * that were made on behalf of a cowextsize hint when we start to run out
1291   * of quota or when the reservations sit around for too long.  If the file
1292   * has dirty pages or is undergoing writeback, its CoW reservations will
1293   * be retained.
1294   *
1295   * The actual garbage collection piggybacks off the same code that runs
1296   * the speculative EOF preallocation garbage collector.
1297   */
1298  STATIC int
xfs_inode_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1299  xfs_inode_free_cowblocks(
1300  	struct xfs_inode	*ip,
1301  	struct xfs_icwalk	*icw,
1302  	unsigned int		*lockflags)
1303  {
1304  	bool			wait;
1305  	int			ret = 0;
1306  
1307  	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1308  
1309  	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1310  		return 0;
1311  
1312  	if (!xfs_prep_free_cowblocks(ip))
1313  		return 0;
1314  
1315  	if (!xfs_icwalk_match(ip, icw))
1316  		return 0;
1317  
1318  	/*
1319  	 * If the caller is waiting, return -EAGAIN to keep the background
1320  	 * scanner moving and revisit the inode in a subsequent pass.
1321  	 */
1322  	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1323  	    !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1324  		if (wait)
1325  			return -EAGAIN;
1326  		return 0;
1327  	}
1328  	*lockflags |= XFS_IOLOCK_EXCL;
1329  
1330  	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1331  		if (wait)
1332  			return -EAGAIN;
1333  		return 0;
1334  	}
1335  	*lockflags |= XFS_MMAPLOCK_EXCL;
1336  
1337  	/*
1338  	 * Check again, nobody else should be able to dirty blocks or change
1339  	 * the reflink iflag now that we have the first two locks held.
1340  	 */
1341  	if (xfs_prep_free_cowblocks(ip))
1342  		ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1343  	return ret;
1344  }
1345  
1346  void
xfs_inode_set_cowblocks_tag(xfs_inode_t * ip)1347  xfs_inode_set_cowblocks_tag(
1348  	xfs_inode_t	*ip)
1349  {
1350  	trace_xfs_inode_set_cowblocks_tag(ip);
1351  	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1352  }
1353  
1354  void
xfs_inode_clear_cowblocks_tag(xfs_inode_t * ip)1355  xfs_inode_clear_cowblocks_tag(
1356  	xfs_inode_t	*ip)
1357  {
1358  	trace_xfs_inode_clear_cowblocks_tag(ip);
1359  	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1360  }
1361  
1362  /* Disable post-EOF and CoW block auto-reclamation. */
1363  void
xfs_blockgc_stop(struct xfs_mount * mp)1364  xfs_blockgc_stop(
1365  	struct xfs_mount	*mp)
1366  {
1367  	struct xfs_perag	*pag;
1368  	xfs_agnumber_t		agno;
1369  
1370  	if (!xfs_clear_blockgc_enabled(mp))
1371  		return;
1372  
1373  	for_each_perag(mp, agno, pag)
1374  		cancel_delayed_work_sync(&pag->pag_blockgc_work);
1375  	trace_xfs_blockgc_stop(mp, __return_address);
1376  }
1377  
1378  /* Enable post-EOF and CoW block auto-reclamation. */
1379  void
xfs_blockgc_start(struct xfs_mount * mp)1380  xfs_blockgc_start(
1381  	struct xfs_mount	*mp)
1382  {
1383  	struct xfs_perag	*pag;
1384  	xfs_agnumber_t		agno;
1385  
1386  	if (xfs_set_blockgc_enabled(mp))
1387  		return;
1388  
1389  	trace_xfs_blockgc_start(mp, __return_address);
1390  	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1391  		xfs_blockgc_queue(pag);
1392  }
1393  
1394  /* Don't try to run block gc on an inode that's in any of these states. */
1395  #define XFS_BLOCKGC_NOGRAB_IFLAGS	(XFS_INEW | \
1396  					 XFS_NEED_INACTIVE | \
1397  					 XFS_INACTIVATING | \
1398  					 XFS_IRECLAIMABLE | \
1399  					 XFS_IRECLAIM)
1400  /*
1401   * Decide if the given @ip is eligible for garbage collection of speculative
1402   * preallocations, and grab it if so.  Returns true if it's ready to go or
1403   * false if we should just ignore it.
1404   */
1405  static bool
xfs_blockgc_igrab(struct xfs_inode * ip)1406  xfs_blockgc_igrab(
1407  	struct xfs_inode	*ip)
1408  {
1409  	struct inode		*inode = VFS_I(ip);
1410  
1411  	ASSERT(rcu_read_lock_held());
1412  
1413  	/* Check for stale RCU freed inode */
1414  	spin_lock(&ip->i_flags_lock);
1415  	if (!ip->i_ino)
1416  		goto out_unlock_noent;
1417  
1418  	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1419  		goto out_unlock_noent;
1420  	spin_unlock(&ip->i_flags_lock);
1421  
1422  	/* nothing to sync during shutdown */
1423  	if (xfs_is_shutdown(ip->i_mount))
1424  		return false;
1425  
1426  	/* If we can't grab the inode, it must on it's way to reclaim. */
1427  	if (!igrab(inode))
1428  		return false;
1429  
1430  	/* inode is valid */
1431  	return true;
1432  
1433  out_unlock_noent:
1434  	spin_unlock(&ip->i_flags_lock);
1435  	return false;
1436  }
1437  
1438  /* Scan one incore inode for block preallocations that we can remove. */
1439  static int
xfs_blockgc_scan_inode(struct xfs_inode * ip,struct xfs_icwalk * icw)1440  xfs_blockgc_scan_inode(
1441  	struct xfs_inode	*ip,
1442  	struct xfs_icwalk	*icw)
1443  {
1444  	unsigned int		lockflags = 0;
1445  	int			error;
1446  
1447  	error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1448  	if (error)
1449  		goto unlock;
1450  
1451  	error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1452  unlock:
1453  	if (lockflags)
1454  		xfs_iunlock(ip, lockflags);
1455  	xfs_irele(ip);
1456  	return error;
1457  }
1458  
1459  /* Background worker that trims preallocated space. */
1460  void
xfs_blockgc_worker(struct work_struct * work)1461  xfs_blockgc_worker(
1462  	struct work_struct	*work)
1463  {
1464  	struct xfs_perag	*pag = container_of(to_delayed_work(work),
1465  					struct xfs_perag, pag_blockgc_work);
1466  	struct xfs_mount	*mp = pag->pag_mount;
1467  	int			error;
1468  
1469  	trace_xfs_blockgc_worker(mp, __return_address);
1470  
1471  	error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1472  	if (error)
1473  		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1474  				pag->pag_agno, error);
1475  	xfs_blockgc_queue(pag);
1476  }
1477  
1478  /*
1479   * Try to free space in the filesystem by purging inactive inodes, eofblocks
1480   * and cowblocks.
1481   */
1482  int
xfs_blockgc_free_space(struct xfs_mount * mp,struct xfs_icwalk * icw)1483  xfs_blockgc_free_space(
1484  	struct xfs_mount	*mp,
1485  	struct xfs_icwalk	*icw)
1486  {
1487  	int			error;
1488  
1489  	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1490  
1491  	error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
1492  	if (error)
1493  		return error;
1494  
1495  	xfs_inodegc_flush(mp);
1496  	return 0;
1497  }
1498  
1499  /*
1500   * Reclaim all the free space that we can by scheduling the background blockgc
1501   * and inodegc workers immediately and waiting for them all to clear.
1502   */
1503  void
xfs_blockgc_flush_all(struct xfs_mount * mp)1504  xfs_blockgc_flush_all(
1505  	struct xfs_mount	*mp)
1506  {
1507  	struct xfs_perag	*pag;
1508  	xfs_agnumber_t		agno;
1509  
1510  	trace_xfs_blockgc_flush_all(mp, __return_address);
1511  
1512  	/*
1513  	 * For each blockgc worker, move its queue time up to now.  If it
1514  	 * wasn't queued, it will not be requeued.  Then flush whatever's
1515  	 * left.
1516  	 */
1517  	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1518  		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
1519  				&pag->pag_blockgc_work, 0);
1520  
1521  	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1522  		flush_delayed_work(&pag->pag_blockgc_work);
1523  
1524  	xfs_inodegc_flush(mp);
1525  }
1526  
1527  /*
1528   * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1529   * quota caused an allocation failure, so we make a best effort by including
1530   * each quota under low free space conditions (less than 1% free space) in the
1531   * scan.
1532   *
1533   * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1534   * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1535   * MMAPLOCK.
1536   */
1537  int
xfs_blockgc_free_dquots(struct xfs_mount * mp,struct xfs_dquot * udqp,struct xfs_dquot * gdqp,struct xfs_dquot * pdqp,unsigned int iwalk_flags)1538  xfs_blockgc_free_dquots(
1539  	struct xfs_mount	*mp,
1540  	struct xfs_dquot	*udqp,
1541  	struct xfs_dquot	*gdqp,
1542  	struct xfs_dquot	*pdqp,
1543  	unsigned int		iwalk_flags)
1544  {
1545  	struct xfs_icwalk	icw = {0};
1546  	bool			do_work = false;
1547  
1548  	if (!udqp && !gdqp && !pdqp)
1549  		return 0;
1550  
1551  	/*
1552  	 * Run a scan to free blocks using the union filter to cover all
1553  	 * applicable quotas in a single scan.
1554  	 */
1555  	icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1556  
1557  	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1558  		icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
1559  		icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1560  		do_work = true;
1561  	}
1562  
1563  	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1564  		icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
1565  		icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1566  		do_work = true;
1567  	}
1568  
1569  	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1570  		icw.icw_prid = pdqp->q_id;
1571  		icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1572  		do_work = true;
1573  	}
1574  
1575  	if (!do_work)
1576  		return 0;
1577  
1578  	return xfs_blockgc_free_space(mp, &icw);
1579  }
1580  
1581  /* Run cow/eofblocks scans on the quotas attached to the inode. */
1582  int
xfs_blockgc_free_quota(struct xfs_inode * ip,unsigned int iwalk_flags)1583  xfs_blockgc_free_quota(
1584  	struct xfs_inode	*ip,
1585  	unsigned int		iwalk_flags)
1586  {
1587  	return xfs_blockgc_free_dquots(ip->i_mount,
1588  			xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1589  			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1590  			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1591  }
1592  
1593  /* XFS Inode Cache Walking Code */
1594  
1595  /*
1596   * The inode lookup is done in batches to keep the amount of lock traffic and
1597   * radix tree lookups to a minimum. The batch size is a trade off between
1598   * lookup reduction and stack usage. This is in the reclaim path, so we can't
1599   * be too greedy.
1600   */
1601  #define XFS_LOOKUP_BATCH	32
1602  
1603  
1604  /*
1605   * Decide if we want to grab this inode in anticipation of doing work towards
1606   * the goal.
1607   */
1608  static inline bool
xfs_icwalk_igrab(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_icwalk * icw)1609  xfs_icwalk_igrab(
1610  	enum xfs_icwalk_goal	goal,
1611  	struct xfs_inode	*ip,
1612  	struct xfs_icwalk	*icw)
1613  {
1614  	switch (goal) {
1615  	case XFS_ICWALK_BLOCKGC:
1616  		return xfs_blockgc_igrab(ip);
1617  	case XFS_ICWALK_RECLAIM:
1618  		return xfs_reclaim_igrab(ip, icw);
1619  	default:
1620  		return false;
1621  	}
1622  }
1623  
1624  /*
1625   * Process an inode.  Each processing function must handle any state changes
1626   * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
1627   */
1628  static inline int
xfs_icwalk_process_inode(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_perag * pag,struct xfs_icwalk * icw)1629  xfs_icwalk_process_inode(
1630  	enum xfs_icwalk_goal	goal,
1631  	struct xfs_inode	*ip,
1632  	struct xfs_perag	*pag,
1633  	struct xfs_icwalk	*icw)
1634  {
1635  	int			error = 0;
1636  
1637  	switch (goal) {
1638  	case XFS_ICWALK_BLOCKGC:
1639  		error = xfs_blockgc_scan_inode(ip, icw);
1640  		break;
1641  	case XFS_ICWALK_RECLAIM:
1642  		xfs_reclaim_inode(ip, pag);
1643  		break;
1644  	}
1645  	return error;
1646  }
1647  
1648  /*
1649   * For a given per-AG structure @pag and a goal, grab qualifying inodes and
1650   * process them in some manner.
1651   */
1652  static int
xfs_icwalk_ag(struct xfs_perag * pag,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1653  xfs_icwalk_ag(
1654  	struct xfs_perag	*pag,
1655  	enum xfs_icwalk_goal	goal,
1656  	struct xfs_icwalk	*icw)
1657  {
1658  	struct xfs_mount	*mp = pag->pag_mount;
1659  	uint32_t		first_index;
1660  	int			last_error = 0;
1661  	int			skipped;
1662  	bool			done;
1663  	int			nr_found;
1664  
1665  restart:
1666  	done = false;
1667  	skipped = 0;
1668  	if (goal == XFS_ICWALK_RECLAIM)
1669  		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1670  	else
1671  		first_index = 0;
1672  	nr_found = 0;
1673  	do {
1674  		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1675  		int		error = 0;
1676  		int		i;
1677  
1678  		rcu_read_lock();
1679  
1680  		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1681  				(void **) batch, first_index,
1682  				XFS_LOOKUP_BATCH, goal);
1683  		if (!nr_found) {
1684  			done = true;
1685  			rcu_read_unlock();
1686  			break;
1687  		}
1688  
1689  		/*
1690  		 * Grab the inodes before we drop the lock. if we found
1691  		 * nothing, nr == 0 and the loop will be skipped.
1692  		 */
1693  		for (i = 0; i < nr_found; i++) {
1694  			struct xfs_inode *ip = batch[i];
1695  
1696  			if (done || !xfs_icwalk_igrab(goal, ip, icw))
1697  				batch[i] = NULL;
1698  
1699  			/*
1700  			 * Update the index for the next lookup. Catch
1701  			 * overflows into the next AG range which can occur if
1702  			 * we have inodes in the last block of the AG and we
1703  			 * are currently pointing to the last inode.
1704  			 *
1705  			 * Because we may see inodes that are from the wrong AG
1706  			 * due to RCU freeing and reallocation, only update the
1707  			 * index if it lies in this AG. It was a race that lead
1708  			 * us to see this inode, so another lookup from the
1709  			 * same index will not find it again.
1710  			 */
1711  			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1712  				continue;
1713  			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1714  			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1715  				done = true;
1716  		}
1717  
1718  		/* unlock now we've grabbed the inodes. */
1719  		rcu_read_unlock();
1720  
1721  		for (i = 0; i < nr_found; i++) {
1722  			if (!batch[i])
1723  				continue;
1724  			error = xfs_icwalk_process_inode(goal, batch[i], pag,
1725  					icw);
1726  			if (error == -EAGAIN) {
1727  				skipped++;
1728  				continue;
1729  			}
1730  			if (error && last_error != -EFSCORRUPTED)
1731  				last_error = error;
1732  		}
1733  
1734  		/* bail out if the filesystem is corrupted.  */
1735  		if (error == -EFSCORRUPTED)
1736  			break;
1737  
1738  		cond_resched();
1739  
1740  		if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1741  			icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1742  			if (icw->icw_scan_limit <= 0)
1743  				break;
1744  		}
1745  	} while (nr_found && !done);
1746  
1747  	if (goal == XFS_ICWALK_RECLAIM) {
1748  		if (done)
1749  			first_index = 0;
1750  		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1751  	}
1752  
1753  	if (skipped) {
1754  		delay(1);
1755  		goto restart;
1756  	}
1757  	return last_error;
1758  }
1759  
1760  /* Walk all incore inodes to achieve a given goal. */
1761  static int
xfs_icwalk(struct xfs_mount * mp,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1762  xfs_icwalk(
1763  	struct xfs_mount	*mp,
1764  	enum xfs_icwalk_goal	goal,
1765  	struct xfs_icwalk	*icw)
1766  {
1767  	struct xfs_perag	*pag;
1768  	int			error = 0;
1769  	int			last_error = 0;
1770  	xfs_agnumber_t		agno;
1771  
1772  	for_each_perag_tag(mp, agno, pag, goal) {
1773  		error = xfs_icwalk_ag(pag, goal, icw);
1774  		if (error) {
1775  			last_error = error;
1776  			if (error == -EFSCORRUPTED) {
1777  				xfs_perag_put(pag);
1778  				break;
1779  			}
1780  		}
1781  	}
1782  	return last_error;
1783  	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1784  }
1785  
1786  #ifdef DEBUG
1787  static void
xfs_check_delalloc(struct xfs_inode * ip,int whichfork)1788  xfs_check_delalloc(
1789  	struct xfs_inode	*ip,
1790  	int			whichfork)
1791  {
1792  	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
1793  	struct xfs_bmbt_irec	got;
1794  	struct xfs_iext_cursor	icur;
1795  
1796  	if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
1797  		return;
1798  	do {
1799  		if (isnullstartblock(got.br_startblock)) {
1800  			xfs_warn(ip->i_mount,
1801  	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1802  				ip->i_ino,
1803  				whichfork == XFS_DATA_FORK ? "data" : "cow",
1804  				got.br_startoff, got.br_blockcount);
1805  		}
1806  	} while (xfs_iext_next_extent(ifp, &icur, &got));
1807  }
1808  #else
1809  #define xfs_check_delalloc(ip, whichfork)	do { } while (0)
1810  #endif
1811  
1812  /* Schedule the inode for reclaim. */
1813  static void
xfs_inodegc_set_reclaimable(struct xfs_inode * ip)1814  xfs_inodegc_set_reclaimable(
1815  	struct xfs_inode	*ip)
1816  {
1817  	struct xfs_mount	*mp = ip->i_mount;
1818  	struct xfs_perag	*pag;
1819  
1820  	if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1821  		xfs_check_delalloc(ip, XFS_DATA_FORK);
1822  		xfs_check_delalloc(ip, XFS_COW_FORK);
1823  		ASSERT(0);
1824  	}
1825  
1826  	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1827  	spin_lock(&pag->pag_ici_lock);
1828  	spin_lock(&ip->i_flags_lock);
1829  
1830  	trace_xfs_inode_set_reclaimable(ip);
1831  	ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
1832  	ip->i_flags |= XFS_IRECLAIMABLE;
1833  	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1834  			XFS_ICI_RECLAIM_TAG);
1835  
1836  	spin_unlock(&ip->i_flags_lock);
1837  	spin_unlock(&pag->pag_ici_lock);
1838  	xfs_perag_put(pag);
1839  }
1840  
1841  /*
1842   * Free all speculative preallocations and possibly even the inode itself.
1843   * This is the last chance to make changes to an otherwise unreferenced file
1844   * before incore reclamation happens.
1845   */
1846  static void
xfs_inodegc_inactivate(struct xfs_inode * ip)1847  xfs_inodegc_inactivate(
1848  	struct xfs_inode	*ip)
1849  {
1850  	trace_xfs_inode_inactivating(ip);
1851  	xfs_inactive(ip);
1852  	xfs_inodegc_set_reclaimable(ip);
1853  }
1854  
1855  void
xfs_inodegc_worker(struct work_struct * work)1856  xfs_inodegc_worker(
1857  	struct work_struct	*work)
1858  {
1859  	struct xfs_inodegc	*gc = container_of(to_delayed_work(work),
1860  						struct xfs_inodegc, work);
1861  	struct llist_node	*node = llist_del_all(&gc->list);
1862  	struct xfs_inode	*ip, *n;
1863  
1864  	ASSERT(gc->cpu == smp_processor_id());
1865  
1866  	WRITE_ONCE(gc->items, 0);
1867  
1868  	if (!node)
1869  		return;
1870  
1871  	ip = llist_entry(node, struct xfs_inode, i_gclist);
1872  	trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
1873  
1874  	WRITE_ONCE(gc->shrinker_hits, 0);
1875  	llist_for_each_entry_safe(ip, n, node, i_gclist) {
1876  		xfs_iflags_set(ip, XFS_INACTIVATING);
1877  		xfs_inodegc_inactivate(ip);
1878  	}
1879  }
1880  
1881  /*
1882   * Expedite all pending inodegc work to run immediately. This does not wait for
1883   * completion of the work.
1884   */
1885  void
xfs_inodegc_push(struct xfs_mount * mp)1886  xfs_inodegc_push(
1887  	struct xfs_mount	*mp)
1888  {
1889  	if (!xfs_is_inodegc_enabled(mp))
1890  		return;
1891  	trace_xfs_inodegc_push(mp, __return_address);
1892  	xfs_inodegc_queue_all(mp);
1893  }
1894  
1895  /*
1896   * Force all currently queued inode inactivation work to run immediately and
1897   * wait for the work to finish.
1898   */
1899  void
xfs_inodegc_flush(struct xfs_mount * mp)1900  xfs_inodegc_flush(
1901  	struct xfs_mount	*mp)
1902  {
1903  	xfs_inodegc_push(mp);
1904  	trace_xfs_inodegc_flush(mp, __return_address);
1905  	flush_workqueue(mp->m_inodegc_wq);
1906  }
1907  
1908  /*
1909   * Flush all the pending work and then disable the inode inactivation background
1910   * workers and wait for them to stop.  Caller must hold sb->s_umount to
1911   * coordinate changes in the inodegc_enabled state.
1912   */
1913  void
xfs_inodegc_stop(struct xfs_mount * mp)1914  xfs_inodegc_stop(
1915  	struct xfs_mount	*mp)
1916  {
1917  	bool			rerun;
1918  
1919  	if (!xfs_clear_inodegc_enabled(mp))
1920  		return;
1921  
1922  	/*
1923  	 * Drain all pending inodegc work, including inodes that could be
1924  	 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
1925  	 * threads that sample the inodegc state just prior to us clearing it.
1926  	 * The inodegc flag state prevents new threads from queuing more
1927  	 * inodes, so we queue pending work items and flush the workqueue until
1928  	 * all inodegc lists are empty.  IOWs, we cannot use drain_workqueue
1929  	 * here because it does not allow other unserialized mechanisms to
1930  	 * reschedule inodegc work while this draining is in progress.
1931  	 */
1932  	xfs_inodegc_queue_all(mp);
1933  	do {
1934  		flush_workqueue(mp->m_inodegc_wq);
1935  		rerun = xfs_inodegc_queue_all(mp);
1936  	} while (rerun);
1937  
1938  	trace_xfs_inodegc_stop(mp, __return_address);
1939  }
1940  
1941  /*
1942   * Enable the inode inactivation background workers and schedule deferred inode
1943   * inactivation work if there is any.  Caller must hold sb->s_umount to
1944   * coordinate changes in the inodegc_enabled state.
1945   */
1946  void
xfs_inodegc_start(struct xfs_mount * mp)1947  xfs_inodegc_start(
1948  	struct xfs_mount	*mp)
1949  {
1950  	if (xfs_set_inodegc_enabled(mp))
1951  		return;
1952  
1953  	trace_xfs_inodegc_start(mp, __return_address);
1954  	xfs_inodegc_queue_all(mp);
1955  }
1956  
1957  #ifdef CONFIG_XFS_RT
1958  static inline bool
xfs_inodegc_want_queue_rt_file(struct xfs_inode * ip)1959  xfs_inodegc_want_queue_rt_file(
1960  	struct xfs_inode	*ip)
1961  {
1962  	struct xfs_mount	*mp = ip->i_mount;
1963  	uint64_t		freertx;
1964  
1965  	if (!XFS_IS_REALTIME_INODE(ip))
1966  		return false;
1967  
1968  	freertx = READ_ONCE(mp->m_sb.sb_frextents);
1969  	return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
1970  }
1971  #else
1972  # define xfs_inodegc_want_queue_rt_file(ip)	(false)
1973  #endif /* CONFIG_XFS_RT */
1974  
1975  /*
1976   * Schedule the inactivation worker when:
1977   *
1978   *  - We've accumulated more than one inode cluster buffer's worth of inodes.
1979   *  - There is less than 5% free space left.
1980   *  - Any of the quotas for this inode are near an enforcement limit.
1981   */
1982  static inline bool
xfs_inodegc_want_queue_work(struct xfs_inode * ip,unsigned int items)1983  xfs_inodegc_want_queue_work(
1984  	struct xfs_inode	*ip,
1985  	unsigned int		items)
1986  {
1987  	struct xfs_mount	*mp = ip->i_mount;
1988  
1989  	if (items > mp->m_ino_geo.inodes_per_cluster)
1990  		return true;
1991  
1992  	if (__percpu_counter_compare(&mp->m_fdblocks,
1993  				mp->m_low_space[XFS_LOWSP_5_PCNT],
1994  				XFS_FDBLOCKS_BATCH) < 0)
1995  		return true;
1996  
1997  	if (xfs_inodegc_want_queue_rt_file(ip))
1998  		return true;
1999  
2000  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
2001  		return true;
2002  
2003  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
2004  		return true;
2005  
2006  	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
2007  		return true;
2008  
2009  	return false;
2010  }
2011  
2012  /*
2013   * Upper bound on the number of inodes in each AG that can be queued for
2014   * inactivation at any given time, to avoid monopolizing the workqueue.
2015   */
2016  #define XFS_INODEGC_MAX_BACKLOG		(4 * XFS_INODES_PER_CHUNK)
2017  
2018  /*
2019   * Make the frontend wait for inactivations when:
2020   *
2021   *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
2022   *  - The queue depth exceeds the maximum allowable percpu backlog.
2023   *
2024   * Note: If the current thread is running a transaction, we don't ever want to
2025   * wait for other transactions because that could introduce a deadlock.
2026   */
2027  static inline bool
xfs_inodegc_want_flush_work(struct xfs_inode * ip,unsigned int items,unsigned int shrinker_hits)2028  xfs_inodegc_want_flush_work(
2029  	struct xfs_inode	*ip,
2030  	unsigned int		items,
2031  	unsigned int		shrinker_hits)
2032  {
2033  	if (current->journal_info)
2034  		return false;
2035  
2036  	if (shrinker_hits > 0)
2037  		return true;
2038  
2039  	if (items > XFS_INODEGC_MAX_BACKLOG)
2040  		return true;
2041  
2042  	return false;
2043  }
2044  
2045  /*
2046   * Queue a background inactivation worker if there are inodes that need to be
2047   * inactivated and higher level xfs code hasn't disabled the background
2048   * workers.
2049   */
2050  static void
xfs_inodegc_queue(struct xfs_inode * ip)2051  xfs_inodegc_queue(
2052  	struct xfs_inode	*ip)
2053  {
2054  	struct xfs_mount	*mp = ip->i_mount;
2055  	struct xfs_inodegc	*gc;
2056  	int			items;
2057  	unsigned int		shrinker_hits;
2058  	unsigned long		queue_delay = 1;
2059  
2060  	trace_xfs_inode_set_need_inactive(ip);
2061  	spin_lock(&ip->i_flags_lock);
2062  	ip->i_flags |= XFS_NEED_INACTIVE;
2063  	spin_unlock(&ip->i_flags_lock);
2064  
2065  	gc = get_cpu_ptr(mp->m_inodegc);
2066  	llist_add(&ip->i_gclist, &gc->list);
2067  	items = READ_ONCE(gc->items);
2068  	WRITE_ONCE(gc->items, items + 1);
2069  	shrinker_hits = READ_ONCE(gc->shrinker_hits);
2070  
2071  	/*
2072  	 * We queue the work while holding the current CPU so that the work
2073  	 * is scheduled to run on this CPU.
2074  	 */
2075  	if (!xfs_is_inodegc_enabled(mp)) {
2076  		put_cpu_ptr(gc);
2077  		return;
2078  	}
2079  
2080  	if (xfs_inodegc_want_queue_work(ip, items))
2081  		queue_delay = 0;
2082  
2083  	trace_xfs_inodegc_queue(mp, __return_address);
2084  	mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2085  			queue_delay);
2086  	put_cpu_ptr(gc);
2087  
2088  	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2089  		trace_xfs_inodegc_throttle(mp, __return_address);
2090  		flush_delayed_work(&gc->work);
2091  	}
2092  }
2093  
2094  /*
2095   * Fold the dead CPU inodegc queue into the current CPUs queue.
2096   */
2097  void
xfs_inodegc_cpu_dead(struct xfs_mount * mp,unsigned int dead_cpu)2098  xfs_inodegc_cpu_dead(
2099  	struct xfs_mount	*mp,
2100  	unsigned int		dead_cpu)
2101  {
2102  	struct xfs_inodegc	*dead_gc, *gc;
2103  	struct llist_node	*first, *last;
2104  	unsigned int		count = 0;
2105  
2106  	dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
2107  	cancel_delayed_work_sync(&dead_gc->work);
2108  
2109  	if (llist_empty(&dead_gc->list))
2110  		return;
2111  
2112  	first = dead_gc->list.first;
2113  	last = first;
2114  	while (last->next) {
2115  		last = last->next;
2116  		count++;
2117  	}
2118  	dead_gc->list.first = NULL;
2119  	dead_gc->items = 0;
2120  
2121  	/* Add pending work to current CPU */
2122  	gc = get_cpu_ptr(mp->m_inodegc);
2123  	llist_add_batch(first, last, &gc->list);
2124  	count += READ_ONCE(gc->items);
2125  	WRITE_ONCE(gc->items, count);
2126  
2127  	if (xfs_is_inodegc_enabled(mp)) {
2128  		trace_xfs_inodegc_queue(mp, __return_address);
2129  		mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2130  				0);
2131  	}
2132  	put_cpu_ptr(gc);
2133  }
2134  
2135  /*
2136   * We set the inode flag atomically with the radix tree tag.  Once we get tag
2137   * lookups on the radix tree, this inode flag can go away.
2138   *
2139   * We always use background reclaim here because even if the inode is clean, it
2140   * still may be under IO and hence we have wait for IO completion to occur
2141   * before we can reclaim the inode. The background reclaim path handles this
2142   * more efficiently than we can here, so simply let background reclaim tear down
2143   * all inodes.
2144   */
2145  void
xfs_inode_mark_reclaimable(struct xfs_inode * ip)2146  xfs_inode_mark_reclaimable(
2147  	struct xfs_inode	*ip)
2148  {
2149  	struct xfs_mount	*mp = ip->i_mount;
2150  	bool			need_inactive;
2151  
2152  	XFS_STATS_INC(mp, vn_reclaim);
2153  
2154  	/*
2155  	 * We should never get here with any of the reclaim flags already set.
2156  	 */
2157  	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2158  
2159  	need_inactive = xfs_inode_needs_inactive(ip);
2160  	if (need_inactive) {
2161  		xfs_inodegc_queue(ip);
2162  		return;
2163  	}
2164  
2165  	/* Going straight to reclaim, so drop the dquots. */
2166  	xfs_qm_dqdetach(ip);
2167  	xfs_inodegc_set_reclaimable(ip);
2168  }
2169  
2170  /*
2171   * Register a phony shrinker so that we can run background inodegc sooner when
2172   * there's memory pressure.  Inactivation does not itself free any memory but
2173   * it does make inodes reclaimable, which eventually frees memory.
2174   *
2175   * The count function, seek value, and batch value are crafted to trigger the
2176   * scan function during the second round of scanning.  Hopefully this means
2177   * that we reclaimed enough memory that initiating metadata transactions won't
2178   * make things worse.
2179   */
2180  #define XFS_INODEGC_SHRINKER_COUNT	(1UL << DEF_PRIORITY)
2181  #define XFS_INODEGC_SHRINKER_BATCH	((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2182  
2183  static unsigned long
xfs_inodegc_shrinker_count(struct shrinker * shrink,struct shrink_control * sc)2184  xfs_inodegc_shrinker_count(
2185  	struct shrinker		*shrink,
2186  	struct shrink_control	*sc)
2187  {
2188  	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
2189  						   m_inodegc_shrinker);
2190  	struct xfs_inodegc	*gc;
2191  	int			cpu;
2192  
2193  	if (!xfs_is_inodegc_enabled(mp))
2194  		return 0;
2195  
2196  	for_each_online_cpu(cpu) {
2197  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
2198  		if (!llist_empty(&gc->list))
2199  			return XFS_INODEGC_SHRINKER_COUNT;
2200  	}
2201  
2202  	return 0;
2203  }
2204  
2205  static unsigned long
xfs_inodegc_shrinker_scan(struct shrinker * shrink,struct shrink_control * sc)2206  xfs_inodegc_shrinker_scan(
2207  	struct shrinker		*shrink,
2208  	struct shrink_control	*sc)
2209  {
2210  	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
2211  						   m_inodegc_shrinker);
2212  	struct xfs_inodegc	*gc;
2213  	int			cpu;
2214  	bool			no_items = true;
2215  
2216  	if (!xfs_is_inodegc_enabled(mp))
2217  		return SHRINK_STOP;
2218  
2219  	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2220  
2221  	for_each_online_cpu(cpu) {
2222  		gc = per_cpu_ptr(mp->m_inodegc, cpu);
2223  		if (!llist_empty(&gc->list)) {
2224  			unsigned int	h = READ_ONCE(gc->shrinker_hits);
2225  
2226  			WRITE_ONCE(gc->shrinker_hits, h + 1);
2227  			mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
2228  			no_items = false;
2229  		}
2230  	}
2231  
2232  	/*
2233  	 * If there are no inodes to inactivate, we don't want the shrinker
2234  	 * to think there's deferred work to call us back about.
2235  	 */
2236  	if (no_items)
2237  		return LONG_MAX;
2238  
2239  	return SHRINK_STOP;
2240  }
2241  
2242  /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
2243  int
xfs_inodegc_register_shrinker(struct xfs_mount * mp)2244  xfs_inodegc_register_shrinker(
2245  	struct xfs_mount	*mp)
2246  {
2247  	struct shrinker		*shrink = &mp->m_inodegc_shrinker;
2248  
2249  	shrink->count_objects = xfs_inodegc_shrinker_count;
2250  	shrink->scan_objects = xfs_inodegc_shrinker_scan;
2251  	shrink->seeks = 0;
2252  	shrink->flags = SHRINKER_NONSLAB;
2253  	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
2254  
2255  	return register_shrinker(shrink);
2256  }
2257