1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 27 #include <linux/iversion.h> 28 29 /* Radix tree tags for incore inode tree. */ 30 31 /* inode is to be reclaimed */ 32 #define XFS_ICI_RECLAIM_TAG 0 33 /* Inode has speculative preallocations (posteof or cow) to clean. */ 34 #define XFS_ICI_BLOCKGC_TAG 1 35 36 /* 37 * The goal for walking incore inodes. These can correspond with incore inode 38 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. 39 */ 40 enum xfs_icwalk_goal { 41 /* Goals directly associated with tagged inodes. */ 42 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, 43 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, 44 }; 45 46 static int xfs_icwalk(struct xfs_mount *mp, 47 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 48 static int xfs_icwalk_ag(struct xfs_perag *pag, 49 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 50 51 /* 52 * Private inode cache walk flags for struct xfs_icwalk. Must not 53 * coincide with XFS_ICWALK_FLAGS_VALID. 54 */ 55 56 /* Stop scanning after icw_scan_limit inodes. */ 57 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) 58 59 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) 60 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ 61 62 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ 63 XFS_ICWALK_FLAG_RECLAIM_SICK | \ 64 XFS_ICWALK_FLAG_UNION) 65 66 /* 67 * Allocate and initialise an xfs_inode. 68 */ 69 struct xfs_inode * xfs_inode_alloc(struct xfs_mount * mp,xfs_ino_t ino)70 xfs_inode_alloc( 71 struct xfs_mount *mp, 72 xfs_ino_t ino) 73 { 74 struct xfs_inode *ip; 75 76 /* 77 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 78 * and return NULL here on ENOMEM. 79 */ 80 ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); 81 82 if (inode_init_always(mp->m_super, VFS_I(ip))) { 83 kmem_cache_free(xfs_inode_zone, ip); 84 return NULL; 85 } 86 87 /* VFS doesn't initialise i_mode or i_state! */ 88 VFS_I(ip)->i_mode = 0; 89 VFS_I(ip)->i_state = 0; 90 91 XFS_STATS_INC(mp, vn_active); 92 ASSERT(atomic_read(&ip->i_pincount) == 0); 93 ASSERT(ip->i_ino == 0); 94 95 /* initialise the xfs inode */ 96 ip->i_ino = ino; 97 ip->i_mount = mp; 98 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 99 ip->i_afp = NULL; 100 ip->i_cowfp = NULL; 101 memset(&ip->i_df, 0, sizeof(ip->i_df)); 102 ip->i_flags = 0; 103 ip->i_delayed_blks = 0; 104 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 105 ip->i_nblocks = 0; 106 ip->i_forkoff = 0; 107 ip->i_sick = 0; 108 ip->i_checked = 0; 109 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 110 INIT_LIST_HEAD(&ip->i_ioend_list); 111 spin_lock_init(&ip->i_ioend_lock); 112 113 return ip; 114 } 115 116 STATIC void xfs_inode_free_callback(struct rcu_head * head)117 xfs_inode_free_callback( 118 struct rcu_head *head) 119 { 120 struct inode *inode = container_of(head, struct inode, i_rcu); 121 struct xfs_inode *ip = XFS_I(inode); 122 123 switch (VFS_I(ip)->i_mode & S_IFMT) { 124 case S_IFREG: 125 case S_IFDIR: 126 case S_IFLNK: 127 xfs_idestroy_fork(&ip->i_df); 128 break; 129 } 130 131 if (ip->i_afp) { 132 xfs_idestroy_fork(ip->i_afp); 133 kmem_cache_free(xfs_ifork_zone, ip->i_afp); 134 } 135 if (ip->i_cowfp) { 136 xfs_idestroy_fork(ip->i_cowfp); 137 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 138 } 139 if (ip->i_itemp) { 140 ASSERT(!test_bit(XFS_LI_IN_AIL, 141 &ip->i_itemp->ili_item.li_flags)); 142 xfs_inode_item_destroy(ip); 143 ip->i_itemp = NULL; 144 } 145 146 kmem_cache_free(xfs_inode_zone, ip); 147 } 148 149 static void __xfs_inode_free(struct xfs_inode * ip)150 __xfs_inode_free( 151 struct xfs_inode *ip) 152 { 153 /* asserts to verify all state is correct here */ 154 ASSERT(atomic_read(&ip->i_pincount) == 0); 155 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 156 XFS_STATS_DEC(ip->i_mount, vn_active); 157 158 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 159 } 160 161 void xfs_inode_free(struct xfs_inode * ip)162 xfs_inode_free( 163 struct xfs_inode *ip) 164 { 165 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 166 167 /* 168 * Because we use RCU freeing we need to ensure the inode always 169 * appears to be reclaimed with an invalid inode number when in the 170 * free state. The ip->i_flags_lock provides the barrier against lookup 171 * races. 172 */ 173 spin_lock(&ip->i_flags_lock); 174 ip->i_flags = XFS_IRECLAIM; 175 ip->i_ino = 0; 176 spin_unlock(&ip->i_flags_lock); 177 178 __xfs_inode_free(ip); 179 } 180 181 /* 182 * Queue background inode reclaim work if there are reclaimable inodes and there 183 * isn't reclaim work already scheduled or in progress. 184 */ 185 static void xfs_reclaim_work_queue(struct xfs_mount * mp)186 xfs_reclaim_work_queue( 187 struct xfs_mount *mp) 188 { 189 190 rcu_read_lock(); 191 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 192 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 193 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 194 } 195 rcu_read_unlock(); 196 } 197 198 /* 199 * Background scanning to trim preallocated space. This is queued based on the 200 * 'speculative_prealloc_lifetime' tunable (5m by default). 201 */ 202 static inline void xfs_blockgc_queue(struct xfs_perag * pag)203 xfs_blockgc_queue( 204 struct xfs_perag *pag) 205 { 206 struct xfs_mount *mp = pag->pag_mount; 207 208 if (!xfs_is_blockgc_enabled(mp)) 209 return; 210 211 rcu_read_lock(); 212 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 213 queue_delayed_work(pag->pag_mount->m_blockgc_wq, 214 &pag->pag_blockgc_work, 215 msecs_to_jiffies(xfs_blockgc_secs * 1000)); 216 rcu_read_unlock(); 217 } 218 219 /* Set a tag on both the AG incore inode tree and the AG radix tree. */ 220 static void xfs_perag_set_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)221 xfs_perag_set_inode_tag( 222 struct xfs_perag *pag, 223 xfs_agino_t agino, 224 unsigned int tag) 225 { 226 struct xfs_mount *mp = pag->pag_mount; 227 bool was_tagged; 228 229 lockdep_assert_held(&pag->pag_ici_lock); 230 231 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 232 radix_tree_tag_set(&pag->pag_ici_root, agino, tag); 233 234 if (tag == XFS_ICI_RECLAIM_TAG) 235 pag->pag_ici_reclaimable++; 236 237 if (was_tagged) 238 return; 239 240 /* propagate the tag up into the perag radix tree */ 241 spin_lock(&mp->m_perag_lock); 242 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); 243 spin_unlock(&mp->m_perag_lock); 244 245 /* start background work */ 246 switch (tag) { 247 case XFS_ICI_RECLAIM_TAG: 248 xfs_reclaim_work_queue(mp); 249 break; 250 case XFS_ICI_BLOCKGC_TAG: 251 xfs_blockgc_queue(pag); 252 break; 253 } 254 255 trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); 256 } 257 258 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ 259 static void xfs_perag_clear_inode_tag(struct xfs_perag * pag,xfs_agino_t agino,unsigned int tag)260 xfs_perag_clear_inode_tag( 261 struct xfs_perag *pag, 262 xfs_agino_t agino, 263 unsigned int tag) 264 { 265 struct xfs_mount *mp = pag->pag_mount; 266 267 lockdep_assert_held(&pag->pag_ici_lock); 268 269 /* 270 * Reclaim can signal (with a null agino) that it cleared its own tag 271 * by removing the inode from the radix tree. 272 */ 273 if (agino != NULLAGINO) 274 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); 275 else 276 ASSERT(tag == XFS_ICI_RECLAIM_TAG); 277 278 if (tag == XFS_ICI_RECLAIM_TAG) 279 pag->pag_ici_reclaimable--; 280 281 if (radix_tree_tagged(&pag->pag_ici_root, tag)) 282 return; 283 284 /* clear the tag from the perag radix tree */ 285 spin_lock(&mp->m_perag_lock); 286 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); 287 spin_unlock(&mp->m_perag_lock); 288 289 trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); 290 } 291 292 static inline void xfs_inew_wait(struct xfs_inode * ip)293 xfs_inew_wait( 294 struct xfs_inode *ip) 295 { 296 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 297 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 298 299 do { 300 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 301 if (!xfs_iflags_test(ip, XFS_INEW)) 302 break; 303 schedule(); 304 } while (true); 305 finish_wait(wq, &wait.wq_entry); 306 } 307 308 /* 309 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 310 * part of the structure. This is made more complex by the fact we store 311 * information about the on-disk values in the VFS inode and so we can't just 312 * overwrite the values unconditionally. Hence we save the parameters we 313 * need to retain across reinitialisation, and rewrite them into the VFS inode 314 * after reinitialisation even if it fails. 315 */ 316 static int xfs_reinit_inode(struct xfs_mount * mp,struct inode * inode)317 xfs_reinit_inode( 318 struct xfs_mount *mp, 319 struct inode *inode) 320 { 321 int error; 322 uint32_t nlink = inode->i_nlink; 323 uint32_t generation = inode->i_generation; 324 uint64_t version = inode_peek_iversion(inode); 325 umode_t mode = inode->i_mode; 326 dev_t dev = inode->i_rdev; 327 kuid_t uid = inode->i_uid; 328 kgid_t gid = inode->i_gid; 329 330 error = inode_init_always(mp->m_super, inode); 331 332 set_nlink(inode, nlink); 333 inode->i_generation = generation; 334 inode_set_iversion_queried(inode, version); 335 inode->i_mode = mode; 336 inode->i_rdev = dev; 337 inode->i_uid = uid; 338 inode->i_gid = gid; 339 return error; 340 } 341 342 /* 343 * Carefully nudge an inode whose VFS state has been torn down back into a 344 * usable state. Drops the i_flags_lock and the rcu read lock. 345 */ 346 static int xfs_iget_recycle(struct xfs_perag * pag,struct xfs_inode * ip)347 xfs_iget_recycle( 348 struct xfs_perag *pag, 349 struct xfs_inode *ip) __releases(&ip->i_flags_lock) 350 { 351 struct xfs_mount *mp = ip->i_mount; 352 struct inode *inode = VFS_I(ip); 353 int error; 354 355 trace_xfs_iget_recycle(ip); 356 357 /* 358 * We need to make it look like the inode is being reclaimed to prevent 359 * the actual reclaim workers from stomping over us while we recycle 360 * the inode. We can't clear the radix tree tag yet as it requires 361 * pag_ici_lock to be held exclusive. 362 */ 363 ip->i_flags |= XFS_IRECLAIM; 364 365 spin_unlock(&ip->i_flags_lock); 366 rcu_read_unlock(); 367 368 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 369 error = xfs_reinit_inode(mp, inode); 370 if (error) { 371 bool wake; 372 373 /* 374 * Re-initializing the inode failed, and we are in deep 375 * trouble. Try to re-add it to the reclaim list. 376 */ 377 rcu_read_lock(); 378 spin_lock(&ip->i_flags_lock); 379 wake = !!__xfs_iflags_test(ip, XFS_INEW); 380 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 381 if (wake) 382 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 383 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 384 spin_unlock(&ip->i_flags_lock); 385 rcu_read_unlock(); 386 387 trace_xfs_iget_recycle_fail(ip); 388 return error; 389 } 390 391 spin_lock(&pag->pag_ici_lock); 392 spin_lock(&ip->i_flags_lock); 393 394 /* 395 * Clear the per-lifetime state in the inode as we are now effectively 396 * a new inode and need to return to the initial state before reuse 397 * occurs. 398 */ 399 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 400 ip->i_flags |= XFS_INEW; 401 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 402 XFS_ICI_RECLAIM_TAG); 403 inode->i_state = I_NEW; 404 spin_unlock(&ip->i_flags_lock); 405 spin_unlock(&pag->pag_ici_lock); 406 407 return 0; 408 } 409 410 /* 411 * If we are allocating a new inode, then check what was returned is 412 * actually a free, empty inode. If we are not allocating an inode, 413 * then check we didn't find a free inode. 414 * 415 * Returns: 416 * 0 if the inode free state matches the lookup context 417 * -ENOENT if the inode is free and we are not allocating 418 * -EFSCORRUPTED if there is any state mismatch at all 419 */ 420 static int xfs_iget_check_free_state(struct xfs_inode * ip,int flags)421 xfs_iget_check_free_state( 422 struct xfs_inode *ip, 423 int flags) 424 { 425 if (flags & XFS_IGET_CREATE) { 426 /* should be a free inode */ 427 if (VFS_I(ip)->i_mode != 0) { 428 xfs_warn(ip->i_mount, 429 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 430 ip->i_ino, VFS_I(ip)->i_mode); 431 return -EFSCORRUPTED; 432 } 433 434 if (ip->i_nblocks != 0) { 435 xfs_warn(ip->i_mount, 436 "Corruption detected! Free inode 0x%llx has blocks allocated!", 437 ip->i_ino); 438 return -EFSCORRUPTED; 439 } 440 return 0; 441 } 442 443 /* should be an allocated inode */ 444 if (VFS_I(ip)->i_mode == 0) 445 return -ENOENT; 446 447 return 0; 448 } 449 450 /* Make all pending inactivation work start immediately. */ 451 static bool xfs_inodegc_queue_all(struct xfs_mount * mp)452 xfs_inodegc_queue_all( 453 struct xfs_mount *mp) 454 { 455 struct xfs_inodegc *gc; 456 int cpu; 457 bool ret = false; 458 459 for_each_online_cpu(cpu) { 460 gc = per_cpu_ptr(mp->m_inodegc, cpu); 461 if (!llist_empty(&gc->list)) { 462 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 463 ret = true; 464 } 465 } 466 467 return ret; 468 } 469 470 /* 471 * Check the validity of the inode we just found it the cache 472 */ 473 static int xfs_iget_cache_hit(struct xfs_perag * pag,struct xfs_inode * ip,xfs_ino_t ino,int flags,int lock_flags)474 xfs_iget_cache_hit( 475 struct xfs_perag *pag, 476 struct xfs_inode *ip, 477 xfs_ino_t ino, 478 int flags, 479 int lock_flags) __releases(RCU) 480 { 481 struct inode *inode = VFS_I(ip); 482 struct xfs_mount *mp = ip->i_mount; 483 int error; 484 485 /* 486 * check for re-use of an inode within an RCU grace period due to the 487 * radix tree nodes not being updated yet. We monitor for this by 488 * setting the inode number to zero before freeing the inode structure. 489 * If the inode has been reallocated and set up, then the inode number 490 * will not match, so check for that, too. 491 */ 492 spin_lock(&ip->i_flags_lock); 493 if (ip->i_ino != ino) 494 goto out_skip; 495 496 /* 497 * If we are racing with another cache hit that is currently 498 * instantiating this inode or currently recycling it out of 499 * reclaimable state, wait for the initialisation to complete 500 * before continuing. 501 * 502 * If we're racing with the inactivation worker we also want to wait. 503 * If we're creating a new file, it's possible that the worker 504 * previously marked the inode as free on disk but hasn't finished 505 * updating the incore state yet. The AGI buffer will be dirty and 506 * locked to the icreate transaction, so a synchronous push of the 507 * inodegc workers would result in deadlock. For a regular iget, the 508 * worker is running already, so we might as well wait. 509 * 510 * XXX(hch): eventually we should do something equivalent to 511 * wait_on_inode to wait for these flags to be cleared 512 * instead of polling for it. 513 */ 514 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) 515 goto out_skip; 516 517 if (ip->i_flags & XFS_NEED_INACTIVE) { 518 /* Unlinked inodes cannot be re-grabbed. */ 519 if (VFS_I(ip)->i_nlink == 0) { 520 error = -ENOENT; 521 goto out_error; 522 } 523 goto out_inodegc_flush; 524 } 525 526 /* 527 * Check the inode free state is valid. This also detects lookup 528 * racing with unlinks. 529 */ 530 error = xfs_iget_check_free_state(ip, flags); 531 if (error) 532 goto out_error; 533 534 /* Skip inodes that have no vfs state. */ 535 if ((flags & XFS_IGET_INCORE) && 536 (ip->i_flags & XFS_IRECLAIMABLE)) 537 goto out_skip; 538 539 /* The inode fits the selection criteria; process it. */ 540 if (ip->i_flags & XFS_IRECLAIMABLE) { 541 /* Drops i_flags_lock and RCU read lock. */ 542 error = xfs_iget_recycle(pag, ip); 543 if (error) 544 return error; 545 } else { 546 /* If the VFS inode is being torn down, pause and try again. */ 547 if (!igrab(inode)) 548 goto out_skip; 549 550 /* We've got a live one. */ 551 spin_unlock(&ip->i_flags_lock); 552 rcu_read_unlock(); 553 trace_xfs_iget_hit(ip); 554 } 555 556 if (lock_flags != 0) 557 xfs_ilock(ip, lock_flags); 558 559 if (!(flags & XFS_IGET_INCORE)) 560 xfs_iflags_clear(ip, XFS_ISTALE); 561 XFS_STATS_INC(mp, xs_ig_found); 562 563 return 0; 564 565 out_skip: 566 trace_xfs_iget_skip(ip); 567 XFS_STATS_INC(mp, xs_ig_frecycle); 568 error = -EAGAIN; 569 out_error: 570 spin_unlock(&ip->i_flags_lock); 571 rcu_read_unlock(); 572 return error; 573 574 out_inodegc_flush: 575 spin_unlock(&ip->i_flags_lock); 576 rcu_read_unlock(); 577 /* 578 * Do not wait for the workers, because the caller could hold an AGI 579 * buffer lock. We're just going to sleep in a loop anyway. 580 */ 581 if (xfs_is_inodegc_enabled(mp)) 582 xfs_inodegc_queue_all(mp); 583 return -EAGAIN; 584 } 585 586 static int xfs_iget_cache_miss(struct xfs_mount * mp,struct xfs_perag * pag,xfs_trans_t * tp,xfs_ino_t ino,struct xfs_inode ** ipp,int flags,int lock_flags)587 xfs_iget_cache_miss( 588 struct xfs_mount *mp, 589 struct xfs_perag *pag, 590 xfs_trans_t *tp, 591 xfs_ino_t ino, 592 struct xfs_inode **ipp, 593 int flags, 594 int lock_flags) 595 { 596 struct xfs_inode *ip; 597 int error; 598 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 599 int iflags; 600 601 ip = xfs_inode_alloc(mp, ino); 602 if (!ip) 603 return -ENOMEM; 604 605 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 606 if (error) 607 goto out_destroy; 608 609 /* 610 * For version 5 superblocks, if we are initialising a new inode and we 611 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can 612 * simply build the new inode core with a random generation number. 613 * 614 * For version 4 (and older) superblocks, log recovery is dependent on 615 * the i_flushiter field being initialised from the current on-disk 616 * value and hence we must also read the inode off disk even when 617 * initializing new inodes. 618 */ 619 if (xfs_has_v3inodes(mp) && 620 (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { 621 VFS_I(ip)->i_generation = prandom_u32(); 622 } else { 623 struct xfs_buf *bp; 624 625 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 626 if (error) 627 goto out_destroy; 628 629 error = xfs_inode_from_disk(ip, 630 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 631 if (!error) 632 xfs_buf_set_ref(bp, XFS_INO_REF); 633 xfs_trans_brelse(tp, bp); 634 635 if (error) 636 goto out_destroy; 637 } 638 639 trace_xfs_iget_miss(ip); 640 641 /* 642 * Check the inode free state is valid. This also detects lookup 643 * racing with unlinks. 644 */ 645 error = xfs_iget_check_free_state(ip, flags); 646 if (error) 647 goto out_destroy; 648 649 /* 650 * Preload the radix tree so we can insert safely under the 651 * write spinlock. Note that we cannot sleep inside the preload 652 * region. Since we can be called from transaction context, don't 653 * recurse into the file system. 654 */ 655 if (radix_tree_preload(GFP_NOFS)) { 656 error = -EAGAIN; 657 goto out_destroy; 658 } 659 660 /* 661 * Because the inode hasn't been added to the radix-tree yet it can't 662 * be found by another thread, so we can do the non-sleeping lock here. 663 */ 664 if (lock_flags) { 665 if (!xfs_ilock_nowait(ip, lock_flags)) 666 BUG(); 667 } 668 669 /* 670 * These values must be set before inserting the inode into the radix 671 * tree as the moment it is inserted a concurrent lookup (allowed by the 672 * RCU locking mechanism) can find it and that lookup must see that this 673 * is an inode currently under construction (i.e. that XFS_INEW is set). 674 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 675 * memory barrier that ensures this detection works correctly at lookup 676 * time. 677 */ 678 iflags = XFS_INEW; 679 if (flags & XFS_IGET_DONTCACHE) 680 d_mark_dontcache(VFS_I(ip)); 681 ip->i_udquot = NULL; 682 ip->i_gdquot = NULL; 683 ip->i_pdquot = NULL; 684 xfs_iflags_set(ip, iflags); 685 686 /* insert the new inode */ 687 spin_lock(&pag->pag_ici_lock); 688 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 689 if (unlikely(error)) { 690 WARN_ON(error != -EEXIST); 691 XFS_STATS_INC(mp, xs_ig_dup); 692 error = -EAGAIN; 693 goto out_preload_end; 694 } 695 spin_unlock(&pag->pag_ici_lock); 696 radix_tree_preload_end(); 697 698 *ipp = ip; 699 return 0; 700 701 out_preload_end: 702 spin_unlock(&pag->pag_ici_lock); 703 radix_tree_preload_end(); 704 if (lock_flags) 705 xfs_iunlock(ip, lock_flags); 706 out_destroy: 707 __destroy_inode(VFS_I(ip)); 708 xfs_inode_free(ip); 709 return error; 710 } 711 712 /* 713 * Look up an inode by number in the given file system. The inode is looked up 714 * in the cache held in each AG. If the inode is found in the cache, initialise 715 * the vfs inode if necessary. 716 * 717 * If it is not in core, read it in from the file system's device, add it to the 718 * cache and initialise the vfs inode. 719 * 720 * The inode is locked according to the value of the lock_flags parameter. 721 * Inode lookup is only done during metadata operations and not as part of the 722 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 723 */ 724 int xfs_iget(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,uint flags,uint lock_flags,struct xfs_inode ** ipp)725 xfs_iget( 726 struct xfs_mount *mp, 727 struct xfs_trans *tp, 728 xfs_ino_t ino, 729 uint flags, 730 uint lock_flags, 731 struct xfs_inode **ipp) 732 { 733 struct xfs_inode *ip; 734 struct xfs_perag *pag; 735 xfs_agino_t agino; 736 int error; 737 738 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 739 740 /* reject inode numbers outside existing AGs */ 741 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 742 return -EINVAL; 743 744 XFS_STATS_INC(mp, xs_ig_attempts); 745 746 /* get the perag structure and ensure that it's inode capable */ 747 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 748 agino = XFS_INO_TO_AGINO(mp, ino); 749 750 again: 751 error = 0; 752 rcu_read_lock(); 753 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 754 755 if (ip) { 756 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 757 if (error) 758 goto out_error_or_again; 759 } else { 760 rcu_read_unlock(); 761 if (flags & XFS_IGET_INCORE) { 762 error = -ENODATA; 763 goto out_error_or_again; 764 } 765 XFS_STATS_INC(mp, xs_ig_missed); 766 767 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 768 flags, lock_flags); 769 if (error) 770 goto out_error_or_again; 771 } 772 xfs_perag_put(pag); 773 774 *ipp = ip; 775 776 /* 777 * If we have a real type for an on-disk inode, we can setup the inode 778 * now. If it's a new inode being created, xfs_ialloc will handle it. 779 */ 780 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 781 xfs_setup_existing_inode(ip); 782 return 0; 783 784 out_error_or_again: 785 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 786 delay(1); 787 goto again; 788 } 789 xfs_perag_put(pag); 790 return error; 791 } 792 793 /* 794 * "Is this a cached inode that's also allocated?" 795 * 796 * Look up an inode by number in the given file system. If the inode is 797 * in cache and isn't in purgatory, return 1 if the inode is allocated 798 * and 0 if it is not. For all other cases (not in cache, being torn 799 * down, etc.), return a negative error code. 800 * 801 * The caller has to prevent inode allocation and freeing activity, 802 * presumably by locking the AGI buffer. This is to ensure that an 803 * inode cannot transition from allocated to freed until the caller is 804 * ready to allow that. If the inode is in an intermediate state (new, 805 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 806 * inode is not in the cache, -ENOENT will be returned. The caller must 807 * deal with these scenarios appropriately. 808 * 809 * This is a specialized use case for the online scrubber; if you're 810 * reading this, you probably want xfs_iget. 811 */ 812 int xfs_icache_inode_is_allocated(struct xfs_mount * mp,struct xfs_trans * tp,xfs_ino_t ino,bool * inuse)813 xfs_icache_inode_is_allocated( 814 struct xfs_mount *mp, 815 struct xfs_trans *tp, 816 xfs_ino_t ino, 817 bool *inuse) 818 { 819 struct xfs_inode *ip; 820 int error; 821 822 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 823 if (error) 824 return error; 825 826 *inuse = !!(VFS_I(ip)->i_mode); 827 xfs_irele(ip); 828 return 0; 829 } 830 831 /* 832 * Grab the inode for reclaim exclusively. 833 * 834 * We have found this inode via a lookup under RCU, so the inode may have 835 * already been freed, or it may be in the process of being recycled by 836 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 837 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 838 * will not be set. Hence we need to check for both these flag conditions to 839 * avoid inodes that are no longer reclaim candidates. 840 * 841 * Note: checking for other state flags here, under the i_flags_lock or not, is 842 * racy and should be avoided. Those races should be resolved only after we have 843 * ensured that we are able to reclaim this inode and the world can see that we 844 * are going to reclaim it. 845 * 846 * Return true if we grabbed it, false otherwise. 847 */ 848 static bool xfs_reclaim_igrab(struct xfs_inode * ip,struct xfs_icwalk * icw)849 xfs_reclaim_igrab( 850 struct xfs_inode *ip, 851 struct xfs_icwalk *icw) 852 { 853 ASSERT(rcu_read_lock_held()); 854 855 spin_lock(&ip->i_flags_lock); 856 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 857 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 858 /* not a reclaim candidate. */ 859 spin_unlock(&ip->i_flags_lock); 860 return false; 861 } 862 863 /* Don't reclaim a sick inode unless the caller asked for it. */ 864 if (ip->i_sick && 865 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { 866 spin_unlock(&ip->i_flags_lock); 867 return false; 868 } 869 870 __xfs_iflags_set(ip, XFS_IRECLAIM); 871 spin_unlock(&ip->i_flags_lock); 872 return true; 873 } 874 875 /* 876 * Inode reclaim is non-blocking, so the default action if progress cannot be 877 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 878 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 879 * blocking anymore and hence we can wait for the inode to be able to reclaim 880 * it. 881 * 882 * We do no IO here - if callers require inodes to be cleaned they must push the 883 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 884 * done in the background in a non-blocking manner, and enables memory reclaim 885 * to make progress without blocking. 886 */ 887 static void xfs_reclaim_inode(struct xfs_inode * ip,struct xfs_perag * pag)888 xfs_reclaim_inode( 889 struct xfs_inode *ip, 890 struct xfs_perag *pag) 891 { 892 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 893 894 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 895 goto out; 896 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 897 goto out_iunlock; 898 899 if (xfs_is_shutdown(ip->i_mount)) { 900 xfs_iunpin_wait(ip); 901 xfs_iflush_abort(ip); 902 goto reclaim; 903 } 904 if (xfs_ipincount(ip)) 905 goto out_clear_flush; 906 if (!xfs_inode_clean(ip)) 907 goto out_clear_flush; 908 909 xfs_iflags_clear(ip, XFS_IFLUSHING); 910 reclaim: 911 trace_xfs_inode_reclaiming(ip); 912 913 /* 914 * Because we use RCU freeing we need to ensure the inode always appears 915 * to be reclaimed with an invalid inode number when in the free state. 916 * We do this as early as possible under the ILOCK so that 917 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 918 * detect races with us here. By doing this, we guarantee that once 919 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 920 * it will see either a valid inode that will serialise correctly, or it 921 * will see an invalid inode that it can skip. 922 */ 923 spin_lock(&ip->i_flags_lock); 924 ip->i_flags = XFS_IRECLAIM; 925 ip->i_ino = 0; 926 ip->i_sick = 0; 927 ip->i_checked = 0; 928 spin_unlock(&ip->i_flags_lock); 929 930 xfs_iunlock(ip, XFS_ILOCK_EXCL); 931 932 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 933 /* 934 * Remove the inode from the per-AG radix tree. 935 * 936 * Because radix_tree_delete won't complain even if the item was never 937 * added to the tree assert that it's been there before to catch 938 * problems with the inode life time early on. 939 */ 940 spin_lock(&pag->pag_ici_lock); 941 if (!radix_tree_delete(&pag->pag_ici_root, 942 XFS_INO_TO_AGINO(ip->i_mount, ino))) 943 ASSERT(0); 944 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); 945 spin_unlock(&pag->pag_ici_lock); 946 947 /* 948 * Here we do an (almost) spurious inode lock in order to coordinate 949 * with inode cache radix tree lookups. This is because the lookup 950 * can reference the inodes in the cache without taking references. 951 * 952 * We make that OK here by ensuring that we wait until the inode is 953 * unlocked after the lookup before we go ahead and free it. 954 */ 955 xfs_ilock(ip, XFS_ILOCK_EXCL); 956 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); 957 xfs_iunlock(ip, XFS_ILOCK_EXCL); 958 ASSERT(xfs_inode_clean(ip)); 959 960 __xfs_inode_free(ip); 961 return; 962 963 out_clear_flush: 964 xfs_iflags_clear(ip, XFS_IFLUSHING); 965 out_iunlock: 966 xfs_iunlock(ip, XFS_ILOCK_EXCL); 967 out: 968 xfs_iflags_clear(ip, XFS_IRECLAIM); 969 } 970 971 /* Reclaim sick inodes if we're unmounting or the fs went down. */ 972 static inline bool xfs_want_reclaim_sick(struct xfs_mount * mp)973 xfs_want_reclaim_sick( 974 struct xfs_mount *mp) 975 { 976 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || 977 xfs_is_shutdown(mp); 978 } 979 980 void xfs_reclaim_inodes(struct xfs_mount * mp)981 xfs_reclaim_inodes( 982 struct xfs_mount *mp) 983 { 984 struct xfs_icwalk icw = { 985 .icw_flags = 0, 986 }; 987 988 if (xfs_want_reclaim_sick(mp)) 989 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 990 991 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 992 xfs_ail_push_all_sync(mp->m_ail); 993 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 994 } 995 } 996 997 /* 998 * The shrinker infrastructure determines how many inodes we should scan for 999 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1000 * push the AIL here. We also want to proactively free up memory if we can to 1001 * minimise the amount of work memory reclaim has to do so we kick the 1002 * background reclaim if it isn't already scheduled. 1003 */ 1004 long xfs_reclaim_inodes_nr(struct xfs_mount * mp,unsigned long nr_to_scan)1005 xfs_reclaim_inodes_nr( 1006 struct xfs_mount *mp, 1007 unsigned long nr_to_scan) 1008 { 1009 struct xfs_icwalk icw = { 1010 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, 1011 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), 1012 }; 1013 1014 if (xfs_want_reclaim_sick(mp)) 1015 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 1016 1017 /* kick background reclaimer and push the AIL */ 1018 xfs_reclaim_work_queue(mp); 1019 xfs_ail_push_all(mp->m_ail); 1020 1021 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 1022 return 0; 1023 } 1024 1025 /* 1026 * Return the number of reclaimable inodes in the filesystem for 1027 * the shrinker to determine how much to reclaim. 1028 */ 1029 long xfs_reclaim_inodes_count(struct xfs_mount * mp)1030 xfs_reclaim_inodes_count( 1031 struct xfs_mount *mp) 1032 { 1033 struct xfs_perag *pag; 1034 xfs_agnumber_t ag = 0; 1035 long reclaimable = 0; 1036 1037 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1038 ag = pag->pag_agno + 1; 1039 reclaimable += pag->pag_ici_reclaimable; 1040 xfs_perag_put(pag); 1041 } 1042 return reclaimable; 1043 } 1044 1045 STATIC bool xfs_icwalk_match_id(struct xfs_inode * ip,struct xfs_icwalk * icw)1046 xfs_icwalk_match_id( 1047 struct xfs_inode *ip, 1048 struct xfs_icwalk *icw) 1049 { 1050 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1051 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1052 return false; 1053 1054 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1055 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1056 return false; 1057 1058 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1059 ip->i_projid != icw->icw_prid) 1060 return false; 1061 1062 return true; 1063 } 1064 1065 /* 1066 * A union-based inode filtering algorithm. Process the inode if any of the 1067 * criteria match. This is for global/internal scans only. 1068 */ 1069 STATIC bool xfs_icwalk_match_id_union(struct xfs_inode * ip,struct xfs_icwalk * icw)1070 xfs_icwalk_match_id_union( 1071 struct xfs_inode *ip, 1072 struct xfs_icwalk *icw) 1073 { 1074 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1075 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1076 return true; 1077 1078 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1079 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1080 return true; 1081 1082 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1083 ip->i_projid == icw->icw_prid) 1084 return true; 1085 1086 return false; 1087 } 1088 1089 /* 1090 * Is this inode @ip eligible for eof/cow block reclamation, given some 1091 * filtering parameters @icw? The inode is eligible if @icw is null or 1092 * if the predicate functions match. 1093 */ 1094 static bool xfs_icwalk_match(struct xfs_inode * ip,struct xfs_icwalk * icw)1095 xfs_icwalk_match( 1096 struct xfs_inode *ip, 1097 struct xfs_icwalk *icw) 1098 { 1099 bool match; 1100 1101 if (!icw) 1102 return true; 1103 1104 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) 1105 match = xfs_icwalk_match_id_union(ip, icw); 1106 else 1107 match = xfs_icwalk_match_id(ip, icw); 1108 if (!match) 1109 return false; 1110 1111 /* skip the inode if the file size is too small */ 1112 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && 1113 XFS_ISIZE(ip) < icw->icw_min_file_size) 1114 return false; 1115 1116 return true; 1117 } 1118 1119 /* 1120 * This is a fast pass over the inode cache to try to get reclaim moving on as 1121 * many inodes as possible in a short period of time. It kicks itself every few 1122 * seconds, as well as being kicked by the inode cache shrinker when memory 1123 * goes low. 1124 */ 1125 void xfs_reclaim_worker(struct work_struct * work)1126 xfs_reclaim_worker( 1127 struct work_struct *work) 1128 { 1129 struct xfs_mount *mp = container_of(to_delayed_work(work), 1130 struct xfs_mount, m_reclaim_work); 1131 1132 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); 1133 xfs_reclaim_work_queue(mp); 1134 } 1135 1136 STATIC int xfs_inode_free_eofblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1137 xfs_inode_free_eofblocks( 1138 struct xfs_inode *ip, 1139 struct xfs_icwalk *icw, 1140 unsigned int *lockflags) 1141 { 1142 bool wait; 1143 1144 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1145 1146 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1147 return 0; 1148 1149 /* 1150 * If the mapping is dirty the operation can block and wait for some 1151 * time. Unless we are waiting, skip it. 1152 */ 1153 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1154 return 0; 1155 1156 if (!xfs_icwalk_match(ip, icw)) 1157 return 0; 1158 1159 /* 1160 * If the caller is waiting, return -EAGAIN to keep the background 1161 * scanner moving and revisit the inode in a subsequent pass. 1162 */ 1163 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1164 if (wait) 1165 return -EAGAIN; 1166 return 0; 1167 } 1168 *lockflags |= XFS_IOLOCK_EXCL; 1169 1170 if (xfs_can_free_eofblocks(ip, false)) 1171 return xfs_free_eofblocks(ip); 1172 1173 /* inode could be preallocated or append-only */ 1174 trace_xfs_inode_free_eofblocks_invalid(ip); 1175 xfs_inode_clear_eofblocks_tag(ip); 1176 return 0; 1177 } 1178 1179 static void xfs_blockgc_set_iflag(struct xfs_inode * ip,unsigned long iflag)1180 xfs_blockgc_set_iflag( 1181 struct xfs_inode *ip, 1182 unsigned long iflag) 1183 { 1184 struct xfs_mount *mp = ip->i_mount; 1185 struct xfs_perag *pag; 1186 1187 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1188 1189 /* 1190 * Don't bother locking the AG and looking up in the radix trees 1191 * if we already know that we have the tag set. 1192 */ 1193 if (ip->i_flags & iflag) 1194 return; 1195 spin_lock(&ip->i_flags_lock); 1196 ip->i_flags |= iflag; 1197 spin_unlock(&ip->i_flags_lock); 1198 1199 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1200 spin_lock(&pag->pag_ici_lock); 1201 1202 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1203 XFS_ICI_BLOCKGC_TAG); 1204 1205 spin_unlock(&pag->pag_ici_lock); 1206 xfs_perag_put(pag); 1207 } 1208 1209 void xfs_inode_set_eofblocks_tag(xfs_inode_t * ip)1210 xfs_inode_set_eofblocks_tag( 1211 xfs_inode_t *ip) 1212 { 1213 trace_xfs_inode_set_eofblocks_tag(ip); 1214 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1215 } 1216 1217 static void xfs_blockgc_clear_iflag(struct xfs_inode * ip,unsigned long iflag)1218 xfs_blockgc_clear_iflag( 1219 struct xfs_inode *ip, 1220 unsigned long iflag) 1221 { 1222 struct xfs_mount *mp = ip->i_mount; 1223 struct xfs_perag *pag; 1224 bool clear_tag; 1225 1226 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1227 1228 spin_lock(&ip->i_flags_lock); 1229 ip->i_flags &= ~iflag; 1230 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1231 spin_unlock(&ip->i_flags_lock); 1232 1233 if (!clear_tag) 1234 return; 1235 1236 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1237 spin_lock(&pag->pag_ici_lock); 1238 1239 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1240 XFS_ICI_BLOCKGC_TAG); 1241 1242 spin_unlock(&pag->pag_ici_lock); 1243 xfs_perag_put(pag); 1244 } 1245 1246 void xfs_inode_clear_eofblocks_tag(xfs_inode_t * ip)1247 xfs_inode_clear_eofblocks_tag( 1248 xfs_inode_t *ip) 1249 { 1250 trace_xfs_inode_clear_eofblocks_tag(ip); 1251 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1252 } 1253 1254 /* 1255 * Set ourselves up to free CoW blocks from this file. If it's already clean 1256 * then we can bail out quickly, but otherwise we must back off if the file 1257 * is undergoing some kind of write. 1258 */ 1259 static bool xfs_prep_free_cowblocks(struct xfs_inode * ip)1260 xfs_prep_free_cowblocks( 1261 struct xfs_inode *ip) 1262 { 1263 /* 1264 * Just clear the tag if we have an empty cow fork or none at all. It's 1265 * possible the inode was fully unshared since it was originally tagged. 1266 */ 1267 if (!xfs_inode_has_cow_data(ip)) { 1268 trace_xfs_inode_free_cowblocks_invalid(ip); 1269 xfs_inode_clear_cowblocks_tag(ip); 1270 return false; 1271 } 1272 1273 /* 1274 * If the mapping is dirty or under writeback we cannot touch the 1275 * CoW fork. Leave it alone if we're in the midst of a directio. 1276 */ 1277 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1278 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1279 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1280 atomic_read(&VFS_I(ip)->i_dio_count)) 1281 return false; 1282 1283 return true; 1284 } 1285 1286 /* 1287 * Automatic CoW Reservation Freeing 1288 * 1289 * These functions automatically garbage collect leftover CoW reservations 1290 * that were made on behalf of a cowextsize hint when we start to run out 1291 * of quota or when the reservations sit around for too long. If the file 1292 * has dirty pages or is undergoing writeback, its CoW reservations will 1293 * be retained. 1294 * 1295 * The actual garbage collection piggybacks off the same code that runs 1296 * the speculative EOF preallocation garbage collector. 1297 */ 1298 STATIC int xfs_inode_free_cowblocks(struct xfs_inode * ip,struct xfs_icwalk * icw,unsigned int * lockflags)1299 xfs_inode_free_cowblocks( 1300 struct xfs_inode *ip, 1301 struct xfs_icwalk *icw, 1302 unsigned int *lockflags) 1303 { 1304 bool wait; 1305 int ret = 0; 1306 1307 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1308 1309 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1310 return 0; 1311 1312 if (!xfs_prep_free_cowblocks(ip)) 1313 return 0; 1314 1315 if (!xfs_icwalk_match(ip, icw)) 1316 return 0; 1317 1318 /* 1319 * If the caller is waiting, return -EAGAIN to keep the background 1320 * scanner moving and revisit the inode in a subsequent pass. 1321 */ 1322 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1323 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1324 if (wait) 1325 return -EAGAIN; 1326 return 0; 1327 } 1328 *lockflags |= XFS_IOLOCK_EXCL; 1329 1330 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1331 if (wait) 1332 return -EAGAIN; 1333 return 0; 1334 } 1335 *lockflags |= XFS_MMAPLOCK_EXCL; 1336 1337 /* 1338 * Check again, nobody else should be able to dirty blocks or change 1339 * the reflink iflag now that we have the first two locks held. 1340 */ 1341 if (xfs_prep_free_cowblocks(ip)) 1342 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1343 return ret; 1344 } 1345 1346 void xfs_inode_set_cowblocks_tag(xfs_inode_t * ip)1347 xfs_inode_set_cowblocks_tag( 1348 xfs_inode_t *ip) 1349 { 1350 trace_xfs_inode_set_cowblocks_tag(ip); 1351 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1352 } 1353 1354 void xfs_inode_clear_cowblocks_tag(xfs_inode_t * ip)1355 xfs_inode_clear_cowblocks_tag( 1356 xfs_inode_t *ip) 1357 { 1358 trace_xfs_inode_clear_cowblocks_tag(ip); 1359 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1360 } 1361 1362 /* Disable post-EOF and CoW block auto-reclamation. */ 1363 void xfs_blockgc_stop(struct xfs_mount * mp)1364 xfs_blockgc_stop( 1365 struct xfs_mount *mp) 1366 { 1367 struct xfs_perag *pag; 1368 xfs_agnumber_t agno; 1369 1370 if (!xfs_clear_blockgc_enabled(mp)) 1371 return; 1372 1373 for_each_perag(mp, agno, pag) 1374 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1375 trace_xfs_blockgc_stop(mp, __return_address); 1376 } 1377 1378 /* Enable post-EOF and CoW block auto-reclamation. */ 1379 void xfs_blockgc_start(struct xfs_mount * mp)1380 xfs_blockgc_start( 1381 struct xfs_mount *mp) 1382 { 1383 struct xfs_perag *pag; 1384 xfs_agnumber_t agno; 1385 1386 if (xfs_set_blockgc_enabled(mp)) 1387 return; 1388 1389 trace_xfs_blockgc_start(mp, __return_address); 1390 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1391 xfs_blockgc_queue(pag); 1392 } 1393 1394 /* Don't try to run block gc on an inode that's in any of these states. */ 1395 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ 1396 XFS_NEED_INACTIVE | \ 1397 XFS_INACTIVATING | \ 1398 XFS_IRECLAIMABLE | \ 1399 XFS_IRECLAIM) 1400 /* 1401 * Decide if the given @ip is eligible for garbage collection of speculative 1402 * preallocations, and grab it if so. Returns true if it's ready to go or 1403 * false if we should just ignore it. 1404 */ 1405 static bool xfs_blockgc_igrab(struct xfs_inode * ip)1406 xfs_blockgc_igrab( 1407 struct xfs_inode *ip) 1408 { 1409 struct inode *inode = VFS_I(ip); 1410 1411 ASSERT(rcu_read_lock_held()); 1412 1413 /* Check for stale RCU freed inode */ 1414 spin_lock(&ip->i_flags_lock); 1415 if (!ip->i_ino) 1416 goto out_unlock_noent; 1417 1418 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) 1419 goto out_unlock_noent; 1420 spin_unlock(&ip->i_flags_lock); 1421 1422 /* nothing to sync during shutdown */ 1423 if (xfs_is_shutdown(ip->i_mount)) 1424 return false; 1425 1426 /* If we can't grab the inode, it must on it's way to reclaim. */ 1427 if (!igrab(inode)) 1428 return false; 1429 1430 /* inode is valid */ 1431 return true; 1432 1433 out_unlock_noent: 1434 spin_unlock(&ip->i_flags_lock); 1435 return false; 1436 } 1437 1438 /* Scan one incore inode for block preallocations that we can remove. */ 1439 static int xfs_blockgc_scan_inode(struct xfs_inode * ip,struct xfs_icwalk * icw)1440 xfs_blockgc_scan_inode( 1441 struct xfs_inode *ip, 1442 struct xfs_icwalk *icw) 1443 { 1444 unsigned int lockflags = 0; 1445 int error; 1446 1447 error = xfs_inode_free_eofblocks(ip, icw, &lockflags); 1448 if (error) 1449 goto unlock; 1450 1451 error = xfs_inode_free_cowblocks(ip, icw, &lockflags); 1452 unlock: 1453 if (lockflags) 1454 xfs_iunlock(ip, lockflags); 1455 xfs_irele(ip); 1456 return error; 1457 } 1458 1459 /* Background worker that trims preallocated space. */ 1460 void xfs_blockgc_worker(struct work_struct * work)1461 xfs_blockgc_worker( 1462 struct work_struct *work) 1463 { 1464 struct xfs_perag *pag = container_of(to_delayed_work(work), 1465 struct xfs_perag, pag_blockgc_work); 1466 struct xfs_mount *mp = pag->pag_mount; 1467 int error; 1468 1469 trace_xfs_blockgc_worker(mp, __return_address); 1470 1471 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); 1472 if (error) 1473 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1474 pag->pag_agno, error); 1475 xfs_blockgc_queue(pag); 1476 } 1477 1478 /* 1479 * Try to free space in the filesystem by purging inactive inodes, eofblocks 1480 * and cowblocks. 1481 */ 1482 int xfs_blockgc_free_space(struct xfs_mount * mp,struct xfs_icwalk * icw)1483 xfs_blockgc_free_space( 1484 struct xfs_mount *mp, 1485 struct xfs_icwalk *icw) 1486 { 1487 int error; 1488 1489 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); 1490 1491 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); 1492 if (error) 1493 return error; 1494 1495 xfs_inodegc_flush(mp); 1496 return 0; 1497 } 1498 1499 /* 1500 * Reclaim all the free space that we can by scheduling the background blockgc 1501 * and inodegc workers immediately and waiting for them all to clear. 1502 */ 1503 void xfs_blockgc_flush_all(struct xfs_mount * mp)1504 xfs_blockgc_flush_all( 1505 struct xfs_mount *mp) 1506 { 1507 struct xfs_perag *pag; 1508 xfs_agnumber_t agno; 1509 1510 trace_xfs_blockgc_flush_all(mp, __return_address); 1511 1512 /* 1513 * For each blockgc worker, move its queue time up to now. If it 1514 * wasn't queued, it will not be requeued. Then flush whatever's 1515 * left. 1516 */ 1517 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1518 mod_delayed_work(pag->pag_mount->m_blockgc_wq, 1519 &pag->pag_blockgc_work, 0); 1520 1521 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1522 flush_delayed_work(&pag->pag_blockgc_work); 1523 1524 xfs_inodegc_flush(mp); 1525 } 1526 1527 /* 1528 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1529 * quota caused an allocation failure, so we make a best effort by including 1530 * each quota under low free space conditions (less than 1% free space) in the 1531 * scan. 1532 * 1533 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1534 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or 1535 * MMAPLOCK. 1536 */ 1537 int xfs_blockgc_free_dquots(struct xfs_mount * mp,struct xfs_dquot * udqp,struct xfs_dquot * gdqp,struct xfs_dquot * pdqp,unsigned int iwalk_flags)1538 xfs_blockgc_free_dquots( 1539 struct xfs_mount *mp, 1540 struct xfs_dquot *udqp, 1541 struct xfs_dquot *gdqp, 1542 struct xfs_dquot *pdqp, 1543 unsigned int iwalk_flags) 1544 { 1545 struct xfs_icwalk icw = {0}; 1546 bool do_work = false; 1547 1548 if (!udqp && !gdqp && !pdqp) 1549 return 0; 1550 1551 /* 1552 * Run a scan to free blocks using the union filter to cover all 1553 * applicable quotas in a single scan. 1554 */ 1555 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; 1556 1557 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1558 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1559 icw.icw_flags |= XFS_ICWALK_FLAG_UID; 1560 do_work = true; 1561 } 1562 1563 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1564 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1565 icw.icw_flags |= XFS_ICWALK_FLAG_GID; 1566 do_work = true; 1567 } 1568 1569 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1570 icw.icw_prid = pdqp->q_id; 1571 icw.icw_flags |= XFS_ICWALK_FLAG_PRID; 1572 do_work = true; 1573 } 1574 1575 if (!do_work) 1576 return 0; 1577 1578 return xfs_blockgc_free_space(mp, &icw); 1579 } 1580 1581 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1582 int xfs_blockgc_free_quota(struct xfs_inode * ip,unsigned int iwalk_flags)1583 xfs_blockgc_free_quota( 1584 struct xfs_inode *ip, 1585 unsigned int iwalk_flags) 1586 { 1587 return xfs_blockgc_free_dquots(ip->i_mount, 1588 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1589 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1590 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); 1591 } 1592 1593 /* XFS Inode Cache Walking Code */ 1594 1595 /* 1596 * The inode lookup is done in batches to keep the amount of lock traffic and 1597 * radix tree lookups to a minimum. The batch size is a trade off between 1598 * lookup reduction and stack usage. This is in the reclaim path, so we can't 1599 * be too greedy. 1600 */ 1601 #define XFS_LOOKUP_BATCH 32 1602 1603 1604 /* 1605 * Decide if we want to grab this inode in anticipation of doing work towards 1606 * the goal. 1607 */ 1608 static inline bool xfs_icwalk_igrab(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_icwalk * icw)1609 xfs_icwalk_igrab( 1610 enum xfs_icwalk_goal goal, 1611 struct xfs_inode *ip, 1612 struct xfs_icwalk *icw) 1613 { 1614 switch (goal) { 1615 case XFS_ICWALK_BLOCKGC: 1616 return xfs_blockgc_igrab(ip); 1617 case XFS_ICWALK_RECLAIM: 1618 return xfs_reclaim_igrab(ip, icw); 1619 default: 1620 return false; 1621 } 1622 } 1623 1624 /* 1625 * Process an inode. Each processing function must handle any state changes 1626 * made by the icwalk igrab function. Return -EAGAIN to skip an inode. 1627 */ 1628 static inline int xfs_icwalk_process_inode(enum xfs_icwalk_goal goal,struct xfs_inode * ip,struct xfs_perag * pag,struct xfs_icwalk * icw)1629 xfs_icwalk_process_inode( 1630 enum xfs_icwalk_goal goal, 1631 struct xfs_inode *ip, 1632 struct xfs_perag *pag, 1633 struct xfs_icwalk *icw) 1634 { 1635 int error = 0; 1636 1637 switch (goal) { 1638 case XFS_ICWALK_BLOCKGC: 1639 error = xfs_blockgc_scan_inode(ip, icw); 1640 break; 1641 case XFS_ICWALK_RECLAIM: 1642 xfs_reclaim_inode(ip, pag); 1643 break; 1644 } 1645 return error; 1646 } 1647 1648 /* 1649 * For a given per-AG structure @pag and a goal, grab qualifying inodes and 1650 * process them in some manner. 1651 */ 1652 static int xfs_icwalk_ag(struct xfs_perag * pag,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1653 xfs_icwalk_ag( 1654 struct xfs_perag *pag, 1655 enum xfs_icwalk_goal goal, 1656 struct xfs_icwalk *icw) 1657 { 1658 struct xfs_mount *mp = pag->pag_mount; 1659 uint32_t first_index; 1660 int last_error = 0; 1661 int skipped; 1662 bool done; 1663 int nr_found; 1664 1665 restart: 1666 done = false; 1667 skipped = 0; 1668 if (goal == XFS_ICWALK_RECLAIM) 1669 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1670 else 1671 first_index = 0; 1672 nr_found = 0; 1673 do { 1674 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1675 int error = 0; 1676 int i; 1677 1678 rcu_read_lock(); 1679 1680 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 1681 (void **) batch, first_index, 1682 XFS_LOOKUP_BATCH, goal); 1683 if (!nr_found) { 1684 done = true; 1685 rcu_read_unlock(); 1686 break; 1687 } 1688 1689 /* 1690 * Grab the inodes before we drop the lock. if we found 1691 * nothing, nr == 0 and the loop will be skipped. 1692 */ 1693 for (i = 0; i < nr_found; i++) { 1694 struct xfs_inode *ip = batch[i]; 1695 1696 if (done || !xfs_icwalk_igrab(goal, ip, icw)) 1697 batch[i] = NULL; 1698 1699 /* 1700 * Update the index for the next lookup. Catch 1701 * overflows into the next AG range which can occur if 1702 * we have inodes in the last block of the AG and we 1703 * are currently pointing to the last inode. 1704 * 1705 * Because we may see inodes that are from the wrong AG 1706 * due to RCU freeing and reallocation, only update the 1707 * index if it lies in this AG. It was a race that lead 1708 * us to see this inode, so another lookup from the 1709 * same index will not find it again. 1710 */ 1711 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 1712 continue; 1713 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1714 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1715 done = true; 1716 } 1717 1718 /* unlock now we've grabbed the inodes. */ 1719 rcu_read_unlock(); 1720 1721 for (i = 0; i < nr_found; i++) { 1722 if (!batch[i]) 1723 continue; 1724 error = xfs_icwalk_process_inode(goal, batch[i], pag, 1725 icw); 1726 if (error == -EAGAIN) { 1727 skipped++; 1728 continue; 1729 } 1730 if (error && last_error != -EFSCORRUPTED) 1731 last_error = error; 1732 } 1733 1734 /* bail out if the filesystem is corrupted. */ 1735 if (error == -EFSCORRUPTED) 1736 break; 1737 1738 cond_resched(); 1739 1740 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { 1741 icw->icw_scan_limit -= XFS_LOOKUP_BATCH; 1742 if (icw->icw_scan_limit <= 0) 1743 break; 1744 } 1745 } while (nr_found && !done); 1746 1747 if (goal == XFS_ICWALK_RECLAIM) { 1748 if (done) 1749 first_index = 0; 1750 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1751 } 1752 1753 if (skipped) { 1754 delay(1); 1755 goto restart; 1756 } 1757 return last_error; 1758 } 1759 1760 /* Walk all incore inodes to achieve a given goal. */ 1761 static int xfs_icwalk(struct xfs_mount * mp,enum xfs_icwalk_goal goal,struct xfs_icwalk * icw)1762 xfs_icwalk( 1763 struct xfs_mount *mp, 1764 enum xfs_icwalk_goal goal, 1765 struct xfs_icwalk *icw) 1766 { 1767 struct xfs_perag *pag; 1768 int error = 0; 1769 int last_error = 0; 1770 xfs_agnumber_t agno; 1771 1772 for_each_perag_tag(mp, agno, pag, goal) { 1773 error = xfs_icwalk_ag(pag, goal, icw); 1774 if (error) { 1775 last_error = error; 1776 if (error == -EFSCORRUPTED) { 1777 xfs_perag_put(pag); 1778 break; 1779 } 1780 } 1781 } 1782 return last_error; 1783 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); 1784 } 1785 1786 #ifdef DEBUG 1787 static void xfs_check_delalloc(struct xfs_inode * ip,int whichfork)1788 xfs_check_delalloc( 1789 struct xfs_inode *ip, 1790 int whichfork) 1791 { 1792 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); 1793 struct xfs_bmbt_irec got; 1794 struct xfs_iext_cursor icur; 1795 1796 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) 1797 return; 1798 do { 1799 if (isnullstartblock(got.br_startblock)) { 1800 xfs_warn(ip->i_mount, 1801 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", 1802 ip->i_ino, 1803 whichfork == XFS_DATA_FORK ? "data" : "cow", 1804 got.br_startoff, got.br_blockcount); 1805 } 1806 } while (xfs_iext_next_extent(ifp, &icur, &got)); 1807 } 1808 #else 1809 #define xfs_check_delalloc(ip, whichfork) do { } while (0) 1810 #endif 1811 1812 /* Schedule the inode for reclaim. */ 1813 static void xfs_inodegc_set_reclaimable(struct xfs_inode * ip)1814 xfs_inodegc_set_reclaimable( 1815 struct xfs_inode *ip) 1816 { 1817 struct xfs_mount *mp = ip->i_mount; 1818 struct xfs_perag *pag; 1819 1820 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { 1821 xfs_check_delalloc(ip, XFS_DATA_FORK); 1822 xfs_check_delalloc(ip, XFS_COW_FORK); 1823 ASSERT(0); 1824 } 1825 1826 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1827 spin_lock(&pag->pag_ici_lock); 1828 spin_lock(&ip->i_flags_lock); 1829 1830 trace_xfs_inode_set_reclaimable(ip); 1831 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); 1832 ip->i_flags |= XFS_IRECLAIMABLE; 1833 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1834 XFS_ICI_RECLAIM_TAG); 1835 1836 spin_unlock(&ip->i_flags_lock); 1837 spin_unlock(&pag->pag_ici_lock); 1838 xfs_perag_put(pag); 1839 } 1840 1841 /* 1842 * Free all speculative preallocations and possibly even the inode itself. 1843 * This is the last chance to make changes to an otherwise unreferenced file 1844 * before incore reclamation happens. 1845 */ 1846 static void xfs_inodegc_inactivate(struct xfs_inode * ip)1847 xfs_inodegc_inactivate( 1848 struct xfs_inode *ip) 1849 { 1850 trace_xfs_inode_inactivating(ip); 1851 xfs_inactive(ip); 1852 xfs_inodegc_set_reclaimable(ip); 1853 } 1854 1855 void xfs_inodegc_worker(struct work_struct * work)1856 xfs_inodegc_worker( 1857 struct work_struct *work) 1858 { 1859 struct xfs_inodegc *gc = container_of(to_delayed_work(work), 1860 struct xfs_inodegc, work); 1861 struct llist_node *node = llist_del_all(&gc->list); 1862 struct xfs_inode *ip, *n; 1863 1864 ASSERT(gc->cpu == smp_processor_id()); 1865 1866 WRITE_ONCE(gc->items, 0); 1867 1868 if (!node) 1869 return; 1870 1871 ip = llist_entry(node, struct xfs_inode, i_gclist); 1872 trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); 1873 1874 WRITE_ONCE(gc->shrinker_hits, 0); 1875 llist_for_each_entry_safe(ip, n, node, i_gclist) { 1876 xfs_iflags_set(ip, XFS_INACTIVATING); 1877 xfs_inodegc_inactivate(ip); 1878 } 1879 } 1880 1881 /* 1882 * Expedite all pending inodegc work to run immediately. This does not wait for 1883 * completion of the work. 1884 */ 1885 void xfs_inodegc_push(struct xfs_mount * mp)1886 xfs_inodegc_push( 1887 struct xfs_mount *mp) 1888 { 1889 if (!xfs_is_inodegc_enabled(mp)) 1890 return; 1891 trace_xfs_inodegc_push(mp, __return_address); 1892 xfs_inodegc_queue_all(mp); 1893 } 1894 1895 /* 1896 * Force all currently queued inode inactivation work to run immediately and 1897 * wait for the work to finish. 1898 */ 1899 void xfs_inodegc_flush(struct xfs_mount * mp)1900 xfs_inodegc_flush( 1901 struct xfs_mount *mp) 1902 { 1903 xfs_inodegc_push(mp); 1904 trace_xfs_inodegc_flush(mp, __return_address); 1905 flush_workqueue(mp->m_inodegc_wq); 1906 } 1907 1908 /* 1909 * Flush all the pending work and then disable the inode inactivation background 1910 * workers and wait for them to stop. Caller must hold sb->s_umount to 1911 * coordinate changes in the inodegc_enabled state. 1912 */ 1913 void xfs_inodegc_stop(struct xfs_mount * mp)1914 xfs_inodegc_stop( 1915 struct xfs_mount *mp) 1916 { 1917 bool rerun; 1918 1919 if (!xfs_clear_inodegc_enabled(mp)) 1920 return; 1921 1922 /* 1923 * Drain all pending inodegc work, including inodes that could be 1924 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan 1925 * threads that sample the inodegc state just prior to us clearing it. 1926 * The inodegc flag state prevents new threads from queuing more 1927 * inodes, so we queue pending work items and flush the workqueue until 1928 * all inodegc lists are empty. IOWs, we cannot use drain_workqueue 1929 * here because it does not allow other unserialized mechanisms to 1930 * reschedule inodegc work while this draining is in progress. 1931 */ 1932 xfs_inodegc_queue_all(mp); 1933 do { 1934 flush_workqueue(mp->m_inodegc_wq); 1935 rerun = xfs_inodegc_queue_all(mp); 1936 } while (rerun); 1937 1938 trace_xfs_inodegc_stop(mp, __return_address); 1939 } 1940 1941 /* 1942 * Enable the inode inactivation background workers and schedule deferred inode 1943 * inactivation work if there is any. Caller must hold sb->s_umount to 1944 * coordinate changes in the inodegc_enabled state. 1945 */ 1946 void xfs_inodegc_start(struct xfs_mount * mp)1947 xfs_inodegc_start( 1948 struct xfs_mount *mp) 1949 { 1950 if (xfs_set_inodegc_enabled(mp)) 1951 return; 1952 1953 trace_xfs_inodegc_start(mp, __return_address); 1954 xfs_inodegc_queue_all(mp); 1955 } 1956 1957 #ifdef CONFIG_XFS_RT 1958 static inline bool xfs_inodegc_want_queue_rt_file(struct xfs_inode * ip)1959 xfs_inodegc_want_queue_rt_file( 1960 struct xfs_inode *ip) 1961 { 1962 struct xfs_mount *mp = ip->i_mount; 1963 uint64_t freertx; 1964 1965 if (!XFS_IS_REALTIME_INODE(ip)) 1966 return false; 1967 1968 freertx = READ_ONCE(mp->m_sb.sb_frextents); 1969 return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; 1970 } 1971 #else 1972 # define xfs_inodegc_want_queue_rt_file(ip) (false) 1973 #endif /* CONFIG_XFS_RT */ 1974 1975 /* 1976 * Schedule the inactivation worker when: 1977 * 1978 * - We've accumulated more than one inode cluster buffer's worth of inodes. 1979 * - There is less than 5% free space left. 1980 * - Any of the quotas for this inode are near an enforcement limit. 1981 */ 1982 static inline bool xfs_inodegc_want_queue_work(struct xfs_inode * ip,unsigned int items)1983 xfs_inodegc_want_queue_work( 1984 struct xfs_inode *ip, 1985 unsigned int items) 1986 { 1987 struct xfs_mount *mp = ip->i_mount; 1988 1989 if (items > mp->m_ino_geo.inodes_per_cluster) 1990 return true; 1991 1992 if (__percpu_counter_compare(&mp->m_fdblocks, 1993 mp->m_low_space[XFS_LOWSP_5_PCNT], 1994 XFS_FDBLOCKS_BATCH) < 0) 1995 return true; 1996 1997 if (xfs_inodegc_want_queue_rt_file(ip)) 1998 return true; 1999 2000 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) 2001 return true; 2002 2003 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) 2004 return true; 2005 2006 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) 2007 return true; 2008 2009 return false; 2010 } 2011 2012 /* 2013 * Upper bound on the number of inodes in each AG that can be queued for 2014 * inactivation at any given time, to avoid monopolizing the workqueue. 2015 */ 2016 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) 2017 2018 /* 2019 * Make the frontend wait for inactivations when: 2020 * 2021 * - Memory shrinkers queued the inactivation worker and it hasn't finished. 2022 * - The queue depth exceeds the maximum allowable percpu backlog. 2023 * 2024 * Note: If the current thread is running a transaction, we don't ever want to 2025 * wait for other transactions because that could introduce a deadlock. 2026 */ 2027 static inline bool xfs_inodegc_want_flush_work(struct xfs_inode * ip,unsigned int items,unsigned int shrinker_hits)2028 xfs_inodegc_want_flush_work( 2029 struct xfs_inode *ip, 2030 unsigned int items, 2031 unsigned int shrinker_hits) 2032 { 2033 if (current->journal_info) 2034 return false; 2035 2036 if (shrinker_hits > 0) 2037 return true; 2038 2039 if (items > XFS_INODEGC_MAX_BACKLOG) 2040 return true; 2041 2042 return false; 2043 } 2044 2045 /* 2046 * Queue a background inactivation worker if there are inodes that need to be 2047 * inactivated and higher level xfs code hasn't disabled the background 2048 * workers. 2049 */ 2050 static void xfs_inodegc_queue(struct xfs_inode * ip)2051 xfs_inodegc_queue( 2052 struct xfs_inode *ip) 2053 { 2054 struct xfs_mount *mp = ip->i_mount; 2055 struct xfs_inodegc *gc; 2056 int items; 2057 unsigned int shrinker_hits; 2058 unsigned long queue_delay = 1; 2059 2060 trace_xfs_inode_set_need_inactive(ip); 2061 spin_lock(&ip->i_flags_lock); 2062 ip->i_flags |= XFS_NEED_INACTIVE; 2063 spin_unlock(&ip->i_flags_lock); 2064 2065 gc = get_cpu_ptr(mp->m_inodegc); 2066 llist_add(&ip->i_gclist, &gc->list); 2067 items = READ_ONCE(gc->items); 2068 WRITE_ONCE(gc->items, items + 1); 2069 shrinker_hits = READ_ONCE(gc->shrinker_hits); 2070 2071 /* 2072 * We queue the work while holding the current CPU so that the work 2073 * is scheduled to run on this CPU. 2074 */ 2075 if (!xfs_is_inodegc_enabled(mp)) { 2076 put_cpu_ptr(gc); 2077 return; 2078 } 2079 2080 if (xfs_inodegc_want_queue_work(ip, items)) 2081 queue_delay = 0; 2082 2083 trace_xfs_inodegc_queue(mp, __return_address); 2084 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, 2085 queue_delay); 2086 put_cpu_ptr(gc); 2087 2088 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { 2089 trace_xfs_inodegc_throttle(mp, __return_address); 2090 flush_delayed_work(&gc->work); 2091 } 2092 } 2093 2094 /* 2095 * Fold the dead CPU inodegc queue into the current CPUs queue. 2096 */ 2097 void xfs_inodegc_cpu_dead(struct xfs_mount * mp,unsigned int dead_cpu)2098 xfs_inodegc_cpu_dead( 2099 struct xfs_mount *mp, 2100 unsigned int dead_cpu) 2101 { 2102 struct xfs_inodegc *dead_gc, *gc; 2103 struct llist_node *first, *last; 2104 unsigned int count = 0; 2105 2106 dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); 2107 cancel_delayed_work_sync(&dead_gc->work); 2108 2109 if (llist_empty(&dead_gc->list)) 2110 return; 2111 2112 first = dead_gc->list.first; 2113 last = first; 2114 while (last->next) { 2115 last = last->next; 2116 count++; 2117 } 2118 dead_gc->list.first = NULL; 2119 dead_gc->items = 0; 2120 2121 /* Add pending work to current CPU */ 2122 gc = get_cpu_ptr(mp->m_inodegc); 2123 llist_add_batch(first, last, &gc->list); 2124 count += READ_ONCE(gc->items); 2125 WRITE_ONCE(gc->items, count); 2126 2127 if (xfs_is_inodegc_enabled(mp)) { 2128 trace_xfs_inodegc_queue(mp, __return_address); 2129 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, 2130 0); 2131 } 2132 put_cpu_ptr(gc); 2133 } 2134 2135 /* 2136 * We set the inode flag atomically with the radix tree tag. Once we get tag 2137 * lookups on the radix tree, this inode flag can go away. 2138 * 2139 * We always use background reclaim here because even if the inode is clean, it 2140 * still may be under IO and hence we have wait for IO completion to occur 2141 * before we can reclaim the inode. The background reclaim path handles this 2142 * more efficiently than we can here, so simply let background reclaim tear down 2143 * all inodes. 2144 */ 2145 void xfs_inode_mark_reclaimable(struct xfs_inode * ip)2146 xfs_inode_mark_reclaimable( 2147 struct xfs_inode *ip) 2148 { 2149 struct xfs_mount *mp = ip->i_mount; 2150 bool need_inactive; 2151 2152 XFS_STATS_INC(mp, vn_reclaim); 2153 2154 /* 2155 * We should never get here with any of the reclaim flags already set. 2156 */ 2157 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); 2158 2159 need_inactive = xfs_inode_needs_inactive(ip); 2160 if (need_inactive) { 2161 xfs_inodegc_queue(ip); 2162 return; 2163 } 2164 2165 /* Going straight to reclaim, so drop the dquots. */ 2166 xfs_qm_dqdetach(ip); 2167 xfs_inodegc_set_reclaimable(ip); 2168 } 2169 2170 /* 2171 * Register a phony shrinker so that we can run background inodegc sooner when 2172 * there's memory pressure. Inactivation does not itself free any memory but 2173 * it does make inodes reclaimable, which eventually frees memory. 2174 * 2175 * The count function, seek value, and batch value are crafted to trigger the 2176 * scan function during the second round of scanning. Hopefully this means 2177 * that we reclaimed enough memory that initiating metadata transactions won't 2178 * make things worse. 2179 */ 2180 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) 2181 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) 2182 2183 static unsigned long xfs_inodegc_shrinker_count(struct shrinker * shrink,struct shrink_control * sc)2184 xfs_inodegc_shrinker_count( 2185 struct shrinker *shrink, 2186 struct shrink_control *sc) 2187 { 2188 struct xfs_mount *mp = container_of(shrink, struct xfs_mount, 2189 m_inodegc_shrinker); 2190 struct xfs_inodegc *gc; 2191 int cpu; 2192 2193 if (!xfs_is_inodegc_enabled(mp)) 2194 return 0; 2195 2196 for_each_online_cpu(cpu) { 2197 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2198 if (!llist_empty(&gc->list)) 2199 return XFS_INODEGC_SHRINKER_COUNT; 2200 } 2201 2202 return 0; 2203 } 2204 2205 static unsigned long xfs_inodegc_shrinker_scan(struct shrinker * shrink,struct shrink_control * sc)2206 xfs_inodegc_shrinker_scan( 2207 struct shrinker *shrink, 2208 struct shrink_control *sc) 2209 { 2210 struct xfs_mount *mp = container_of(shrink, struct xfs_mount, 2211 m_inodegc_shrinker); 2212 struct xfs_inodegc *gc; 2213 int cpu; 2214 bool no_items = true; 2215 2216 if (!xfs_is_inodegc_enabled(mp)) 2217 return SHRINK_STOP; 2218 2219 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); 2220 2221 for_each_online_cpu(cpu) { 2222 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2223 if (!llist_empty(&gc->list)) { 2224 unsigned int h = READ_ONCE(gc->shrinker_hits); 2225 2226 WRITE_ONCE(gc->shrinker_hits, h + 1); 2227 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 2228 no_items = false; 2229 } 2230 } 2231 2232 /* 2233 * If there are no inodes to inactivate, we don't want the shrinker 2234 * to think there's deferred work to call us back about. 2235 */ 2236 if (no_items) 2237 return LONG_MAX; 2238 2239 return SHRINK_STOP; 2240 } 2241 2242 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ 2243 int xfs_inodegc_register_shrinker(struct xfs_mount * mp)2244 xfs_inodegc_register_shrinker( 2245 struct xfs_mount *mp) 2246 { 2247 struct shrinker *shrink = &mp->m_inodegc_shrinker; 2248 2249 shrink->count_objects = xfs_inodegc_shrinker_count; 2250 shrink->scan_objects = xfs_inodegc_shrinker_scan; 2251 shrink->seeks = 0; 2252 shrink->flags = SHRINKER_NONSLAB; 2253 shrink->batch = XFS_INODEGC_SHRINKER_BATCH; 2254 2255 return register_shrinker(shrink); 2256 } 2257