1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * mm/truncate.c - code for taking down pages from address_spaces
4 *
5 * Copyright (C) 2002, Linus Torvalds
6 *
7 * 10Sep2002 Andrew Morton
8 * Initial version.
9 */
10
11 #include <linux/kernel.h>
12 #include <linux/backing-dev.h>
13 #include <linux/dax.h>
14 #include <linux/gfp.h>
15 #include <linux/mm.h>
16 #include <linux/swap.h>
17 #include <linux/export.h>
18 #include <linux/pagemap.h>
19 #include <linux/highmem.h>
20 #include <linux/pagevec.h>
21 #include <linux/task_io_accounting_ops.h>
22 #include <linux/shmem_fs.h>
23 #include <linux/cleancache.h>
24 #include <linux/rmap.h>
25 #include "internal.h"
26
27 #undef CREATE_TRACE_POINTS
28 #include <trace/hooks/vmscan.h>
29
30 /*
31 * Regular page slots are stabilized by the page lock even without the tree
32 * itself locked. These unlocked entries need verification under the tree
33 * lock.
34 */
__clear_shadow_entry(struct address_space * mapping,pgoff_t index,void * entry)35 static inline void __clear_shadow_entry(struct address_space *mapping,
36 pgoff_t index, void *entry)
37 {
38 XA_STATE(xas, &mapping->i_pages, index);
39
40 xas_set_update(&xas, workingset_update_node);
41 if (xas_load(&xas) != entry)
42 return;
43 xas_store(&xas, NULL);
44 }
45
clear_shadow_entries(struct address_space * mapping,struct folio_batch * fbatch,pgoff_t * indices)46 static void clear_shadow_entries(struct address_space *mapping,
47 struct folio_batch *fbatch, pgoff_t *indices)
48 {
49 int i;
50
51 /* Handled by shmem itself, or for DAX we do nothing. */
52 if (shmem_mapping(mapping) || dax_mapping(mapping))
53 return;
54
55 spin_lock(&mapping->host->i_lock);
56 xa_lock_irq(&mapping->i_pages);
57
58 for (i = 0; i < folio_batch_count(fbatch); i++) {
59 struct folio *folio = fbatch->folios[i];
60
61 if (xa_is_value(folio))
62 __clear_shadow_entry(mapping, indices[i], folio);
63 }
64
65 xa_unlock_irq(&mapping->i_pages);
66 if (mapping_shrinkable(mapping))
67 inode_add_lru(mapping->host);
68 spin_unlock(&mapping->host->i_lock);
69 }
70
71 /*
72 * Unconditionally remove exceptional entries. Usually called from truncate
73 * path. Note that the folio_batch may be altered by this function by removing
74 * exceptional entries similar to what folio_batch_remove_exceptionals() does.
75 */
truncate_folio_batch_exceptionals(struct address_space * mapping,struct folio_batch * fbatch,pgoff_t * indices)76 static void truncate_folio_batch_exceptionals(struct address_space *mapping,
77 struct folio_batch *fbatch, pgoff_t *indices)
78 {
79 int i, j;
80 bool dax;
81
82 /* Handled by shmem itself */
83 if (shmem_mapping(mapping))
84 return;
85
86 for (j = 0; j < folio_batch_count(fbatch); j++)
87 if (xa_is_value(fbatch->folios[j]))
88 break;
89
90 if (j == folio_batch_count(fbatch))
91 return;
92
93 dax = dax_mapping(mapping);
94 if (!dax) {
95 spin_lock(&mapping->host->i_lock);
96 xa_lock_irq(&mapping->i_pages);
97 }
98
99 for (i = j; i < folio_batch_count(fbatch); i++) {
100 struct folio *folio = fbatch->folios[i];
101 pgoff_t index = indices[i];
102
103 if (!xa_is_value(folio)) {
104 fbatch->folios[j++] = folio;
105 continue;
106 }
107
108 if (unlikely(dax)) {
109 dax_delete_mapping_entry(mapping, index);
110 continue;
111 }
112
113 __clear_shadow_entry(mapping, index, folio);
114 }
115
116 if (!dax) {
117 xa_unlock_irq(&mapping->i_pages);
118 if (mapping_shrinkable(mapping))
119 inode_add_lru(mapping->host);
120 spin_unlock(&mapping->host->i_lock);
121 }
122 fbatch->nr = j;
123 }
124
125 /**
126 * folio_invalidate - Invalidate part or all of a folio.
127 * @folio: The folio which is affected.
128 * @offset: start of the range to invalidate
129 * @length: length of the range to invalidate
130 *
131 * folio_invalidate() is called when all or part of the folio has become
132 * invalidated by a truncate operation.
133 *
134 * folio_invalidate() does not have to release all buffers, but it must
135 * ensure that no dirty buffer is left outside @offset and that no I/O
136 * is underway against any of the blocks which are outside the truncation
137 * point. Because the caller is about to free (and possibly reuse) those
138 * blocks on-disk.
139 */
folio_invalidate(struct folio * folio,size_t offset,size_t length)140 void folio_invalidate(struct folio *folio, size_t offset, size_t length)
141 {
142 const struct address_space_operations *aops = folio->mapping->a_ops;
143
144 if (aops->invalidate_folio)
145 aops->invalidate_folio(folio, offset, length);
146 }
147 EXPORT_SYMBOL_GPL(folio_invalidate);
148
149 /*
150 * If truncate cannot remove the fs-private metadata from the page, the page
151 * becomes orphaned. It will be left on the LRU and may even be mapped into
152 * user pagetables if we're racing with filemap_fault().
153 *
154 * We need to bail out if page->mapping is no longer equal to the original
155 * mapping. This happens a) when the VM reclaimed the page while we waited on
156 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
157 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
158 */
truncate_cleanup_folio(struct folio * folio)159 static void truncate_cleanup_folio(struct folio *folio)
160 {
161 if (folio_mapped(folio))
162 unmap_mapping_folio(folio);
163
164 if (folio_needs_release(folio))
165 folio_invalidate(folio, 0, folio_size(folio));
166
167 /*
168 * Some filesystems seem to re-dirty the page even after
169 * the VM has canceled the dirty bit (eg ext3 journaling).
170 * Hence dirty accounting check is placed after invalidation.
171 */
172 folio_cancel_dirty(folio);
173 folio_clear_mappedtodisk(folio);
174 }
175
truncate_inode_folio(struct address_space * mapping,struct folio * folio)176 int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
177 {
178 if (folio->mapping != mapping)
179 return -EIO;
180
181 truncate_cleanup_folio(folio);
182 filemap_remove_folio(folio);
183 return 0;
184 }
185
186 /*
187 * Handle partial folios. The folio may be entirely within the
188 * range if a split has raced with us. If not, we zero the part of the
189 * folio that's within the [start, end] range, and then split the folio if
190 * it's large. split_page_range() will discard pages which now lie beyond
191 * i_size, and we rely on the caller to discard pages which lie within a
192 * newly created hole.
193 *
194 * Returns false if splitting failed so the caller can avoid
195 * discarding the entire folio which is stubbornly unsplit.
196 */
truncate_inode_partial_folio(struct folio * folio,loff_t start,loff_t end)197 bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
198 {
199 loff_t pos = folio_pos(folio);
200 unsigned int offset, length;
201
202 if (pos < start)
203 offset = start - pos;
204 else
205 offset = 0;
206 length = folio_size(folio);
207 if (pos + length <= (u64)end)
208 length = length - offset;
209 else
210 length = end + 1 - pos - offset;
211
212 folio_wait_writeback(folio);
213 if (length == folio_size(folio)) {
214 truncate_inode_folio(folio->mapping, folio);
215 return true;
216 }
217
218 /*
219 * We may be zeroing pages we're about to discard, but it avoids
220 * doing a complex calculation here, and then doing the zeroing
221 * anyway if the page split fails.
222 */
223 if (!mapping_inaccessible(folio->mapping))
224 folio_zero_range(folio, offset, length);
225
226 cleancache_invalidate_page(folio->mapping, &folio->page);
227 if (folio_needs_release(folio))
228 folio_invalidate(folio, offset, length);
229 if (!folio_test_large(folio))
230 return true;
231 if (split_folio(folio) == 0)
232 return true;
233 if (folio_test_dirty(folio))
234 return false;
235 truncate_inode_folio(folio->mapping, folio);
236 return true;
237 }
238
239 /*
240 * Used to get rid of pages on hardware memory corruption.
241 */
generic_error_remove_folio(struct address_space * mapping,struct folio * folio)242 int generic_error_remove_folio(struct address_space *mapping,
243 struct folio *folio)
244 {
245 if (!mapping)
246 return -EINVAL;
247 /*
248 * Only punch for normal data pages for now.
249 * Handling other types like directories would need more auditing.
250 */
251 if (!S_ISREG(mapping->host->i_mode))
252 return -EIO;
253 return truncate_inode_folio(mapping, folio);
254 }
255 EXPORT_SYMBOL(generic_error_remove_folio);
256
257 /**
258 * mapping_evict_folio() - Remove an unused folio from the page-cache.
259 * @mapping: The mapping this folio belongs to.
260 * @folio: The folio to remove.
261 *
262 * Safely remove one folio from the page cache.
263 * It only drops clean, unused folios.
264 *
265 * Context: Folio must be locked.
266 * Return: The number of pages successfully removed.
267 */
mapping_evict_folio(struct address_space * mapping,struct folio * folio)268 long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
269 {
270 /* The page may have been truncated before it was locked */
271 if (!mapping)
272 return 0;
273 if (folio_test_dirty(folio) || folio_test_writeback(folio))
274 return 0;
275 /* The refcount will be elevated if any page in the folio is mapped */
276 if (folio_ref_count(folio) >
277 folio_nr_pages(folio) + folio_has_private(folio) + 1)
278 return 0;
279 if (!filemap_release_folio(folio, 0))
280 return 0;
281
282 return remove_mapping(mapping, folio);
283 }
284
285 /**
286 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
287 * @mapping: mapping to truncate
288 * @lstart: offset from which to truncate
289 * @lend: offset to which to truncate (inclusive)
290 *
291 * Truncate the page cache, removing the pages that are between
292 * specified offsets (and zeroing out partial pages
293 * if lstart or lend + 1 is not page aligned).
294 *
295 * Truncate takes two passes - the first pass is nonblocking. It will not
296 * block on page locks and it will not block on writeback. The second pass
297 * will wait. This is to prevent as much IO as possible in the affected region.
298 * The first pass will remove most pages, so the search cost of the second pass
299 * is low.
300 *
301 * We pass down the cache-hot hint to the page freeing code. Even if the
302 * mapping is large, it is probably the case that the final pages are the most
303 * recently touched, and freeing happens in ascending file offset order.
304 *
305 * Note that since ->invalidate_folio() accepts range to invalidate
306 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
307 * page aligned properly.
308 */
truncate_inode_pages_range(struct address_space * mapping,loff_t lstart,loff_t lend)309 void truncate_inode_pages_range(struct address_space *mapping,
310 loff_t lstart, loff_t lend)
311 {
312 pgoff_t start; /* inclusive */
313 pgoff_t end; /* exclusive */
314 struct folio_batch fbatch;
315 pgoff_t indices[PAGEVEC_SIZE];
316 pgoff_t index;
317 int i;
318 struct folio *folio;
319 bool same_folio;
320
321 if (mapping_empty(mapping))
322 goto out;
323
324 /*
325 * 'start' and 'end' always covers the range of pages to be fully
326 * truncated. Partial pages are covered with 'partial_start' at the
327 * start of the range and 'partial_end' at the end of the range.
328 * Note that 'end' is exclusive while 'lend' is inclusive.
329 */
330 start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
331 if (lend == -1)
332 /*
333 * lend == -1 indicates end-of-file so we have to set 'end'
334 * to the highest possible pgoff_t and since the type is
335 * unsigned we're using -1.
336 */
337 end = -1;
338 else
339 end = (lend + 1) >> PAGE_SHIFT;
340
341 folio_batch_init(&fbatch);
342 index = start;
343 while (index < end && find_lock_entries(mapping, &index, end - 1,
344 &fbatch, indices)) {
345 truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
346 for (i = 0; i < folio_batch_count(&fbatch); i++)
347 truncate_cleanup_folio(fbatch.folios[i]);
348 delete_from_page_cache_batch(mapping, &fbatch);
349 for (i = 0; i < folio_batch_count(&fbatch); i++)
350 folio_unlock(fbatch.folios[i]);
351 folio_batch_release(&fbatch);
352 cond_resched();
353 }
354
355 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
356 folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
357 if (!IS_ERR(folio)) {
358 same_folio = lend < folio_pos(folio) + folio_size(folio);
359 if (!truncate_inode_partial_folio(folio, lstart, lend)) {
360 start = folio_next_index(folio);
361 if (same_folio)
362 end = folio->index;
363 }
364 folio_unlock(folio);
365 folio_put(folio);
366 folio = NULL;
367 }
368
369 if (!same_folio) {
370 folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
371 FGP_LOCK, 0);
372 if (!IS_ERR(folio)) {
373 if (!truncate_inode_partial_folio(folio, lstart, lend))
374 end = folio->index;
375 folio_unlock(folio);
376 folio_put(folio);
377 }
378 }
379
380 index = start;
381 while (index < end) {
382 cond_resched();
383 if (!find_get_entries(mapping, &index, end - 1, &fbatch,
384 indices)) {
385 /* If all gone from start onwards, we're done */
386 if (index == start)
387 break;
388 /* Otherwise restart to make sure all gone */
389 index = start;
390 continue;
391 }
392
393 for (i = 0; i < folio_batch_count(&fbatch); i++) {
394 struct folio *folio = fbatch.folios[i];
395
396 /* We rely upon deletion not changing page->index */
397
398 if (xa_is_value(folio))
399 continue;
400
401 folio_lock(folio);
402 VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
403 folio_wait_writeback(folio);
404 truncate_inode_folio(mapping, folio);
405 folio_unlock(folio);
406 }
407 truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
408 folio_batch_release(&fbatch);
409 }
410
411 out:
412 cleancache_invalidate_inode(mapping);
413 }
414 EXPORT_SYMBOL(truncate_inode_pages_range);
415
416 /**
417 * truncate_inode_pages - truncate *all* the pages from an offset
418 * @mapping: mapping to truncate
419 * @lstart: offset from which to truncate
420 *
421 * Called under (and serialised by) inode->i_rwsem and
422 * mapping->invalidate_lock.
423 *
424 * Note: When this function returns, there can be a page in the process of
425 * deletion (inside __filemap_remove_folio()) in the specified range. Thus
426 * mapping->nrpages can be non-zero when this function returns even after
427 * truncation of the whole mapping.
428 */
truncate_inode_pages(struct address_space * mapping,loff_t lstart)429 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
430 {
431 truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
432 }
433 EXPORT_SYMBOL(truncate_inode_pages);
434
435 /**
436 * truncate_inode_pages_final - truncate *all* pages before inode dies
437 * @mapping: mapping to truncate
438 *
439 * Called under (and serialized by) inode->i_rwsem.
440 *
441 * Filesystems have to use this in the .evict_inode path to inform the
442 * VM that this is the final truncate and the inode is going away.
443 */
truncate_inode_pages_final(struct address_space * mapping)444 void truncate_inode_pages_final(struct address_space *mapping)
445 {
446 /*
447 * Page reclaim can not participate in regular inode lifetime
448 * management (can't call iput()) and thus can race with the
449 * inode teardown. Tell it when the address space is exiting,
450 * so that it does not install eviction information after the
451 * final truncate has begun.
452 */
453 mapping_set_exiting(mapping);
454
455 if (!mapping_empty(mapping)) {
456 /*
457 * As truncation uses a lockless tree lookup, cycle
458 * the tree lock to make sure any ongoing tree
459 * modification that does not see AS_EXITING is
460 * completed before starting the final truncate.
461 */
462 xa_lock_irq(&mapping->i_pages);
463 xa_unlock_irq(&mapping->i_pages);
464 }
465
466 /*
467 * Cleancache needs notification even if there are no pages or shadow
468 * entries.
469 */
470 truncate_inode_pages(mapping, 0);
471 }
472 EXPORT_SYMBOL(truncate_inode_pages_final);
473
474 /**
475 * mapping_try_invalidate - Invalidate all the evictable folios of one inode
476 * @mapping: the address_space which holds the folios to invalidate
477 * @start: the offset 'from' which to invalidate
478 * @end: the offset 'to' which to invalidate (inclusive)
479 * @nr_failed: How many folio invalidations failed
480 *
481 * This function is similar to invalidate_mapping_pages(), except that it
482 * returns the number of folios which could not be evicted in @nr_failed.
483 */
mapping_try_invalidate(struct address_space * mapping,pgoff_t start,pgoff_t end,unsigned long * nr_failed)484 unsigned long mapping_try_invalidate(struct address_space *mapping,
485 pgoff_t start, pgoff_t end, unsigned long *nr_failed)
486 {
487 pgoff_t indices[PAGEVEC_SIZE];
488 struct folio_batch fbatch;
489 pgoff_t index = start;
490 unsigned long ret;
491 unsigned long count = 0;
492 int i;
493 bool xa_has_values = false;
494 bool skip = false;
495
496 trace_android_vh_invalidate_mapping_pagevec(mapping, &skip);
497 if (skip)
498 return count;
499
500 folio_batch_init(&fbatch);
501 while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
502 for (i = 0; i < folio_batch_count(&fbatch); i++) {
503 struct folio *folio = fbatch.folios[i];
504
505 /* We rely upon deletion not changing folio->index */
506
507 if (xa_is_value(folio)) {
508 xa_has_values = true;
509 count++;
510 continue;
511 }
512
513 ret = mapping_evict_folio(mapping, folio);
514 folio_unlock(folio);
515 /*
516 * Invalidation is a hint that the folio is no longer
517 * of interest and try to speed up its reclaim.
518 */
519 if (!ret) {
520 deactivate_file_folio(folio);
521 /* Likely in the lru cache of a remote CPU */
522 if (nr_failed)
523 (*nr_failed)++;
524 }
525 count += ret;
526 }
527
528 if (xa_has_values)
529 clear_shadow_entries(mapping, &fbatch, indices);
530
531 folio_batch_remove_exceptionals(&fbatch);
532 folio_batch_release(&fbatch);
533 cond_resched();
534 }
535 return count;
536 }
537
538 /**
539 * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
540 * @mapping: the address_space which holds the cache to invalidate
541 * @start: the offset 'from' which to invalidate
542 * @end: the offset 'to' which to invalidate (inclusive)
543 *
544 * This function removes pages that are clean, unmapped and unlocked,
545 * as well as shadow entries. It will not block on IO activity.
546 *
547 * If you want to remove all the pages of one inode, regardless of
548 * their use and writeback state, use truncate_inode_pages().
549 *
550 * Return: The number of indices that had their contents invalidated
551 */
invalidate_mapping_pages(struct address_space * mapping,pgoff_t start,pgoff_t end)552 unsigned long invalidate_mapping_pages(struct address_space *mapping,
553 pgoff_t start, pgoff_t end)
554 {
555 return mapping_try_invalidate(mapping, start, end, NULL);
556 }
557 EXPORT_SYMBOL(invalidate_mapping_pages);
558
folio_launder(struct address_space * mapping,struct folio * folio)559 static int folio_launder(struct address_space *mapping, struct folio *folio)
560 {
561 if (!folio_test_dirty(folio))
562 return 0;
563 if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
564 return 0;
565 return mapping->a_ops->launder_folio(folio);
566 }
567
568 /*
569 * This is like mapping_evict_folio(), except it ignores the folio's
570 * refcount. We do this because invalidate_inode_pages2() needs stronger
571 * invalidation guarantees, and cannot afford to leave folios behind because
572 * shrink_folio_list() has a temp ref on them, or because they're transiently
573 * sitting in the folio_add_lru() caches.
574 */
folio_unmap_invalidate(struct address_space * mapping,struct folio * folio,gfp_t gfp)575 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
576 gfp_t gfp)
577 {
578 int ret;
579
580 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
581
582 if (folio_mapped(folio))
583 unmap_mapping_folio(folio);
584 BUG_ON(folio_mapped(folio));
585
586 ret = folio_launder(mapping, folio);
587 if (ret)
588 return ret;
589 if (folio->mapping != mapping)
590 return -EBUSY;
591 if (!filemap_release_folio(folio, gfp))
592 return -EBUSY;
593
594 spin_lock(&mapping->host->i_lock);
595 xa_lock_irq(&mapping->i_pages);
596 if (folio_test_dirty(folio))
597 goto failed;
598
599 BUG_ON(folio_has_private(folio));
600 __filemap_remove_folio(folio, NULL);
601 xa_unlock_irq(&mapping->i_pages);
602 if (mapping_shrinkable(mapping))
603 inode_add_lru(mapping->host);
604 spin_unlock(&mapping->host->i_lock);
605
606 filemap_free_folio(mapping, folio);
607 return 1;
608 failed:
609 xa_unlock_irq(&mapping->i_pages);
610 spin_unlock(&mapping->host->i_lock);
611 return -EBUSY;
612 }
613
614 /**
615 * invalidate_inode_pages2_range - remove range of pages from an address_space
616 * @mapping: the address_space
617 * @start: the page offset 'from' which to invalidate
618 * @end: the page offset 'to' which to invalidate (inclusive)
619 *
620 * Any pages which are found to be mapped into pagetables are unmapped prior to
621 * invalidation.
622 *
623 * Return: -EBUSY if any pages could not be invalidated.
624 */
invalidate_inode_pages2_range(struct address_space * mapping,pgoff_t start,pgoff_t end)625 int invalidate_inode_pages2_range(struct address_space *mapping,
626 pgoff_t start, pgoff_t end)
627 {
628 pgoff_t indices[PAGEVEC_SIZE];
629 struct folio_batch fbatch;
630 pgoff_t index;
631 int i;
632 int ret = 0;
633 int ret2 = 0;
634 int did_range_unmap = 0;
635 bool xa_has_values = false;
636
637 if (mapping_empty(mapping))
638 goto out;
639
640 folio_batch_init(&fbatch);
641 index = start;
642 while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
643 for (i = 0; i < folio_batch_count(&fbatch); i++) {
644 struct folio *folio = fbatch.folios[i];
645
646 /* We rely upon deletion not changing folio->index */
647
648 if (xa_is_value(folio)) {
649 xa_has_values = true;
650 if (dax_mapping(mapping) &&
651 !dax_invalidate_mapping_entry_sync(mapping, indices[i]))
652 ret = -EBUSY;
653 continue;
654 }
655
656 if (!did_range_unmap && folio_mapped(folio)) {
657 /*
658 * If folio is mapped, before taking its lock,
659 * zap the rest of the file in one hit.
660 */
661 unmap_mapping_pages(mapping, indices[i],
662 (1 + end - indices[i]), false);
663 did_range_unmap = 1;
664 }
665
666 folio_lock(folio);
667 if (unlikely(folio->mapping != mapping)) {
668 folio_unlock(folio);
669 continue;
670 }
671 VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
672 folio_wait_writeback(folio);
673 ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
674 if (ret2 < 0)
675 ret = ret2;
676 folio_unlock(folio);
677 }
678
679 if (xa_has_values)
680 clear_shadow_entries(mapping, &fbatch, indices);
681
682 folio_batch_remove_exceptionals(&fbatch);
683 folio_batch_release(&fbatch);
684 cond_resched();
685 }
686 /*
687 * For DAX we invalidate page tables after invalidating page cache. We
688 * could invalidate page tables while invalidating each entry however
689 * that would be expensive. And doing range unmapping before doesn't
690 * work as we have no cheap way to find whether page cache entry didn't
691 * get remapped later.
692 */
693 if (dax_mapping(mapping)) {
694 unmap_mapping_pages(mapping, start, end - start + 1, false);
695 }
696 out:
697 cleancache_invalidate_inode(mapping);
698 return ret;
699 }
700 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
701
702 /**
703 * invalidate_inode_pages2 - remove all pages from an address_space
704 * @mapping: the address_space
705 *
706 * Any pages which are found to be mapped into pagetables are unmapped prior to
707 * invalidation.
708 *
709 * Return: -EBUSY if any pages could not be invalidated.
710 */
invalidate_inode_pages2(struct address_space * mapping)711 int invalidate_inode_pages2(struct address_space *mapping)
712 {
713 return invalidate_inode_pages2_range(mapping, 0, -1);
714 }
715 EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
716
717 /**
718 * truncate_pagecache - unmap and remove pagecache that has been truncated
719 * @inode: inode
720 * @newsize: new file size
721 *
722 * inode's new i_size must already be written before truncate_pagecache
723 * is called.
724 *
725 * This function should typically be called before the filesystem
726 * releases resources associated with the freed range (eg. deallocates
727 * blocks). This way, pagecache will always stay logically coherent
728 * with on-disk format, and the filesystem would not have to deal with
729 * situations such as writepage being called for a page that has already
730 * had its underlying blocks deallocated.
731 */
truncate_pagecache(struct inode * inode,loff_t newsize)732 void truncate_pagecache(struct inode *inode, loff_t newsize)
733 {
734 struct address_space *mapping = inode->i_mapping;
735 loff_t holebegin = round_up(newsize, PAGE_SIZE);
736
737 /*
738 * unmap_mapping_range is called twice, first simply for
739 * efficiency so that truncate_inode_pages does fewer
740 * single-page unmaps. However after this first call, and
741 * before truncate_inode_pages finishes, it is possible for
742 * private pages to be COWed, which remain after
743 * truncate_inode_pages finishes, hence the second
744 * unmap_mapping_range call must be made for correctness.
745 */
746 unmap_mapping_range(mapping, holebegin, 0, 1);
747 truncate_inode_pages(mapping, newsize);
748 unmap_mapping_range(mapping, holebegin, 0, 1);
749 }
750 EXPORT_SYMBOL(truncate_pagecache);
751
752 /**
753 * truncate_setsize - update inode and pagecache for a new file size
754 * @inode: inode
755 * @newsize: new file size
756 *
757 * truncate_setsize updates i_size and performs pagecache truncation (if
758 * necessary) to @newsize. It will be typically be called from the filesystem's
759 * setattr function when ATTR_SIZE is passed in.
760 *
761 * Must be called with a lock serializing truncates and writes (generally
762 * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
763 * specific block truncation has been performed.
764 */
truncate_setsize(struct inode * inode,loff_t newsize)765 void truncate_setsize(struct inode *inode, loff_t newsize)
766 {
767 loff_t oldsize = inode->i_size;
768
769 i_size_write(inode, newsize);
770 if (newsize > oldsize)
771 pagecache_isize_extended(inode, oldsize, newsize);
772 truncate_pagecache(inode, newsize);
773 }
774 EXPORT_SYMBOL(truncate_setsize);
775
776 /**
777 * pagecache_isize_extended - update pagecache after extension of i_size
778 * @inode: inode for which i_size was extended
779 * @from: original inode size
780 * @to: new inode size
781 *
782 * Handle extension of inode size either caused by extending truncate or
783 * by write starting after current i_size. We mark the page straddling
784 * current i_size RO so that page_mkwrite() is called on the first
785 * write access to the page. The filesystem will update its per-block
786 * information before user writes to the page via mmap after the i_size
787 * has been changed.
788 *
789 * The function must be called after i_size is updated so that page fault
790 * coming after we unlock the folio will already see the new i_size.
791 * The function must be called while we still hold i_rwsem - this not only
792 * makes sure i_size is stable but also that userspace cannot observe new
793 * i_size value before we are prepared to store mmap writes at new inode size.
794 */
pagecache_isize_extended(struct inode * inode,loff_t from,loff_t to)795 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
796 {
797 int bsize = i_blocksize(inode);
798 loff_t rounded_from;
799 struct folio *folio;
800
801 WARN_ON(to > inode->i_size);
802
803 if (from >= to || bsize >= PAGE_SIZE)
804 return;
805 /* Page straddling @from will not have any hole block created? */
806 rounded_from = round_up(from, bsize);
807 if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
808 return;
809
810 folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
811 /* Folio not cached? Nothing to do */
812 if (IS_ERR(folio))
813 return;
814 /*
815 * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
816 * is needed.
817 */
818 if (folio_mkclean(folio))
819 folio_mark_dirty(folio);
820 folio_unlock(folio);
821 folio_put(folio);
822 }
823 EXPORT_SYMBOL(pagecache_isize_extended);
824
825 /**
826 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
827 * @inode: inode
828 * @lstart: offset of beginning of hole
829 * @lend: offset of last byte of hole
830 *
831 * This function should typically be called before the filesystem
832 * releases resources associated with the freed range (eg. deallocates
833 * blocks). This way, pagecache will always stay logically coherent
834 * with on-disk format, and the filesystem would not have to deal with
835 * situations such as writepage being called for a page that has already
836 * had its underlying blocks deallocated.
837 */
truncate_pagecache_range(struct inode * inode,loff_t lstart,loff_t lend)838 void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
839 {
840 struct address_space *mapping = inode->i_mapping;
841 loff_t unmap_start = round_up(lstart, PAGE_SIZE);
842 loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
843 /*
844 * This rounding is currently just for example: unmap_mapping_range
845 * expands its hole outwards, whereas we want it to contract the hole
846 * inwards. However, existing callers of truncate_pagecache_range are
847 * doing their own page rounding first. Note that unmap_mapping_range
848 * allows holelen 0 for all, and we allow lend -1 for end of file.
849 */
850
851 /*
852 * Unlike in truncate_pagecache, unmap_mapping_range is called only
853 * once (before truncating pagecache), and without "even_cows" flag:
854 * hole-punching should not remove private COWed pages from the hole.
855 */
856 if ((u64)unmap_end > (u64)unmap_start)
857 unmap_mapping_range(mapping, unmap_start,
858 1 + unmap_end - unmap_start, 0);
859 truncate_inode_pages_range(mapping, lstart, lend);
860 }
861 EXPORT_SYMBOL(truncate_pagecache_range);
862