1 /*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
13 *
14 * Extended attribute support for tmpfs:
15 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17 *
18 * tiny-shmem:
19 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20 *
21 * This file is released under the GPL.
22 */
23
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/fileattr.h>
32 #include <linux/mm.h>
33 #include <linux/random.h>
34 #include <linux/sched/signal.h>
35 #include <linux/export.h>
36 #include <linux/shmem_fs.h>
37 #include <linux/swap.h>
38 #include <linux/uio.h>
39 #include <linux/hugetlb.h>
40 #include <linux/fs_parser.h>
41 #include <linux/swapfile.h>
42 #include <linux/iversion.h>
43 #include <linux/mm_inline.h>
44 #include "swap.h"
45
46 #undef CREATE_TRACE_POINTS
47 #include <trace/hooks/mm.h>
48
49 static struct vfsmount *shm_mnt __ro_after_init;
50
51 #ifdef CONFIG_SHMEM
52 /*
53 * This virtual memory filesystem is heavily based on the ramfs. It
54 * extends ramfs by the ability to use swap and honor resource limits
55 * which makes it a completely usable filesystem.
56 */
57
58 #include <linux/xattr.h>
59 #include <linux/exportfs.h>
60 #include <linux/posix_acl.h>
61 #include <linux/posix_acl_xattr.h>
62 #include <linux/mman.h>
63 #include <linux/string.h>
64 #include <linux/slab.h>
65 #include <linux/backing-dev.h>
66 #include <linux/writeback.h>
67 #include <linux/pagevec.h>
68 #include <linux/percpu_counter.h>
69 #include <linux/falloc.h>
70 #include <linux/splice.h>
71 #include <linux/security.h>
72 #include <linux/swapops.h>
73 #include <linux/mempolicy.h>
74 #include <linux/namei.h>
75 #include <linux/ctype.h>
76 #include <linux/migrate.h>
77 #include <linux/highmem.h>
78 #include <linux/seq_file.h>
79 #include <linux/magic.h>
80 #include <linux/syscalls.h>
81 #include <linux/fcntl.h>
82 #include <uapi/linux/memfd.h>
83 #include <linux/rmap.h>
84 #include <linux/uuid.h>
85 #include <linux/quotaops.h>
86 #include <linux/rcupdate_wait.h>
87 #include <linux/android_vendor.h>
88
89 #include <linux/uaccess.h>
90
91 #include "internal.h"
92
93 #define BLOCKS_PER_PAGE (PAGE_SIZE/512)
94 #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
95
96 /* Pretend that each entry is of this size in directory's i_size */
97 #define BOGO_DIRENT_SIZE 20
98
99 /* Pretend that one inode + its dentry occupy this much memory */
100 #define BOGO_INODE_SIZE 1024
101
102 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
103 #define SHORT_SYMLINK_LEN 128
104
105 /*
106 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
107 * inode->i_private (with i_rwsem making sure that it has only one user at
108 * a time): we would prefer not to enlarge the shmem inode just for that.
109 */
110 struct shmem_falloc {
111 wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
112 pgoff_t start; /* start of range currently being fallocated */
113 pgoff_t next; /* the next page offset to be fallocated */
114 pgoff_t nr_falloced; /* how many new pages have been fallocated */
115 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
116 };
117
118 struct shmem_options {
119 unsigned long long blocks;
120 unsigned long long inodes;
121 struct mempolicy *mpol;
122 kuid_t uid;
123 kgid_t gid;
124 umode_t mode;
125 bool full_inums;
126 int huge;
127 int seen;
128 bool noswap;
129 unsigned short quota_types;
130 struct shmem_quota_limits qlimits;
131 #define SHMEM_SEEN_BLOCKS 1
132 #define SHMEM_SEEN_INODES 2
133 #define SHMEM_SEEN_HUGE 4
134 #define SHMEM_SEEN_INUMS 8
135 #define SHMEM_SEEN_NOSWAP 16
136 #define SHMEM_SEEN_QUOTA 32
137 };
138
139 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
140 static unsigned long huge_shmem_orders_always __read_mostly;
141 static unsigned long huge_shmem_orders_madvise __read_mostly;
142 static unsigned long huge_shmem_orders_inherit __read_mostly;
143 static unsigned long huge_shmem_orders_within_size __read_mostly;
144 #endif
145
146 #ifdef CONFIG_TMPFS
shmem_default_max_blocks(void)147 static unsigned long shmem_default_max_blocks(void)
148 {
149 return totalram_pages() / 2;
150 }
151
shmem_default_max_inodes(void)152 static unsigned long shmem_default_max_inodes(void)
153 {
154 unsigned long nr_pages = totalram_pages();
155
156 return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
157 ULONG_MAX / BOGO_INODE_SIZE);
158 }
159 #endif
160
161 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
162 struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
163 struct vm_area_struct *vma, vm_fault_t *fault_type);
164
SHMEM_SB(struct super_block * sb)165 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
166 {
167 return sb->s_fs_info;
168 }
169
170 /*
171 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
172 * for shared memory and for shared anonymous (/dev/zero) mappings
173 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
174 * consistent with the pre-accounting of private mappings ...
175 */
shmem_acct_size(unsigned long flags,loff_t size)176 static inline int shmem_acct_size(unsigned long flags, loff_t size)
177 {
178 return (flags & VM_NORESERVE) ?
179 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
180 }
181
shmem_unacct_size(unsigned long flags,loff_t size)182 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
183 {
184 if (!(flags & VM_NORESERVE))
185 vm_unacct_memory(VM_ACCT(size));
186 }
187
shmem_reacct_size(unsigned long flags,loff_t oldsize,loff_t newsize)188 static inline int shmem_reacct_size(unsigned long flags,
189 loff_t oldsize, loff_t newsize)
190 {
191 if (!(flags & VM_NORESERVE)) {
192 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
193 return security_vm_enough_memory_mm(current->mm,
194 VM_ACCT(newsize) - VM_ACCT(oldsize));
195 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
196 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
197 }
198 return 0;
199 }
200
201 /*
202 * ... whereas tmpfs objects are accounted incrementally as
203 * pages are allocated, in order to allow large sparse files.
204 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
205 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
206 */
shmem_acct_blocks(unsigned long flags,long pages)207 static inline int shmem_acct_blocks(unsigned long flags, long pages)
208 {
209 if (!(flags & VM_NORESERVE))
210 return 0;
211
212 return security_vm_enough_memory_mm(current->mm,
213 pages * VM_ACCT(PAGE_SIZE));
214 }
215
shmem_unacct_blocks(unsigned long flags,long pages)216 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
217 {
218 if (flags & VM_NORESERVE)
219 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
220 }
221
shmem_inode_acct_blocks(struct inode * inode,long pages)222 static int shmem_inode_acct_blocks(struct inode *inode, long pages)
223 {
224 struct shmem_inode_info *info = SHMEM_I(inode);
225 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
226 int err = -ENOSPC;
227
228 if (shmem_acct_blocks(info->flags, pages))
229 return err;
230
231 might_sleep(); /* when quotas */
232 if (sbinfo->max_blocks) {
233 if (!percpu_counter_limited_add(&sbinfo->used_blocks,
234 sbinfo->max_blocks, pages))
235 goto unacct;
236
237 err = dquot_alloc_block_nodirty(inode, pages);
238 if (err) {
239 percpu_counter_sub(&sbinfo->used_blocks, pages);
240 goto unacct;
241 }
242 } else {
243 err = dquot_alloc_block_nodirty(inode, pages);
244 if (err)
245 goto unacct;
246 }
247
248 return 0;
249
250 unacct:
251 shmem_unacct_blocks(info->flags, pages);
252 return err;
253 }
254
shmem_inode_unacct_blocks(struct inode * inode,long pages)255 static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
256 {
257 struct shmem_inode_info *info = SHMEM_I(inode);
258 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
259
260 might_sleep(); /* when quotas */
261 dquot_free_block_nodirty(inode, pages);
262
263 if (sbinfo->max_blocks)
264 percpu_counter_sub(&sbinfo->used_blocks, pages);
265 shmem_unacct_blocks(info->flags, pages);
266 }
267
268 static const struct super_operations shmem_ops;
269 static const struct address_space_operations shmem_aops;
270 static const struct file_operations shmem_file_operations;
271 static const struct inode_operations shmem_inode_operations;
272 static const struct inode_operations shmem_dir_inode_operations;
273 static const struct inode_operations shmem_special_inode_operations;
274 static const struct vm_operations_struct shmem_vm_ops;
275 static const struct vm_operations_struct shmem_anon_vm_ops;
276 static struct file_system_type shmem_fs_type;
277
shmem_mapping(struct address_space * mapping)278 bool shmem_mapping(struct address_space *mapping)
279 {
280 return mapping->a_ops == &shmem_aops;
281 }
282 EXPORT_SYMBOL_GPL(shmem_mapping);
283
vma_is_anon_shmem(struct vm_area_struct * vma)284 bool vma_is_anon_shmem(struct vm_area_struct *vma)
285 {
286 return vma->vm_ops == &shmem_anon_vm_ops;
287 }
288
vma_is_shmem(struct vm_area_struct * vma)289 bool vma_is_shmem(struct vm_area_struct *vma)
290 {
291 return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
292 }
293
294 static LIST_HEAD(shmem_swaplist);
295 static DEFINE_MUTEX(shmem_swaplist_mutex);
296
297 #ifdef CONFIG_TMPFS_QUOTA
298
shmem_enable_quotas(struct super_block * sb,unsigned short quota_types)299 static int shmem_enable_quotas(struct super_block *sb,
300 unsigned short quota_types)
301 {
302 int type, err = 0;
303
304 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
305 for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
306 if (!(quota_types & (1 << type)))
307 continue;
308 err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
309 DQUOT_USAGE_ENABLED |
310 DQUOT_LIMITS_ENABLED);
311 if (err)
312 goto out_err;
313 }
314 return 0;
315
316 out_err:
317 pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
318 type, err);
319 for (type--; type >= 0; type--)
320 dquot_quota_off(sb, type);
321 return err;
322 }
323
shmem_disable_quotas(struct super_block * sb)324 static void shmem_disable_quotas(struct super_block *sb)
325 {
326 int type;
327
328 for (type = 0; type < SHMEM_MAXQUOTAS; type++)
329 dquot_quota_off(sb, type);
330 }
331
shmem_get_dquots(struct inode * inode)332 static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
333 {
334 return SHMEM_I(inode)->i_dquot;
335 }
336 #endif /* CONFIG_TMPFS_QUOTA */
337
338 /*
339 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
340 * produces a novel ino for the newly allocated inode.
341 *
342 * It may also be called when making a hard link to permit the space needed by
343 * each dentry. However, in that case, no new inode number is needed since that
344 * internally draws from another pool of inode numbers (currently global
345 * get_next_ino()). This case is indicated by passing NULL as inop.
346 */
347 #define SHMEM_INO_BATCH 1024
shmem_reserve_inode(struct super_block * sb,ino_t * inop)348 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
349 {
350 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
351 ino_t ino;
352
353 if (!(sb->s_flags & SB_KERNMOUNT)) {
354 raw_spin_lock(&sbinfo->stat_lock);
355 if (sbinfo->max_inodes) {
356 if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
357 raw_spin_unlock(&sbinfo->stat_lock);
358 return -ENOSPC;
359 }
360 sbinfo->free_ispace -= BOGO_INODE_SIZE;
361 }
362 if (inop) {
363 ino = sbinfo->next_ino++;
364 if (unlikely(is_zero_ino(ino)))
365 ino = sbinfo->next_ino++;
366 if (unlikely(!sbinfo->full_inums &&
367 ino > UINT_MAX)) {
368 /*
369 * Emulate get_next_ino uint wraparound for
370 * compatibility
371 */
372 if (IS_ENABLED(CONFIG_64BIT))
373 pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
374 __func__, MINOR(sb->s_dev));
375 sbinfo->next_ino = 1;
376 ino = sbinfo->next_ino++;
377 }
378 *inop = ino;
379 }
380 raw_spin_unlock(&sbinfo->stat_lock);
381 } else if (inop) {
382 /*
383 * __shmem_file_setup, one of our callers, is lock-free: it
384 * doesn't hold stat_lock in shmem_reserve_inode since
385 * max_inodes is always 0, and is called from potentially
386 * unknown contexts. As such, use a per-cpu batched allocator
387 * which doesn't require the per-sb stat_lock unless we are at
388 * the batch boundary.
389 *
390 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
391 * shmem mounts are not exposed to userspace, so we don't need
392 * to worry about things like glibc compatibility.
393 */
394 ino_t *next_ino;
395
396 next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
397 ino = *next_ino;
398 if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
399 raw_spin_lock(&sbinfo->stat_lock);
400 ino = sbinfo->next_ino;
401 sbinfo->next_ino += SHMEM_INO_BATCH;
402 raw_spin_unlock(&sbinfo->stat_lock);
403 if (unlikely(is_zero_ino(ino)))
404 ino++;
405 }
406 *inop = ino;
407 *next_ino = ++ino;
408 put_cpu();
409 }
410
411 return 0;
412 }
413
shmem_free_inode(struct super_block * sb,size_t freed_ispace)414 static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
415 {
416 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
417 if (sbinfo->max_inodes) {
418 raw_spin_lock(&sbinfo->stat_lock);
419 sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
420 raw_spin_unlock(&sbinfo->stat_lock);
421 }
422 }
423
424 /**
425 * shmem_recalc_inode - recalculate the block usage of an inode
426 * @inode: inode to recalc
427 * @alloced: the change in number of pages allocated to inode
428 * @swapped: the change in number of pages swapped from inode
429 *
430 * We have to calculate the free blocks since the mm can drop
431 * undirtied hole pages behind our back.
432 *
433 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
434 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
435 */
shmem_recalc_inode(struct inode * inode,long alloced,long swapped)436 static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
437 {
438 struct shmem_inode_info *info = SHMEM_I(inode);
439 long freed;
440
441 spin_lock(&info->lock);
442 info->alloced += alloced;
443 info->swapped += swapped;
444 freed = info->alloced - info->swapped -
445 READ_ONCE(inode->i_mapping->nrpages);
446 /*
447 * Special case: whereas normally shmem_recalc_inode() is called
448 * after i_mapping->nrpages has already been adjusted (up or down),
449 * shmem_writepage() has to raise swapped before nrpages is lowered -
450 * to stop a racing shmem_recalc_inode() from thinking that a page has
451 * been freed. Compensate here, to avoid the need for a followup call.
452 */
453 if (swapped > 0)
454 freed += swapped;
455 if (freed > 0)
456 info->alloced -= freed;
457 spin_unlock(&info->lock);
458
459 /* The quota case may block */
460 if (freed > 0)
461 shmem_inode_unacct_blocks(inode, freed);
462 }
463
shmem_charge(struct inode * inode,long pages)464 bool shmem_charge(struct inode *inode, long pages)
465 {
466 struct address_space *mapping = inode->i_mapping;
467
468 if (shmem_inode_acct_blocks(inode, pages))
469 return false;
470
471 /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
472 xa_lock_irq(&mapping->i_pages);
473 mapping->nrpages += pages;
474 xa_unlock_irq(&mapping->i_pages);
475
476 shmem_recalc_inode(inode, pages, 0);
477 return true;
478 }
479
shmem_uncharge(struct inode * inode,long pages)480 void shmem_uncharge(struct inode *inode, long pages)
481 {
482 /* pages argument is currently unused: keep it to help debugging */
483 /* nrpages adjustment done by __filemap_remove_folio() or caller */
484
485 shmem_recalc_inode(inode, 0, 0);
486 }
487
488 /*
489 * Replace item expected in xarray by a new item, while holding xa_lock.
490 */
shmem_replace_entry(struct address_space * mapping,pgoff_t index,void * expected,void * replacement)491 static int shmem_replace_entry(struct address_space *mapping,
492 pgoff_t index, void *expected, void *replacement)
493 {
494 XA_STATE(xas, &mapping->i_pages, index);
495 void *item;
496
497 VM_BUG_ON(!expected);
498 VM_BUG_ON(!replacement);
499 item = xas_load(&xas);
500 if (item != expected)
501 return -ENOENT;
502 xas_store(&xas, replacement);
503 return 0;
504 }
505
506 /*
507 * Sometimes, before we decide whether to proceed or to fail, we must check
508 * that an entry was not already brought back from swap by a racing thread.
509 *
510 * Checking folio is not enough: by the time a swapcache folio is locked, it
511 * might be reused, and again be swapcache, using the same swap as before.
512 */
shmem_confirm_swap(struct address_space * mapping,pgoff_t index,swp_entry_t swap)513 static bool shmem_confirm_swap(struct address_space *mapping,
514 pgoff_t index, swp_entry_t swap)
515 {
516 return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
517 }
518
519 /*
520 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
521 *
522 * SHMEM_HUGE_NEVER:
523 * disables huge pages for the mount;
524 * SHMEM_HUGE_ALWAYS:
525 * enables huge pages for the mount;
526 * SHMEM_HUGE_WITHIN_SIZE:
527 * only allocate huge pages if the page will be fully within i_size,
528 * also respect fadvise()/madvise() hints;
529 * SHMEM_HUGE_ADVISE:
530 * only allocate huge pages if requested with fadvise()/madvise();
531 */
532
533 #define SHMEM_HUGE_NEVER 0
534 #define SHMEM_HUGE_ALWAYS 1
535 #define SHMEM_HUGE_WITHIN_SIZE 2
536 #define SHMEM_HUGE_ADVISE 3
537
538 /*
539 * Special values.
540 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
541 *
542 * SHMEM_HUGE_DENY:
543 * disables huge on shm_mnt and all mounts, for emergency use;
544 * SHMEM_HUGE_FORCE:
545 * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
546 *
547 */
548 #define SHMEM_HUGE_DENY (-1)
549 #define SHMEM_HUGE_FORCE (-2)
550
551 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
552 /* ifdef here to avoid bloating shmem.o when not necessary */
553
554 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
555
__shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,unsigned long vm_flags)556 static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
557 loff_t write_end, bool shmem_huge_force,
558 struct vm_area_struct *vma,
559 unsigned long vm_flags)
560 {
561 struct mm_struct *mm = vma ? vma->vm_mm : NULL;
562 loff_t i_size;
563
564 if (!S_ISREG(inode->i_mode))
565 return false;
566 if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
567 return false;
568 if (shmem_huge == SHMEM_HUGE_DENY)
569 return false;
570 if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
571 return true;
572
573 switch (SHMEM_SB(inode->i_sb)->huge) {
574 case SHMEM_HUGE_ALWAYS:
575 return true;
576 case SHMEM_HUGE_WITHIN_SIZE:
577 index = round_up(index + 1, HPAGE_PMD_NR);
578 i_size = max(write_end, i_size_read(inode));
579 i_size = round_up(i_size, PAGE_SIZE);
580 if (i_size >> PAGE_SHIFT >= index)
581 return true;
582 fallthrough;
583 case SHMEM_HUGE_ADVISE:
584 if (mm && (vm_flags & VM_HUGEPAGE))
585 return true;
586 fallthrough;
587 default:
588 return false;
589 }
590 }
591
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,unsigned long vm_flags)592 static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
593 loff_t write_end, bool shmem_huge_force,
594 struct vm_area_struct *vma, unsigned long vm_flags)
595 {
596 if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
597 return false;
598
599 return __shmem_huge_global_enabled(inode, index, write_end,
600 shmem_huge_force, vma, vm_flags);
601 }
602
603 #if defined(CONFIG_SYSFS)
shmem_parse_huge(const char * str)604 static int shmem_parse_huge(const char *str)
605 {
606 if (!strcmp(str, "never"))
607 return SHMEM_HUGE_NEVER;
608 if (!strcmp(str, "always"))
609 return SHMEM_HUGE_ALWAYS;
610 if (!strcmp(str, "within_size"))
611 return SHMEM_HUGE_WITHIN_SIZE;
612 if (!strcmp(str, "advise"))
613 return SHMEM_HUGE_ADVISE;
614 if (!strcmp(str, "deny"))
615 return SHMEM_HUGE_DENY;
616 if (!strcmp(str, "force"))
617 return SHMEM_HUGE_FORCE;
618 return -EINVAL;
619 }
620 #endif
621
622 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
shmem_format_huge(int huge)623 static const char *shmem_format_huge(int huge)
624 {
625 switch (huge) {
626 case SHMEM_HUGE_NEVER:
627 return "never";
628 case SHMEM_HUGE_ALWAYS:
629 return "always";
630 case SHMEM_HUGE_WITHIN_SIZE:
631 return "within_size";
632 case SHMEM_HUGE_ADVISE:
633 return "advise";
634 case SHMEM_HUGE_DENY:
635 return "deny";
636 case SHMEM_HUGE_FORCE:
637 return "force";
638 default:
639 VM_BUG_ON(1);
640 return "bad_val";
641 }
642 }
643 #endif
644
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)645 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
646 struct shrink_control *sc, unsigned long nr_to_free)
647 {
648 LIST_HEAD(list), *pos, *next;
649 struct inode *inode;
650 struct shmem_inode_info *info;
651 struct folio *folio;
652 unsigned long batch = sc ? sc->nr_to_scan : 128;
653 unsigned long split = 0, freed = 0;
654
655 if (list_empty(&sbinfo->shrinklist))
656 return SHRINK_STOP;
657
658 spin_lock(&sbinfo->shrinklist_lock);
659 list_for_each_safe(pos, next, &sbinfo->shrinklist) {
660 info = list_entry(pos, struct shmem_inode_info, shrinklist);
661
662 /* pin the inode */
663 inode = igrab(&info->vfs_inode);
664
665 /* inode is about to be evicted */
666 if (!inode) {
667 list_del_init(&info->shrinklist);
668 goto next;
669 }
670
671 list_move(&info->shrinklist, &list);
672 next:
673 sbinfo->shrinklist_len--;
674 if (!--batch)
675 break;
676 }
677 spin_unlock(&sbinfo->shrinklist_lock);
678
679 list_for_each_safe(pos, next, &list) {
680 pgoff_t next, end;
681 loff_t i_size;
682 int ret;
683
684 info = list_entry(pos, struct shmem_inode_info, shrinklist);
685 inode = &info->vfs_inode;
686
687 if (nr_to_free && freed >= nr_to_free)
688 goto move_back;
689
690 i_size = i_size_read(inode);
691 folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
692 if (!folio || xa_is_value(folio))
693 goto drop;
694
695 /* No large folio at the end of the file: nothing to split */
696 if (!folio_test_large(folio)) {
697 folio_put(folio);
698 goto drop;
699 }
700
701 /* Check if there is anything to gain from splitting */
702 next = folio_next_index(folio);
703 end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
704 if (end <= folio->index || end >= next) {
705 folio_put(folio);
706 goto drop;
707 }
708
709 /*
710 * Move the inode on the list back to shrinklist if we failed
711 * to lock the page at this time.
712 *
713 * Waiting for the lock may lead to deadlock in the
714 * reclaim path.
715 */
716 if (!folio_trylock(folio)) {
717 folio_put(folio);
718 goto move_back;
719 }
720
721 ret = split_folio(folio);
722 folio_unlock(folio);
723 folio_put(folio);
724
725 /* If split failed move the inode on the list back to shrinklist */
726 if (ret)
727 goto move_back;
728
729 freed += next - end;
730 split++;
731 drop:
732 list_del_init(&info->shrinklist);
733 goto put;
734 move_back:
735 /*
736 * Make sure the inode is either on the global list or deleted
737 * from any local list before iput() since it could be deleted
738 * in another thread once we put the inode (then the local list
739 * is corrupted).
740 */
741 spin_lock(&sbinfo->shrinklist_lock);
742 list_move(&info->shrinklist, &sbinfo->shrinklist);
743 sbinfo->shrinklist_len++;
744 spin_unlock(&sbinfo->shrinklist_lock);
745 put:
746 iput(inode);
747 }
748
749 return split;
750 }
751
shmem_unused_huge_scan(struct super_block * sb,struct shrink_control * sc)752 static long shmem_unused_huge_scan(struct super_block *sb,
753 struct shrink_control *sc)
754 {
755 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
756
757 if (!READ_ONCE(sbinfo->shrinklist_len))
758 return SHRINK_STOP;
759
760 return shmem_unused_huge_shrink(sbinfo, sc, 0);
761 }
762
shmem_unused_huge_count(struct super_block * sb,struct shrink_control * sc)763 static long shmem_unused_huge_count(struct super_block *sb,
764 struct shrink_control *sc)
765 {
766 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
767 return READ_ONCE(sbinfo->shrinklist_len);
768 }
769 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
770
771 #define shmem_huge SHMEM_HUGE_DENY
772
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)773 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
774 struct shrink_control *sc, unsigned long nr_to_free)
775 {
776 return 0;
777 }
778
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,unsigned long vm_flags)779 static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
780 loff_t write_end, bool shmem_huge_force,
781 struct vm_area_struct *vma, unsigned long vm_flags)
782 {
783 return false;
784 }
785 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
786
shmem_update_stats(struct folio * folio,int nr_pages)787 static void shmem_update_stats(struct folio *folio, int nr_pages)
788 {
789 if (folio_test_pmd_mappable(folio))
790 __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
791 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
792 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
793 }
794
795 /*
796 * Somewhat like filemap_add_folio, but error if expected item has gone.
797 */
shmem_add_to_page_cache(struct folio * folio,struct address_space * mapping,pgoff_t index,void * expected,gfp_t gfp)798 static int shmem_add_to_page_cache(struct folio *folio,
799 struct address_space *mapping,
800 pgoff_t index, void *expected, gfp_t gfp)
801 {
802 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
803 long nr = folio_nr_pages(folio);
804
805 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
806 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
807 VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
808
809 folio_ref_add(folio, nr);
810 folio->mapping = mapping;
811 folio->index = index;
812
813 gfp &= GFP_RECLAIM_MASK;
814 folio_throttle_swaprate(folio, gfp);
815
816 do {
817 xas_lock_irq(&xas);
818 if (expected != xas_find_conflict(&xas)) {
819 xas_set_err(&xas, -EEXIST);
820 goto unlock;
821 }
822 if (expected && xas_find_conflict(&xas)) {
823 xas_set_err(&xas, -EEXIST);
824 goto unlock;
825 }
826 xas_store(&xas, folio);
827 if (xas_error(&xas))
828 goto unlock;
829 shmem_update_stats(folio, nr);
830 mapping->nrpages += nr;
831 trace_android_vh_shmem_mod_shmem(folio->mapping, nr);
832 unlock:
833 xas_unlock_irq(&xas);
834 } while (xas_nomem(&xas, gfp));
835
836 if (xas_error(&xas)) {
837 folio->mapping = NULL;
838 folio_ref_sub(folio, nr);
839 return xas_error(&xas);
840 }
841
842 return 0;
843 }
844
845 /*
846 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
847 */
shmem_delete_from_page_cache(struct folio * folio,void * radswap)848 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
849 {
850 struct address_space *mapping = folio->mapping;
851 long nr = folio_nr_pages(folio);
852 int error;
853
854 xa_lock_irq(&mapping->i_pages);
855 error = shmem_replace_entry(mapping, folio->index, folio, radswap);
856 trace_android_vh_shmem_mod_shmem(folio->mapping, -nr);
857 folio->mapping = NULL;
858 mapping->nrpages -= nr;
859 shmem_update_stats(folio, -nr);
860 xa_unlock_irq(&mapping->i_pages);
861 folio_put_refs(folio, nr);
862 BUG_ON(error);
863 }
864
865 /*
866 * Remove swap entry from page cache, free the swap and its page cache. Returns
867 * the number of pages being freed. 0 means entry not found in XArray (0 pages
868 * being freed).
869 */
shmem_free_swap(struct address_space * mapping,pgoff_t index,void * radswap)870 static long shmem_free_swap(struct address_space *mapping,
871 pgoff_t index, void *radswap)
872 {
873 int order = xa_get_order(&mapping->i_pages, index);
874 void *old;
875
876 old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
877 if (old != radswap)
878 return 0;
879 free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
880
881 return 1 << order;
882 }
883
884 /*
885 * Determine (in bytes) how many of the shmem object's pages mapped by the
886 * given offsets are swapped out.
887 *
888 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
889 * as long as the inode doesn't go away and racy results are not a problem.
890 */
shmem_partial_swap_usage(struct address_space * mapping,pgoff_t start,pgoff_t end)891 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
892 pgoff_t start, pgoff_t end)
893 {
894 XA_STATE(xas, &mapping->i_pages, start);
895 struct page *page;
896 unsigned long swapped = 0;
897 unsigned long max = end - 1;
898
899 rcu_read_lock();
900 xas_for_each(&xas, page, max) {
901 if (xas_retry(&xas, page))
902 continue;
903 if (xa_is_value(page))
904 swapped += 1 << xas_get_order(&xas);
905 if (xas.xa_index == max)
906 break;
907 if (need_resched()) {
908 xas_pause(&xas);
909 cond_resched_rcu();
910 }
911 }
912 rcu_read_unlock();
913
914 return swapped << PAGE_SHIFT;
915 }
916
917 /*
918 * Determine (in bytes) how many of the shmem object's pages mapped by the
919 * given vma is swapped out.
920 *
921 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
922 * as long as the inode doesn't go away and racy results are not a problem.
923 */
shmem_swap_usage(struct vm_area_struct * vma)924 unsigned long shmem_swap_usage(struct vm_area_struct *vma)
925 {
926 struct inode *inode = file_inode(vma->vm_file);
927 struct shmem_inode_info *info = SHMEM_I(inode);
928 struct address_space *mapping = inode->i_mapping;
929 unsigned long swapped;
930
931 /* Be careful as we don't hold info->lock */
932 swapped = READ_ONCE(info->swapped);
933
934 /*
935 * The easier cases are when the shmem object has nothing in swap, or
936 * the vma maps it whole. Then we can simply use the stats that we
937 * already track.
938 */
939 if (!swapped)
940 return 0;
941
942 if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
943 return swapped << PAGE_SHIFT;
944
945 /* Here comes the more involved part */
946 return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
947 vma->vm_pgoff + vma_pages(vma));
948 }
949
950 /*
951 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
952 */
shmem_unlock_mapping(struct address_space * mapping)953 void shmem_unlock_mapping(struct address_space *mapping)
954 {
955 struct folio_batch fbatch;
956 pgoff_t index = 0;
957
958 folio_batch_init(&fbatch);
959 /*
960 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
961 */
962 while (!mapping_unevictable(mapping) &&
963 filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
964 check_move_unevictable_folios(&fbatch);
965 folio_batch_release(&fbatch);
966 cond_resched();
967 }
968 }
969
shmem_get_partial_folio(struct inode * inode,pgoff_t index)970 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
971 {
972 struct folio *folio;
973
974 /*
975 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
976 * beyond i_size, and reports fallocated folios as holes.
977 */
978 folio = filemap_get_entry(inode->i_mapping, index);
979 if (!folio)
980 return folio;
981 if (!xa_is_value(folio)) {
982 folio_lock(folio);
983 if (folio->mapping == inode->i_mapping)
984 return folio;
985 /* The folio has been swapped out */
986 folio_unlock(folio);
987 folio_put(folio);
988 }
989 /*
990 * But read a folio back from swap if any of it is within i_size
991 * (although in some cases this is just a waste of time).
992 */
993 folio = NULL;
994 shmem_get_folio(inode, index, 0, &folio, SGP_READ);
995 return folio;
996 }
997
998 /*
999 * Remove range of pages and swap entries from page cache, and free them.
1000 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1001 */
shmem_undo_range(struct inode * inode,loff_t lstart,loff_t lend,bool unfalloc)1002 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
1003 bool unfalloc)
1004 {
1005 struct address_space *mapping = inode->i_mapping;
1006 struct shmem_inode_info *info = SHMEM_I(inode);
1007 pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
1008 pgoff_t end = (lend + 1) >> PAGE_SHIFT;
1009 struct folio_batch fbatch;
1010 pgoff_t indices[PAGEVEC_SIZE];
1011 struct folio *folio;
1012 bool same_folio;
1013 long nr_swaps_freed = 0;
1014 pgoff_t index;
1015 int i;
1016
1017 if (lend == -1)
1018 end = -1; /* unsigned, so actually very big */
1019
1020 if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1021 info->fallocend = start;
1022
1023 folio_batch_init(&fbatch);
1024 index = start;
1025 while (index < end && find_lock_entries(mapping, &index, end - 1,
1026 &fbatch, indices)) {
1027 for (i = 0; i < folio_batch_count(&fbatch); i++) {
1028 folio = fbatch.folios[i];
1029
1030 if (xa_is_value(folio)) {
1031 if (unfalloc)
1032 continue;
1033 nr_swaps_freed += shmem_free_swap(mapping,
1034 indices[i], folio);
1035 continue;
1036 }
1037
1038 if (!unfalloc || !folio_test_uptodate(folio))
1039 truncate_inode_folio(mapping, folio);
1040 folio_unlock(folio);
1041 }
1042 folio_batch_remove_exceptionals(&fbatch);
1043 folio_batch_release(&fbatch);
1044 cond_resched();
1045 }
1046
1047 /*
1048 * When undoing a failed fallocate, we want none of the partial folio
1049 * zeroing and splitting below, but shall want to truncate the whole
1050 * folio when !uptodate indicates that it was added by this fallocate,
1051 * even when [lstart, lend] covers only a part of the folio.
1052 */
1053 if (unfalloc)
1054 goto whole_folios;
1055
1056 same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1057 folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
1058 if (folio) {
1059 same_folio = lend < folio_pos(folio) + folio_size(folio);
1060 folio_mark_dirty(folio);
1061 if (!truncate_inode_partial_folio(folio, lstart, lend)) {
1062 start = folio_next_index(folio);
1063 if (same_folio)
1064 end = folio->index;
1065 }
1066 folio_unlock(folio);
1067 folio_put(folio);
1068 folio = NULL;
1069 }
1070
1071 if (!same_folio)
1072 folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
1073 if (folio) {
1074 folio_mark_dirty(folio);
1075 if (!truncate_inode_partial_folio(folio, lstart, lend))
1076 end = folio->index;
1077 folio_unlock(folio);
1078 folio_put(folio);
1079 }
1080
1081 whole_folios:
1082
1083 index = start;
1084 while (index < end) {
1085 cond_resched();
1086
1087 if (!find_get_entries(mapping, &index, end - 1, &fbatch,
1088 indices)) {
1089 /* If all gone or hole-punch or unfalloc, we're done */
1090 if (index == start || end != -1)
1091 break;
1092 /* But if truncating, restart to make sure all gone */
1093 index = start;
1094 continue;
1095 }
1096 for (i = 0; i < folio_batch_count(&fbatch); i++) {
1097 folio = fbatch.folios[i];
1098
1099 if (xa_is_value(folio)) {
1100 long swaps_freed;
1101
1102 if (unfalloc)
1103 continue;
1104 swaps_freed = shmem_free_swap(mapping, indices[i], folio);
1105 if (!swaps_freed) {
1106 /* Swap was replaced by page: retry */
1107 index = indices[i];
1108 break;
1109 }
1110 nr_swaps_freed += swaps_freed;
1111 continue;
1112 }
1113
1114 folio_lock(folio);
1115
1116 if (!unfalloc || !folio_test_uptodate(folio)) {
1117 if (folio_mapping(folio) != mapping) {
1118 /* Page was replaced by swap: retry */
1119 folio_unlock(folio);
1120 index = indices[i];
1121 break;
1122 }
1123 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1124 folio);
1125
1126 if (!folio_test_large(folio)) {
1127 truncate_inode_folio(mapping, folio);
1128 } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
1129 /*
1130 * If we split a page, reset the loop so
1131 * that we pick up the new sub pages.
1132 * Otherwise the THP was entirely
1133 * dropped or the target range was
1134 * zeroed, so just continue the loop as
1135 * is.
1136 */
1137 if (!folio_test_large(folio)) {
1138 folio_unlock(folio);
1139 index = start;
1140 break;
1141 }
1142 }
1143 }
1144 folio_unlock(folio);
1145 }
1146 folio_batch_remove_exceptionals(&fbatch);
1147 folio_batch_release(&fbatch);
1148 }
1149
1150 shmem_recalc_inode(inode, 0, -nr_swaps_freed);
1151 trace_android_vh_shmem_mod_swapped(mapping, -nr_swaps_freed);
1152 }
1153
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)1154 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1155 {
1156 shmem_undo_range(inode, lstart, lend, false);
1157 inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1158 inode_inc_iversion(inode);
1159 }
1160 EXPORT_SYMBOL_GPL(shmem_truncate_range);
1161
shmem_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int query_flags)1162 static int shmem_getattr(struct mnt_idmap *idmap,
1163 const struct path *path, struct kstat *stat,
1164 u32 request_mask, unsigned int query_flags)
1165 {
1166 struct inode *inode = path->dentry->d_inode;
1167 struct shmem_inode_info *info = SHMEM_I(inode);
1168
1169 if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1170 shmem_recalc_inode(inode, 0, 0);
1171
1172 if (info->fsflags & FS_APPEND_FL)
1173 stat->attributes |= STATX_ATTR_APPEND;
1174 if (info->fsflags & FS_IMMUTABLE_FL)
1175 stat->attributes |= STATX_ATTR_IMMUTABLE;
1176 if (info->fsflags & FS_NODUMP_FL)
1177 stat->attributes |= STATX_ATTR_NODUMP;
1178 stat->attributes_mask |= (STATX_ATTR_APPEND |
1179 STATX_ATTR_IMMUTABLE |
1180 STATX_ATTR_NODUMP);
1181 generic_fillattr(idmap, request_mask, inode, stat);
1182
1183 if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
1184 stat->blksize = HPAGE_PMD_SIZE;
1185
1186 if (request_mask & STATX_BTIME) {
1187 stat->result_mask |= STATX_BTIME;
1188 stat->btime.tv_sec = info->i_crtime.tv_sec;
1189 stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1190 }
1191
1192 return 0;
1193 }
1194
shmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)1195 static int shmem_setattr(struct mnt_idmap *idmap,
1196 struct dentry *dentry, struct iattr *attr)
1197 {
1198 struct inode *inode = d_inode(dentry);
1199 struct shmem_inode_info *info = SHMEM_I(inode);
1200 int error;
1201 bool update_mtime = false;
1202 bool update_ctime = true;
1203
1204 error = setattr_prepare(idmap, dentry, attr);
1205 if (error)
1206 return error;
1207
1208 if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1209 if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1210 return -EPERM;
1211 }
1212 }
1213
1214 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1215 loff_t oldsize = inode->i_size;
1216 loff_t newsize = attr->ia_size;
1217
1218 /* protected by i_rwsem */
1219 if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1220 (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1221 return -EPERM;
1222
1223 if (newsize != oldsize) {
1224 error = shmem_reacct_size(SHMEM_I(inode)->flags,
1225 oldsize, newsize);
1226 if (error)
1227 return error;
1228 i_size_write(inode, newsize);
1229 update_mtime = true;
1230 } else {
1231 update_ctime = false;
1232 }
1233 if (newsize <= oldsize) {
1234 loff_t holebegin = round_up(newsize, PAGE_SIZE);
1235 if (oldsize > holebegin)
1236 unmap_mapping_range(inode->i_mapping,
1237 holebegin, 0, 1);
1238 if (info->alloced)
1239 shmem_truncate_range(inode,
1240 newsize, (loff_t)-1);
1241 /* unmap again to remove racily COWed private pages */
1242 if (oldsize > holebegin)
1243 unmap_mapping_range(inode->i_mapping,
1244 holebegin, 0, 1);
1245 }
1246 }
1247
1248 if (is_quota_modification(idmap, inode, attr)) {
1249 error = dquot_initialize(inode);
1250 if (error)
1251 return error;
1252 }
1253
1254 /* Transfer quota accounting */
1255 if (i_uid_needs_update(idmap, attr, inode) ||
1256 i_gid_needs_update(idmap, attr, inode)) {
1257 error = dquot_transfer(idmap, inode, attr);
1258 if (error)
1259 return error;
1260 }
1261
1262 setattr_copy(idmap, inode, attr);
1263 if (attr->ia_valid & ATTR_MODE)
1264 error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1265 if (!error && update_ctime) {
1266 inode_set_ctime_current(inode);
1267 if (update_mtime)
1268 inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
1269 inode_inc_iversion(inode);
1270 }
1271 return error;
1272 }
1273
shmem_evict_inode(struct inode * inode)1274 static void shmem_evict_inode(struct inode *inode)
1275 {
1276 struct shmem_inode_info *info = SHMEM_I(inode);
1277 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1278 size_t freed = 0;
1279
1280 if (shmem_mapping(inode->i_mapping)) {
1281 shmem_unacct_size(info->flags, inode->i_size);
1282 inode->i_size = 0;
1283 mapping_set_exiting(inode->i_mapping);
1284 shmem_truncate_range(inode, 0, (loff_t)-1);
1285 if (!list_empty(&info->shrinklist)) {
1286 spin_lock(&sbinfo->shrinklist_lock);
1287 if (!list_empty(&info->shrinklist)) {
1288 list_del_init(&info->shrinklist);
1289 sbinfo->shrinklist_len--;
1290 }
1291 spin_unlock(&sbinfo->shrinklist_lock);
1292 }
1293 while (!list_empty(&info->swaplist)) {
1294 /* Wait while shmem_unuse() is scanning this inode... */
1295 wait_var_event(&info->stop_eviction,
1296 !atomic_read(&info->stop_eviction));
1297 mutex_lock(&shmem_swaplist_mutex);
1298 /* ...but beware of the race if we peeked too early */
1299 if (!atomic_read(&info->stop_eviction))
1300 list_del_init(&info->swaplist);
1301 mutex_unlock(&shmem_swaplist_mutex);
1302 }
1303 }
1304
1305 simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
1306 shmem_free_inode(inode->i_sb, freed);
1307 WARN_ON(inode->i_blocks);
1308 clear_inode(inode);
1309 #ifdef CONFIG_TMPFS_QUOTA
1310 dquot_free_inode(inode);
1311 dquot_drop(inode);
1312 #endif
1313 }
1314
shmem_find_swap_entries(struct address_space * mapping,pgoff_t start,struct folio_batch * fbatch,pgoff_t * indices,unsigned int type)1315 static int shmem_find_swap_entries(struct address_space *mapping,
1316 pgoff_t start, struct folio_batch *fbatch,
1317 pgoff_t *indices, unsigned int type)
1318 {
1319 XA_STATE(xas, &mapping->i_pages, start);
1320 struct folio *folio;
1321 swp_entry_t entry;
1322
1323 rcu_read_lock();
1324 xas_for_each(&xas, folio, ULONG_MAX) {
1325 if (xas_retry(&xas, folio))
1326 continue;
1327
1328 if (!xa_is_value(folio))
1329 continue;
1330
1331 entry = radix_to_swp_entry(folio);
1332 /*
1333 * swapin error entries can be found in the mapping. But they're
1334 * deliberately ignored here as we've done everything we can do.
1335 */
1336 if (swp_type(entry) != type)
1337 continue;
1338
1339 indices[folio_batch_count(fbatch)] = xas.xa_index;
1340 if (!folio_batch_add(fbatch, folio))
1341 break;
1342
1343 if (need_resched()) {
1344 xas_pause(&xas);
1345 cond_resched_rcu();
1346 }
1347 }
1348 rcu_read_unlock();
1349
1350 return xas.xa_index;
1351 }
1352
1353 /*
1354 * Move the swapped pages for an inode to page cache. Returns the count
1355 * of pages swapped in, or the error in case of failure.
1356 */
shmem_unuse_swap_entries(struct inode * inode,struct folio_batch * fbatch,pgoff_t * indices)1357 static int shmem_unuse_swap_entries(struct inode *inode,
1358 struct folio_batch *fbatch, pgoff_t *indices)
1359 {
1360 int i = 0;
1361 int ret = 0;
1362 int error = 0;
1363 struct address_space *mapping = inode->i_mapping;
1364
1365 for (i = 0; i < folio_batch_count(fbatch); i++) {
1366 struct folio *folio = fbatch->folios[i];
1367
1368 if (!xa_is_value(folio))
1369 continue;
1370 error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
1371 mapping_gfp_mask(mapping), NULL, NULL);
1372 if (error == 0) {
1373 folio_unlock(folio);
1374 folio_put(folio);
1375 ret++;
1376 }
1377 if (error == -ENOMEM)
1378 break;
1379 error = 0;
1380 }
1381 return error ? error : ret;
1382 }
1383
1384 /*
1385 * If swap found in inode, free it and move page from swapcache to filecache.
1386 */
shmem_unuse_inode(struct inode * inode,unsigned int type)1387 static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1388 {
1389 struct address_space *mapping = inode->i_mapping;
1390 pgoff_t start = 0;
1391 struct folio_batch fbatch;
1392 pgoff_t indices[PAGEVEC_SIZE];
1393 int ret = 0;
1394
1395 do {
1396 folio_batch_init(&fbatch);
1397 shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1398 if (folio_batch_count(&fbatch) == 0) {
1399 ret = 0;
1400 break;
1401 }
1402
1403 ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1404 if (ret < 0)
1405 break;
1406
1407 start = indices[folio_batch_count(&fbatch) - 1];
1408 } while (true);
1409
1410 return ret;
1411 }
1412
1413 /*
1414 * Read all the shared memory data that resides in the swap
1415 * device 'type' back into memory, so the swap device can be
1416 * unused.
1417 */
shmem_unuse(unsigned int type)1418 int shmem_unuse(unsigned int type)
1419 {
1420 struct shmem_inode_info *info, *next;
1421 int error = 0;
1422
1423 if (list_empty(&shmem_swaplist))
1424 return 0;
1425
1426 mutex_lock(&shmem_swaplist_mutex);
1427 list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1428 if (!info->swapped) {
1429 list_del_init(&info->swaplist);
1430 continue;
1431 }
1432 /*
1433 * Drop the swaplist mutex while searching the inode for swap;
1434 * but before doing so, make sure shmem_evict_inode() will not
1435 * remove placeholder inode from swaplist, nor let it be freed
1436 * (igrab() would protect from unlink, but not from unmount).
1437 */
1438 atomic_inc(&info->stop_eviction);
1439 mutex_unlock(&shmem_swaplist_mutex);
1440
1441 error = shmem_unuse_inode(&info->vfs_inode, type);
1442 cond_resched();
1443
1444 mutex_lock(&shmem_swaplist_mutex);
1445 next = list_next_entry(info, swaplist);
1446 if (!info->swapped)
1447 list_del_init(&info->swaplist);
1448 if (atomic_dec_and_test(&info->stop_eviction))
1449 wake_up_var(&info->stop_eviction);
1450 if (error)
1451 break;
1452 }
1453 mutex_unlock(&shmem_swaplist_mutex);
1454
1455 return error;
1456 }
1457
1458 /*
1459 * Move the page from the page cache to the swap cache.
1460 */
shmem_writepage(struct page * page,struct writeback_control * wbc)1461 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1462 {
1463 struct folio *folio = page_folio(page);
1464 struct address_space *mapping = folio->mapping;
1465 struct inode *inode = mapping->host;
1466 struct shmem_inode_info *info = SHMEM_I(inode);
1467 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1468 swp_entry_t swap;
1469 pgoff_t index;
1470 int nr_pages;
1471 bool split = false;
1472
1473 /*
1474 * Our capabilities prevent regular writeback or sync from ever calling
1475 * shmem_writepage; but a stacking filesystem might use ->writepage of
1476 * its underlying filesystem, in which case tmpfs should write out to
1477 * swap only in response to memory pressure, and not for the writeback
1478 * threads or sync.
1479 */
1480 if (WARN_ON_ONCE(!wbc->for_reclaim))
1481 goto redirty;
1482
1483 if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
1484 goto redirty;
1485
1486 if (!total_swap_pages)
1487 goto redirty;
1488
1489 /*
1490 * If CONFIG_THP_SWAP is not enabled, the large folio should be
1491 * split when swapping.
1492 *
1493 * And shrinkage of pages beyond i_size does not split swap, so
1494 * swapout of a large folio crossing i_size needs to split too
1495 * (unless fallocate has been used to preallocate beyond EOF).
1496 */
1497 if (folio_test_large(folio)) {
1498 index = shmem_fallocend(inode,
1499 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1500 if ((index > folio->index && index < folio_next_index(folio)) ||
1501 !IS_ENABLED(CONFIG_THP_SWAP))
1502 split = true;
1503 }
1504
1505 if (split) {
1506 try_split:
1507 /* Ensure the subpages are still dirty */
1508 folio_test_set_dirty(folio);
1509 if (split_huge_page_to_list_to_order(page, wbc->list, 0))
1510 goto redirty;
1511 folio = page_folio(page);
1512 folio_clear_dirty(folio);
1513 }
1514
1515 index = folio->index;
1516 nr_pages = folio_nr_pages(folio);
1517
1518 /*
1519 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1520 * value into swapfile.c, the only way we can correctly account for a
1521 * fallocated folio arriving here is now to initialize it and write it.
1522 *
1523 * That's okay for a folio already fallocated earlier, but if we have
1524 * not yet completed the fallocation, then (a) we want to keep track
1525 * of this folio in case we have to undo it, and (b) it may not be a
1526 * good idea to continue anyway, once we're pushing into swap. So
1527 * reactivate the folio, and let shmem_fallocate() quit when too many.
1528 */
1529 if (!folio_test_uptodate(folio)) {
1530 if (inode->i_private) {
1531 struct shmem_falloc *shmem_falloc;
1532 spin_lock(&inode->i_lock);
1533 shmem_falloc = inode->i_private;
1534 if (shmem_falloc &&
1535 !shmem_falloc->waitq &&
1536 index >= shmem_falloc->start &&
1537 index < shmem_falloc->next)
1538 shmem_falloc->nr_unswapped += nr_pages;
1539 else
1540 shmem_falloc = NULL;
1541 spin_unlock(&inode->i_lock);
1542 if (shmem_falloc)
1543 goto redirty;
1544 }
1545 folio_zero_range(folio, 0, folio_size(folio));
1546 flush_dcache_folio(folio);
1547 folio_mark_uptodate(folio);
1548 }
1549
1550 swap = folio_alloc_swap(folio);
1551 if (!swap.val) {
1552 if (nr_pages > 1)
1553 goto try_split;
1554
1555 goto redirty;
1556 }
1557
1558 /*
1559 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1560 * if it's not already there. Do it now before the folio is
1561 * moved to swap cache, when its pagelock no longer protects
1562 * the inode from eviction. But don't unlock the mutex until
1563 * we've incremented swapped, because shmem_unuse_inode() will
1564 * prune a !swapped inode from the swaplist under this mutex.
1565 */
1566 mutex_lock(&shmem_swaplist_mutex);
1567 if (list_empty(&info->swaplist))
1568 list_add(&info->swaplist, &shmem_swaplist);
1569
1570 if (add_to_swap_cache(folio, swap,
1571 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1572 NULL) == 0) {
1573 shmem_recalc_inode(inode, 0, nr_pages);
1574 trace_android_vh_shmem_mod_swapped(folio->mapping, nr_pages);
1575 swap_shmem_alloc(swap, nr_pages);
1576 shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
1577
1578 mutex_unlock(&shmem_swaplist_mutex);
1579 BUG_ON(folio_mapped(folio));
1580 return swap_writepage(&folio->page, wbc);
1581 }
1582
1583 mutex_unlock(&shmem_swaplist_mutex);
1584 put_swap_folio(folio, swap);
1585 redirty:
1586 folio_mark_dirty(folio);
1587 if (wbc->for_reclaim)
1588 return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
1589 folio_unlock(folio);
1590 return 0;
1591 }
1592
1593 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1594 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1595 {
1596 char buffer[64];
1597
1598 if (!mpol || mpol->mode == MPOL_DEFAULT)
1599 return; /* show nothing */
1600
1601 mpol_to_str(buffer, sizeof(buffer), mpol);
1602
1603 seq_printf(seq, ",mpol=%s", buffer);
1604 }
1605
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1606 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1607 {
1608 struct mempolicy *mpol = NULL;
1609 if (sbinfo->mpol) {
1610 raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1611 mpol = sbinfo->mpol;
1612 mpol_get(mpol);
1613 raw_spin_unlock(&sbinfo->stat_lock);
1614 }
1615 return mpol;
1616 }
1617 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1618 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1619 {
1620 }
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1621 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1622 {
1623 return NULL;
1624 }
1625 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1626
1627 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
1628 pgoff_t index, unsigned int order, pgoff_t *ilx);
1629
shmem_swapin_cluster(swp_entry_t swap,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1630 static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1631 struct shmem_inode_info *info, pgoff_t index)
1632 {
1633 struct mempolicy *mpol;
1634 pgoff_t ilx;
1635 struct folio *folio;
1636
1637 mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
1638 folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
1639 mpol_cond_put(mpol);
1640
1641 return folio;
1642 }
1643
1644 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1645 /*
1646 * Make sure huge_gfp is always more limited than limit_gfp.
1647 * Some of the flags set permissions, while others set limitations.
1648 */
limit_gfp_mask(gfp_t huge_gfp,gfp_t limit_gfp)1649 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1650 {
1651 gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1652 gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1653 gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1654 gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1655
1656 /* Allow allocations only from the originally specified zones. */
1657 result |= zoneflags;
1658
1659 /*
1660 * Minimize the result gfp by taking the union with the deny flags,
1661 * and the intersection of the allow flags.
1662 */
1663 result |= (limit_gfp & denyflags);
1664 result |= (huge_gfp & limit_gfp) & allowflags;
1665
1666 return result;
1667 }
1668
shmem_allowable_huge_orders(struct inode * inode,struct vm_area_struct * vma,pgoff_t index,loff_t write_end,bool shmem_huge_force)1669 unsigned long shmem_allowable_huge_orders(struct inode *inode,
1670 struct vm_area_struct *vma, pgoff_t index,
1671 loff_t write_end, bool shmem_huge_force)
1672 {
1673 unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1674 unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1675 unsigned long vm_flags = vma ? vma->vm_flags : 0;
1676 pgoff_t aligned_index;
1677 bool global_huge;
1678 loff_t i_size;
1679 int order;
1680
1681 if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
1682 return 0;
1683
1684 global_huge = shmem_huge_global_enabled(inode, index, write_end,
1685 shmem_huge_force, vma, vm_flags);
1686 if (!vma || !vma_is_anon_shmem(vma)) {
1687 /*
1688 * For tmpfs, we now only support PMD sized THP if huge page
1689 * is enabled, otherwise fallback to order 0.
1690 */
1691 return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
1692 }
1693
1694 /*
1695 * Following the 'deny' semantics of the top level, force the huge
1696 * option off from all mounts.
1697 */
1698 if (shmem_huge == SHMEM_HUGE_DENY)
1699 return 0;
1700
1701 /*
1702 * Only allow inherit orders if the top-level value is 'force', which
1703 * means non-PMD sized THP can not override 'huge' mount option now.
1704 */
1705 if (shmem_huge == SHMEM_HUGE_FORCE)
1706 return READ_ONCE(huge_shmem_orders_inherit);
1707
1708 /* Allow mTHP that will be fully within i_size. */
1709 order = highest_order(within_size_orders);
1710 while (within_size_orders) {
1711 aligned_index = round_up(index + 1, 1 << order);
1712 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1713 if (i_size >> PAGE_SHIFT >= aligned_index) {
1714 mask |= within_size_orders;
1715 break;
1716 }
1717
1718 order = next_order(&within_size_orders, order);
1719 }
1720
1721 if (vm_flags & VM_HUGEPAGE)
1722 mask |= READ_ONCE(huge_shmem_orders_madvise);
1723
1724 if (global_huge)
1725 mask |= READ_ONCE(huge_shmem_orders_inherit);
1726
1727 return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1728 }
1729
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1730 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1731 struct address_space *mapping, pgoff_t index,
1732 unsigned long orders)
1733 {
1734 struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1735 pgoff_t aligned_index;
1736 unsigned long pages;
1737 int order;
1738
1739 if (vma) {
1740 orders = thp_vma_suitable_orders(vma, vmf->address, orders);
1741 if (!orders)
1742 return 0;
1743 }
1744
1745 /* Find the highest order that can add into the page cache */
1746 order = highest_order(orders);
1747 while (orders) {
1748 pages = 1UL << order;
1749 aligned_index = round_down(index, pages);
1750 /*
1751 * Check for conflict before waiting on a huge allocation.
1752 * Conflict might be that a huge page has just been allocated
1753 * and added to page cache by a racing thread, or that there
1754 * is already at least one small page in the huge extent.
1755 * Be careful to retry when appropriate, but not forever!
1756 * Elsewhere -EEXIST would be the right code, but not here.
1757 */
1758 if (!xa_find(&mapping->i_pages, &aligned_index,
1759 aligned_index + pages - 1, XA_PRESENT))
1760 break;
1761 order = next_order(&orders, order);
1762 }
1763
1764 return orders;
1765 }
1766 #else
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1767 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1768 struct address_space *mapping, pgoff_t index,
1769 unsigned long orders)
1770 {
1771 return 0;
1772 }
1773 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1774
shmem_alloc_folio(gfp_t gfp,int order,struct shmem_inode_info * info,pgoff_t index)1775 static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
1776 struct shmem_inode_info *info, pgoff_t index)
1777 {
1778 struct mempolicy *mpol;
1779 pgoff_t ilx;
1780 struct folio *folio = NULL;
1781
1782 mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
1783 trace_android_rvh_shmem_get_folio(info, &folio, order);
1784 if (folio)
1785 goto done;
1786 folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1787 done:
1788 mpol_cond_put(mpol);
1789
1790 return folio;
1791 }
1792
shmem_alloc_and_add_folio(struct vm_fault * vmf,gfp_t gfp,struct inode * inode,pgoff_t index,struct mm_struct * fault_mm,unsigned long orders)1793 static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
1794 gfp_t gfp, struct inode *inode, pgoff_t index,
1795 struct mm_struct *fault_mm, unsigned long orders)
1796 {
1797 struct address_space *mapping = inode->i_mapping;
1798 struct shmem_inode_info *info = SHMEM_I(inode);
1799 unsigned long suitable_orders = 0;
1800 struct folio *folio = NULL;
1801 long pages;
1802 int error, order;
1803
1804 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1805 orders = 0;
1806
1807 if (orders > 0) {
1808 suitable_orders = shmem_suitable_orders(inode, vmf,
1809 mapping, index, orders);
1810
1811 trace_android_rvh_shmem_suitable_orders(inode, index,
1812 orders, &suitable_orders);
1813 order = highest_order(suitable_orders);
1814 while (suitable_orders) {
1815 pages = 1UL << order;
1816 index = round_down(index, pages);
1817 folio = shmem_alloc_folio(gfp, order, info, index);
1818 if (folio)
1819 goto allocated;
1820
1821 if (pages == HPAGE_PMD_NR)
1822 count_vm_event(THP_FILE_FALLBACK);
1823 count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
1824 order = next_order(&suitable_orders, order);
1825 }
1826 } else {
1827 pages = 1;
1828 folio = shmem_alloc_folio(gfp, 0, info, index);
1829 }
1830 if (!folio)
1831 return ERR_PTR(-ENOMEM);
1832
1833 allocated:
1834 __folio_set_locked(folio);
1835 __folio_set_swapbacked(folio);
1836
1837 gfp &= GFP_RECLAIM_MASK;
1838 error = mem_cgroup_charge(folio, fault_mm, gfp);
1839 if (error) {
1840 if (xa_find(&mapping->i_pages, &index,
1841 index + pages - 1, XA_PRESENT)) {
1842 error = -EEXIST;
1843 } else if (pages > 1) {
1844 if (pages == HPAGE_PMD_NR) {
1845 count_vm_event(THP_FILE_FALLBACK);
1846 count_vm_event(THP_FILE_FALLBACK_CHARGE);
1847 }
1848 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
1849 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1850 }
1851 goto unlock;
1852 }
1853
1854 error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1855 if (error)
1856 goto unlock;
1857
1858 error = shmem_inode_acct_blocks(inode, pages);
1859 if (error) {
1860 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1861 long freed;
1862 /*
1863 * Try to reclaim some space by splitting a few
1864 * large folios beyond i_size on the filesystem.
1865 */
1866 shmem_unused_huge_shrink(sbinfo, NULL, pages);
1867 /*
1868 * And do a shmem_recalc_inode() to account for freed pages:
1869 * except our folio is there in cache, so not quite balanced.
1870 */
1871 spin_lock(&info->lock);
1872 freed = pages + info->alloced - info->swapped -
1873 READ_ONCE(mapping->nrpages);
1874 if (freed > 0)
1875 info->alloced -= freed;
1876 spin_unlock(&info->lock);
1877 if (freed > 0)
1878 shmem_inode_unacct_blocks(inode, freed);
1879 error = shmem_inode_acct_blocks(inode, pages);
1880 if (error) {
1881 filemap_remove_folio(folio);
1882 goto unlock;
1883 }
1884 }
1885
1886 shmem_recalc_inode(inode, pages, 0);
1887 folio_add_lru(folio);
1888 return folio;
1889
1890 unlock:
1891 folio_unlock(folio);
1892 folio_put(folio);
1893 return ERR_PTR(error);
1894 }
1895
1896 /*
1897 * When a page is moved from swapcache to shmem filecache (either by the
1898 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
1899 * shmem_unuse_inode()), it may have been read in earlier from swap, in
1900 * ignorance of the mapping it belongs to. If that mapping has special
1901 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1902 * we may need to copy to a suitable page before moving to filecache.
1903 *
1904 * In a future release, this may well be extended to respect cpuset and
1905 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1906 * but for now it is a simple matter of zone.
1907 */
shmem_should_replace_folio(struct folio * folio,gfp_t gfp)1908 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
1909 {
1910 return folio_zonenum(folio) > gfp_zone(gfp);
1911 }
1912
shmem_replace_folio(struct folio ** foliop,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index,struct vm_area_struct * vma)1913 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
1914 struct shmem_inode_info *info, pgoff_t index,
1915 struct vm_area_struct *vma)
1916 {
1917 struct folio *new, *old = *foliop;
1918 swp_entry_t entry = old->swap;
1919 struct address_space *swap_mapping = swap_address_space(entry);
1920 pgoff_t swap_index = swap_cache_index(entry);
1921 XA_STATE(xas, &swap_mapping->i_pages, swap_index);
1922 int nr_pages = folio_nr_pages(old);
1923 int error = 0, i;
1924
1925 /*
1926 * We have arrived here because our zones are constrained, so don't
1927 * limit chance of success by further cpuset and node constraints.
1928 */
1929 gfp &= ~GFP_CONSTRAINT_MASK;
1930 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1931 if (nr_pages > 1) {
1932 gfp_t huge_gfp = vma_thp_gfp_mask(vma);
1933
1934 gfp = limit_gfp_mask(huge_gfp, gfp);
1935 }
1936 #endif
1937
1938 new = shmem_alloc_folio(gfp, folio_order(old), info, index);
1939 if (!new)
1940 return -ENOMEM;
1941
1942 folio_ref_add(new, nr_pages);
1943 folio_copy(new, old);
1944 flush_dcache_folio(new);
1945
1946 __folio_set_locked(new);
1947 __folio_set_swapbacked(new);
1948 folio_mark_uptodate(new);
1949 new->swap = entry;
1950 folio_set_swapcache(new);
1951
1952 /* Swap cache still stores N entries instead of a high-order entry */
1953 xa_lock_irq(&swap_mapping->i_pages);
1954 for (i = 0; i < nr_pages; i++) {
1955 void *item = xas_load(&xas);
1956
1957 if (item != old) {
1958 error = -ENOENT;
1959 break;
1960 }
1961
1962 xas_store(&xas, new);
1963 xas_next(&xas);
1964 }
1965 if (!error) {
1966 mem_cgroup_replace_folio(old, new);
1967 shmem_update_stats(new, nr_pages);
1968 shmem_update_stats(old, -nr_pages);
1969 }
1970 xa_unlock_irq(&swap_mapping->i_pages);
1971
1972 if (unlikely(error)) {
1973 /*
1974 * Is this possible? I think not, now that our callers
1975 * check both the swapcache flag and folio->private
1976 * after getting the folio lock; but be defensive.
1977 * Reverse old to newpage for clear and free.
1978 */
1979 old = new;
1980 } else {
1981 folio_add_lru(new);
1982 *foliop = new;
1983 }
1984
1985 folio_clear_swapcache(old);
1986 old->private = NULL;
1987
1988 folio_unlock(old);
1989 /*
1990 * The old folio are removed from swap cache, drop the 'nr_pages'
1991 * reference, as well as one temporary reference getting from swap
1992 * cache.
1993 */
1994 folio_put_refs(old, nr_pages + 1);
1995 return error;
1996 }
1997
shmem_set_folio_swapin_error(struct inode * inode,pgoff_t index,struct folio * folio,swp_entry_t swap)1998 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
1999 struct folio *folio, swp_entry_t swap)
2000 {
2001 struct address_space *mapping = inode->i_mapping;
2002 swp_entry_t swapin_error;
2003 void *old;
2004 int nr_pages;
2005
2006 swapin_error = make_poisoned_swp_entry();
2007 old = xa_cmpxchg_irq(&mapping->i_pages, index,
2008 swp_to_radix_entry(swap),
2009 swp_to_radix_entry(swapin_error), 0);
2010 if (old != swp_to_radix_entry(swap))
2011 return;
2012
2013 nr_pages = folio_nr_pages(folio);
2014 folio_wait_writeback(folio);
2015 delete_from_swap_cache(folio);
2016 /*
2017 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2018 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2019 * in shmem_evict_inode().
2020 */
2021 shmem_recalc_inode(inode, -nr_pages, -nr_pages);
2022 swap_free_nr(swap, nr_pages);
2023 }
2024
shmem_split_large_entry(struct inode * inode,pgoff_t index,swp_entry_t swap,gfp_t gfp)2025 static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2026 swp_entry_t swap, gfp_t gfp)
2027 {
2028 struct address_space *mapping = inode->i_mapping;
2029 XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
2030 void *alloced_shadow = NULL;
2031 int alloced_order = 0, i;
2032
2033 /* Convert user data gfp flags to xarray node gfp flags */
2034 gfp &= GFP_RECLAIM_MASK;
2035
2036 for (;;) {
2037 int order = -1, split_order = 0;
2038 void *old = NULL;
2039
2040 xas_lock_irq(&xas);
2041 old = xas_load(&xas);
2042 if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
2043 xas_set_err(&xas, -EEXIST);
2044 goto unlock;
2045 }
2046
2047 order = xas_get_order(&xas);
2048
2049 /* Swap entry may have changed before we re-acquire the lock */
2050 if (alloced_order &&
2051 (old != alloced_shadow || order != alloced_order)) {
2052 xas_destroy(&xas);
2053 alloced_order = 0;
2054 }
2055
2056 /* Try to split large swap entry in pagecache */
2057 if (order > 0) {
2058 if (!alloced_order) {
2059 split_order = order;
2060 goto unlock;
2061 }
2062 xas_split(&xas, old, order);
2063
2064 /*
2065 * Re-set the swap entry after splitting, and the swap
2066 * offset of the original large entry must be continuous.
2067 */
2068 for (i = 0; i < 1 << order; i++) {
2069 pgoff_t aligned_index = round_down(index, 1 << order);
2070 swp_entry_t tmp;
2071
2072 tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
2073 __xa_store(&mapping->i_pages, aligned_index + i,
2074 swp_to_radix_entry(tmp), 0);
2075 }
2076 }
2077
2078 unlock:
2079 xas_unlock_irq(&xas);
2080
2081 /* split needed, alloc here and retry. */
2082 if (split_order) {
2083 xas_split_alloc(&xas, old, split_order, gfp);
2084 if (xas_error(&xas))
2085 goto error;
2086 alloced_shadow = old;
2087 alloced_order = split_order;
2088 xas_reset(&xas);
2089 continue;
2090 }
2091
2092 if (!xas_nomem(&xas, gfp))
2093 break;
2094 }
2095
2096 error:
2097 if (xas_error(&xas))
2098 return xas_error(&xas);
2099
2100 return alloced_order;
2101 }
2102
2103 /*
2104 * Swap in the folio pointed to by *foliop.
2105 * Caller has to make sure that *foliop contains a valid swapped folio.
2106 * Returns 0 and the folio in foliop if success. On failure, returns the
2107 * error code and NULL in *foliop.
2108 */
shmem_swapin_folio(struct inode * inode,pgoff_t index,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_area_struct * vma,vm_fault_t * fault_type)2109 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2110 struct folio **foliop, enum sgp_type sgp,
2111 gfp_t gfp, struct vm_area_struct *vma,
2112 vm_fault_t *fault_type)
2113 {
2114 struct address_space *mapping = inode->i_mapping;
2115 struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2116 struct shmem_inode_info *info = SHMEM_I(inode);
2117 struct swap_info_struct *si;
2118 struct folio *folio = NULL;
2119 swp_entry_t swap;
2120 int error, nr_pages;
2121
2122 VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
2123 swap = radix_to_swp_entry(*foliop);
2124 *foliop = NULL;
2125
2126 if (is_poisoned_swp_entry(swap))
2127 return -EIO;
2128
2129 si = get_swap_device(swap);
2130 if (!si) {
2131 if (!shmem_confirm_swap(mapping, index, swap))
2132 return -EEXIST;
2133 else
2134 return -EINVAL;
2135 }
2136
2137 /* Look it up and read it in.. */
2138 folio = swap_cache_get_folio(swap, NULL, 0);
2139 if (!folio) {
2140 int split_order;
2141
2142 /* Or update major stats only when swapin succeeds?? */
2143 if (fault_type) {
2144 *fault_type |= VM_FAULT_MAJOR;
2145 count_vm_event(PGMAJFAULT);
2146 count_memcg_event_mm(fault_mm, PGMAJFAULT);
2147 }
2148
2149 /*
2150 * Now swap device can only swap in order 0 folio, then we
2151 * should split the large swap entry stored in the pagecache
2152 * if necessary.
2153 */
2154 split_order = shmem_split_large_entry(inode, index, swap, gfp);
2155 if (split_order < 0) {
2156 error = split_order;
2157 goto failed;
2158 }
2159
2160 /*
2161 * If the large swap entry has already been split, it is
2162 * necessary to recalculate the new swap entry based on
2163 * the old order alignment.
2164 */
2165 if (split_order > 0) {
2166 pgoff_t offset = index - round_down(index, 1 << split_order);
2167
2168 swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2169 }
2170
2171 /* Here we actually start the io */
2172 folio = shmem_swapin_cluster(swap, gfp, info, index);
2173 if (!folio) {
2174 error = -ENOMEM;
2175 goto failed;
2176 }
2177 }
2178
2179 /* We have to do this with folio locked to prevent races */
2180 folio_lock(folio);
2181 trace_android_vh_shmem_swapin_folio(folio);
2182 if (!folio_test_swapcache(folio) ||
2183 folio->swap.val != swap.val ||
2184 !shmem_confirm_swap(mapping, index, swap)) {
2185 error = -EEXIST;
2186 goto unlock;
2187 }
2188 if (!folio_test_uptodate(folio)) {
2189 error = -EIO;
2190 goto failed;
2191 }
2192 folio_wait_writeback(folio);
2193 nr_pages = folio_nr_pages(folio);
2194
2195 /*
2196 * Some architectures may have to restore extra metadata to the
2197 * folio after reading from swap.
2198 */
2199 arch_swap_restore(folio_swap(swap, folio), folio);
2200
2201 if (shmem_should_replace_folio(folio, gfp)) {
2202 error = shmem_replace_folio(&folio, gfp, info, index, vma);
2203 if (error)
2204 goto failed;
2205 }
2206
2207 error = shmem_add_to_page_cache(folio, mapping,
2208 round_down(index, nr_pages),
2209 swp_to_radix_entry(swap), gfp);
2210 if (error)
2211 goto failed;
2212
2213 shmem_recalc_inode(inode, 0, -nr_pages);
2214 trace_android_vh_shmem_mod_swapped(folio->mapping, -nr_pages);
2215
2216 if (sgp == SGP_WRITE)
2217 folio_mark_accessed(folio);
2218
2219 delete_from_swap_cache(folio);
2220 folio_mark_dirty(folio);
2221 swap_free_nr(swap, nr_pages);
2222 put_swap_device(si);
2223
2224 *foliop = folio;
2225 return 0;
2226 failed:
2227 if (!shmem_confirm_swap(mapping, index, swap))
2228 error = -EEXIST;
2229 if (error == -EIO)
2230 shmem_set_folio_swapin_error(inode, index, folio, swap);
2231 unlock:
2232 if (folio) {
2233 folio_unlock(folio);
2234 folio_put(folio);
2235 }
2236 put_swap_device(si);
2237
2238 return error;
2239 }
2240
2241 /*
2242 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2243 *
2244 * If we allocate a new one we do not mark it dirty. That's up to the
2245 * vm. If we swap it in we mark it dirty since we also free the swap
2246 * entry since a page cannot live in both the swap and page cache.
2247 *
2248 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2249 */
shmem_get_folio_gfp(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_fault * vmf,vm_fault_t * fault_type)2250 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2251 loff_t write_end, struct folio **foliop, enum sgp_type sgp,
2252 gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
2253 {
2254 struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2255 struct mm_struct *fault_mm;
2256 struct folio *folio;
2257 int error;
2258 bool alloced;
2259 unsigned long orders = 0;
2260
2261 if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2262 return -EINVAL;
2263
2264 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2265 return -EFBIG;
2266 repeat:
2267 if (sgp <= SGP_CACHE &&
2268 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2269 return -EINVAL;
2270
2271 alloced = false;
2272 fault_mm = vma ? vma->vm_mm : NULL;
2273
2274 folio = filemap_get_entry(inode->i_mapping, index);
2275 if (folio && vma && userfaultfd_minor(vma)) {
2276 if (!xa_is_value(folio))
2277 folio_put(folio);
2278 *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2279 return 0;
2280 }
2281
2282 if (xa_is_value(folio)) {
2283 error = shmem_swapin_folio(inode, index, &folio,
2284 sgp, gfp, vma, fault_type);
2285 if (error == -EEXIST)
2286 goto repeat;
2287
2288 *foliop = folio;
2289 return error;
2290 }
2291
2292 if (folio) {
2293 folio_lock(folio);
2294
2295 /* Has the folio been truncated or swapped out? */
2296 if (unlikely(folio->mapping != inode->i_mapping)) {
2297 folio_unlock(folio);
2298 folio_put(folio);
2299 goto repeat;
2300 }
2301 if (sgp == SGP_WRITE)
2302 folio_mark_accessed(folio);
2303 if (folio_test_uptodate(folio))
2304 goto out;
2305 /* fallocated folio */
2306 if (sgp != SGP_READ)
2307 goto clear;
2308 folio_unlock(folio);
2309 folio_put(folio);
2310 }
2311
2312 /*
2313 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2314 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2315 */
2316 *foliop = NULL;
2317 if (sgp == SGP_READ)
2318 return 0;
2319 if (sgp == SGP_NOALLOC)
2320 return -ENOENT;
2321
2322 /*
2323 * Fast cache lookup and swap lookup did not find it: allocate.
2324 */
2325
2326 if (vma && userfaultfd_missing(vma)) {
2327 *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2328 return 0;
2329 }
2330
2331 /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
2332 orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
2333 trace_android_rvh_shmem_allowable_huge_orders(inode, index, vma, &orders);
2334 /*
2335 * With the above hook `order` is not always 0 anymore and the following
2336 * if block does not get compiled out. With CONFIG_TRANSPARENT_HUGEPAGE=n
2337 * vma_thp_gfp_mask() becomes undefined and linker fails.
2338 */
2339 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2340 if (orders > 0) {
2341 gfp_t huge_gfp;
2342
2343 huge_gfp = vma_thp_gfp_mask(vma);
2344 huge_gfp = limit_gfp_mask(huge_gfp, gfp);
2345 folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
2346 inode, index, fault_mm, orders);
2347 if (!IS_ERR(folio)) {
2348 if (folio_test_pmd_mappable(folio))
2349 count_vm_event(THP_FILE_ALLOC);
2350 count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
2351 goto alloced;
2352 }
2353 if (PTR_ERR(folio) == -EEXIST)
2354 goto repeat;
2355 }
2356 #endif
2357
2358 folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
2359 if (IS_ERR(folio)) {
2360 error = PTR_ERR(folio);
2361 if (error == -EEXIST)
2362 goto repeat;
2363 folio = NULL;
2364 goto unlock;
2365 }
2366
2367 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2368 alloced:
2369 #endif
2370 alloced = true;
2371 if (folio_test_large(folio) &&
2372 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2373 folio_next_index(folio)) {
2374 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2375 struct shmem_inode_info *info = SHMEM_I(inode);
2376 /*
2377 * Part of the large folio is beyond i_size: subject
2378 * to shrink under memory pressure.
2379 */
2380 spin_lock(&sbinfo->shrinklist_lock);
2381 /*
2382 * _careful to defend against unlocked access to
2383 * ->shrink_list in shmem_unused_huge_shrink()
2384 */
2385 if (list_empty_careful(&info->shrinklist)) {
2386 list_add_tail(&info->shrinklist,
2387 &sbinfo->shrinklist);
2388 sbinfo->shrinklist_len++;
2389 }
2390 spin_unlock(&sbinfo->shrinklist_lock);
2391 }
2392
2393 if (sgp == SGP_WRITE)
2394 folio_set_referenced(folio);
2395 /*
2396 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2397 */
2398 if (sgp == SGP_FALLOC)
2399 sgp = SGP_WRITE;
2400 clear:
2401 /*
2402 * Let SGP_WRITE caller clear ends if write does not fill folio;
2403 * but SGP_FALLOC on a folio fallocated earlier must initialize
2404 * it now, lest undo on failure cancel our earlier guarantee.
2405 */
2406 if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2407 long i, n = folio_nr_pages(folio);
2408
2409 for (i = 0; i < n; i++)
2410 clear_highpage(folio_page(folio, i));
2411 flush_dcache_folio(folio);
2412 folio_mark_uptodate(folio);
2413 }
2414
2415 /* Perhaps the file has been truncated since we checked */
2416 if (sgp <= SGP_CACHE &&
2417 ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2418 error = -EINVAL;
2419 goto unlock;
2420 }
2421 out:
2422 *foliop = folio;
2423 return 0;
2424
2425 /*
2426 * Error recovery.
2427 */
2428 unlock:
2429 if (alloced)
2430 filemap_remove_folio(folio);
2431 shmem_recalc_inode(inode, 0, 0);
2432 if (folio) {
2433 folio_unlock(folio);
2434 folio_put(folio);
2435 }
2436 return error;
2437 }
2438
2439 /**
2440 * shmem_get_folio - find, and lock a shmem folio.
2441 * @inode: inode to search
2442 * @index: the page index.
2443 * @write_end: end of a write, could extend inode size
2444 * @foliop: pointer to the folio if found
2445 * @sgp: SGP_* flags to control behavior
2446 *
2447 * Looks up the page cache entry at @inode & @index. If a folio is
2448 * present, it is returned locked with an increased refcount.
2449 *
2450 * If the caller modifies data in the folio, it must call folio_mark_dirty()
2451 * before unlocking the folio to ensure that the folio is not reclaimed.
2452 * There is no need to reserve space before calling folio_mark_dirty().
2453 *
2454 * When no folio is found, the behavior depends on @sgp:
2455 * - for SGP_READ, *@foliop is %NULL and 0 is returned
2456 * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2457 * - for all other flags a new folio is allocated, inserted into the
2458 * page cache and returned locked in @foliop.
2459 *
2460 * Context: May sleep.
2461 * Return: 0 if successful, else a negative error code.
2462 */
shmem_get_folio(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp)2463 int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2464 struct folio **foliop, enum sgp_type sgp)
2465 {
2466 return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2467 mapping_gfp_mask(inode->i_mapping), NULL, NULL);
2468 }
2469 EXPORT_SYMBOL_GPL(shmem_get_folio);
2470
2471 /*
2472 * This is like autoremove_wake_function, but it removes the wait queue
2473 * entry unconditionally - even if something else had already woken the
2474 * target.
2475 */
synchronous_wake_function(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)2476 static int synchronous_wake_function(wait_queue_entry_t *wait,
2477 unsigned int mode, int sync, void *key)
2478 {
2479 int ret = default_wake_function(wait, mode, sync, key);
2480 list_del_init(&wait->entry);
2481 return ret;
2482 }
2483
2484 /*
2485 * Trinity finds that probing a hole which tmpfs is punching can
2486 * prevent the hole-punch from ever completing: which in turn
2487 * locks writers out with its hold on i_rwsem. So refrain from
2488 * faulting pages into the hole while it's being punched. Although
2489 * shmem_undo_range() does remove the additions, it may be unable to
2490 * keep up, as each new page needs its own unmap_mapping_range() call,
2491 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2492 *
2493 * It does not matter if we sometimes reach this check just before the
2494 * hole-punch begins, so that one fault then races with the punch:
2495 * we just need to make racing faults a rare case.
2496 *
2497 * The implementation below would be much simpler if we just used a
2498 * standard mutex or completion: but we cannot take i_rwsem in fault,
2499 * and bloating every shmem inode for this unlikely case would be sad.
2500 */
shmem_falloc_wait(struct vm_fault * vmf,struct inode * inode)2501 static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
2502 {
2503 struct shmem_falloc *shmem_falloc;
2504 struct file *fpin = NULL;
2505 vm_fault_t ret = 0;
2506
2507 spin_lock(&inode->i_lock);
2508 shmem_falloc = inode->i_private;
2509 if (shmem_falloc &&
2510 shmem_falloc->waitq &&
2511 vmf->pgoff >= shmem_falloc->start &&
2512 vmf->pgoff < shmem_falloc->next) {
2513 wait_queue_head_t *shmem_falloc_waitq;
2514 DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2515
2516 ret = VM_FAULT_NOPAGE;
2517 fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2518 shmem_falloc_waitq = shmem_falloc->waitq;
2519 prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2520 TASK_UNINTERRUPTIBLE);
2521 spin_unlock(&inode->i_lock);
2522 schedule();
2523
2524 /*
2525 * shmem_falloc_waitq points into the shmem_fallocate()
2526 * stack of the hole-punching task: shmem_falloc_waitq
2527 * is usually invalid by the time we reach here, but
2528 * finish_wait() does not dereference it in that case;
2529 * though i_lock needed lest racing with wake_up_all().
2530 */
2531 spin_lock(&inode->i_lock);
2532 finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2533 }
2534 spin_unlock(&inode->i_lock);
2535 if (fpin) {
2536 fput(fpin);
2537 ret = VM_FAULT_RETRY;
2538 }
2539 return ret;
2540 }
2541
shmem_fault(struct vm_fault * vmf)2542 vm_fault_t shmem_fault(struct vm_fault *vmf)
2543 {
2544 struct inode *inode = file_inode(vmf->vma->vm_file);
2545 gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2546 struct folio *folio = NULL;
2547 vm_fault_t ret = 0;
2548 int err;
2549
2550 /*
2551 * Trinity finds that probing a hole which tmpfs is punching can
2552 * prevent the hole-punch from ever completing: noted in i_private.
2553 */
2554 if (unlikely(inode->i_private)) {
2555 ret = shmem_falloc_wait(vmf, inode);
2556 if (ret)
2557 return ret;
2558 }
2559
2560 WARN_ON_ONCE(vmf->page != NULL);
2561 err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
2562 gfp, vmf, &ret);
2563 if (err)
2564 return vmf_error(err);
2565 if (folio) {
2566 vmf->page = folio_file_page(folio, vmf->pgoff);
2567 ret |= VM_FAULT_LOCKED;
2568 }
2569 return ret;
2570 }
2571
shmem_get_unmapped_area(struct file * file,unsigned long uaddr,unsigned long len,unsigned long pgoff,unsigned long flags)2572 unsigned long shmem_get_unmapped_area(struct file *file,
2573 unsigned long uaddr, unsigned long len,
2574 unsigned long pgoff, unsigned long flags)
2575 {
2576 unsigned long addr;
2577 unsigned long offset;
2578 unsigned long inflated_len;
2579 unsigned long inflated_addr;
2580 unsigned long inflated_offset;
2581 unsigned long hpage_size;
2582
2583 if (len > TASK_SIZE)
2584 return -ENOMEM;
2585
2586 addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
2587 flags);
2588
2589 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2590 return addr;
2591 if (IS_ERR_VALUE(addr))
2592 return addr;
2593 if (addr & ~PAGE_MASK)
2594 return addr;
2595 if (addr > TASK_SIZE - len)
2596 return addr;
2597
2598 if (shmem_huge == SHMEM_HUGE_DENY)
2599 return addr;
2600 if (flags & MAP_FIXED)
2601 return addr;
2602 /*
2603 * Our priority is to support MAP_SHARED mapped hugely;
2604 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2605 * But if caller specified an address hint and we allocated area there
2606 * successfully, respect that as before.
2607 */
2608 if (uaddr == addr)
2609 return addr;
2610
2611 hpage_size = HPAGE_PMD_SIZE;
2612 if (shmem_huge != SHMEM_HUGE_FORCE) {
2613 struct super_block *sb;
2614 unsigned long __maybe_unused hpage_orders;
2615 int order = 0;
2616
2617 if (file) {
2618 VM_BUG_ON(file->f_op != &shmem_file_operations);
2619 sb = file_inode(file)->i_sb;
2620 } else {
2621 /*
2622 * Called directly from mm/mmap.c, or drivers/char/mem.c
2623 * for "/dev/zero", to create a shared anonymous object.
2624 */
2625 if (IS_ERR(shm_mnt))
2626 return addr;
2627 sb = shm_mnt->mnt_sb;
2628
2629 /*
2630 * Find the highest mTHP order used for anonymous shmem to
2631 * provide a suitable alignment address.
2632 */
2633 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2634 hpage_orders = READ_ONCE(huge_shmem_orders_always);
2635 hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
2636 hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
2637 if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2638 hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
2639
2640 if (hpage_orders > 0) {
2641 order = highest_order(hpage_orders);
2642 hpage_size = PAGE_SIZE << order;
2643 }
2644 #endif
2645 }
2646 if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2647 return addr;
2648 }
2649
2650 if (len < hpage_size)
2651 return addr;
2652
2653 offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
2654 if (offset && offset + len < 2 * hpage_size)
2655 return addr;
2656 if ((addr & (hpage_size - 1)) == offset)
2657 return addr;
2658
2659 inflated_len = len + hpage_size - PAGE_SIZE;
2660 if (inflated_len > TASK_SIZE)
2661 return addr;
2662 if (inflated_len < len)
2663 return addr;
2664
2665 inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
2666 inflated_len, 0, flags);
2667 if (IS_ERR_VALUE(inflated_addr))
2668 return addr;
2669 if (inflated_addr & ~PAGE_MASK)
2670 return addr;
2671
2672 inflated_offset = inflated_addr & (hpage_size - 1);
2673 inflated_addr += offset - inflated_offset;
2674 if (inflated_offset > offset)
2675 inflated_addr += hpage_size;
2676
2677 if (inflated_addr > TASK_SIZE - len)
2678 return addr;
2679 return inflated_addr;
2680 }
2681
2682 #ifdef CONFIG_NUMA
shmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)2683 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2684 {
2685 struct inode *inode = file_inode(vma->vm_file);
2686 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2687 }
2688
shmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2689 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2690 unsigned long addr, pgoff_t *ilx)
2691 {
2692 struct inode *inode = file_inode(vma->vm_file);
2693 pgoff_t index;
2694
2695 /*
2696 * Bias interleave by inode number to distribute better across nodes;
2697 * but this interface is independent of which page order is used, so
2698 * supplies only that bias, letting caller apply the offset (adjusted
2699 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2700 */
2701 *ilx = inode->i_ino;
2702 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2703 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2704 }
2705
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2706 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2707 pgoff_t index, unsigned int order, pgoff_t *ilx)
2708 {
2709 struct mempolicy *mpol;
2710
2711 /* Bias interleave by inode number to distribute better across nodes */
2712 *ilx = info->vfs_inode.i_ino + (index >> order);
2713
2714 mpol = mpol_shared_policy_lookup(&info->policy, index);
2715 return mpol ? mpol : get_task_policy(current);
2716 }
2717 #else
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2718 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2719 pgoff_t index, unsigned int order, pgoff_t *ilx)
2720 {
2721 *ilx = 0;
2722 return NULL;
2723 }
2724 #endif /* CONFIG_NUMA */
2725
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)2726 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2727 {
2728 struct inode *inode = file_inode(file);
2729 struct shmem_inode_info *info = SHMEM_I(inode);
2730 int retval = -ENOMEM;
2731
2732 /*
2733 * What serializes the accesses to info->flags?
2734 * ipc_lock_object() when called from shmctl_do_lock(),
2735 * no serialization needed when called from shm_destroy().
2736 */
2737 if (lock && !(info->flags & VM_LOCKED)) {
2738 if (!user_shm_lock(inode->i_size, ucounts))
2739 goto out_nomem;
2740 info->flags |= VM_LOCKED;
2741 mapping_set_unevictable(file->f_mapping);
2742 }
2743 if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2744 user_shm_unlock(inode->i_size, ucounts);
2745 info->flags &= ~VM_LOCKED;
2746 mapping_clear_unevictable(file->f_mapping);
2747 }
2748 retval = 0;
2749
2750 out_nomem:
2751 return retval;
2752 }
2753
shmem_mmap(struct file * file,struct vm_area_struct * vma)2754 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2755 {
2756 struct inode *inode = file_inode(file);
2757 struct shmem_inode_info *info = SHMEM_I(inode);
2758 int ret;
2759
2760 ret = seal_check_write(info->seals, vma);
2761 if (ret)
2762 return ret;
2763
2764 file_accessed(file);
2765 /* This is anonymous shared memory if it is unlinked at the time of mmap */
2766 if (inode->i_nlink)
2767 vma->vm_ops = &shmem_vm_ops;
2768 else
2769 vma->vm_ops = &shmem_anon_vm_ops;
2770 return 0;
2771 }
2772
shmem_file_open(struct inode * inode,struct file * file)2773 static int shmem_file_open(struct inode *inode, struct file *file)
2774 {
2775 file->f_mode |= FMODE_CAN_ODIRECT;
2776 return generic_file_open(inode, file);
2777 }
2778
2779 #ifdef CONFIG_TMPFS_XATTR
2780 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2781
2782 /*
2783 * chattr's fsflags are unrelated to extended attributes,
2784 * but tmpfs has chosen to enable them under the same config option.
2785 */
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags)2786 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2787 {
2788 unsigned int i_flags = 0;
2789
2790 if (fsflags & FS_NOATIME_FL)
2791 i_flags |= S_NOATIME;
2792 if (fsflags & FS_APPEND_FL)
2793 i_flags |= S_APPEND;
2794 if (fsflags & FS_IMMUTABLE_FL)
2795 i_flags |= S_IMMUTABLE;
2796 /*
2797 * But FS_NODUMP_FL does not require any action in i_flags.
2798 */
2799 inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2800 }
2801 #else
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags)2802 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2803 {
2804 }
2805 #define shmem_initxattrs NULL
2806 #endif
2807
shmem_get_offset_ctx(struct inode * inode)2808 static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
2809 {
2810 return &SHMEM_I(inode)->dir_offsets;
2811 }
2812
__shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)2813 static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
2814 struct super_block *sb,
2815 struct inode *dir, umode_t mode,
2816 dev_t dev, unsigned long flags)
2817 {
2818 struct inode *inode;
2819 struct shmem_inode_info *info;
2820 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2821 ino_t ino;
2822 int err;
2823
2824 err = shmem_reserve_inode(sb, &ino);
2825 if (err)
2826 return ERR_PTR(err);
2827
2828 inode = new_inode(sb);
2829 if (!inode) {
2830 shmem_free_inode(sb, 0);
2831 return ERR_PTR(-ENOSPC);
2832 }
2833
2834 inode->i_ino = ino;
2835 inode_init_owner(idmap, inode, dir, mode);
2836 inode->i_blocks = 0;
2837 simple_inode_init_ts(inode);
2838 inode->i_generation = get_random_u32();
2839 info = SHMEM_I(inode);
2840 memset(info, 0, (char *)inode - (char *)info);
2841 android_init_vendor_data(info, 1);
2842 spin_lock_init(&info->lock);
2843 atomic_set(&info->stop_eviction, 0);
2844 info->seals = F_SEAL_SEAL;
2845 info->flags = flags & VM_NORESERVE;
2846 info->i_crtime = inode_get_mtime(inode);
2847 info->fsflags = (dir == NULL) ? 0 :
2848 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
2849 if (info->fsflags)
2850 shmem_set_inode_flags(inode, info->fsflags);
2851 INIT_LIST_HEAD(&info->shrinklist);
2852 INIT_LIST_HEAD(&info->swaplist);
2853 simple_xattrs_init(&info->xattrs);
2854 cache_no_acl(inode);
2855 if (sbinfo->noswap)
2856 mapping_set_unevictable(inode->i_mapping);
2857 mapping_set_large_folios(inode->i_mapping);
2858
2859 switch (mode & S_IFMT) {
2860 default:
2861 inode->i_op = &shmem_special_inode_operations;
2862 init_special_inode(inode, mode, dev);
2863 break;
2864 case S_IFREG:
2865 inode->i_mapping->a_ops = &shmem_aops;
2866 inode->i_op = &shmem_inode_operations;
2867 inode->i_fop = &shmem_file_operations;
2868 mpol_shared_policy_init(&info->policy,
2869 shmem_get_sbmpol(sbinfo));
2870 break;
2871 case S_IFDIR:
2872 inc_nlink(inode);
2873 /* Some things misbehave if size == 0 on a directory */
2874 inode->i_size = 2 * BOGO_DIRENT_SIZE;
2875 inode->i_op = &shmem_dir_inode_operations;
2876 inode->i_fop = &simple_offset_dir_operations;
2877 simple_offset_init(shmem_get_offset_ctx(inode));
2878 break;
2879 case S_IFLNK:
2880 /*
2881 * Must not load anything in the rbtree,
2882 * mpol_free_shared_policy will not be called.
2883 */
2884 mpol_shared_policy_init(&info->policy, NULL);
2885 break;
2886 }
2887
2888 lockdep_annotate_inode_mutex_key(inode);
2889 return inode;
2890 }
2891
2892 #ifdef CONFIG_TMPFS_QUOTA
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)2893 static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
2894 struct super_block *sb, struct inode *dir,
2895 umode_t mode, dev_t dev, unsigned long flags)
2896 {
2897 int err;
2898 struct inode *inode;
2899
2900 inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
2901 if (IS_ERR(inode))
2902 return inode;
2903
2904 err = dquot_initialize(inode);
2905 if (err)
2906 goto errout;
2907
2908 err = dquot_alloc_inode(inode);
2909 if (err) {
2910 dquot_drop(inode);
2911 goto errout;
2912 }
2913 return inode;
2914
2915 errout:
2916 inode->i_flags |= S_NOQUOTA;
2917 iput(inode);
2918 return ERR_PTR(err);
2919 }
2920 #else
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)2921 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
2922 struct super_block *sb, struct inode *dir,
2923 umode_t mode, dev_t dev, unsigned long flags)
2924 {
2925 return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
2926 }
2927 #endif /* CONFIG_TMPFS_QUOTA */
2928
2929 #ifdef CONFIG_USERFAULTFD
shmem_mfill_atomic_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)2930 int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
2931 struct vm_area_struct *dst_vma,
2932 unsigned long dst_addr,
2933 unsigned long src_addr,
2934 uffd_flags_t flags,
2935 struct folio **foliop)
2936 {
2937 struct inode *inode = file_inode(dst_vma->vm_file);
2938 struct shmem_inode_info *info = SHMEM_I(inode);
2939 struct address_space *mapping = inode->i_mapping;
2940 gfp_t gfp = mapping_gfp_mask(mapping);
2941 pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2942 void *page_kaddr;
2943 struct folio *folio;
2944 int ret;
2945 pgoff_t max_off;
2946
2947 if (shmem_inode_acct_blocks(inode, 1)) {
2948 /*
2949 * We may have got a page, returned -ENOENT triggering a retry,
2950 * and now we find ourselves with -ENOMEM. Release the page, to
2951 * avoid a BUG_ON in our caller.
2952 */
2953 if (unlikely(*foliop)) {
2954 folio_put(*foliop);
2955 *foliop = NULL;
2956 }
2957 return -ENOMEM;
2958 }
2959
2960 if (!*foliop) {
2961 ret = -ENOMEM;
2962 folio = shmem_alloc_folio(gfp, 0, info, pgoff);
2963 if (!folio)
2964 goto out_unacct_blocks;
2965
2966 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
2967 page_kaddr = kmap_local_folio(folio, 0);
2968 /*
2969 * The read mmap_lock is held here. Despite the
2970 * mmap_lock being read recursive a deadlock is still
2971 * possible if a writer has taken a lock. For example:
2972 *
2973 * process A thread 1 takes read lock on own mmap_lock
2974 * process A thread 2 calls mmap, blocks taking write lock
2975 * process B thread 1 takes page fault, read lock on own mmap lock
2976 * process B thread 2 calls mmap, blocks taking write lock
2977 * process A thread 1 blocks taking read lock on process B
2978 * process B thread 1 blocks taking read lock on process A
2979 *
2980 * Disable page faults to prevent potential deadlock
2981 * and retry the copy outside the mmap_lock.
2982 */
2983 pagefault_disable();
2984 ret = copy_from_user(page_kaddr,
2985 (const void __user *)src_addr,
2986 PAGE_SIZE);
2987 pagefault_enable();
2988 kunmap_local(page_kaddr);
2989
2990 /* fallback to copy_from_user outside mmap_lock */
2991 if (unlikely(ret)) {
2992 *foliop = folio;
2993 ret = -ENOENT;
2994 /* don't free the page */
2995 goto out_unacct_blocks;
2996 }
2997
2998 flush_dcache_folio(folio);
2999 } else { /* ZEROPAGE */
3000 clear_user_highpage(&folio->page, dst_addr);
3001 }
3002 } else {
3003 folio = *foliop;
3004 VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3005 *foliop = NULL;
3006 }
3007
3008 VM_BUG_ON(folio_test_locked(folio));
3009 VM_BUG_ON(folio_test_swapbacked(folio));
3010 __folio_set_locked(folio);
3011 __folio_set_swapbacked(folio);
3012 __folio_mark_uptodate(folio);
3013
3014 ret = -EFAULT;
3015 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3016 if (unlikely(pgoff >= max_off))
3017 goto out_release;
3018
3019 ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
3020 if (ret)
3021 goto out_release;
3022 ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
3023 if (ret)
3024 goto out_release;
3025
3026 ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3027 &folio->page, true, flags);
3028 if (ret)
3029 goto out_delete_from_cache;
3030
3031 shmem_recalc_inode(inode, 1, 0);
3032 folio_unlock(folio);
3033 return 0;
3034 out_delete_from_cache:
3035 filemap_remove_folio(folio);
3036 out_release:
3037 folio_unlock(folio);
3038 folio_put(folio);
3039 out_unacct_blocks:
3040 shmem_inode_unacct_blocks(inode, 1);
3041 return ret;
3042 }
3043 #endif /* CONFIG_USERFAULTFD */
3044
3045 #ifdef CONFIG_TMPFS
3046 static const struct inode_operations shmem_symlink_inode_operations;
3047 static const struct inode_operations shmem_short_symlink_operations;
3048
3049 static int
shmem_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,void ** fsdata)3050 shmem_write_begin(struct file *file, struct address_space *mapping,
3051 loff_t pos, unsigned len,
3052 struct folio **foliop, void **fsdata)
3053 {
3054 struct inode *inode = mapping->host;
3055 struct shmem_inode_info *info = SHMEM_I(inode);
3056 pgoff_t index = pos >> PAGE_SHIFT;
3057 struct folio *folio;
3058 int ret = 0;
3059
3060 /* i_rwsem is held by caller */
3061 if (unlikely(info->seals & (F_SEAL_GROW |
3062 F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
3063 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
3064 return -EPERM;
3065 if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3066 return -EPERM;
3067 }
3068
3069 ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3070 if (ret)
3071 return ret;
3072
3073 if (folio_contain_hwpoisoned_page(folio)) {
3074 folio_unlock(folio);
3075 folio_put(folio);
3076 return -EIO;
3077 }
3078
3079 *foliop = folio;
3080 return 0;
3081 }
3082
3083 static int
shmem_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct folio * folio,void * fsdata)3084 shmem_write_end(struct file *file, struct address_space *mapping,
3085 loff_t pos, unsigned len, unsigned copied,
3086 struct folio *folio, void *fsdata)
3087 {
3088 struct inode *inode = mapping->host;
3089
3090 if (pos + copied > inode->i_size)
3091 i_size_write(inode, pos + copied);
3092
3093 if (!folio_test_uptodate(folio)) {
3094 if (copied < folio_size(folio)) {
3095 size_t from = offset_in_folio(folio, pos);
3096 folio_zero_segments(folio, 0, from,
3097 from + copied, folio_size(folio));
3098 }
3099 folio_mark_uptodate(folio);
3100 }
3101 folio_mark_dirty(folio);
3102 folio_unlock(folio);
3103 folio_put(folio);
3104
3105 return copied;
3106 }
3107
shmem_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3108 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3109 {
3110 struct file *file = iocb->ki_filp;
3111 struct inode *inode = file_inode(file);
3112 struct address_space *mapping = inode->i_mapping;
3113 pgoff_t index;
3114 unsigned long offset;
3115 int error = 0;
3116 ssize_t retval = 0;
3117 loff_t *ppos = &iocb->ki_pos;
3118
3119 index = *ppos >> PAGE_SHIFT;
3120 offset = *ppos & ~PAGE_MASK;
3121
3122 for (;;) {
3123 struct folio *folio = NULL;
3124 struct page *page = NULL;
3125 pgoff_t end_index;
3126 unsigned long nr, ret;
3127 loff_t i_size = i_size_read(inode);
3128
3129 end_index = i_size >> PAGE_SHIFT;
3130 if (index > end_index)
3131 break;
3132 if (index == end_index) {
3133 nr = i_size & ~PAGE_MASK;
3134 if (nr <= offset)
3135 break;
3136 }
3137
3138 error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3139 if (error) {
3140 if (error == -EINVAL)
3141 error = 0;
3142 break;
3143 }
3144 if (folio) {
3145 folio_unlock(folio);
3146
3147 page = folio_file_page(folio, index);
3148 if (PageHWPoison(page)) {
3149 folio_put(folio);
3150 error = -EIO;
3151 break;
3152 }
3153 }
3154
3155 /*
3156 * We must evaluate after, since reads (unlike writes)
3157 * are called without i_rwsem protection against truncate
3158 */
3159 nr = PAGE_SIZE;
3160 i_size = i_size_read(inode);
3161 end_index = i_size >> PAGE_SHIFT;
3162 if (index == end_index) {
3163 nr = i_size & ~PAGE_MASK;
3164 if (nr <= offset) {
3165 if (folio)
3166 folio_put(folio);
3167 break;
3168 }
3169 }
3170 nr -= offset;
3171
3172 if (folio) {
3173 /*
3174 * If users can be writing to this page using arbitrary
3175 * virtual addresses, take care about potential aliasing
3176 * before reading the page on the kernel side.
3177 */
3178 if (mapping_writably_mapped(mapping))
3179 flush_dcache_page(page);
3180 /*
3181 * Mark the page accessed if we read the beginning.
3182 */
3183 if (!offset)
3184 folio_mark_accessed(folio);
3185 /*
3186 * Ok, we have the page, and it's up-to-date, so
3187 * now we can copy it to user space...
3188 */
3189 ret = copy_page_to_iter(page, offset, nr, to);
3190 folio_put(folio);
3191
3192 } else if (user_backed_iter(to)) {
3193 /*
3194 * Copy to user tends to be so well optimized, but
3195 * clear_user() not so much, that it is noticeably
3196 * faster to copy the zero page instead of clearing.
3197 */
3198 ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
3199 } else {
3200 /*
3201 * But submitting the same page twice in a row to
3202 * splice() - or others? - can result in confusion:
3203 * so don't attempt that optimization on pipes etc.
3204 */
3205 ret = iov_iter_zero(nr, to);
3206 }
3207
3208 retval += ret;
3209 offset += ret;
3210 index += offset >> PAGE_SHIFT;
3211 offset &= ~PAGE_MASK;
3212
3213 if (!iov_iter_count(to))
3214 break;
3215 if (ret < nr) {
3216 error = -EFAULT;
3217 break;
3218 }
3219 cond_resched();
3220 }
3221
3222 *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
3223 file_accessed(file);
3224 return retval ? retval : error;
3225 }
3226
shmem_file_write_iter(struct kiocb * iocb,struct iov_iter * from)3227 static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3228 {
3229 struct file *file = iocb->ki_filp;
3230 struct inode *inode = file->f_mapping->host;
3231 ssize_t ret;
3232
3233 inode_lock(inode);
3234 ret = generic_write_checks(iocb, from);
3235 if (ret <= 0)
3236 goto unlock;
3237 ret = file_remove_privs(file);
3238 if (ret)
3239 goto unlock;
3240 ret = file_update_time(file);
3241 if (ret)
3242 goto unlock;
3243 ret = generic_perform_write(iocb, from);
3244 unlock:
3245 inode_unlock(inode);
3246 return ret;
3247 }
3248
zero_pipe_buf_get(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3249 static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3250 struct pipe_buffer *buf)
3251 {
3252 return true;
3253 }
3254
zero_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3255 static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3256 struct pipe_buffer *buf)
3257 {
3258 }
3259
zero_pipe_buf_try_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3260 static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3261 struct pipe_buffer *buf)
3262 {
3263 return false;
3264 }
3265
3266 static const struct pipe_buf_operations zero_pipe_buf_ops = {
3267 .release = zero_pipe_buf_release,
3268 .try_steal = zero_pipe_buf_try_steal,
3269 .get = zero_pipe_buf_get,
3270 };
3271
splice_zeropage_into_pipe(struct pipe_inode_info * pipe,loff_t fpos,size_t size)3272 static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3273 loff_t fpos, size_t size)
3274 {
3275 size_t offset = fpos & ~PAGE_MASK;
3276
3277 size = min_t(size_t, size, PAGE_SIZE - offset);
3278
3279 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
3280 struct pipe_buffer *buf = pipe_head_buf(pipe);
3281
3282 *buf = (struct pipe_buffer) {
3283 .ops = &zero_pipe_buf_ops,
3284 .page = ZERO_PAGE(0),
3285 .offset = offset,
3286 .len = size,
3287 };
3288 pipe->head++;
3289 }
3290
3291 return size;
3292 }
3293
shmem_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)3294 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
3295 struct pipe_inode_info *pipe,
3296 size_t len, unsigned int flags)
3297 {
3298 struct inode *inode = file_inode(in);
3299 struct address_space *mapping = inode->i_mapping;
3300 struct folio *folio = NULL;
3301 size_t total_spliced = 0, used, npages, n, part;
3302 loff_t isize;
3303 int error = 0;
3304
3305 /* Work out how much data we can actually add into the pipe */
3306 used = pipe_occupancy(pipe->head, pipe->tail);
3307 npages = max_t(ssize_t, pipe->max_usage - used, 0);
3308 len = min_t(size_t, len, npages * PAGE_SIZE);
3309
3310 do {
3311 if (*ppos >= i_size_read(inode))
3312 break;
3313
3314 error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
3315 SGP_READ);
3316 if (error) {
3317 if (error == -EINVAL)
3318 error = 0;
3319 break;
3320 }
3321 if (folio) {
3322 folio_unlock(folio);
3323
3324 if (folio_test_hwpoison(folio) ||
3325 (folio_test_large(folio) &&
3326 folio_test_has_hwpoisoned(folio))) {
3327 error = -EIO;
3328 break;
3329 }
3330 }
3331
3332 /*
3333 * i_size must be checked after we know the pages are Uptodate.
3334 *
3335 * Checking i_size after the check allows us to calculate
3336 * the correct value for "nr", which means the zero-filled
3337 * part of the page is not copied back to userspace (unless
3338 * another truncate extends the file - this is desired though).
3339 */
3340 isize = i_size_read(inode);
3341 if (unlikely(*ppos >= isize))
3342 break;
3343 part = min_t(loff_t, isize - *ppos, len);
3344
3345 if (folio) {
3346 /*
3347 * If users can be writing to this page using arbitrary
3348 * virtual addresses, take care about potential aliasing
3349 * before reading the page on the kernel side.
3350 */
3351 if (mapping_writably_mapped(mapping))
3352 flush_dcache_folio(folio);
3353 folio_mark_accessed(folio);
3354 /*
3355 * Ok, we have the page, and it's up-to-date, so we can
3356 * now splice it into the pipe.
3357 */
3358 n = splice_folio_into_pipe(pipe, folio, *ppos, part);
3359 folio_put(folio);
3360 folio = NULL;
3361 } else {
3362 n = splice_zeropage_into_pipe(pipe, *ppos, part);
3363 }
3364
3365 if (!n)
3366 break;
3367 len -= n;
3368 total_spliced += n;
3369 *ppos += n;
3370 in->f_ra.prev_pos = *ppos;
3371 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
3372 break;
3373
3374 cond_resched();
3375 } while (len);
3376
3377 if (folio)
3378 folio_put(folio);
3379
3380 file_accessed(in);
3381 return total_spliced ? total_spliced : error;
3382 }
3383
shmem_file_llseek(struct file * file,loff_t offset,int whence)3384 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
3385 {
3386 struct address_space *mapping = file->f_mapping;
3387 struct inode *inode = mapping->host;
3388
3389 if (whence != SEEK_DATA && whence != SEEK_HOLE)
3390 return generic_file_llseek_size(file, offset, whence,
3391 MAX_LFS_FILESIZE, i_size_read(inode));
3392 if (offset < 0)
3393 return -ENXIO;
3394
3395 inode_lock(inode);
3396 /* We're holding i_rwsem so we can access i_size directly */
3397 offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
3398 if (offset >= 0)
3399 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3400 inode_unlock(inode);
3401 return offset;
3402 }
3403
shmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3404 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
3405 loff_t len)
3406 {
3407 struct inode *inode = file_inode(file);
3408 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3409 struct shmem_inode_info *info = SHMEM_I(inode);
3410 struct shmem_falloc shmem_falloc;
3411 pgoff_t start, index, end, undo_fallocend;
3412 int error;
3413
3414 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3415 return -EOPNOTSUPP;
3416
3417 inode_lock(inode);
3418
3419 if (mode & FALLOC_FL_PUNCH_HOLE) {
3420 struct address_space *mapping = file->f_mapping;
3421 loff_t unmap_start = round_up(offset, PAGE_SIZE);
3422 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
3423 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3424
3425 /* protected by i_rwsem */
3426 if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
3427 error = -EPERM;
3428 goto out;
3429 }
3430
3431 shmem_falloc.waitq = &shmem_falloc_waitq;
3432 shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3433 shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
3434 spin_lock(&inode->i_lock);
3435 inode->i_private = &shmem_falloc;
3436 spin_unlock(&inode->i_lock);
3437
3438 if ((u64)unmap_end > (u64)unmap_start)
3439 unmap_mapping_range(mapping, unmap_start,
3440 1 + unmap_end - unmap_start, 0);
3441 shmem_truncate_range(inode, offset, offset + len - 1);
3442 /* No need to unmap again: hole-punching leaves COWed pages */
3443
3444 spin_lock(&inode->i_lock);
3445 inode->i_private = NULL;
3446 wake_up_all(&shmem_falloc_waitq);
3447 WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3448 spin_unlock(&inode->i_lock);
3449 error = 0;
3450 goto out;
3451 }
3452
3453 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
3454 error = inode_newsize_ok(inode, offset + len);
3455 if (error)
3456 goto out;
3457
3458 if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3459 error = -EPERM;
3460 goto out;
3461 }
3462
3463 start = offset >> PAGE_SHIFT;
3464 end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3465 /* Try to avoid a swapstorm if len is impossible to satisfy */
3466 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3467 error = -ENOSPC;
3468 goto out;
3469 }
3470
3471 shmem_falloc.waitq = NULL;
3472 shmem_falloc.start = start;
3473 shmem_falloc.next = start;
3474 shmem_falloc.nr_falloced = 0;
3475 shmem_falloc.nr_unswapped = 0;
3476 spin_lock(&inode->i_lock);
3477 inode->i_private = &shmem_falloc;
3478 spin_unlock(&inode->i_lock);
3479
3480 /*
3481 * info->fallocend is only relevant when huge pages might be
3482 * involved: to prevent split_huge_page() freeing fallocated
3483 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3484 */
3485 undo_fallocend = info->fallocend;
3486 if (info->fallocend < end)
3487 info->fallocend = end;
3488
3489 for (index = start; index < end; ) {
3490 struct folio *folio;
3491
3492 /*
3493 * Check for fatal signal so that we abort early in OOM
3494 * situations. We don't want to abort in case of non-fatal
3495 * signals as large fallocate can take noticeable time and
3496 * e.g. periodic timers may result in fallocate constantly
3497 * restarting.
3498 */
3499 if (fatal_signal_pending(current))
3500 error = -EINTR;
3501 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3502 error = -ENOMEM;
3503 else
3504 error = shmem_get_folio(inode, index, offset + len,
3505 &folio, SGP_FALLOC);
3506 if (error) {
3507 info->fallocend = undo_fallocend;
3508 /* Remove the !uptodate folios we added */
3509 if (index > start) {
3510 shmem_undo_range(inode,
3511 (loff_t)start << PAGE_SHIFT,
3512 ((loff_t)index << PAGE_SHIFT) - 1, true);
3513 }
3514 goto undone;
3515 }
3516
3517 /*
3518 * Here is a more important optimization than it appears:
3519 * a second SGP_FALLOC on the same large folio will clear it,
3520 * making it uptodate and un-undoable if we fail later.
3521 */
3522 index = folio_next_index(folio);
3523 /* Beware 32-bit wraparound */
3524 if (!index)
3525 index--;
3526
3527 /*
3528 * Inform shmem_writepage() how far we have reached.
3529 * No need for lock or barrier: we have the page lock.
3530 */
3531 if (!folio_test_uptodate(folio))
3532 shmem_falloc.nr_falloced += index - shmem_falloc.next;
3533 shmem_falloc.next = index;
3534
3535 /*
3536 * If !uptodate, leave it that way so that freeable folios
3537 * can be recognized if we need to rollback on error later.
3538 * But mark it dirty so that memory pressure will swap rather
3539 * than free the folios we are allocating (and SGP_CACHE folios
3540 * might still be clean: we now need to mark those dirty too).
3541 */
3542 folio_mark_dirty(folio);
3543 folio_unlock(folio);
3544 folio_put(folio);
3545 cond_resched();
3546 }
3547
3548 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3549 i_size_write(inode, offset + len);
3550 undone:
3551 spin_lock(&inode->i_lock);
3552 inode->i_private = NULL;
3553 spin_unlock(&inode->i_lock);
3554 out:
3555 if (!error)
3556 file_modified(file);
3557 inode_unlock(inode);
3558 return error;
3559 }
3560
shmem_statfs(struct dentry * dentry,struct kstatfs * buf)3561 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3562 {
3563 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3564
3565 buf->f_type = TMPFS_MAGIC;
3566 buf->f_bsize = PAGE_SIZE;
3567 buf->f_namelen = NAME_MAX;
3568 if (sbinfo->max_blocks) {
3569 buf->f_blocks = sbinfo->max_blocks;
3570 buf->f_bavail =
3571 buf->f_bfree = sbinfo->max_blocks -
3572 percpu_counter_sum(&sbinfo->used_blocks);
3573 }
3574 if (sbinfo->max_inodes) {
3575 buf->f_files = sbinfo->max_inodes;
3576 buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3577 }
3578 /* else leave those fields 0 like simple_statfs */
3579
3580 buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3581
3582 return 0;
3583 }
3584
3585 /*
3586 * File creation. Allocate an inode, and we're done..
3587 */
3588 static int
shmem_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)3589 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3590 struct dentry *dentry, umode_t mode, dev_t dev)
3591 {
3592 struct inode *inode;
3593 int error;
3594
3595 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3596 if (IS_ERR(inode))
3597 return PTR_ERR(inode);
3598
3599 error = simple_acl_create(dir, inode);
3600 if (error)
3601 goto out_iput;
3602 error = security_inode_init_security(inode, dir, &dentry->d_name,
3603 shmem_initxattrs, NULL);
3604 if (error && error != -EOPNOTSUPP)
3605 goto out_iput;
3606
3607 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3608 if (error)
3609 goto out_iput;
3610
3611 dir->i_size += BOGO_DIRENT_SIZE;
3612 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3613 inode_inc_iversion(dir);
3614 d_instantiate(dentry, inode);
3615 dget(dentry); /* Extra count - pin the dentry in core */
3616 return error;
3617
3618 out_iput:
3619 iput(inode);
3620 return error;
3621 }
3622
3623 static int
shmem_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)3624 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3625 struct file *file, umode_t mode)
3626 {
3627 struct inode *inode;
3628 int error;
3629
3630 inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3631 if (IS_ERR(inode)) {
3632 error = PTR_ERR(inode);
3633 goto err_out;
3634 }
3635 error = security_inode_init_security(inode, dir, NULL,
3636 shmem_initxattrs, NULL);
3637 if (error && error != -EOPNOTSUPP)
3638 goto out_iput;
3639 error = simple_acl_create(dir, inode);
3640 if (error)
3641 goto out_iput;
3642 d_tmpfile(file, inode);
3643
3644 err_out:
3645 return finish_open_simple(file, error);
3646 out_iput:
3647 iput(inode);
3648 return error;
3649 }
3650
shmem_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)3651 static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3652 struct dentry *dentry, umode_t mode)
3653 {
3654 int error;
3655
3656 error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3657 if (error)
3658 return error;
3659 inc_nlink(dir);
3660 return 0;
3661 }
3662
shmem_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)3663 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3664 struct dentry *dentry, umode_t mode, bool excl)
3665 {
3666 return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3667 }
3668
3669 /*
3670 * Link a file..
3671 */
shmem_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)3672 static int shmem_link(struct dentry *old_dentry, struct inode *dir,
3673 struct dentry *dentry)
3674 {
3675 struct inode *inode = d_inode(old_dentry);
3676 int ret = 0;
3677
3678 /*
3679 * No ordinary (disk based) filesystem counts links as inodes;
3680 * but each new link needs a new dentry, pinning lowmem, and
3681 * tmpfs dentries cannot be pruned until they are unlinked.
3682 * But if an O_TMPFILE file is linked into the tmpfs, the
3683 * first link must skip that, to get the accounting right.
3684 */
3685 if (inode->i_nlink) {
3686 ret = shmem_reserve_inode(inode->i_sb, NULL);
3687 if (ret)
3688 goto out;
3689 }
3690
3691 ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3692 if (ret) {
3693 if (inode->i_nlink)
3694 shmem_free_inode(inode->i_sb, 0);
3695 goto out;
3696 }
3697
3698 dir->i_size += BOGO_DIRENT_SIZE;
3699 inode_set_mtime_to_ts(dir,
3700 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
3701 inode_inc_iversion(dir);
3702 inc_nlink(inode);
3703 ihold(inode); /* New dentry reference */
3704 dget(dentry); /* Extra pinning count for the created dentry */
3705 d_instantiate(dentry, inode);
3706 out:
3707 return ret;
3708 }
3709
shmem_unlink(struct inode * dir,struct dentry * dentry)3710 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
3711 {
3712 struct inode *inode = d_inode(dentry);
3713
3714 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
3715 shmem_free_inode(inode->i_sb, 0);
3716
3717 simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
3718
3719 dir->i_size -= BOGO_DIRENT_SIZE;
3720 inode_set_mtime_to_ts(dir,
3721 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
3722 inode_inc_iversion(dir);
3723 drop_nlink(inode);
3724 dput(dentry); /* Undo the count from "create" - does all the work */
3725 return 0;
3726 }
3727
shmem_rmdir(struct inode * dir,struct dentry * dentry)3728 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
3729 {
3730 if (!simple_empty(dentry))
3731 return -ENOTEMPTY;
3732
3733 drop_nlink(d_inode(dentry));
3734 drop_nlink(dir);
3735 return shmem_unlink(dir, dentry);
3736 }
3737
shmem_whiteout(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry)3738 static int shmem_whiteout(struct mnt_idmap *idmap,
3739 struct inode *old_dir, struct dentry *old_dentry)
3740 {
3741 struct dentry *whiteout;
3742 int error;
3743
3744 whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3745 if (!whiteout)
3746 return -ENOMEM;
3747
3748 error = shmem_mknod(idmap, old_dir, whiteout,
3749 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3750 dput(whiteout);
3751 if (error)
3752 return error;
3753
3754 /*
3755 * Cheat and hash the whiteout while the old dentry is still in
3756 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3757 *
3758 * d_lookup() will consistently find one of them at this point,
3759 * not sure which one, but that isn't even important.
3760 */
3761 d_rehash(whiteout);
3762 return 0;
3763 }
3764
3765 /*
3766 * The VFS layer already does all the dentry stuff for rename,
3767 * we just have to decrement the usage count for the target if
3768 * it exists so that the VFS layer correctly free's it when it
3769 * gets overwritten.
3770 */
shmem_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)3771 static int shmem_rename2(struct mnt_idmap *idmap,
3772 struct inode *old_dir, struct dentry *old_dentry,
3773 struct inode *new_dir, struct dentry *new_dentry,
3774 unsigned int flags)
3775 {
3776 struct inode *inode = d_inode(old_dentry);
3777 int they_are_dirs = S_ISDIR(inode->i_mode);
3778 int error;
3779
3780 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3781 return -EINVAL;
3782
3783 if (flags & RENAME_EXCHANGE)
3784 return simple_offset_rename_exchange(old_dir, old_dentry,
3785 new_dir, new_dentry);
3786
3787 if (!simple_empty(new_dentry))
3788 return -ENOTEMPTY;
3789
3790 if (flags & RENAME_WHITEOUT) {
3791 error = shmem_whiteout(idmap, old_dir, old_dentry);
3792 if (error)
3793 return error;
3794 }
3795
3796 error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
3797 if (error)
3798 return error;
3799
3800 if (d_really_is_positive(new_dentry)) {
3801 (void) shmem_unlink(new_dir, new_dentry);
3802 if (they_are_dirs) {
3803 drop_nlink(d_inode(new_dentry));
3804 drop_nlink(old_dir);
3805 }
3806 } else if (they_are_dirs) {
3807 drop_nlink(old_dir);
3808 inc_nlink(new_dir);
3809 }
3810
3811 old_dir->i_size -= BOGO_DIRENT_SIZE;
3812 new_dir->i_size += BOGO_DIRENT_SIZE;
3813 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
3814 inode_inc_iversion(old_dir);
3815 inode_inc_iversion(new_dir);
3816 return 0;
3817 }
3818
shmem_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)3819 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
3820 struct dentry *dentry, const char *symname)
3821 {
3822 int error;
3823 int len;
3824 struct inode *inode;
3825 struct folio *folio;
3826
3827 len = strlen(symname) + 1;
3828 if (len > PAGE_SIZE)
3829 return -ENAMETOOLONG;
3830
3831 inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
3832 VM_NORESERVE);
3833 if (IS_ERR(inode))
3834 return PTR_ERR(inode);
3835
3836 error = security_inode_init_security(inode, dir, &dentry->d_name,
3837 shmem_initxattrs, NULL);
3838 if (error && error != -EOPNOTSUPP)
3839 goto out_iput;
3840
3841 error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3842 if (error)
3843 goto out_iput;
3844
3845 inode->i_size = len-1;
3846 if (len <= SHORT_SYMLINK_LEN) {
3847 inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3848 if (!inode->i_link) {
3849 error = -ENOMEM;
3850 goto out_remove_offset;
3851 }
3852 inode->i_op = &shmem_short_symlink_operations;
3853 } else {
3854 inode_nohighmem(inode);
3855 inode->i_mapping->a_ops = &shmem_aops;
3856 error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
3857 if (error)
3858 goto out_remove_offset;
3859 inode->i_op = &shmem_symlink_inode_operations;
3860 memcpy(folio_address(folio), symname, len);
3861 folio_mark_uptodate(folio);
3862 folio_mark_dirty(folio);
3863 folio_unlock(folio);
3864 folio_put(folio);
3865 }
3866 dir->i_size += BOGO_DIRENT_SIZE;
3867 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3868 inode_inc_iversion(dir);
3869 d_instantiate(dentry, inode);
3870 dget(dentry);
3871 return 0;
3872
3873 out_remove_offset:
3874 simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
3875 out_iput:
3876 iput(inode);
3877 return error;
3878 }
3879
shmem_put_link(void * arg)3880 static void shmem_put_link(void *arg)
3881 {
3882 folio_mark_accessed(arg);
3883 folio_put(arg);
3884 }
3885
shmem_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * done)3886 static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
3887 struct delayed_call *done)
3888 {
3889 struct folio *folio = NULL;
3890 int error;
3891
3892 if (!dentry) {
3893 folio = filemap_get_folio(inode->i_mapping, 0);
3894 if (IS_ERR(folio))
3895 return ERR_PTR(-ECHILD);
3896 if (PageHWPoison(folio_page(folio, 0)) ||
3897 !folio_test_uptodate(folio)) {
3898 folio_put(folio);
3899 return ERR_PTR(-ECHILD);
3900 }
3901 } else {
3902 error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
3903 if (error)
3904 return ERR_PTR(error);
3905 if (!folio)
3906 return ERR_PTR(-ECHILD);
3907 if (PageHWPoison(folio_page(folio, 0))) {
3908 folio_unlock(folio);
3909 folio_put(folio);
3910 return ERR_PTR(-ECHILD);
3911 }
3912 folio_unlock(folio);
3913 }
3914 set_delayed_call(done, shmem_put_link, folio);
3915 return folio_address(folio);
3916 }
3917
3918 #ifdef CONFIG_TMPFS_XATTR
3919
shmem_fileattr_get(struct dentry * dentry,struct fileattr * fa)3920 static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3921 {
3922 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3923
3924 fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3925
3926 return 0;
3927 }
3928
shmem_fileattr_set(struct mnt_idmap * idmap,struct dentry * dentry,struct fileattr * fa)3929 static int shmem_fileattr_set(struct mnt_idmap *idmap,
3930 struct dentry *dentry, struct fileattr *fa)
3931 {
3932 struct inode *inode = d_inode(dentry);
3933 struct shmem_inode_info *info = SHMEM_I(inode);
3934
3935 if (fileattr_has_fsx(fa))
3936 return -EOPNOTSUPP;
3937 if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3938 return -EOPNOTSUPP;
3939
3940 info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3941 (fa->flags & SHMEM_FL_USER_MODIFIABLE);
3942
3943 shmem_set_inode_flags(inode, info->fsflags);
3944 inode_set_ctime_current(inode);
3945 inode_inc_iversion(inode);
3946 return 0;
3947 }
3948
3949 /*
3950 * Superblocks without xattr inode operations may get some security.* xattr
3951 * support from the LSM "for free". As soon as we have any other xattrs
3952 * like ACLs, we also need to implement the security.* handlers at
3953 * filesystem level, though.
3954 */
3955
3956 /*
3957 * Callback for security_inode_init_security() for acquiring xattrs.
3958 */
shmem_initxattrs(struct inode * inode,const struct xattr * xattr_array,void * fs_info)3959 static int shmem_initxattrs(struct inode *inode,
3960 const struct xattr *xattr_array, void *fs_info)
3961 {
3962 struct shmem_inode_info *info = SHMEM_I(inode);
3963 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3964 const struct xattr *xattr;
3965 struct simple_xattr *new_xattr;
3966 size_t ispace = 0;
3967 size_t len;
3968
3969 if (sbinfo->max_inodes) {
3970 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3971 ispace += simple_xattr_space(xattr->name,
3972 xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
3973 }
3974 if (ispace) {
3975 raw_spin_lock(&sbinfo->stat_lock);
3976 if (sbinfo->free_ispace < ispace)
3977 ispace = 0;
3978 else
3979 sbinfo->free_ispace -= ispace;
3980 raw_spin_unlock(&sbinfo->stat_lock);
3981 if (!ispace)
3982 return -ENOSPC;
3983 }
3984 }
3985
3986 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3987 new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3988 if (!new_xattr)
3989 break;
3990
3991 len = strlen(xattr->name) + 1;
3992 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3993 GFP_KERNEL_ACCOUNT);
3994 if (!new_xattr->name) {
3995 kvfree(new_xattr);
3996 break;
3997 }
3998
3999 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4000 XATTR_SECURITY_PREFIX_LEN);
4001 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4002 xattr->name, len);
4003
4004 simple_xattr_add(&info->xattrs, new_xattr);
4005 }
4006
4007 if (xattr->name != NULL) {
4008 if (ispace) {
4009 raw_spin_lock(&sbinfo->stat_lock);
4010 sbinfo->free_ispace += ispace;
4011 raw_spin_unlock(&sbinfo->stat_lock);
4012 }
4013 simple_xattrs_free(&info->xattrs, NULL);
4014 return -ENOMEM;
4015 }
4016
4017 return 0;
4018 }
4019
shmem_xattr_handler_get(const struct xattr_handler * handler,struct dentry * unused,struct inode * inode,const char * name,void * buffer,size_t size)4020 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4021 struct dentry *unused, struct inode *inode,
4022 const char *name, void *buffer, size_t size)
4023 {
4024 struct shmem_inode_info *info = SHMEM_I(inode);
4025
4026 name = xattr_full_name(handler, name);
4027 return simple_xattr_get(&info->xattrs, name, buffer, size);
4028 }
4029
shmem_xattr_handler_set(const struct xattr_handler * handler,struct mnt_idmap * idmap,struct dentry * unused,struct inode * inode,const char * name,const void * value,size_t size,int flags)4030 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4031 struct mnt_idmap *idmap,
4032 struct dentry *unused, struct inode *inode,
4033 const char *name, const void *value,
4034 size_t size, int flags)
4035 {
4036 struct shmem_inode_info *info = SHMEM_I(inode);
4037 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4038 struct simple_xattr *old_xattr;
4039 size_t ispace = 0;
4040
4041 name = xattr_full_name(handler, name);
4042 if (value && sbinfo->max_inodes) {
4043 ispace = simple_xattr_space(name, size);
4044 raw_spin_lock(&sbinfo->stat_lock);
4045 if (sbinfo->free_ispace < ispace)
4046 ispace = 0;
4047 else
4048 sbinfo->free_ispace -= ispace;
4049 raw_spin_unlock(&sbinfo->stat_lock);
4050 if (!ispace)
4051 return -ENOSPC;
4052 }
4053
4054 old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
4055 if (!IS_ERR(old_xattr)) {
4056 ispace = 0;
4057 if (old_xattr && sbinfo->max_inodes)
4058 ispace = simple_xattr_space(old_xattr->name,
4059 old_xattr->size);
4060 simple_xattr_free(old_xattr);
4061 old_xattr = NULL;
4062 inode_set_ctime_current(inode);
4063 inode_inc_iversion(inode);
4064 }
4065 if (ispace) {
4066 raw_spin_lock(&sbinfo->stat_lock);
4067 sbinfo->free_ispace += ispace;
4068 raw_spin_unlock(&sbinfo->stat_lock);
4069 }
4070 return PTR_ERR(old_xattr);
4071 }
4072
4073 static const struct xattr_handler shmem_security_xattr_handler = {
4074 .prefix = XATTR_SECURITY_PREFIX,
4075 .get = shmem_xattr_handler_get,
4076 .set = shmem_xattr_handler_set,
4077 };
4078
4079 static const struct xattr_handler shmem_trusted_xattr_handler = {
4080 .prefix = XATTR_TRUSTED_PREFIX,
4081 .get = shmem_xattr_handler_get,
4082 .set = shmem_xattr_handler_set,
4083 };
4084
4085 static const struct xattr_handler shmem_user_xattr_handler = {
4086 .prefix = XATTR_USER_PREFIX,
4087 .get = shmem_xattr_handler_get,
4088 .set = shmem_xattr_handler_set,
4089 };
4090
4091 static const struct xattr_handler * const shmem_xattr_handlers[] = {
4092 &shmem_security_xattr_handler,
4093 &shmem_trusted_xattr_handler,
4094 &shmem_user_xattr_handler,
4095 NULL
4096 };
4097
shmem_listxattr(struct dentry * dentry,char * buffer,size_t size)4098 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
4099 {
4100 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4101 return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
4102 }
4103 #endif /* CONFIG_TMPFS_XATTR */
4104
4105 static const struct inode_operations shmem_short_symlink_operations = {
4106 .getattr = shmem_getattr,
4107 .setattr = shmem_setattr,
4108 .get_link = simple_get_link,
4109 #ifdef CONFIG_TMPFS_XATTR
4110 .listxattr = shmem_listxattr,
4111 #endif
4112 };
4113
4114 static const struct inode_operations shmem_symlink_inode_operations = {
4115 .getattr = shmem_getattr,
4116 .setattr = shmem_setattr,
4117 .get_link = shmem_get_link,
4118 #ifdef CONFIG_TMPFS_XATTR
4119 .listxattr = shmem_listxattr,
4120 #endif
4121 };
4122
shmem_get_parent(struct dentry * child)4123 static struct dentry *shmem_get_parent(struct dentry *child)
4124 {
4125 return ERR_PTR(-ESTALE);
4126 }
4127
shmem_match(struct inode * ino,void * vfh)4128 static int shmem_match(struct inode *ino, void *vfh)
4129 {
4130 __u32 *fh = vfh;
4131 __u64 inum = fh[2];
4132 inum = (inum << 32) | fh[1];
4133 return ino->i_ino == inum && fh[0] == ino->i_generation;
4134 }
4135
4136 /* Find any alias of inode, but prefer a hashed alias */
shmem_find_alias(struct inode * inode)4137 static struct dentry *shmem_find_alias(struct inode *inode)
4138 {
4139 struct dentry *alias = d_find_alias(inode);
4140
4141 return alias ?: d_find_any_alias(inode);
4142 }
4143
shmem_fh_to_dentry(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)4144 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
4145 struct fid *fid, int fh_len, int fh_type)
4146 {
4147 struct inode *inode;
4148 struct dentry *dentry = NULL;
4149 u64 inum;
4150
4151 if (fh_len < 3)
4152 return NULL;
4153
4154 inum = fid->raw[2];
4155 inum = (inum << 32) | fid->raw[1];
4156
4157 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
4158 shmem_match, fid->raw);
4159 if (inode) {
4160 dentry = shmem_find_alias(inode);
4161 iput(inode);
4162 }
4163
4164 return dentry;
4165 }
4166
shmem_encode_fh(struct inode * inode,__u32 * fh,int * len,struct inode * parent)4167 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
4168 struct inode *parent)
4169 {
4170 if (*len < 3) {
4171 *len = 3;
4172 return FILEID_INVALID;
4173 }
4174
4175 if (inode_unhashed(inode)) {
4176 /* Unfortunately insert_inode_hash is not idempotent,
4177 * so as we hash inodes here rather than at creation
4178 * time, we need a lock to ensure we only try
4179 * to do it once
4180 */
4181 static DEFINE_SPINLOCK(lock);
4182 spin_lock(&lock);
4183 if (inode_unhashed(inode))
4184 __insert_inode_hash(inode,
4185 inode->i_ino + inode->i_generation);
4186 spin_unlock(&lock);
4187 }
4188
4189 fh[0] = inode->i_generation;
4190 fh[1] = inode->i_ino;
4191 fh[2] = ((__u64)inode->i_ino) >> 32;
4192
4193 *len = 3;
4194 return 1;
4195 }
4196
4197 static const struct export_operations shmem_export_ops = {
4198 .get_parent = shmem_get_parent,
4199 .encode_fh = shmem_encode_fh,
4200 .fh_to_dentry = shmem_fh_to_dentry,
4201 };
4202
4203 enum shmem_param {
4204 Opt_gid,
4205 Opt_huge,
4206 Opt_mode,
4207 Opt_mpol,
4208 Opt_nr_blocks,
4209 Opt_nr_inodes,
4210 Opt_size,
4211 Opt_uid,
4212 Opt_inode32,
4213 Opt_inode64,
4214 Opt_noswap,
4215 Opt_quota,
4216 Opt_usrquota,
4217 Opt_grpquota,
4218 Opt_usrquota_block_hardlimit,
4219 Opt_usrquota_inode_hardlimit,
4220 Opt_grpquota_block_hardlimit,
4221 Opt_grpquota_inode_hardlimit,
4222 };
4223
4224 static const struct constant_table shmem_param_enums_huge[] = {
4225 {"never", SHMEM_HUGE_NEVER },
4226 {"always", SHMEM_HUGE_ALWAYS },
4227 {"within_size", SHMEM_HUGE_WITHIN_SIZE },
4228 {"advise", SHMEM_HUGE_ADVISE },
4229 {}
4230 };
4231
4232 const struct fs_parameter_spec shmem_fs_parameters[] = {
4233 fsparam_gid ("gid", Opt_gid),
4234 fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
4235 fsparam_u32oct("mode", Opt_mode),
4236 fsparam_string("mpol", Opt_mpol),
4237 fsparam_string("nr_blocks", Opt_nr_blocks),
4238 fsparam_string("nr_inodes", Opt_nr_inodes),
4239 fsparam_string("size", Opt_size),
4240 fsparam_uid ("uid", Opt_uid),
4241 fsparam_flag ("inode32", Opt_inode32),
4242 fsparam_flag ("inode64", Opt_inode64),
4243 fsparam_flag ("noswap", Opt_noswap),
4244 #ifdef CONFIG_TMPFS_QUOTA
4245 fsparam_flag ("quota", Opt_quota),
4246 fsparam_flag ("usrquota", Opt_usrquota),
4247 fsparam_flag ("grpquota", Opt_grpquota),
4248 fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4249 fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4250 fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4251 fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4252 #endif
4253 {}
4254 };
4255
shmem_parse_one(struct fs_context * fc,struct fs_parameter * param)4256 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
4257 {
4258 struct shmem_options *ctx = fc->fs_private;
4259 struct fs_parse_result result;
4260 unsigned long long size;
4261 char *rest;
4262 int opt;
4263 kuid_t kuid;
4264 kgid_t kgid;
4265
4266 opt = fs_parse(fc, shmem_fs_parameters, param, &result);
4267 if (opt < 0)
4268 return opt;
4269
4270 switch (opt) {
4271 case Opt_size:
4272 size = memparse(param->string, &rest);
4273 if (*rest == '%') {
4274 size <<= PAGE_SHIFT;
4275 size *= totalram_pages();
4276 do_div(size, 100);
4277 rest++;
4278 }
4279 if (*rest)
4280 goto bad_value;
4281 ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4282 ctx->seen |= SHMEM_SEEN_BLOCKS;
4283 break;
4284 case Opt_nr_blocks:
4285 ctx->blocks = memparse(param->string, &rest);
4286 if (*rest || ctx->blocks > LONG_MAX)
4287 goto bad_value;
4288 ctx->seen |= SHMEM_SEEN_BLOCKS;
4289 break;
4290 case Opt_nr_inodes:
4291 ctx->inodes = memparse(param->string, &rest);
4292 if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4293 goto bad_value;
4294 ctx->seen |= SHMEM_SEEN_INODES;
4295 break;
4296 case Opt_mode:
4297 ctx->mode = result.uint_32 & 07777;
4298 break;
4299 case Opt_uid:
4300 kuid = result.uid;
4301
4302 /*
4303 * The requested uid must be representable in the
4304 * filesystem's idmapping.
4305 */
4306 if (!kuid_has_mapping(fc->user_ns, kuid))
4307 goto bad_value;
4308
4309 ctx->uid = kuid;
4310 break;
4311 case Opt_gid:
4312 kgid = result.gid;
4313
4314 /*
4315 * The requested gid must be representable in the
4316 * filesystem's idmapping.
4317 */
4318 if (!kgid_has_mapping(fc->user_ns, kgid))
4319 goto bad_value;
4320
4321 ctx->gid = kgid;
4322 break;
4323 case Opt_huge:
4324 ctx->huge = result.uint_32;
4325 if (ctx->huge != SHMEM_HUGE_NEVER &&
4326 !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4327 has_transparent_hugepage()))
4328 goto unsupported_parameter;
4329 ctx->seen |= SHMEM_SEEN_HUGE;
4330 break;
4331 case Opt_mpol:
4332 if (IS_ENABLED(CONFIG_NUMA)) {
4333 mpol_put(ctx->mpol);
4334 ctx->mpol = NULL;
4335 if (mpol_parse_str(param->string, &ctx->mpol))
4336 goto bad_value;
4337 break;
4338 }
4339 goto unsupported_parameter;
4340 case Opt_inode32:
4341 ctx->full_inums = false;
4342 ctx->seen |= SHMEM_SEEN_INUMS;
4343 break;
4344 case Opt_inode64:
4345 if (sizeof(ino_t) < 8) {
4346 return invalfc(fc,
4347 "Cannot use inode64 with <64bit inums in kernel\n");
4348 }
4349 ctx->full_inums = true;
4350 ctx->seen |= SHMEM_SEEN_INUMS;
4351 break;
4352 case Opt_noswap:
4353 if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
4354 return invalfc(fc,
4355 "Turning off swap in unprivileged tmpfs mounts unsupported");
4356 }
4357 ctx->noswap = true;
4358 ctx->seen |= SHMEM_SEEN_NOSWAP;
4359 break;
4360 case Opt_quota:
4361 if (fc->user_ns != &init_user_ns)
4362 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4363 ctx->seen |= SHMEM_SEEN_QUOTA;
4364 ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
4365 break;
4366 case Opt_usrquota:
4367 if (fc->user_ns != &init_user_ns)
4368 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4369 ctx->seen |= SHMEM_SEEN_QUOTA;
4370 ctx->quota_types |= QTYPE_MASK_USR;
4371 break;
4372 case Opt_grpquota:
4373 if (fc->user_ns != &init_user_ns)
4374 return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4375 ctx->seen |= SHMEM_SEEN_QUOTA;
4376 ctx->quota_types |= QTYPE_MASK_GRP;
4377 break;
4378 case Opt_usrquota_block_hardlimit:
4379 size = memparse(param->string, &rest);
4380 if (*rest || !size)
4381 goto bad_value;
4382 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4383 return invalfc(fc,
4384 "User quota block hardlimit too large.");
4385 ctx->qlimits.usrquota_bhardlimit = size;
4386 break;
4387 case Opt_grpquota_block_hardlimit:
4388 size = memparse(param->string, &rest);
4389 if (*rest || !size)
4390 goto bad_value;
4391 if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4392 return invalfc(fc,
4393 "Group quota block hardlimit too large.");
4394 ctx->qlimits.grpquota_bhardlimit = size;
4395 break;
4396 case Opt_usrquota_inode_hardlimit:
4397 size = memparse(param->string, &rest);
4398 if (*rest || !size)
4399 goto bad_value;
4400 if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4401 return invalfc(fc,
4402 "User quota inode hardlimit too large.");
4403 ctx->qlimits.usrquota_ihardlimit = size;
4404 break;
4405 case Opt_grpquota_inode_hardlimit:
4406 size = memparse(param->string, &rest);
4407 if (*rest || !size)
4408 goto bad_value;
4409 if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4410 return invalfc(fc,
4411 "Group quota inode hardlimit too large.");
4412 ctx->qlimits.grpquota_ihardlimit = size;
4413 break;
4414 }
4415 return 0;
4416
4417 unsupported_parameter:
4418 return invalfc(fc, "Unsupported parameter '%s'", param->key);
4419 bad_value:
4420 return invalfc(fc, "Bad value for '%s'", param->key);
4421 }
4422
shmem_parse_options(struct fs_context * fc,void * data)4423 static int shmem_parse_options(struct fs_context *fc, void *data)
4424 {
4425 char *options = data;
4426
4427 if (options) {
4428 int err = security_sb_eat_lsm_opts(options, &fc->security);
4429 if (err)
4430 return err;
4431 }
4432
4433 while (options != NULL) {
4434 char *this_char = options;
4435 for (;;) {
4436 /*
4437 * NUL-terminate this option: unfortunately,
4438 * mount options form a comma-separated list,
4439 * but mpol's nodelist may also contain commas.
4440 */
4441 options = strchr(options, ',');
4442 if (options == NULL)
4443 break;
4444 options++;
4445 if (!isdigit(*options)) {
4446 options[-1] = '\0';
4447 break;
4448 }
4449 }
4450 if (*this_char) {
4451 char *value = strchr(this_char, '=');
4452 size_t len = 0;
4453 int err;
4454
4455 if (value) {
4456 *value++ = '\0';
4457 len = strlen(value);
4458 }
4459 err = vfs_parse_fs_string(fc, this_char, value, len);
4460 if (err < 0)
4461 return err;
4462 }
4463 }
4464 return 0;
4465 }
4466
4467 /*
4468 * Reconfigure a shmem filesystem.
4469 */
shmem_reconfigure(struct fs_context * fc)4470 static int shmem_reconfigure(struct fs_context *fc)
4471 {
4472 struct shmem_options *ctx = fc->fs_private;
4473 struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
4474 unsigned long used_isp;
4475 struct mempolicy *mpol = NULL;
4476 const char *err;
4477
4478 raw_spin_lock(&sbinfo->stat_lock);
4479 used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4480
4481 if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4482 if (!sbinfo->max_blocks) {
4483 err = "Cannot retroactively limit size";
4484 goto out;
4485 }
4486 if (percpu_counter_compare(&sbinfo->used_blocks,
4487 ctx->blocks) > 0) {
4488 err = "Too small a size for current use";
4489 goto out;
4490 }
4491 }
4492 if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4493 if (!sbinfo->max_inodes) {
4494 err = "Cannot retroactively limit inodes";
4495 goto out;
4496 }
4497 if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4498 err = "Too few inodes for current use";
4499 goto out;
4500 }
4501 }
4502
4503 if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4504 sbinfo->next_ino > UINT_MAX) {
4505 err = "Current inum too high to switch to 32-bit inums";
4506 goto out;
4507 }
4508 if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
4509 err = "Cannot disable swap on remount";
4510 goto out;
4511 }
4512 if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
4513 err = "Cannot enable swap on remount if it was disabled on first mount";
4514 goto out;
4515 }
4516
4517 if (ctx->seen & SHMEM_SEEN_QUOTA &&
4518 !sb_any_quota_loaded(fc->root->d_sb)) {
4519 err = "Cannot enable quota on remount";
4520 goto out;
4521 }
4522
4523 #ifdef CONFIG_TMPFS_QUOTA
4524 #define CHANGED_LIMIT(name) \
4525 (ctx->qlimits.name## hardlimit && \
4526 (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4527
4528 if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
4529 CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
4530 err = "Cannot change global quota limit on remount";
4531 goto out;
4532 }
4533 #endif /* CONFIG_TMPFS_QUOTA */
4534
4535 if (ctx->seen & SHMEM_SEEN_HUGE)
4536 sbinfo->huge = ctx->huge;
4537 if (ctx->seen & SHMEM_SEEN_INUMS)
4538 sbinfo->full_inums = ctx->full_inums;
4539 if (ctx->seen & SHMEM_SEEN_BLOCKS)
4540 sbinfo->max_blocks = ctx->blocks;
4541 if (ctx->seen & SHMEM_SEEN_INODES) {
4542 sbinfo->max_inodes = ctx->inodes;
4543 sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4544 }
4545
4546 /*
4547 * Preserve previous mempolicy unless mpol remount option was specified.
4548 */
4549 if (ctx->mpol) {
4550 mpol = sbinfo->mpol;
4551 sbinfo->mpol = ctx->mpol; /* transfers initial ref */
4552 ctx->mpol = NULL;
4553 }
4554
4555 if (ctx->noswap)
4556 sbinfo->noswap = true;
4557
4558 raw_spin_unlock(&sbinfo->stat_lock);
4559 mpol_put(mpol);
4560 return 0;
4561 out:
4562 raw_spin_unlock(&sbinfo->stat_lock);
4563 return invalfc(fc, "%s", err);
4564 }
4565
shmem_show_options(struct seq_file * seq,struct dentry * root)4566 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
4567 {
4568 struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
4569 struct mempolicy *mpol;
4570
4571 if (sbinfo->max_blocks != shmem_default_max_blocks())
4572 seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
4573 if (sbinfo->max_inodes != shmem_default_max_inodes())
4574 seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
4575 if (sbinfo->mode != (0777 | S_ISVTX))
4576 seq_printf(seq, ",mode=%03ho", sbinfo->mode);
4577 if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
4578 seq_printf(seq, ",uid=%u",
4579 from_kuid_munged(&init_user_ns, sbinfo->uid));
4580 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4581 seq_printf(seq, ",gid=%u",
4582 from_kgid_munged(&init_user_ns, sbinfo->gid));
4583
4584 /*
4585 * Showing inode{64,32} might be useful even if it's the system default,
4586 * since then people don't have to resort to checking both here and
4587 * /proc/config.gz to confirm 64-bit inums were successfully applied
4588 * (which may not even exist if IKCONFIG_PROC isn't enabled).
4589 *
4590 * We hide it when inode64 isn't the default and we are using 32-bit
4591 * inodes, since that probably just means the feature isn't even under
4592 * consideration.
4593 *
4594 * As such:
4595 *
4596 * +-----------------+-----------------+
4597 * | TMPFS_INODE64=y | TMPFS_INODE64=n |
4598 * +------------------+-----------------+-----------------+
4599 * | full_inums=true | show | show |
4600 * | full_inums=false | show | hide |
4601 * +------------------+-----------------+-----------------+
4602 *
4603 */
4604 if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
4605 seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
4606 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4607 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
4608 if (sbinfo->huge)
4609 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4610 #endif
4611 mpol = shmem_get_sbmpol(sbinfo);
4612 shmem_show_mpol(seq, mpol);
4613 mpol_put(mpol);
4614 if (sbinfo->noswap)
4615 seq_printf(seq, ",noswap");
4616 #ifdef CONFIG_TMPFS_QUOTA
4617 if (sb_has_quota_active(root->d_sb, USRQUOTA))
4618 seq_printf(seq, ",usrquota");
4619 if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4620 seq_printf(seq, ",grpquota");
4621 if (sbinfo->qlimits.usrquota_bhardlimit)
4622 seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4623 sbinfo->qlimits.usrquota_bhardlimit);
4624 if (sbinfo->qlimits.grpquota_bhardlimit)
4625 seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4626 sbinfo->qlimits.grpquota_bhardlimit);
4627 if (sbinfo->qlimits.usrquota_ihardlimit)
4628 seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4629 sbinfo->qlimits.usrquota_ihardlimit);
4630 if (sbinfo->qlimits.grpquota_ihardlimit)
4631 seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4632 sbinfo->qlimits.grpquota_ihardlimit);
4633 #endif
4634 return 0;
4635 }
4636
4637 #endif /* CONFIG_TMPFS */
4638
shmem_put_super(struct super_block * sb)4639 static void shmem_put_super(struct super_block *sb)
4640 {
4641 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4642
4643 #ifdef CONFIG_TMPFS_QUOTA
4644 shmem_disable_quotas(sb);
4645 #endif
4646 free_percpu(sbinfo->ino_batch);
4647 percpu_counter_destroy(&sbinfo->used_blocks);
4648 mpol_put(sbinfo->mpol);
4649 kfree(sbinfo);
4650 sb->s_fs_info = NULL;
4651 }
4652
shmem_fill_super(struct super_block * sb,struct fs_context * fc)4653 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
4654 {
4655 struct shmem_options *ctx = fc->fs_private;
4656 struct inode *inode;
4657 struct shmem_sb_info *sbinfo;
4658 int error = -ENOMEM;
4659
4660 /* Round up to L1_CACHE_BYTES to resist false sharing */
4661 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
4662 L1_CACHE_BYTES), GFP_KERNEL);
4663 if (!sbinfo)
4664 return error;
4665
4666 sb->s_fs_info = sbinfo;
4667
4668 #ifdef CONFIG_TMPFS
4669 /*
4670 * Per default we only allow half of the physical ram per
4671 * tmpfs instance, limiting inodes to one per page of lowmem;
4672 * but the internal instance is left unlimited.
4673 */
4674 if (!(sb->s_flags & SB_KERNMOUNT)) {
4675 if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
4676 ctx->blocks = shmem_default_max_blocks();
4677 if (!(ctx->seen & SHMEM_SEEN_INODES))
4678 ctx->inodes = shmem_default_max_inodes();
4679 if (!(ctx->seen & SHMEM_SEEN_INUMS))
4680 ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
4681 sbinfo->noswap = ctx->noswap;
4682 } else {
4683 sb->s_flags |= SB_NOUSER;
4684 }
4685 sb->s_export_op = &shmem_export_ops;
4686 sb->s_flags |= SB_NOSEC | SB_I_VERSION;
4687 #else
4688 sb->s_flags |= SB_NOUSER;
4689 #endif
4690 sbinfo->max_blocks = ctx->blocks;
4691 sbinfo->max_inodes = ctx->inodes;
4692 sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
4693 if (sb->s_flags & SB_KERNMOUNT) {
4694 sbinfo->ino_batch = alloc_percpu(ino_t);
4695 if (!sbinfo->ino_batch)
4696 goto failed;
4697 }
4698 sbinfo->uid = ctx->uid;
4699 sbinfo->gid = ctx->gid;
4700 sbinfo->full_inums = ctx->full_inums;
4701 sbinfo->mode = ctx->mode;
4702 sbinfo->huge = ctx->huge;
4703 sbinfo->mpol = ctx->mpol;
4704 ctx->mpol = NULL;
4705
4706 raw_spin_lock_init(&sbinfo->stat_lock);
4707 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
4708 goto failed;
4709 spin_lock_init(&sbinfo->shrinklist_lock);
4710 INIT_LIST_HEAD(&sbinfo->shrinklist);
4711
4712 sb->s_maxbytes = MAX_LFS_FILESIZE;
4713 sb->s_blocksize = PAGE_SIZE;
4714 sb->s_blocksize_bits = PAGE_SHIFT;
4715 sb->s_magic = TMPFS_MAGIC;
4716 sb->s_op = &shmem_ops;
4717 sb->s_time_gran = 1;
4718 #ifdef CONFIG_TMPFS_XATTR
4719 sb->s_xattr = shmem_xattr_handlers;
4720 #endif
4721 #ifdef CONFIG_TMPFS_POSIX_ACL
4722 sb->s_flags |= SB_POSIXACL;
4723 #endif
4724 uuid_t uuid;
4725 uuid_gen(&uuid);
4726 super_set_uuid(sb, uuid.b, sizeof(uuid));
4727
4728 #ifdef CONFIG_TMPFS_QUOTA
4729 if (ctx->seen & SHMEM_SEEN_QUOTA) {
4730 sb->dq_op = &shmem_quota_operations;
4731 sb->s_qcop = &dquot_quotactl_sysfile_ops;
4732 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
4733
4734 /* Copy the default limits from ctx into sbinfo */
4735 memcpy(&sbinfo->qlimits, &ctx->qlimits,
4736 sizeof(struct shmem_quota_limits));
4737
4738 if (shmem_enable_quotas(sb, ctx->quota_types))
4739 goto failed;
4740 }
4741 #endif /* CONFIG_TMPFS_QUOTA */
4742
4743 inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
4744 S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
4745 if (IS_ERR(inode)) {
4746 error = PTR_ERR(inode);
4747 goto failed;
4748 }
4749 inode->i_uid = sbinfo->uid;
4750 inode->i_gid = sbinfo->gid;
4751 sb->s_root = d_make_root(inode);
4752 if (!sb->s_root)
4753 goto failed;
4754 return 0;
4755
4756 failed:
4757 shmem_put_super(sb);
4758 return error;
4759 }
4760
shmem_get_tree(struct fs_context * fc)4761 static int shmem_get_tree(struct fs_context *fc)
4762 {
4763 return get_tree_nodev(fc, shmem_fill_super);
4764 }
4765
shmem_free_fc(struct fs_context * fc)4766 static void shmem_free_fc(struct fs_context *fc)
4767 {
4768 struct shmem_options *ctx = fc->fs_private;
4769
4770 if (ctx) {
4771 mpol_put(ctx->mpol);
4772 kfree(ctx);
4773 }
4774 }
4775
4776 static const struct fs_context_operations shmem_fs_context_ops = {
4777 .free = shmem_free_fc,
4778 .get_tree = shmem_get_tree,
4779 #ifdef CONFIG_TMPFS
4780 .parse_monolithic = shmem_parse_options,
4781 .parse_param = shmem_parse_one,
4782 .reconfigure = shmem_reconfigure,
4783 #endif
4784 };
4785
4786 static struct kmem_cache *shmem_inode_cachep __ro_after_init;
4787
shmem_alloc_inode(struct super_block * sb)4788 static struct inode *shmem_alloc_inode(struct super_block *sb)
4789 {
4790 struct shmem_inode_info *info;
4791 info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
4792 if (!info)
4793 return NULL;
4794 return &info->vfs_inode;
4795 }
4796
shmem_free_in_core_inode(struct inode * inode)4797 static void shmem_free_in_core_inode(struct inode *inode)
4798 {
4799 if (S_ISLNK(inode->i_mode))
4800 kfree(inode->i_link);
4801 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
4802 }
4803
shmem_destroy_inode(struct inode * inode)4804 static void shmem_destroy_inode(struct inode *inode)
4805 {
4806 if (S_ISREG(inode->i_mode))
4807 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
4808 if (S_ISDIR(inode->i_mode))
4809 simple_offset_destroy(shmem_get_offset_ctx(inode));
4810 }
4811
shmem_init_inode(void * foo)4812 static void shmem_init_inode(void *foo)
4813 {
4814 struct shmem_inode_info *info = foo;
4815 inode_init_once(&info->vfs_inode);
4816 }
4817
shmem_init_inodecache(void)4818 static void __init shmem_init_inodecache(void)
4819 {
4820 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
4821 sizeof(struct shmem_inode_info),
4822 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
4823 }
4824
shmem_destroy_inodecache(void)4825 static void __init shmem_destroy_inodecache(void)
4826 {
4827 kmem_cache_destroy(shmem_inode_cachep);
4828 }
4829
4830 /* Keep the page in page cache instead of truncating it */
shmem_error_remove_folio(struct address_space * mapping,struct folio * folio)4831 static int shmem_error_remove_folio(struct address_space *mapping,
4832 struct folio *folio)
4833 {
4834 return 0;
4835 }
4836
4837 static const struct address_space_operations shmem_aops = {
4838 .writepage = shmem_writepage,
4839 .dirty_folio = noop_dirty_folio,
4840 #ifdef CONFIG_TMPFS
4841 .write_begin = shmem_write_begin,
4842 .write_end = shmem_write_end,
4843 #endif
4844 #ifdef CONFIG_MIGRATION
4845 .migrate_folio = migrate_folio,
4846 #endif
4847 .error_remove_folio = shmem_error_remove_folio,
4848 };
4849
4850 #ifdef CONFIG_ASHMEM_RUST
4851 extern long ashmem_memfd_ioctl(struct file *file, unsigned int cmd,
4852 unsigned long arg);
4853 #endif
4854
4855 static const struct file_operations shmem_file_operations = {
4856 .mmap = shmem_mmap,
4857 .open = shmem_file_open,
4858 .get_unmapped_area = shmem_get_unmapped_area,
4859 #ifdef CONFIG_TMPFS
4860 .llseek = shmem_file_llseek,
4861 .read_iter = shmem_file_read_iter,
4862 .write_iter = shmem_file_write_iter,
4863 .fsync = noop_fsync,
4864 .splice_read = shmem_file_splice_read,
4865 .splice_write = iter_file_splice_write,
4866 .fallocate = shmem_fallocate,
4867 #endif
4868 #ifdef CONFIG_ASHMEM_RUST
4869 .unlocked_ioctl = ashmem_memfd_ioctl,
4870 #ifdef CONFIG_COMPAT
4871 .compat_ioctl = ashmem_memfd_ioctl,
4872 #endif
4873 #endif
4874 };
4875
4876 static const struct inode_operations shmem_inode_operations = {
4877 .getattr = shmem_getattr,
4878 .setattr = shmem_setattr,
4879 #ifdef CONFIG_TMPFS_XATTR
4880 .listxattr = shmem_listxattr,
4881 .set_acl = simple_set_acl,
4882 .fileattr_get = shmem_fileattr_get,
4883 .fileattr_set = shmem_fileattr_set,
4884 #endif
4885 };
4886
4887 static const struct inode_operations shmem_dir_inode_operations = {
4888 #ifdef CONFIG_TMPFS
4889 .getattr = shmem_getattr,
4890 .create = shmem_create,
4891 .lookup = simple_lookup,
4892 .link = shmem_link,
4893 .unlink = shmem_unlink,
4894 .symlink = shmem_symlink,
4895 .mkdir = shmem_mkdir,
4896 .rmdir = shmem_rmdir,
4897 .mknod = shmem_mknod,
4898 .rename = shmem_rename2,
4899 .tmpfile = shmem_tmpfile,
4900 .get_offset_ctx = shmem_get_offset_ctx,
4901 #endif
4902 #ifdef CONFIG_TMPFS_XATTR
4903 .listxattr = shmem_listxattr,
4904 .fileattr_get = shmem_fileattr_get,
4905 .fileattr_set = shmem_fileattr_set,
4906 #endif
4907 #ifdef CONFIG_TMPFS_POSIX_ACL
4908 .setattr = shmem_setattr,
4909 .set_acl = simple_set_acl,
4910 #endif
4911 };
4912
4913 static const struct inode_operations shmem_special_inode_operations = {
4914 .getattr = shmem_getattr,
4915 #ifdef CONFIG_TMPFS_XATTR
4916 .listxattr = shmem_listxattr,
4917 #endif
4918 #ifdef CONFIG_TMPFS_POSIX_ACL
4919 .setattr = shmem_setattr,
4920 .set_acl = simple_set_acl,
4921 #endif
4922 };
4923
4924 static const struct super_operations shmem_ops = {
4925 .alloc_inode = shmem_alloc_inode,
4926 .free_inode = shmem_free_in_core_inode,
4927 .destroy_inode = shmem_destroy_inode,
4928 #ifdef CONFIG_TMPFS
4929 .statfs = shmem_statfs,
4930 .show_options = shmem_show_options,
4931 #endif
4932 #ifdef CONFIG_TMPFS_QUOTA
4933 .get_dquots = shmem_get_dquots,
4934 #endif
4935 .evict_inode = shmem_evict_inode,
4936 .drop_inode = generic_delete_inode,
4937 .put_super = shmem_put_super,
4938 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4939 .nr_cached_objects = shmem_unused_huge_count,
4940 .free_cached_objects = shmem_unused_huge_scan,
4941 #endif
4942 };
4943
4944 static const struct vm_operations_struct shmem_vm_ops = {
4945 .fault = shmem_fault,
4946 .map_pages = filemap_map_pages,
4947 #ifdef CONFIG_NUMA
4948 .set_policy = shmem_set_policy,
4949 .get_policy = shmem_get_policy,
4950 #endif
4951 };
4952
4953 static const struct vm_operations_struct shmem_anon_vm_ops = {
4954 .fault = shmem_fault,
4955 .map_pages = filemap_map_pages,
4956 #ifdef CONFIG_NUMA
4957 .set_policy = shmem_set_policy,
4958 .get_policy = shmem_get_policy,
4959 #endif
4960 };
4961
shmem_init_fs_context(struct fs_context * fc)4962 int shmem_init_fs_context(struct fs_context *fc)
4963 {
4964 struct shmem_options *ctx;
4965
4966 ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
4967 if (!ctx)
4968 return -ENOMEM;
4969
4970 ctx->mode = 0777 | S_ISVTX;
4971 ctx->uid = current_fsuid();
4972 ctx->gid = current_fsgid();
4973
4974 fc->fs_private = ctx;
4975 fc->ops = &shmem_fs_context_ops;
4976 return 0;
4977 }
4978
4979 static struct file_system_type shmem_fs_type = {
4980 .owner = THIS_MODULE,
4981 .name = "tmpfs",
4982 .init_fs_context = shmem_init_fs_context,
4983 #ifdef CONFIG_TMPFS
4984 .parameters = shmem_fs_parameters,
4985 #endif
4986 .kill_sb = kill_litter_super,
4987 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
4988 };
4989
shmem_init(void)4990 void __init shmem_init(void)
4991 {
4992 int error;
4993
4994 shmem_init_inodecache();
4995
4996 #ifdef CONFIG_TMPFS_QUOTA
4997 register_quota_format(&shmem_quota_format);
4998 #endif
4999
5000 error = register_filesystem(&shmem_fs_type);
5001 if (error) {
5002 pr_err("Could not register tmpfs\n");
5003 goto out2;
5004 }
5005
5006 shm_mnt = kern_mount(&shmem_fs_type);
5007 if (IS_ERR(shm_mnt)) {
5008 error = PTR_ERR(shm_mnt);
5009 pr_err("Could not kern_mount tmpfs\n");
5010 goto out1;
5011 }
5012
5013 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5014 if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5015 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5016 else
5017 shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5018
5019 /*
5020 * Default to setting PMD-sized THP to inherit the global setting and
5021 * disable all other multi-size THPs.
5022 */
5023 huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5024 #endif
5025 return;
5026
5027 out1:
5028 unregister_filesystem(&shmem_fs_type);
5029 out2:
5030 #ifdef CONFIG_TMPFS_QUOTA
5031 unregister_quota_format(&shmem_quota_format);
5032 #endif
5033 shmem_destroy_inodecache();
5034 shm_mnt = ERR_PTR(error);
5035 }
5036
5037 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5038 static ssize_t shmem_enabled_show(struct kobject *kobj,
5039 struct kobj_attribute *attr, char *buf)
5040 {
5041 static const int values[] = {
5042 SHMEM_HUGE_ALWAYS,
5043 SHMEM_HUGE_WITHIN_SIZE,
5044 SHMEM_HUGE_ADVISE,
5045 SHMEM_HUGE_NEVER,
5046 SHMEM_HUGE_DENY,
5047 SHMEM_HUGE_FORCE,
5048 };
5049 int len = 0;
5050 int i;
5051
5052 for (i = 0; i < ARRAY_SIZE(values); i++) {
5053 len += sysfs_emit_at(buf, len,
5054 shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5055 i ? " " : "", shmem_format_huge(values[i]));
5056 }
5057 len += sysfs_emit_at(buf, len, "\n");
5058
5059 return len;
5060 }
5061
shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5062 static ssize_t shmem_enabled_store(struct kobject *kobj,
5063 struct kobj_attribute *attr, const char *buf, size_t count)
5064 {
5065 char tmp[16];
5066 int huge;
5067
5068 if (count + 1 > sizeof(tmp))
5069 return -EINVAL;
5070 memcpy(tmp, buf, count);
5071 tmp[count] = '\0';
5072 if (count && tmp[count - 1] == '\n')
5073 tmp[count - 1] = '\0';
5074
5075 huge = shmem_parse_huge(tmp);
5076 if (huge == -EINVAL)
5077 return -EINVAL;
5078 if (!has_transparent_hugepage() &&
5079 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
5080 return -EINVAL;
5081
5082 /* Do not override huge allocation policy with non-PMD sized mTHP */
5083 if (huge == SHMEM_HUGE_FORCE &&
5084 huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
5085 return -EINVAL;
5086
5087 shmem_huge = huge;
5088 if (shmem_huge > SHMEM_HUGE_DENY)
5089 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5090 return count;
5091 }
5092
5093 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5094 static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5095
thpsize_shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5096 static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5097 struct kobj_attribute *attr, char *buf)
5098 {
5099 int order = to_thpsize(kobj)->order;
5100 const char *output;
5101
5102 if (test_bit(order, &huge_shmem_orders_always))
5103 output = "[always] inherit within_size advise never";
5104 else if (test_bit(order, &huge_shmem_orders_inherit))
5105 output = "always [inherit] within_size advise never";
5106 else if (test_bit(order, &huge_shmem_orders_within_size))
5107 output = "always inherit [within_size] advise never";
5108 else if (test_bit(order, &huge_shmem_orders_madvise))
5109 output = "always inherit within_size [advise] never";
5110 else
5111 output = "always inherit within_size advise [never]";
5112
5113 return sysfs_emit(buf, "%s\n", output);
5114 }
5115
thpsize_shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5116 static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5117 struct kobj_attribute *attr,
5118 const char *buf, size_t count)
5119 {
5120 int order = to_thpsize(kobj)->order;
5121 ssize_t ret = count;
5122
5123 if (sysfs_streq(buf, "always")) {
5124 spin_lock(&huge_shmem_orders_lock);
5125 clear_bit(order, &huge_shmem_orders_inherit);
5126 clear_bit(order, &huge_shmem_orders_madvise);
5127 clear_bit(order, &huge_shmem_orders_within_size);
5128 set_bit(order, &huge_shmem_orders_always);
5129 spin_unlock(&huge_shmem_orders_lock);
5130 } else if (sysfs_streq(buf, "inherit")) {
5131 /* Do not override huge allocation policy with non-PMD sized mTHP */
5132 if (shmem_huge == SHMEM_HUGE_FORCE &&
5133 order != HPAGE_PMD_ORDER)
5134 return -EINVAL;
5135
5136 spin_lock(&huge_shmem_orders_lock);
5137 clear_bit(order, &huge_shmem_orders_always);
5138 clear_bit(order, &huge_shmem_orders_madvise);
5139 clear_bit(order, &huge_shmem_orders_within_size);
5140 set_bit(order, &huge_shmem_orders_inherit);
5141 spin_unlock(&huge_shmem_orders_lock);
5142 } else if (sysfs_streq(buf, "within_size")) {
5143 spin_lock(&huge_shmem_orders_lock);
5144 clear_bit(order, &huge_shmem_orders_always);
5145 clear_bit(order, &huge_shmem_orders_inherit);
5146 clear_bit(order, &huge_shmem_orders_madvise);
5147 set_bit(order, &huge_shmem_orders_within_size);
5148 spin_unlock(&huge_shmem_orders_lock);
5149 } else if (sysfs_streq(buf, "advise")) {
5150 spin_lock(&huge_shmem_orders_lock);
5151 clear_bit(order, &huge_shmem_orders_always);
5152 clear_bit(order, &huge_shmem_orders_inherit);
5153 clear_bit(order, &huge_shmem_orders_within_size);
5154 set_bit(order, &huge_shmem_orders_madvise);
5155 spin_unlock(&huge_shmem_orders_lock);
5156 } else if (sysfs_streq(buf, "never")) {
5157 spin_lock(&huge_shmem_orders_lock);
5158 clear_bit(order, &huge_shmem_orders_always);
5159 clear_bit(order, &huge_shmem_orders_inherit);
5160 clear_bit(order, &huge_shmem_orders_within_size);
5161 clear_bit(order, &huge_shmem_orders_madvise);
5162 spin_unlock(&huge_shmem_orders_lock);
5163 } else {
5164 ret = -EINVAL;
5165 }
5166
5167 return ret;
5168 }
5169
5170 struct kobj_attribute thpsize_shmem_enabled_attr =
5171 __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5172 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5173
5174 #else /* !CONFIG_SHMEM */
5175
5176 /*
5177 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
5178 *
5179 * This is intended for small system where the benefits of the full
5180 * shmem code (swap-backed and resource-limited) are outweighed by
5181 * their complexity. On systems without swap this code should be
5182 * effectively equivalent, but much lighter weight.
5183 */
5184
5185 static struct file_system_type shmem_fs_type = {
5186 .name = "tmpfs",
5187 .init_fs_context = ramfs_init_fs_context,
5188 .parameters = ramfs_fs_parameters,
5189 .kill_sb = ramfs_kill_sb,
5190 .fs_flags = FS_USERNS_MOUNT,
5191 };
5192
shmem_init(void)5193 void __init shmem_init(void)
5194 {
5195 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
5196
5197 shm_mnt = kern_mount(&shmem_fs_type);
5198 BUG_ON(IS_ERR(shm_mnt));
5199 }
5200
shmem_unuse(unsigned int type)5201 int shmem_unuse(unsigned int type)
5202 {
5203 return 0;
5204 }
5205
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)5206 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
5207 {
5208 return 0;
5209 }
5210
shmem_unlock_mapping(struct address_space * mapping)5211 void shmem_unlock_mapping(struct address_space *mapping)
5212 {
5213 }
5214
5215 #ifdef CONFIG_MMU
shmem_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)5216 unsigned long shmem_get_unmapped_area(struct file *file,
5217 unsigned long addr, unsigned long len,
5218 unsigned long pgoff, unsigned long flags)
5219 {
5220 return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
5221 }
5222 #endif
5223
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)5224 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
5225 {
5226 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5227 }
5228 EXPORT_SYMBOL_GPL(shmem_truncate_range);
5229
5230 #define shmem_vm_ops generic_file_vm_ops
5231 #define shmem_anon_vm_ops generic_file_vm_ops
5232 #define shmem_file_operations ramfs_file_operations
5233 #define shmem_acct_size(flags, size) 0
5234 #define shmem_unacct_size(flags, size) do {} while (0)
5235
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)5236 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
5237 struct super_block *sb, struct inode *dir,
5238 umode_t mode, dev_t dev, unsigned long flags)
5239 {
5240 struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5241 return inode ? inode : ERR_PTR(-ENOSPC);
5242 }
5243
5244 #endif /* CONFIG_SHMEM */
5245
5246 /* common code */
5247
__shmem_file_setup(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags,unsigned int i_flags)5248 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
5249 loff_t size, unsigned long flags, unsigned int i_flags)
5250 {
5251 struct inode *inode;
5252 struct file *res;
5253
5254 if (IS_ERR(mnt))
5255 return ERR_CAST(mnt);
5256
5257 if (size < 0 || size > MAX_LFS_FILESIZE)
5258 return ERR_PTR(-EINVAL);
5259
5260 if (shmem_acct_size(flags, size))
5261 return ERR_PTR(-ENOMEM);
5262
5263 if (is_idmapped_mnt(mnt))
5264 return ERR_PTR(-EINVAL);
5265
5266 inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
5267 S_IFREG | S_IRWXUGO, 0, flags);
5268 if (IS_ERR(inode)) {
5269 shmem_unacct_size(flags, size);
5270 return ERR_CAST(inode);
5271 }
5272 inode->i_flags |= i_flags;
5273 inode->i_size = size;
5274 clear_nlink(inode); /* It is unlinked */
5275 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
5276 if (!IS_ERR(res))
5277 res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5278 &shmem_file_operations);
5279 if (IS_ERR(res))
5280 iput(inode);
5281 return res;
5282 }
5283
5284 /**
5285 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5286 * kernel internal. There will be NO LSM permission checks against the
5287 * underlying inode. So users of this interface must do LSM checks at a
5288 * higher layer. The users are the big_key and shm implementations. LSM
5289 * checks are provided at the key or shm level rather than the inode.
5290 * @name: name for dentry (to be seen in /proc/<pid>/maps
5291 * @size: size to be set for the file
5292 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5293 */
shmem_kernel_file_setup(const char * name,loff_t size,unsigned long flags)5294 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
5295 {
5296 return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
5297 }
5298 EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5299
5300 /**
5301 * shmem_file_setup - get an unlinked file living in tmpfs
5302 * @name: name for dentry (to be seen in /proc/<pid>/maps
5303 * @size: size to be set for the file
5304 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5305 */
shmem_file_setup(const char * name,loff_t size,unsigned long flags)5306 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
5307 {
5308 return __shmem_file_setup(shm_mnt, name, size, flags, 0);
5309 }
5310 EXPORT_SYMBOL_GPL(shmem_file_setup);
5311
5312 /**
5313 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5314 * @mnt: the tmpfs mount where the file will be created
5315 * @name: name for dentry (to be seen in /proc/<pid>/maps
5316 * @size: size to be set for the file
5317 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5318 */
shmem_file_setup_with_mnt(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags)5319 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
5320 loff_t size, unsigned long flags)
5321 {
5322 return __shmem_file_setup(mnt, name, size, flags, 0);
5323 }
5324 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5325
5326 /**
5327 * shmem_zero_setup - setup a shared anonymous mapping
5328 * @vma: the vma to be mmapped is prepared by do_mmap
5329 */
shmem_zero_setup(struct vm_area_struct * vma)5330 int shmem_zero_setup(struct vm_area_struct *vma)
5331 {
5332 struct file *file;
5333 loff_t size = vma->vm_end - vma->vm_start;
5334
5335 /*
5336 * Cloning a new file under mmap_lock leads to a lock ordering conflict
5337 * between XFS directory reading and selinux: since this file is only
5338 * accessible to the user through its mapping, use S_PRIVATE flag to
5339 * bypass file security, in the same way as shmem_kernel_file_setup().
5340 */
5341 file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
5342 if (IS_ERR(file))
5343 return PTR_ERR(file);
5344
5345 if (vma->vm_file)
5346 fput(vma->vm_file);
5347 vma->vm_file = file;
5348 vma->vm_ops = &shmem_anon_vm_ops;
5349
5350 return 0;
5351 }
5352
5353 /**
5354 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5355 * @mapping: the folio's address_space
5356 * @index: the folio index
5357 * @gfp: the page allocator flags to use if allocating
5358 *
5359 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5360 * with any new page allocations done using the specified allocation flags.
5361 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
5362 * suit tmpfs, since it may have pages in swapcache, and needs to find those
5363 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5364 *
5365 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
5366 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5367 */
shmem_read_folio_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5368 struct folio *shmem_read_folio_gfp(struct address_space *mapping,
5369 pgoff_t index, gfp_t gfp)
5370 {
5371 #ifdef CONFIG_SHMEM
5372 struct inode *inode = mapping->host;
5373 struct folio *folio;
5374 int error;
5375
5376 error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
5377 gfp, NULL, NULL);
5378 if (error)
5379 return ERR_PTR(error);
5380
5381 folio_unlock(folio);
5382 return folio;
5383 #else
5384 /*
5385 * The tiny !SHMEM case uses ramfs without swap
5386 */
5387 return mapping_read_folio_gfp(mapping, index, gfp);
5388 #endif
5389 }
5390 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
5391
shmem_read_mapping_page_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5392 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
5393 pgoff_t index, gfp_t gfp)
5394 {
5395 struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
5396 struct page *page;
5397
5398 if (IS_ERR(folio))
5399 return &folio->page;
5400
5401 page = folio_file_page(folio, index);
5402 if (PageHWPoison(page)) {
5403 folio_put(folio);
5404 return ERR_PTR(-EIO);
5405 }
5406
5407 return page;
5408 }
5409 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
5410
reclaim_shmem_address_space(struct address_space * mapping)5411 int reclaim_shmem_address_space(struct address_space *mapping)
5412 {
5413 #ifdef CONFIG_SHMEM
5414 pgoff_t start = 0;
5415 struct page *page;
5416 LIST_HEAD(page_list);
5417 XA_STATE(xas, &mapping->i_pages, start);
5418
5419 if (!shmem_mapping(mapping))
5420 return -EINVAL;
5421
5422 lru_add_drain();
5423
5424 rcu_read_lock();
5425 xas_for_each(&xas, page, ULONG_MAX) {
5426 if (xas_retry(&xas, page))
5427 continue;
5428 if (xa_is_value(page))
5429 continue;
5430 if (!folio_isolate_lru(page_folio(page)))
5431 continue;
5432
5433 list_add(&page->lru, &page_list);
5434
5435 if (need_resched()) {
5436 xas_pause(&xas);
5437 cond_resched_rcu();
5438 }
5439 }
5440 rcu_read_unlock();
5441
5442 return reclaim_pages(&page_list);
5443 #else
5444 return 0;
5445 #endif
5446 }
5447 EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);
5448