• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *		 2000 Transmeta Corp.
6  *		 2000-2001 Christoph Rohland
7  *		 2000-2001 SAP AG
8  *		 2002 Red Hat Inc.
9  * Copyright (C) 2002-2011 Hugh Dickins.
10  * Copyright (C) 2011 Google Inc.
11  * Copyright (C) 2002-2005 VERITAS Software Corporation.
12  * Copyright (C) 2004 Andi Kleen, SuSE Labs
13  *
14  * Extended attribute support for tmpfs:
15  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17  *
18  * tiny-shmem:
19  * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20  *
21  * This file is released under the GPL.
22  */
23 
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/mm.h>
32 #include <linux/random.h>
33 #include <linux/sched/signal.h>
34 #include <linux/export.h>
35 #include <linux/swap.h>
36 #include <linux/uio.h>
37 #include <linux/khugepaged.h>
38 #include <linux/hugetlb.h>
39 #include <linux/frontswap.h>
40 #include <linux/fs_parser.h>
41 #include <linux/swapfile.h>
42 #include <linux/mm_inline.h>
43 
44 static struct vfsmount *shm_mnt;
45 
46 #ifdef CONFIG_SHMEM
47 /*
48  * This virtual memory filesystem is heavily based on the ramfs. It
49  * extends ramfs by the ability to use swap and honor resource limits
50  * which makes it a completely usable filesystem.
51  */
52 
53 #include <linux/xattr.h>
54 #include <linux/exportfs.h>
55 #include <linux/posix_acl.h>
56 #include <linux/posix_acl_xattr.h>
57 #include <linux/mman.h>
58 #include <linux/string.h>
59 #include <linux/slab.h>
60 #include <linux/backing-dev.h>
61 #include <linux/shmem_fs.h>
62 #include <linux/writeback.h>
63 #include <linux/blkdev.h>
64 #include <linux/pagevec.h>
65 #include <linux/percpu_counter.h>
66 #include <linux/falloc.h>
67 #include <linux/splice.h>
68 #include <linux/security.h>
69 #include <linux/swapops.h>
70 #include <linux/mempolicy.h>
71 #include <linux/namei.h>
72 #include <linux/ctype.h>
73 #include <linux/migrate.h>
74 #include <linux/highmem.h>
75 #include <linux/seq_file.h>
76 #include <linux/magic.h>
77 #include <linux/syscalls.h>
78 #include <linux/fcntl.h>
79 #include <uapi/linux/memfd.h>
80 #include <linux/userfaultfd_k.h>
81 #include <linux/rmap.h>
82 #include <linux/uuid.h>
83 
84 #include <linux/uaccess.h>
85 
86 #include "internal.h"
87 
88 #define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
89 #define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
90 
91 /* Pretend that each entry is of this size in directory's i_size */
92 #define BOGO_DIRENT_SIZE 20
93 
94 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
95 #define SHORT_SYMLINK_LEN 128
96 
97 /*
98  * shmem_fallocate communicates with shmem_fault or shmem_writepage via
99  * inode->i_private (with i_rwsem making sure that it has only one user at
100  * a time): we would prefer not to enlarge the shmem inode just for that.
101  */
102 struct shmem_falloc {
103 	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
104 	pgoff_t start;		/* start of range currently being fallocated */
105 	pgoff_t next;		/* the next page offset to be fallocated */
106 	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
107 	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
108 };
109 
110 struct shmem_options {
111 	unsigned long long blocks;
112 	unsigned long long inodes;
113 	struct mempolicy *mpol;
114 	kuid_t uid;
115 	kgid_t gid;
116 	umode_t mode;
117 	bool full_inums;
118 	int huge;
119 	int seen;
120 #define SHMEM_SEEN_BLOCKS 1
121 #define SHMEM_SEEN_INODES 2
122 #define SHMEM_SEEN_HUGE 4
123 #define SHMEM_SEEN_INUMS 8
124 };
125 
126 #ifdef CONFIG_TMPFS
shmem_default_max_blocks(void)127 static unsigned long shmem_default_max_blocks(void)
128 {
129 	return totalram_pages() / 2;
130 }
131 
shmem_default_max_inodes(void)132 static unsigned long shmem_default_max_inodes(void)
133 {
134 	unsigned long nr_pages = totalram_pages();
135 
136 	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
137 }
138 #endif
139 
140 static int shmem_swapin_page(struct inode *inode, pgoff_t index,
141 			     struct page **pagep, enum sgp_type sgp,
142 			     gfp_t gfp, struct vm_area_struct *vma,
143 			     vm_fault_t *fault_type);
144 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
145 		struct page **pagep, enum sgp_type sgp,
146 		gfp_t gfp, struct vm_area_struct *vma,
147 		struct vm_fault *vmf, vm_fault_t *fault_type);
148 
shmem_getpage(struct inode * inode,pgoff_t index,struct page ** pagep,enum sgp_type sgp)149 int shmem_getpage(struct inode *inode, pgoff_t index,
150 		struct page **pagep, enum sgp_type sgp)
151 {
152 	return shmem_getpage_gfp(inode, index, pagep, sgp,
153 		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
154 }
155 
SHMEM_SB(struct super_block * sb)156 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
157 {
158 	return sb->s_fs_info;
159 }
160 
161 /*
162  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
163  * for shared memory and for shared anonymous (/dev/zero) mappings
164  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
165  * consistent with the pre-accounting of private mappings ...
166  */
shmem_acct_size(unsigned long flags,loff_t size)167 static inline int shmem_acct_size(unsigned long flags, loff_t size)
168 {
169 	return (flags & VM_NORESERVE) ?
170 		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
171 }
172 
shmem_unacct_size(unsigned long flags,loff_t size)173 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
174 {
175 	if (!(flags & VM_NORESERVE))
176 		vm_unacct_memory(VM_ACCT(size));
177 }
178 
shmem_reacct_size(unsigned long flags,loff_t oldsize,loff_t newsize)179 static inline int shmem_reacct_size(unsigned long flags,
180 		loff_t oldsize, loff_t newsize)
181 {
182 	if (!(flags & VM_NORESERVE)) {
183 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
184 			return security_vm_enough_memory_mm(current->mm,
185 					VM_ACCT(newsize) - VM_ACCT(oldsize));
186 		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
187 			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
188 	}
189 	return 0;
190 }
191 
192 /*
193  * ... whereas tmpfs objects are accounted incrementally as
194  * pages are allocated, in order to allow large sparse files.
195  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
196  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
197  */
shmem_acct_block(unsigned long flags,long pages)198 static inline int shmem_acct_block(unsigned long flags, long pages)
199 {
200 	if (!(flags & VM_NORESERVE))
201 		return 0;
202 
203 	return security_vm_enough_memory_mm(current->mm,
204 			pages * VM_ACCT(PAGE_SIZE));
205 }
206 
shmem_unacct_blocks(unsigned long flags,long pages)207 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
208 {
209 	if (flags & VM_NORESERVE)
210 		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
211 }
212 
shmem_inode_acct_block(struct inode * inode,long pages)213 static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
214 {
215 	struct shmem_inode_info *info = SHMEM_I(inode);
216 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
217 
218 	if (shmem_acct_block(info->flags, pages))
219 		return false;
220 
221 	if (sbinfo->max_blocks) {
222 		if (percpu_counter_compare(&sbinfo->used_blocks,
223 					   sbinfo->max_blocks - pages) > 0)
224 			goto unacct;
225 		percpu_counter_add(&sbinfo->used_blocks, pages);
226 	}
227 
228 	return true;
229 
230 unacct:
231 	shmem_unacct_blocks(info->flags, pages);
232 	return false;
233 }
234 
shmem_inode_unacct_blocks(struct inode * inode,long pages)235 static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
236 {
237 	struct shmem_inode_info *info = SHMEM_I(inode);
238 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
239 
240 	if (sbinfo->max_blocks)
241 		percpu_counter_sub(&sbinfo->used_blocks, pages);
242 	shmem_unacct_blocks(info->flags, pages);
243 }
244 
245 static const struct super_operations shmem_ops;
246 const struct address_space_operations shmem_aops;
247 static const struct file_operations shmem_file_operations;
248 static const struct inode_operations shmem_inode_operations;
249 static const struct inode_operations shmem_dir_inode_operations;
250 static const struct inode_operations shmem_special_inode_operations;
251 static const struct vm_operations_struct shmem_vm_ops;
252 static struct file_system_type shmem_fs_type;
253 
vma_is_shmem(struct vm_area_struct * vma)254 bool vma_is_shmem(struct vm_area_struct *vma)
255 {
256 	return vma->vm_ops == &shmem_vm_ops;
257 }
258 
259 static LIST_HEAD(shmem_swaplist);
260 static DEFINE_MUTEX(shmem_swaplist_mutex);
261 
262 /*
263  * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
264  * produces a novel ino for the newly allocated inode.
265  *
266  * It may also be called when making a hard link to permit the space needed by
267  * each dentry. However, in that case, no new inode number is needed since that
268  * internally draws from another pool of inode numbers (currently global
269  * get_next_ino()). This case is indicated by passing NULL as inop.
270  */
271 #define SHMEM_INO_BATCH 1024
shmem_reserve_inode(struct super_block * sb,ino_t * inop)272 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
273 {
274 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
275 	ino_t ino;
276 
277 	if (!(sb->s_flags & SB_KERNMOUNT)) {
278 		raw_spin_lock(&sbinfo->stat_lock);
279 		if (sbinfo->max_inodes) {
280 			if (!sbinfo->free_inodes) {
281 				raw_spin_unlock(&sbinfo->stat_lock);
282 				return -ENOSPC;
283 			}
284 			sbinfo->free_inodes--;
285 		}
286 		if (inop) {
287 			ino = sbinfo->next_ino++;
288 			if (unlikely(is_zero_ino(ino)))
289 				ino = sbinfo->next_ino++;
290 			if (unlikely(!sbinfo->full_inums &&
291 				     ino > UINT_MAX)) {
292 				/*
293 				 * Emulate get_next_ino uint wraparound for
294 				 * compatibility
295 				 */
296 				if (IS_ENABLED(CONFIG_64BIT))
297 					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
298 						__func__, MINOR(sb->s_dev));
299 				sbinfo->next_ino = 1;
300 				ino = sbinfo->next_ino++;
301 			}
302 			*inop = ino;
303 		}
304 		raw_spin_unlock(&sbinfo->stat_lock);
305 	} else if (inop) {
306 		/*
307 		 * __shmem_file_setup, one of our callers, is lock-free: it
308 		 * doesn't hold stat_lock in shmem_reserve_inode since
309 		 * max_inodes is always 0, and is called from potentially
310 		 * unknown contexts. As such, use a per-cpu batched allocator
311 		 * which doesn't require the per-sb stat_lock unless we are at
312 		 * the batch boundary.
313 		 *
314 		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
315 		 * shmem mounts are not exposed to userspace, so we don't need
316 		 * to worry about things like glibc compatibility.
317 		 */
318 		ino_t *next_ino;
319 
320 		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
321 		ino = *next_ino;
322 		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
323 			raw_spin_lock(&sbinfo->stat_lock);
324 			ino = sbinfo->next_ino;
325 			sbinfo->next_ino += SHMEM_INO_BATCH;
326 			raw_spin_unlock(&sbinfo->stat_lock);
327 			if (unlikely(is_zero_ino(ino)))
328 				ino++;
329 		}
330 		*inop = ino;
331 		*next_ino = ++ino;
332 		put_cpu();
333 	}
334 
335 	return 0;
336 }
337 
shmem_free_inode(struct super_block * sb)338 static void shmem_free_inode(struct super_block *sb)
339 {
340 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
341 	if (sbinfo->max_inodes) {
342 		raw_spin_lock(&sbinfo->stat_lock);
343 		sbinfo->free_inodes++;
344 		raw_spin_unlock(&sbinfo->stat_lock);
345 	}
346 }
347 
348 /**
349  * shmem_recalc_inode - recalculate the block usage of an inode
350  * @inode: inode to recalc
351  *
352  * We have to calculate the free blocks since the mm can drop
353  * undirtied hole pages behind our back.
354  *
355  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
356  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
357  *
358  * It has to be called with the spinlock held.
359  */
shmem_recalc_inode(struct inode * inode)360 static void shmem_recalc_inode(struct inode *inode)
361 {
362 	struct shmem_inode_info *info = SHMEM_I(inode);
363 	long freed;
364 
365 	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
366 	if (freed > 0) {
367 		info->alloced -= freed;
368 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
369 		shmem_inode_unacct_blocks(inode, freed);
370 	}
371 }
372 
shmem_charge(struct inode * inode,long pages)373 bool shmem_charge(struct inode *inode, long pages)
374 {
375 	struct shmem_inode_info *info = SHMEM_I(inode);
376 	unsigned long flags;
377 
378 	if (!shmem_inode_acct_block(inode, pages))
379 		return false;
380 
381 	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
382 	inode->i_mapping->nrpages += pages;
383 
384 	spin_lock_irqsave(&info->lock, flags);
385 	info->alloced += pages;
386 	inode->i_blocks += pages * BLOCKS_PER_PAGE;
387 	shmem_recalc_inode(inode);
388 	spin_unlock_irqrestore(&info->lock, flags);
389 
390 	return true;
391 }
392 
shmem_uncharge(struct inode * inode,long pages)393 void shmem_uncharge(struct inode *inode, long pages)
394 {
395 	struct shmem_inode_info *info = SHMEM_I(inode);
396 	unsigned long flags;
397 
398 	/* nrpages adjustment done by __delete_from_page_cache() or caller */
399 
400 	spin_lock_irqsave(&info->lock, flags);
401 	info->alloced -= pages;
402 	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
403 	shmem_recalc_inode(inode);
404 	spin_unlock_irqrestore(&info->lock, flags);
405 
406 	shmem_inode_unacct_blocks(inode, pages);
407 }
408 
409 /*
410  * Replace item expected in xarray by a new item, while holding xa_lock.
411  */
shmem_replace_entry(struct address_space * mapping,pgoff_t index,void * expected,void * replacement)412 static int shmem_replace_entry(struct address_space *mapping,
413 			pgoff_t index, void *expected, void *replacement)
414 {
415 	XA_STATE(xas, &mapping->i_pages, index);
416 	void *item;
417 
418 	VM_BUG_ON(!expected);
419 	VM_BUG_ON(!replacement);
420 	item = xas_load(&xas);
421 	if (item != expected)
422 		return -ENOENT;
423 	xas_store(&xas, replacement);
424 	return 0;
425 }
426 
427 /*
428  * Sometimes, before we decide whether to proceed or to fail, we must check
429  * that an entry was not already brought back from swap by a racing thread.
430  *
431  * Checking page is not enough: by the time a SwapCache page is locked, it
432  * might be reused, and again be SwapCache, using the same swap as before.
433  */
shmem_confirm_swap(struct address_space * mapping,pgoff_t index,swp_entry_t swap)434 static bool shmem_confirm_swap(struct address_space *mapping,
435 			       pgoff_t index, swp_entry_t swap)
436 {
437 	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
438 }
439 
440 /*
441  * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
442  *
443  * SHMEM_HUGE_NEVER:
444  *	disables huge pages for the mount;
445  * SHMEM_HUGE_ALWAYS:
446  *	enables huge pages for the mount;
447  * SHMEM_HUGE_WITHIN_SIZE:
448  *	only allocate huge pages if the page will be fully within i_size,
449  *	also respect fadvise()/madvise() hints;
450  * SHMEM_HUGE_ADVISE:
451  *	only allocate huge pages if requested with fadvise()/madvise();
452  */
453 
454 #define SHMEM_HUGE_NEVER	0
455 #define SHMEM_HUGE_ALWAYS	1
456 #define SHMEM_HUGE_WITHIN_SIZE	2
457 #define SHMEM_HUGE_ADVISE	3
458 
459 /*
460  * Special values.
461  * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
462  *
463  * SHMEM_HUGE_DENY:
464  *	disables huge on shm_mnt and all mounts, for emergency use;
465  * SHMEM_HUGE_FORCE:
466  *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
467  *
468  */
469 #define SHMEM_HUGE_DENY		(-1)
470 #define SHMEM_HUGE_FORCE	(-2)
471 
472 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
473 /* ifdef here to avoid bloating shmem.o when not necessary */
474 
475 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
476 
shmem_is_huge(struct vm_area_struct * vma,struct inode * inode,pgoff_t index)477 bool shmem_is_huge(struct vm_area_struct *vma,
478 		   struct inode *inode, pgoff_t index)
479 {
480 	loff_t i_size;
481 
482 	if (shmem_huge == SHMEM_HUGE_DENY)
483 		return false;
484 	if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
485 	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
486 		return false;
487 	if (shmem_huge == SHMEM_HUGE_FORCE)
488 		return true;
489 
490 	switch (SHMEM_SB(inode->i_sb)->huge) {
491 	case SHMEM_HUGE_ALWAYS:
492 		return true;
493 	case SHMEM_HUGE_WITHIN_SIZE:
494 		index = round_up(index + 1, HPAGE_PMD_NR);
495 		i_size = round_up(i_size_read(inode), PAGE_SIZE);
496 		if (i_size >> PAGE_SHIFT >= index)
497 			return true;
498 		fallthrough;
499 	case SHMEM_HUGE_ADVISE:
500 		if (vma && (vma->vm_flags & VM_HUGEPAGE))
501 			return true;
502 		fallthrough;
503 	default:
504 		return false;
505 	}
506 }
507 
508 #if defined(CONFIG_SYSFS)
shmem_parse_huge(const char * str)509 static int shmem_parse_huge(const char *str)
510 {
511 	if (!strcmp(str, "never"))
512 		return SHMEM_HUGE_NEVER;
513 	if (!strcmp(str, "always"))
514 		return SHMEM_HUGE_ALWAYS;
515 	if (!strcmp(str, "within_size"))
516 		return SHMEM_HUGE_WITHIN_SIZE;
517 	if (!strcmp(str, "advise"))
518 		return SHMEM_HUGE_ADVISE;
519 	if (!strcmp(str, "deny"))
520 		return SHMEM_HUGE_DENY;
521 	if (!strcmp(str, "force"))
522 		return SHMEM_HUGE_FORCE;
523 	return -EINVAL;
524 }
525 #endif
526 
527 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
shmem_format_huge(int huge)528 static const char *shmem_format_huge(int huge)
529 {
530 	switch (huge) {
531 	case SHMEM_HUGE_NEVER:
532 		return "never";
533 	case SHMEM_HUGE_ALWAYS:
534 		return "always";
535 	case SHMEM_HUGE_WITHIN_SIZE:
536 		return "within_size";
537 	case SHMEM_HUGE_ADVISE:
538 		return "advise";
539 	case SHMEM_HUGE_DENY:
540 		return "deny";
541 	case SHMEM_HUGE_FORCE:
542 		return "force";
543 	default:
544 		VM_BUG_ON(1);
545 		return "bad_val";
546 	}
547 }
548 #endif
549 
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_split)550 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
551 		struct shrink_control *sc, unsigned long nr_to_split)
552 {
553 	LIST_HEAD(list), *pos, *next;
554 	LIST_HEAD(to_remove);
555 	struct inode *inode;
556 	struct shmem_inode_info *info;
557 	struct page *page;
558 	unsigned long batch = sc ? sc->nr_to_scan : 128;
559 	int split = 0;
560 
561 	if (list_empty(&sbinfo->shrinklist))
562 		return SHRINK_STOP;
563 
564 	spin_lock(&sbinfo->shrinklist_lock);
565 	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
566 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
567 
568 		/* pin the inode */
569 		inode = igrab(&info->vfs_inode);
570 
571 		/* inode is about to be evicted */
572 		if (!inode) {
573 			list_del_init(&info->shrinklist);
574 			goto next;
575 		}
576 
577 		/* Check if there's anything to gain */
578 		if (round_up(inode->i_size, PAGE_SIZE) ==
579 				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
580 			list_move(&info->shrinklist, &to_remove);
581 			goto next;
582 		}
583 
584 		list_move(&info->shrinklist, &list);
585 next:
586 		sbinfo->shrinklist_len--;
587 		if (!--batch)
588 			break;
589 	}
590 	spin_unlock(&sbinfo->shrinklist_lock);
591 
592 	list_for_each_safe(pos, next, &to_remove) {
593 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
594 		inode = &info->vfs_inode;
595 		list_del_init(&info->shrinklist);
596 		iput(inode);
597 	}
598 
599 	list_for_each_safe(pos, next, &list) {
600 		int ret;
601 
602 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
603 		inode = &info->vfs_inode;
604 
605 		if (nr_to_split && split >= nr_to_split)
606 			goto move_back;
607 
608 		page = find_get_page(inode->i_mapping,
609 				(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
610 		if (!page)
611 			goto drop;
612 
613 		/* No huge page at the end of the file: nothing to split */
614 		if (!PageTransHuge(page)) {
615 			put_page(page);
616 			goto drop;
617 		}
618 
619 		/*
620 		 * Move the inode on the list back to shrinklist if we failed
621 		 * to lock the page at this time.
622 		 *
623 		 * Waiting for the lock may lead to deadlock in the
624 		 * reclaim path.
625 		 */
626 		if (!trylock_page(page)) {
627 			put_page(page);
628 			goto move_back;
629 		}
630 
631 		ret = split_huge_page(page);
632 		unlock_page(page);
633 		put_page(page);
634 
635 		/* If split failed move the inode on the list back to shrinklist */
636 		if (ret)
637 			goto move_back;
638 
639 		split++;
640 drop:
641 		list_del_init(&info->shrinklist);
642 		goto put;
643 move_back:
644 		/*
645 		 * Make sure the inode is either on the global list or deleted
646 		 * from any local list before iput() since it could be deleted
647 		 * in another thread once we put the inode (then the local list
648 		 * is corrupted).
649 		 */
650 		spin_lock(&sbinfo->shrinklist_lock);
651 		list_move(&info->shrinklist, &sbinfo->shrinklist);
652 		sbinfo->shrinklist_len++;
653 		spin_unlock(&sbinfo->shrinklist_lock);
654 put:
655 		iput(inode);
656 	}
657 
658 	return split;
659 }
660 
shmem_unused_huge_scan(struct super_block * sb,struct shrink_control * sc)661 static long shmem_unused_huge_scan(struct super_block *sb,
662 		struct shrink_control *sc)
663 {
664 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
665 
666 	if (!READ_ONCE(sbinfo->shrinklist_len))
667 		return SHRINK_STOP;
668 
669 	return shmem_unused_huge_shrink(sbinfo, sc, 0);
670 }
671 
shmem_unused_huge_count(struct super_block * sb,struct shrink_control * sc)672 static long shmem_unused_huge_count(struct super_block *sb,
673 		struct shrink_control *sc)
674 {
675 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
676 	return READ_ONCE(sbinfo->shrinklist_len);
677 }
678 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
679 
680 #define shmem_huge SHMEM_HUGE_DENY
681 
shmem_is_huge(struct vm_area_struct * vma,struct inode * inode,pgoff_t index)682 bool shmem_is_huge(struct vm_area_struct *vma,
683 		   struct inode *inode, pgoff_t index)
684 {
685 	return false;
686 }
687 
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_split)688 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
689 		struct shrink_control *sc, unsigned long nr_to_split)
690 {
691 	return 0;
692 }
693 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
694 
695 /*
696  * Like add_to_page_cache_locked, but error if expected item has gone.
697  */
shmem_add_to_page_cache(struct page * page,struct address_space * mapping,pgoff_t index,void * expected,gfp_t gfp,struct mm_struct * charge_mm)698 static int shmem_add_to_page_cache(struct page *page,
699 				   struct address_space *mapping,
700 				   pgoff_t index, void *expected, gfp_t gfp,
701 				   struct mm_struct *charge_mm)
702 {
703 	XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
704 	unsigned long i = 0;
705 	unsigned long nr = compound_nr(page);
706 	int error;
707 
708 	VM_BUG_ON_PAGE(PageTail(page), page);
709 	VM_BUG_ON_PAGE(index != round_down(index, nr), page);
710 	VM_BUG_ON_PAGE(!PageLocked(page), page);
711 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
712 	VM_BUG_ON(expected && PageTransHuge(page));
713 
714 	page_ref_add(page, nr);
715 	page->mapping = mapping;
716 	page->index = index;
717 
718 	if (!PageSwapCache(page)) {
719 		error = mem_cgroup_charge(page, charge_mm, gfp);
720 		if (error) {
721 			if (PageTransHuge(page)) {
722 				count_vm_event(THP_FILE_FALLBACK);
723 				count_vm_event(THP_FILE_FALLBACK_CHARGE);
724 			}
725 			goto error;
726 		}
727 	}
728 	cgroup_throttle_swaprate(page, gfp);
729 
730 	do {
731 		void *entry;
732 		xas_lock_irq(&xas);
733 		entry = xas_find_conflict(&xas);
734 		if (entry != expected)
735 			xas_set_err(&xas, -EEXIST);
736 		xas_create_range(&xas);
737 		if (xas_error(&xas))
738 			goto unlock;
739 next:
740 		xas_store(&xas, page);
741 		if (++i < nr) {
742 			xas_next(&xas);
743 			goto next;
744 		}
745 		if (PageTransHuge(page)) {
746 			count_vm_event(THP_FILE_ALLOC);
747 			__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
748 		}
749 		mapping->nrpages += nr;
750 		__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
751 		__mod_lruvec_page_state(page, NR_SHMEM, nr);
752 unlock:
753 		xas_unlock_irq(&xas);
754 	} while (xas_nomem(&xas, gfp));
755 
756 	if (xas_error(&xas)) {
757 		error = xas_error(&xas);
758 		goto error;
759 	}
760 
761 	return 0;
762 error:
763 	page->mapping = NULL;
764 	page_ref_sub(page, nr);
765 	return error;
766 }
767 
768 /*
769  * Like delete_from_page_cache, but substitutes swap for page.
770  */
shmem_delete_from_page_cache(struct page * page,void * radswap)771 static void shmem_delete_from_page_cache(struct page *page, void *radswap)
772 {
773 	struct address_space *mapping = page->mapping;
774 	int error;
775 
776 	VM_BUG_ON_PAGE(PageCompound(page), page);
777 
778 	xa_lock_irq(&mapping->i_pages);
779 	error = shmem_replace_entry(mapping, page->index, page, radswap);
780 	page->mapping = NULL;
781 	mapping->nrpages--;
782 	__dec_lruvec_page_state(page, NR_FILE_PAGES);
783 	__dec_lruvec_page_state(page, NR_SHMEM);
784 	xa_unlock_irq(&mapping->i_pages);
785 	put_page(page);
786 	BUG_ON(error);
787 }
788 
789 /*
790  * Remove swap entry from page cache, free the swap and its page cache.
791  */
shmem_free_swap(struct address_space * mapping,pgoff_t index,void * radswap)792 static int shmem_free_swap(struct address_space *mapping,
793 			   pgoff_t index, void *radswap)
794 {
795 	void *old;
796 
797 	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
798 	if (old != radswap)
799 		return -ENOENT;
800 	free_swap_and_cache(radix_to_swp_entry(radswap));
801 	return 0;
802 }
803 
804 /*
805  * Determine (in bytes) how many of the shmem object's pages mapped by the
806  * given offsets are swapped out.
807  *
808  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
809  * as long as the inode doesn't go away and racy results are not a problem.
810  */
shmem_partial_swap_usage(struct address_space * mapping,pgoff_t start,pgoff_t end)811 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
812 						pgoff_t start, pgoff_t end)
813 {
814 	XA_STATE(xas, &mapping->i_pages, start);
815 	struct page *page;
816 	unsigned long swapped = 0;
817 
818 	rcu_read_lock();
819 	xas_for_each(&xas, page, end - 1) {
820 		if (xas_retry(&xas, page))
821 			continue;
822 		if (xa_is_value(page))
823 			swapped++;
824 
825 		if (need_resched()) {
826 			xas_pause(&xas);
827 			cond_resched_rcu();
828 		}
829 	}
830 
831 	rcu_read_unlock();
832 
833 	return swapped << PAGE_SHIFT;
834 }
835 
836 /*
837  * Determine (in bytes) how many of the shmem object's pages mapped by the
838  * given vma is swapped out.
839  *
840  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
841  * as long as the inode doesn't go away and racy results are not a problem.
842  */
shmem_swap_usage(struct vm_area_struct * vma)843 unsigned long shmem_swap_usage(struct vm_area_struct *vma)
844 {
845 	struct inode *inode = file_inode(vma->vm_file);
846 	struct shmem_inode_info *info = SHMEM_I(inode);
847 	struct address_space *mapping = inode->i_mapping;
848 	unsigned long swapped;
849 
850 	/* Be careful as we don't hold info->lock */
851 	swapped = READ_ONCE(info->swapped);
852 
853 	/*
854 	 * The easier cases are when the shmem object has nothing in swap, or
855 	 * the vma maps it whole. Then we can simply use the stats that we
856 	 * already track.
857 	 */
858 	if (!swapped)
859 		return 0;
860 
861 	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
862 		return swapped << PAGE_SHIFT;
863 
864 	/* Here comes the more involved part */
865 	return shmem_partial_swap_usage(mapping,
866 			linear_page_index(vma, vma->vm_start),
867 			linear_page_index(vma, vma->vm_end));
868 }
869 
870 /*
871  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
872  */
shmem_unlock_mapping(struct address_space * mapping)873 void shmem_unlock_mapping(struct address_space *mapping)
874 {
875 	struct pagevec pvec;
876 	pgoff_t index = 0;
877 
878 	pagevec_init(&pvec);
879 	/*
880 	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
881 	 */
882 	while (!mapping_unevictable(mapping)) {
883 		if (!pagevec_lookup(&pvec, mapping, &index))
884 			break;
885 		check_move_unevictable_pages(&pvec);
886 		pagevec_release(&pvec);
887 		cond_resched();
888 	}
889 }
890 
891 /*
892  * Check whether a hole-punch or truncation needs to split a huge page,
893  * returning true if no split was required, or the split has been successful.
894  *
895  * Eviction (or truncation to 0 size) should never need to split a huge page;
896  * but in rare cases might do so, if shmem_undo_range() failed to trylock on
897  * head, and then succeeded to trylock on tail.
898  *
899  * A split can only succeed when there are no additional references on the
900  * huge page: so the split below relies upon find_get_entries() having stopped
901  * when it found a subpage of the huge page, without getting further references.
902  */
shmem_punch_compound(struct page * page,pgoff_t start,pgoff_t end)903 static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
904 {
905 	if (!PageTransCompound(page))
906 		return true;
907 
908 	/* Just proceed to delete a huge page wholly within the range punched */
909 	if (PageHead(page) &&
910 	    page->index >= start && page->index + HPAGE_PMD_NR <= end)
911 		return true;
912 
913 	/* Try to split huge page, so we can truly punch the hole or truncate */
914 	return split_huge_page(page) >= 0;
915 }
916 
917 /*
918  * Remove range of pages and swap entries from page cache, and free them.
919  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
920  */
shmem_undo_range(struct inode * inode,loff_t lstart,loff_t lend,bool unfalloc)921 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
922 								 bool unfalloc)
923 {
924 	struct address_space *mapping = inode->i_mapping;
925 	struct shmem_inode_info *info = SHMEM_I(inode);
926 	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
927 	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
928 	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
929 	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
930 	struct pagevec pvec;
931 	pgoff_t indices[PAGEVEC_SIZE];
932 	long nr_swaps_freed = 0;
933 	pgoff_t index;
934 	int i;
935 
936 	if (lend == -1)
937 		end = -1;	/* unsigned, so actually very big */
938 
939 	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
940 		info->fallocend = start;
941 
942 	pagevec_init(&pvec);
943 	index = start;
944 	while (index < end && find_lock_entries(mapping, index, end - 1,
945 			&pvec, indices)) {
946 		for (i = 0; i < pagevec_count(&pvec); i++) {
947 			struct page *page = pvec.pages[i];
948 
949 			index = indices[i];
950 
951 			if (xa_is_value(page)) {
952 				if (unfalloc)
953 					continue;
954 				nr_swaps_freed += !shmem_free_swap(mapping,
955 								index, page);
956 				continue;
957 			}
958 			index += thp_nr_pages(page) - 1;
959 
960 			if (!unfalloc || !PageUptodate(page))
961 				truncate_inode_page(mapping, page);
962 			unlock_page(page);
963 		}
964 		pagevec_remove_exceptionals(&pvec);
965 		pagevec_release(&pvec);
966 		cond_resched();
967 		index++;
968 	}
969 
970 	if (partial_start) {
971 		struct page *page = NULL;
972 		shmem_getpage(inode, start - 1, &page, SGP_READ);
973 		if (page) {
974 			unsigned int top = PAGE_SIZE;
975 			if (start > end) {
976 				top = partial_end;
977 				partial_end = 0;
978 			}
979 			zero_user_segment(page, partial_start, top);
980 			set_page_dirty(page);
981 			unlock_page(page);
982 			put_page(page);
983 		}
984 	}
985 	if (partial_end) {
986 		struct page *page = NULL;
987 		shmem_getpage(inode, end, &page, SGP_READ);
988 		if (page) {
989 			zero_user_segment(page, 0, partial_end);
990 			set_page_dirty(page);
991 			unlock_page(page);
992 			put_page(page);
993 		}
994 	}
995 	if (start >= end)
996 		return;
997 
998 	index = start;
999 	while (index < end) {
1000 		cond_resched();
1001 
1002 		if (!find_get_entries(mapping, index, end - 1, &pvec,
1003 				indices)) {
1004 			/* If all gone or hole-punch or unfalloc, we're done */
1005 			if (index == start || end != -1)
1006 				break;
1007 			/* But if truncating, restart to make sure all gone */
1008 			index = start;
1009 			continue;
1010 		}
1011 		for (i = 0; i < pagevec_count(&pvec); i++) {
1012 			struct page *page = pvec.pages[i];
1013 
1014 			index = indices[i];
1015 			if (xa_is_value(page)) {
1016 				if (unfalloc)
1017 					continue;
1018 				if (shmem_free_swap(mapping, index, page)) {
1019 					/* Swap was replaced by page: retry */
1020 					index--;
1021 					break;
1022 				}
1023 				nr_swaps_freed++;
1024 				continue;
1025 			}
1026 
1027 			lock_page(page);
1028 
1029 			if (!unfalloc || !PageUptodate(page)) {
1030 				if (page_mapping(page) != mapping) {
1031 					/* Page was replaced by swap: retry */
1032 					unlock_page(page);
1033 					index--;
1034 					break;
1035 				}
1036 				VM_BUG_ON_PAGE(PageWriteback(page), page);
1037 				if (shmem_punch_compound(page, start, end))
1038 					truncate_inode_page(mapping, page);
1039 				else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
1040 					/* Wipe the page and don't get stuck */
1041 					clear_highpage(page);
1042 					flush_dcache_page(page);
1043 					set_page_dirty(page);
1044 					if (index <
1045 					    round_up(start, HPAGE_PMD_NR))
1046 						start = index + 1;
1047 				}
1048 			}
1049 			unlock_page(page);
1050 		}
1051 		pagevec_remove_exceptionals(&pvec);
1052 		pagevec_release(&pvec);
1053 		index++;
1054 	}
1055 
1056 	spin_lock_irq(&info->lock);
1057 	info->swapped -= nr_swaps_freed;
1058 	shmem_recalc_inode(inode);
1059 	spin_unlock_irq(&info->lock);
1060 }
1061 
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)1062 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1063 {
1064 	shmem_undo_range(inode, lstart, lend, false);
1065 	inode->i_ctime = inode->i_mtime = current_time(inode);
1066 }
1067 EXPORT_SYMBOL_GPL(shmem_truncate_range);
1068 
shmem_getattr(struct user_namespace * mnt_userns,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int query_flags)1069 static int shmem_getattr(struct user_namespace *mnt_userns,
1070 			 const struct path *path, struct kstat *stat,
1071 			 u32 request_mask, unsigned int query_flags)
1072 {
1073 	struct inode *inode = path->dentry->d_inode;
1074 	struct shmem_inode_info *info = SHMEM_I(inode);
1075 
1076 	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1077 		spin_lock_irq(&info->lock);
1078 		shmem_recalc_inode(inode);
1079 		spin_unlock_irq(&info->lock);
1080 	}
1081 	generic_fillattr(&init_user_ns, inode, stat);
1082 
1083 	if (shmem_is_huge(NULL, inode, 0))
1084 		stat->blksize = HPAGE_PMD_SIZE;
1085 
1086 	return 0;
1087 }
1088 
shmem_setattr(struct user_namespace * mnt_userns,struct dentry * dentry,struct iattr * attr)1089 static int shmem_setattr(struct user_namespace *mnt_userns,
1090 			 struct dentry *dentry, struct iattr *attr)
1091 {
1092 	struct inode *inode = d_inode(dentry);
1093 	struct shmem_inode_info *info = SHMEM_I(inode);
1094 	int error;
1095 
1096 	error = setattr_prepare(&init_user_ns, dentry, attr);
1097 	if (error)
1098 		return error;
1099 
1100 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1101 		loff_t oldsize = inode->i_size;
1102 		loff_t newsize = attr->ia_size;
1103 
1104 		/* protected by i_rwsem */
1105 		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1106 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1107 			return -EPERM;
1108 
1109 		if (newsize != oldsize) {
1110 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1111 					oldsize, newsize);
1112 			if (error)
1113 				return error;
1114 			i_size_write(inode, newsize);
1115 			inode->i_ctime = inode->i_mtime = current_time(inode);
1116 		}
1117 		if (newsize <= oldsize) {
1118 			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1119 			if (oldsize > holebegin)
1120 				unmap_mapping_range(inode->i_mapping,
1121 							holebegin, 0, 1);
1122 			if (info->alloced)
1123 				shmem_truncate_range(inode,
1124 							newsize, (loff_t)-1);
1125 			/* unmap again to remove racily COWed private pages */
1126 			if (oldsize > holebegin)
1127 				unmap_mapping_range(inode->i_mapping,
1128 							holebegin, 0, 1);
1129 		}
1130 	}
1131 
1132 	setattr_copy(&init_user_ns, inode, attr);
1133 	if (attr->ia_valid & ATTR_MODE)
1134 		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
1135 	return error;
1136 }
1137 
shmem_evict_inode(struct inode * inode)1138 static void shmem_evict_inode(struct inode *inode)
1139 {
1140 	struct shmem_inode_info *info = SHMEM_I(inode);
1141 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1142 
1143 	if (shmem_mapping(inode->i_mapping)) {
1144 		shmem_unacct_size(info->flags, inode->i_size);
1145 		inode->i_size = 0;
1146 		shmem_truncate_range(inode, 0, (loff_t)-1);
1147 		if (!list_empty(&info->shrinklist)) {
1148 			spin_lock(&sbinfo->shrinklist_lock);
1149 			if (!list_empty(&info->shrinklist)) {
1150 				list_del_init(&info->shrinklist);
1151 				sbinfo->shrinklist_len--;
1152 			}
1153 			spin_unlock(&sbinfo->shrinklist_lock);
1154 		}
1155 		while (!list_empty(&info->swaplist)) {
1156 			/* Wait while shmem_unuse() is scanning this inode... */
1157 			wait_var_event(&info->stop_eviction,
1158 				       !atomic_read(&info->stop_eviction));
1159 			mutex_lock(&shmem_swaplist_mutex);
1160 			/* ...but beware of the race if we peeked too early */
1161 			if (!atomic_read(&info->stop_eviction))
1162 				list_del_init(&info->swaplist);
1163 			mutex_unlock(&shmem_swaplist_mutex);
1164 		}
1165 	}
1166 
1167 	simple_xattrs_free(&info->xattrs);
1168 	WARN_ON(inode->i_blocks);
1169 	shmem_free_inode(inode->i_sb);
1170 	clear_inode(inode);
1171 }
1172 
shmem_find_swap_entries(struct address_space * mapping,pgoff_t start,unsigned int nr_entries,struct page ** entries,pgoff_t * indices,unsigned int type,bool frontswap)1173 static int shmem_find_swap_entries(struct address_space *mapping,
1174 				   pgoff_t start, unsigned int nr_entries,
1175 				   struct page **entries, pgoff_t *indices,
1176 				   unsigned int type, bool frontswap)
1177 {
1178 	XA_STATE(xas, &mapping->i_pages, start);
1179 	struct page *page;
1180 	swp_entry_t entry;
1181 	unsigned int ret = 0;
1182 
1183 	if (!nr_entries)
1184 		return 0;
1185 
1186 	rcu_read_lock();
1187 	xas_for_each(&xas, page, ULONG_MAX) {
1188 		if (xas_retry(&xas, page))
1189 			continue;
1190 
1191 		if (!xa_is_value(page))
1192 			continue;
1193 
1194 		entry = radix_to_swp_entry(page);
1195 		if (swp_type(entry) != type)
1196 			continue;
1197 		if (frontswap &&
1198 		    !frontswap_test(swap_info[type], swp_offset(entry)))
1199 			continue;
1200 
1201 		indices[ret] = xas.xa_index;
1202 		entries[ret] = page;
1203 
1204 		if (need_resched()) {
1205 			xas_pause(&xas);
1206 			cond_resched_rcu();
1207 		}
1208 		if (++ret == nr_entries)
1209 			break;
1210 	}
1211 	rcu_read_unlock();
1212 
1213 	return ret;
1214 }
1215 
1216 /*
1217  * Move the swapped pages for an inode to page cache. Returns the count
1218  * of pages swapped in, or the error in case of failure.
1219  */
shmem_unuse_swap_entries(struct inode * inode,struct pagevec pvec,pgoff_t * indices)1220 static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
1221 				    pgoff_t *indices)
1222 {
1223 	int i = 0;
1224 	int ret = 0;
1225 	int error = 0;
1226 	struct address_space *mapping = inode->i_mapping;
1227 
1228 	for (i = 0; i < pvec.nr; i++) {
1229 		struct page *page = pvec.pages[i];
1230 
1231 		if (!xa_is_value(page))
1232 			continue;
1233 		error = shmem_swapin_page(inode, indices[i],
1234 					  &page, SGP_CACHE,
1235 					  mapping_gfp_mask(mapping),
1236 					  NULL, NULL);
1237 		if (error == 0) {
1238 			unlock_page(page);
1239 			put_page(page);
1240 			ret++;
1241 		}
1242 		if (error == -ENOMEM)
1243 			break;
1244 		error = 0;
1245 	}
1246 	return error ? error : ret;
1247 }
1248 
1249 /*
1250  * If swap found in inode, free it and move page from swapcache to filecache.
1251  */
shmem_unuse_inode(struct inode * inode,unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)1252 static int shmem_unuse_inode(struct inode *inode, unsigned int type,
1253 			     bool frontswap, unsigned long *fs_pages_to_unuse)
1254 {
1255 	struct address_space *mapping = inode->i_mapping;
1256 	pgoff_t start = 0;
1257 	struct pagevec pvec;
1258 	pgoff_t indices[PAGEVEC_SIZE];
1259 	bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
1260 	int ret = 0;
1261 
1262 	pagevec_init(&pvec);
1263 	do {
1264 		unsigned int nr_entries = PAGEVEC_SIZE;
1265 
1266 		if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
1267 			nr_entries = *fs_pages_to_unuse;
1268 
1269 		pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
1270 						  pvec.pages, indices,
1271 						  type, frontswap);
1272 		if (pvec.nr == 0) {
1273 			ret = 0;
1274 			break;
1275 		}
1276 
1277 		ret = shmem_unuse_swap_entries(inode, pvec, indices);
1278 		if (ret < 0)
1279 			break;
1280 
1281 		if (frontswap_partial) {
1282 			*fs_pages_to_unuse -= ret;
1283 			if (*fs_pages_to_unuse == 0) {
1284 				ret = FRONTSWAP_PAGES_UNUSED;
1285 				break;
1286 			}
1287 		}
1288 
1289 		start = indices[pvec.nr - 1];
1290 	} while (true);
1291 
1292 	return ret;
1293 }
1294 
1295 /*
1296  * Read all the shared memory data that resides in the swap
1297  * device 'type' back into memory, so the swap device can be
1298  * unused.
1299  */
shmem_unuse(unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)1300 int shmem_unuse(unsigned int type, bool frontswap,
1301 		unsigned long *fs_pages_to_unuse)
1302 {
1303 	struct shmem_inode_info *info, *next;
1304 	int error = 0;
1305 
1306 	if (list_empty(&shmem_swaplist))
1307 		return 0;
1308 
1309 	mutex_lock(&shmem_swaplist_mutex);
1310 	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1311 		if (!info->swapped) {
1312 			list_del_init(&info->swaplist);
1313 			continue;
1314 		}
1315 		/*
1316 		 * Drop the swaplist mutex while searching the inode for swap;
1317 		 * but before doing so, make sure shmem_evict_inode() will not
1318 		 * remove placeholder inode from swaplist, nor let it be freed
1319 		 * (igrab() would protect from unlink, but not from unmount).
1320 		 */
1321 		atomic_inc(&info->stop_eviction);
1322 		mutex_unlock(&shmem_swaplist_mutex);
1323 
1324 		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
1325 					  fs_pages_to_unuse);
1326 		cond_resched();
1327 
1328 		mutex_lock(&shmem_swaplist_mutex);
1329 		next = list_next_entry(info, swaplist);
1330 		if (!info->swapped)
1331 			list_del_init(&info->swaplist);
1332 		if (atomic_dec_and_test(&info->stop_eviction))
1333 			wake_up_var(&info->stop_eviction);
1334 		if (error)
1335 			break;
1336 	}
1337 	mutex_unlock(&shmem_swaplist_mutex);
1338 
1339 	return error;
1340 }
1341 
1342 /*
1343  * Move the page from the page cache to the swap cache.
1344  */
shmem_writepage(struct page * page,struct writeback_control * wbc)1345 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1346 {
1347 	struct shmem_inode_info *info;
1348 	struct address_space *mapping;
1349 	struct inode *inode;
1350 	swp_entry_t swap;
1351 	pgoff_t index;
1352 
1353 	/*
1354 	 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
1355 	 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
1356 	 * and its shmem_writeback() needs them to be split when swapping.
1357 	 */
1358 	if (PageTransCompound(page)) {
1359 		/* Ensure the subpages are still dirty */
1360 		SetPageDirty(page);
1361 		if (split_huge_page(page) < 0)
1362 			goto redirty;
1363 		ClearPageDirty(page);
1364 	}
1365 
1366 	BUG_ON(!PageLocked(page));
1367 	mapping = page->mapping;
1368 	index = page->index;
1369 	inode = mapping->host;
1370 	info = SHMEM_I(inode);
1371 	if (info->flags & VM_LOCKED)
1372 		goto redirty;
1373 	if (!total_swap_pages)
1374 		goto redirty;
1375 
1376 	/*
1377 	 * Our capabilities prevent regular writeback or sync from ever calling
1378 	 * shmem_writepage; but a stacking filesystem might use ->writepage of
1379 	 * its underlying filesystem, in which case tmpfs should write out to
1380 	 * swap only in response to memory pressure, and not for the writeback
1381 	 * threads or sync.
1382 	 */
1383 	if (!wbc->for_reclaim) {
1384 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
1385 		goto redirty;
1386 	}
1387 
1388 	/*
1389 	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1390 	 * value into swapfile.c, the only way we can correctly account for a
1391 	 * fallocated page arriving here is now to initialize it and write it.
1392 	 *
1393 	 * That's okay for a page already fallocated earlier, but if we have
1394 	 * not yet completed the fallocation, then (a) we want to keep track
1395 	 * of this page in case we have to undo it, and (b) it may not be a
1396 	 * good idea to continue anyway, once we're pushing into swap.  So
1397 	 * reactivate the page, and let shmem_fallocate() quit when too many.
1398 	 */
1399 	if (!PageUptodate(page)) {
1400 		if (inode->i_private) {
1401 			struct shmem_falloc *shmem_falloc;
1402 			spin_lock(&inode->i_lock);
1403 			shmem_falloc = inode->i_private;
1404 			if (shmem_falloc &&
1405 			    !shmem_falloc->waitq &&
1406 			    index >= shmem_falloc->start &&
1407 			    index < shmem_falloc->next)
1408 				shmem_falloc->nr_unswapped++;
1409 			else
1410 				shmem_falloc = NULL;
1411 			spin_unlock(&inode->i_lock);
1412 			if (shmem_falloc)
1413 				goto redirty;
1414 		}
1415 		clear_highpage(page);
1416 		flush_dcache_page(page);
1417 		SetPageUptodate(page);
1418 	}
1419 
1420 	swap = get_swap_page(page);
1421 	if (!swap.val)
1422 		goto redirty;
1423 
1424 	/*
1425 	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1426 	 * if it's not already there.  Do it now before the page is
1427 	 * moved to swap cache, when its pagelock no longer protects
1428 	 * the inode from eviction.  But don't unlock the mutex until
1429 	 * we've incremented swapped, because shmem_unuse_inode() will
1430 	 * prune a !swapped inode from the swaplist under this mutex.
1431 	 */
1432 	mutex_lock(&shmem_swaplist_mutex);
1433 	if (list_empty(&info->swaplist))
1434 		list_add(&info->swaplist, &shmem_swaplist);
1435 
1436 	if (add_to_swap_cache(page, swap,
1437 			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1438 			NULL) == 0) {
1439 		spin_lock_irq(&info->lock);
1440 		shmem_recalc_inode(inode);
1441 		info->swapped++;
1442 		spin_unlock_irq(&info->lock);
1443 
1444 		swap_shmem_alloc(swap);
1445 		shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
1446 
1447 		mutex_unlock(&shmem_swaplist_mutex);
1448 		BUG_ON(page_mapped(page));
1449 		swap_writepage(page, wbc);
1450 		return 0;
1451 	}
1452 
1453 	mutex_unlock(&shmem_swaplist_mutex);
1454 	put_swap_page(page, swap);
1455 redirty:
1456 	set_page_dirty(page);
1457 	if (wbc->for_reclaim)
1458 		return AOP_WRITEPAGE_ACTIVATE;	/* Return with page locked */
1459 	unlock_page(page);
1460 	return 0;
1461 }
1462 
1463 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1464 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1465 {
1466 	char buffer[64];
1467 
1468 	if (!mpol || mpol->mode == MPOL_DEFAULT)
1469 		return;		/* show nothing */
1470 
1471 	mpol_to_str(buffer, sizeof(buffer), mpol);
1472 
1473 	seq_printf(seq, ",mpol=%s", buffer);
1474 }
1475 
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1476 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1477 {
1478 	struct mempolicy *mpol = NULL;
1479 	if (sbinfo->mpol) {
1480 		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1481 		mpol = sbinfo->mpol;
1482 		mpol_get(mpol);
1483 		raw_spin_unlock(&sbinfo->stat_lock);
1484 	}
1485 	return mpol;
1486 }
1487 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1488 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1489 {
1490 }
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1491 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1492 {
1493 	return NULL;
1494 }
1495 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1496 #ifndef CONFIG_NUMA
1497 #define vm_policy vm_private_data
1498 #endif
1499 
shmem_pseudo_vma_init(struct vm_area_struct * vma,struct shmem_inode_info * info,pgoff_t index)1500 static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1501 		struct shmem_inode_info *info, pgoff_t index)
1502 {
1503 	/* Create a pseudo vma that just contains the policy */
1504 	vma_init(vma, NULL);
1505 	/* Bias interleave by inode number to distribute better across nodes */
1506 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
1507 	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1508 }
1509 
shmem_pseudo_vma_destroy(struct vm_area_struct * vma)1510 static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1511 {
1512 	/* Drop reference taken by mpol_shared_policy_lookup() */
1513 	mpol_cond_put(vma->vm_policy);
1514 }
1515 
shmem_swapin(swp_entry_t swap,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1516 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1517 			struct shmem_inode_info *info, pgoff_t index)
1518 {
1519 	struct vm_area_struct pvma;
1520 	struct page *page;
1521 	struct vm_fault vmf = {
1522 		.vma = &pvma,
1523 	};
1524 
1525 	shmem_pseudo_vma_init(&pvma, info, index);
1526 	page = swap_cluster_readahead(swap, gfp, &vmf);
1527 	shmem_pseudo_vma_destroy(&pvma);
1528 
1529 	return page;
1530 }
1531 
1532 /*
1533  * Make sure huge_gfp is always more limited than limit_gfp.
1534  * Some of the flags set permissions, while others set limitations.
1535  */
limit_gfp_mask(gfp_t huge_gfp,gfp_t limit_gfp)1536 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1537 {
1538 	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1539 	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1540 	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1541 	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1542 
1543 	/* Allow allocations only from the originally specified zones. */
1544 	result |= zoneflags;
1545 
1546 	/*
1547 	 * Minimize the result gfp by taking the union with the deny flags,
1548 	 * and the intersection of the allow flags.
1549 	 */
1550 	result |= (limit_gfp & denyflags);
1551 	result |= (huge_gfp & limit_gfp) & allowflags;
1552 
1553 	return result;
1554 }
1555 
shmem_alloc_hugepage(gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1556 static struct page *shmem_alloc_hugepage(gfp_t gfp,
1557 		struct shmem_inode_info *info, pgoff_t index)
1558 {
1559 	struct vm_area_struct pvma;
1560 	struct address_space *mapping = info->vfs_inode.i_mapping;
1561 	pgoff_t hindex;
1562 	struct page *page;
1563 
1564 	hindex = round_down(index, HPAGE_PMD_NR);
1565 	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1566 								XA_PRESENT))
1567 		return NULL;
1568 
1569 	shmem_pseudo_vma_init(&pvma, info, hindex);
1570 	page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
1571 			       true);
1572 	shmem_pseudo_vma_destroy(&pvma);
1573 	if (page)
1574 		prep_transhuge_page(page);
1575 	else
1576 		count_vm_event(THP_FILE_FALLBACK);
1577 	return page;
1578 }
1579 
shmem_alloc_page(gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1580 static struct page *shmem_alloc_page(gfp_t gfp,
1581 			struct shmem_inode_info *info, pgoff_t index)
1582 {
1583 	struct vm_area_struct pvma;
1584 	struct page *page;
1585 
1586 	shmem_pseudo_vma_init(&pvma, info, index);
1587 	page = alloc_page_vma(gfp, &pvma, 0);
1588 	shmem_pseudo_vma_destroy(&pvma);
1589 
1590 	return page;
1591 }
1592 
shmem_alloc_and_acct_page(gfp_t gfp,struct inode * inode,pgoff_t index,bool huge)1593 static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1594 		struct inode *inode,
1595 		pgoff_t index, bool huge)
1596 {
1597 	struct shmem_inode_info *info = SHMEM_I(inode);
1598 	struct page *page;
1599 	int nr;
1600 	int err = -ENOSPC;
1601 
1602 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1603 		huge = false;
1604 	nr = huge ? HPAGE_PMD_NR : 1;
1605 
1606 	if (!shmem_inode_acct_block(inode, nr))
1607 		goto failed;
1608 
1609 	if (huge)
1610 		page = shmem_alloc_hugepage(gfp, info, index);
1611 	else
1612 		page = shmem_alloc_page(gfp, info, index);
1613 	if (page) {
1614 		__SetPageLocked(page);
1615 		__SetPageSwapBacked(page);
1616 		return page;
1617 	}
1618 
1619 	err = -ENOMEM;
1620 	shmem_inode_unacct_blocks(inode, nr);
1621 failed:
1622 	return ERR_PTR(err);
1623 }
1624 
1625 /*
1626  * When a page is moved from swapcache to shmem filecache (either by the
1627  * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
1628  * shmem_unuse_inode()), it may have been read in earlier from swap, in
1629  * ignorance of the mapping it belongs to.  If that mapping has special
1630  * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1631  * we may need to copy to a suitable page before moving to filecache.
1632  *
1633  * In a future release, this may well be extended to respect cpuset and
1634  * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1635  * but for now it is a simple matter of zone.
1636  */
shmem_should_replace_page(struct page * page,gfp_t gfp)1637 static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
1638 {
1639 	return page_zonenum(page) > gfp_zone(gfp);
1640 }
1641 
shmem_replace_page(struct page ** pagep,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1642 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1643 				struct shmem_inode_info *info, pgoff_t index)
1644 {
1645 	struct page *oldpage, *newpage;
1646 	struct address_space *swap_mapping;
1647 	swp_entry_t entry;
1648 	pgoff_t swap_index;
1649 	int error;
1650 
1651 	oldpage = *pagep;
1652 	entry.val = page_private(oldpage);
1653 	swap_index = swp_offset(entry);
1654 	swap_mapping = page_mapping(oldpage);
1655 
1656 	/*
1657 	 * We have arrived here because our zones are constrained, so don't
1658 	 * limit chance of success by further cpuset and node constraints.
1659 	 */
1660 	gfp &= ~GFP_CONSTRAINT_MASK;
1661 	newpage = shmem_alloc_page(gfp, info, index);
1662 	if (!newpage)
1663 		return -ENOMEM;
1664 
1665 	get_page(newpage);
1666 	copy_highpage(newpage, oldpage);
1667 	flush_dcache_page(newpage);
1668 
1669 	__SetPageLocked(newpage);
1670 	__SetPageSwapBacked(newpage);
1671 	SetPageUptodate(newpage);
1672 	set_page_private(newpage, entry.val);
1673 	SetPageSwapCache(newpage);
1674 
1675 	/*
1676 	 * Our caller will very soon move newpage out of swapcache, but it's
1677 	 * a nice clean interface for us to replace oldpage by newpage there.
1678 	 */
1679 	xa_lock_irq(&swap_mapping->i_pages);
1680 	error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
1681 	if (!error) {
1682 		mem_cgroup_migrate(oldpage, newpage);
1683 		__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
1684 		__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
1685 	}
1686 	xa_unlock_irq(&swap_mapping->i_pages);
1687 
1688 	if (unlikely(error)) {
1689 		/*
1690 		 * Is this possible?  I think not, now that our callers check
1691 		 * both PageSwapCache and page_private after getting page lock;
1692 		 * but be defensive.  Reverse old to newpage for clear and free.
1693 		 */
1694 		oldpage = newpage;
1695 	} else {
1696 		lru_cache_add(newpage);
1697 		*pagep = newpage;
1698 	}
1699 
1700 	ClearPageSwapCache(oldpage);
1701 	set_page_private(oldpage, 0);
1702 
1703 	unlock_page(oldpage);
1704 	put_page(oldpage);
1705 	put_page(oldpage);
1706 	return error;
1707 }
1708 
1709 /*
1710  * Swap in the page pointed to by *pagep.
1711  * Caller has to make sure that *pagep contains a valid swapped page.
1712  * Returns 0 and the page in pagep if success. On failure, returns the
1713  * error code and NULL in *pagep.
1714  */
shmem_swapin_page(struct inode * inode,pgoff_t index,struct page ** pagep,enum sgp_type sgp,gfp_t gfp,struct vm_area_struct * vma,vm_fault_t * fault_type)1715 static int shmem_swapin_page(struct inode *inode, pgoff_t index,
1716 			     struct page **pagep, enum sgp_type sgp,
1717 			     gfp_t gfp, struct vm_area_struct *vma,
1718 			     vm_fault_t *fault_type)
1719 {
1720 	struct address_space *mapping = inode->i_mapping;
1721 	struct shmem_inode_info *info = SHMEM_I(inode);
1722 	struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
1723 	struct page *page;
1724 	swp_entry_t swap;
1725 	int error;
1726 
1727 	VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
1728 	swap = radix_to_swp_entry(*pagep);
1729 	*pagep = NULL;
1730 
1731 	/* Look it up and read it in.. */
1732 	page = lookup_swap_cache(swap, NULL, 0);
1733 	if (!page) {
1734 		/* Or update major stats only when swapin succeeds?? */
1735 		if (fault_type) {
1736 			*fault_type |= VM_FAULT_MAJOR;
1737 			count_vm_event(PGMAJFAULT);
1738 			count_memcg_event_mm(charge_mm, PGMAJFAULT);
1739 		}
1740 		/* Here we actually start the io */
1741 		page = shmem_swapin(swap, gfp, info, index);
1742 		if (!page) {
1743 			error = -ENOMEM;
1744 			goto failed;
1745 		}
1746 	}
1747 
1748 	/* We have to do this with page locked to prevent races */
1749 	lock_page(page);
1750 	if (!PageSwapCache(page) || page_private(page) != swap.val ||
1751 	    !shmem_confirm_swap(mapping, index, swap)) {
1752 		error = -EEXIST;
1753 		goto unlock;
1754 	}
1755 	if (!PageUptodate(page)) {
1756 		error = -EIO;
1757 		goto failed;
1758 	}
1759 	wait_on_page_writeback(page);
1760 
1761 	/*
1762 	 * Some architectures may have to restore extra metadata to the
1763 	 * physical page after reading from swap.
1764 	 */
1765 	arch_swap_restore(swap, page);
1766 
1767 	if (shmem_should_replace_page(page, gfp)) {
1768 		error = shmem_replace_page(&page, gfp, info, index);
1769 		if (error)
1770 			goto failed;
1771 	}
1772 
1773 	error = shmem_add_to_page_cache(page, mapping, index,
1774 					swp_to_radix_entry(swap), gfp,
1775 					charge_mm);
1776 	if (error)
1777 		goto failed;
1778 
1779 	spin_lock_irq(&info->lock);
1780 	info->swapped--;
1781 	shmem_recalc_inode(inode);
1782 	spin_unlock_irq(&info->lock);
1783 
1784 	if (sgp == SGP_WRITE)
1785 		mark_page_accessed(page);
1786 
1787 	delete_from_swap_cache(page);
1788 	set_page_dirty(page);
1789 	swap_free(swap);
1790 
1791 	*pagep = page;
1792 	return 0;
1793 failed:
1794 	if (!shmem_confirm_swap(mapping, index, swap))
1795 		error = -EEXIST;
1796 unlock:
1797 	if (page) {
1798 		unlock_page(page);
1799 		put_page(page);
1800 	}
1801 
1802 	return error;
1803 }
1804 
1805 /*
1806  * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1807  *
1808  * If we allocate a new one we do not mark it dirty. That's up to the
1809  * vm. If we swap it in we mark it dirty since we also free the swap
1810  * entry since a page cannot live in both the swap and page cache.
1811  *
1812  * vma, vmf, and fault_type are only supplied by shmem_fault:
1813  * otherwise they are NULL.
1814  */
shmem_getpage_gfp(struct inode * inode,pgoff_t index,struct page ** pagep,enum sgp_type sgp,gfp_t gfp,struct vm_area_struct * vma,struct vm_fault * vmf,vm_fault_t * fault_type)1815 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1816 	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
1817 	struct vm_area_struct *vma, struct vm_fault *vmf,
1818 			vm_fault_t *fault_type)
1819 {
1820 	struct address_space *mapping = inode->i_mapping;
1821 	struct shmem_inode_info *info = SHMEM_I(inode);
1822 	struct shmem_sb_info *sbinfo;
1823 	struct mm_struct *charge_mm;
1824 	struct page *page;
1825 	pgoff_t hindex = index;
1826 	gfp_t huge_gfp;
1827 	int error;
1828 	int once = 0;
1829 	int alloced = 0;
1830 
1831 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1832 		return -EFBIG;
1833 repeat:
1834 	if (sgp <= SGP_CACHE &&
1835 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1836 		return -EINVAL;
1837 	}
1838 
1839 	sbinfo = SHMEM_SB(inode->i_sb);
1840 	charge_mm = vma ? vma->vm_mm : NULL;
1841 
1842 	page = pagecache_get_page(mapping, index,
1843 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
1844 
1845 	if (page && vma && userfaultfd_minor(vma)) {
1846 		if (!xa_is_value(page)) {
1847 			unlock_page(page);
1848 			put_page(page);
1849 		}
1850 		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1851 		return 0;
1852 	}
1853 
1854 	if (xa_is_value(page)) {
1855 		error = shmem_swapin_page(inode, index, &page,
1856 					  sgp, gfp, vma, fault_type);
1857 		if (error == -EEXIST)
1858 			goto repeat;
1859 
1860 		*pagep = page;
1861 		return error;
1862 	}
1863 
1864 	if (page) {
1865 		hindex = page->index;
1866 		if (sgp == SGP_WRITE)
1867 			mark_page_accessed(page);
1868 		if (PageUptodate(page))
1869 			goto out;
1870 		/* fallocated page */
1871 		if (sgp != SGP_READ)
1872 			goto clear;
1873 		unlock_page(page);
1874 		put_page(page);
1875 	}
1876 
1877 	/*
1878 	 * SGP_READ: succeed on hole, with NULL page, letting caller zero.
1879 	 * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
1880 	 */
1881 	*pagep = NULL;
1882 	if (sgp == SGP_READ)
1883 		return 0;
1884 	if (sgp == SGP_NOALLOC)
1885 		return -ENOENT;
1886 
1887 	/*
1888 	 * Fast cache lookup and swap lookup did not find it: allocate.
1889 	 */
1890 
1891 	if (vma && userfaultfd_missing(vma)) {
1892 		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1893 		return 0;
1894 	}
1895 
1896 	/* Never use a huge page for shmem_symlink() */
1897 	if (S_ISLNK(inode->i_mode))
1898 		goto alloc_nohuge;
1899 	if (!shmem_is_huge(vma, inode, index))
1900 		goto alloc_nohuge;
1901 
1902 	huge_gfp = vma_thp_gfp_mask(vma);
1903 	huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1904 	page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
1905 	if (IS_ERR(page)) {
1906 alloc_nohuge:
1907 		page = shmem_alloc_and_acct_page(gfp, inode,
1908 						 index, false);
1909 	}
1910 	if (IS_ERR(page)) {
1911 		int retry = 5;
1912 
1913 		error = PTR_ERR(page);
1914 		page = NULL;
1915 		if (error != -ENOSPC)
1916 			goto unlock;
1917 		/*
1918 		 * Try to reclaim some space by splitting a huge page
1919 		 * beyond i_size on the filesystem.
1920 		 */
1921 		while (retry--) {
1922 			int ret;
1923 
1924 			ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1925 			if (ret == SHRINK_STOP)
1926 				break;
1927 			if (ret)
1928 				goto alloc_nohuge;
1929 		}
1930 		goto unlock;
1931 	}
1932 
1933 	if (PageTransHuge(page))
1934 		hindex = round_down(index, HPAGE_PMD_NR);
1935 	else
1936 		hindex = index;
1937 
1938 	if (sgp == SGP_WRITE)
1939 		__SetPageReferenced(page);
1940 
1941 	error = shmem_add_to_page_cache(page, mapping, hindex,
1942 					NULL, gfp & GFP_RECLAIM_MASK,
1943 					charge_mm);
1944 	if (error)
1945 		goto unacct;
1946 	lru_cache_add(page);
1947 
1948 	spin_lock_irq(&info->lock);
1949 	info->alloced += compound_nr(page);
1950 	inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1951 	shmem_recalc_inode(inode);
1952 	spin_unlock_irq(&info->lock);
1953 	alloced = true;
1954 
1955 	if (PageTransHuge(page) &&
1956 	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1957 			hindex + HPAGE_PMD_NR - 1) {
1958 		/*
1959 		 * Part of the huge page is beyond i_size: subject
1960 		 * to shrink under memory pressure.
1961 		 */
1962 		spin_lock(&sbinfo->shrinklist_lock);
1963 		/*
1964 		 * _careful to defend against unlocked access to
1965 		 * ->shrink_list in shmem_unused_huge_shrink()
1966 		 */
1967 		if (list_empty_careful(&info->shrinklist)) {
1968 			list_add_tail(&info->shrinklist,
1969 				      &sbinfo->shrinklist);
1970 			sbinfo->shrinklist_len++;
1971 		}
1972 		spin_unlock(&sbinfo->shrinklist_lock);
1973 	}
1974 
1975 	/*
1976 	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1977 	 */
1978 	if (sgp == SGP_FALLOC)
1979 		sgp = SGP_WRITE;
1980 clear:
1981 	/*
1982 	 * Let SGP_WRITE caller clear ends if write does not fill page;
1983 	 * but SGP_FALLOC on a page fallocated earlier must initialize
1984 	 * it now, lest undo on failure cancel our earlier guarantee.
1985 	 */
1986 	if (sgp != SGP_WRITE && !PageUptodate(page)) {
1987 		int i;
1988 
1989 		for (i = 0; i < compound_nr(page); i++) {
1990 			clear_highpage(page + i);
1991 			flush_dcache_page(page + i);
1992 		}
1993 		SetPageUptodate(page);
1994 	}
1995 
1996 	/* Perhaps the file has been truncated since we checked */
1997 	if (sgp <= SGP_CACHE &&
1998 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1999 		if (alloced) {
2000 			ClearPageDirty(page);
2001 			delete_from_page_cache(page);
2002 			spin_lock_irq(&info->lock);
2003 			shmem_recalc_inode(inode);
2004 			spin_unlock_irq(&info->lock);
2005 		}
2006 		error = -EINVAL;
2007 		goto unlock;
2008 	}
2009 out:
2010 	*pagep = page + index - hindex;
2011 	return 0;
2012 
2013 	/*
2014 	 * Error recovery.
2015 	 */
2016 unacct:
2017 	shmem_inode_unacct_blocks(inode, compound_nr(page));
2018 
2019 	if (PageTransHuge(page)) {
2020 		unlock_page(page);
2021 		put_page(page);
2022 		goto alloc_nohuge;
2023 	}
2024 unlock:
2025 	if (page) {
2026 		unlock_page(page);
2027 		put_page(page);
2028 	}
2029 	if (error == -ENOSPC && !once++) {
2030 		spin_lock_irq(&info->lock);
2031 		shmem_recalc_inode(inode);
2032 		spin_unlock_irq(&info->lock);
2033 		goto repeat;
2034 	}
2035 	if (error == -EEXIST)
2036 		goto repeat;
2037 	return error;
2038 }
2039 
2040 /*
2041  * This is like autoremove_wake_function, but it removes the wait queue
2042  * entry unconditionally - even if something else had already woken the
2043  * target.
2044  */
synchronous_wake_function(wait_queue_entry_t * wait,unsigned mode,int sync,void * key)2045 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2046 {
2047 	int ret = default_wake_function(wait, mode, sync, key);
2048 	list_del_init(&wait->entry);
2049 	return ret;
2050 }
2051 
shmem_fault(struct vm_fault * vmf)2052 static vm_fault_t shmem_fault(struct vm_fault *vmf)
2053 {
2054 	struct vm_area_struct *vma = vmf->vma;
2055 	struct inode *inode = file_inode(vma->vm_file);
2056 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2057 	int err;
2058 	vm_fault_t ret = VM_FAULT_LOCKED;
2059 
2060 	/*
2061 	 * Trinity finds that probing a hole which tmpfs is punching can
2062 	 * prevent the hole-punch from ever completing: which in turn
2063 	 * locks writers out with its hold on i_rwsem.  So refrain from
2064 	 * faulting pages into the hole while it's being punched.  Although
2065 	 * shmem_undo_range() does remove the additions, it may be unable to
2066 	 * keep up, as each new page needs its own unmap_mapping_range() call,
2067 	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2068 	 *
2069 	 * It does not matter if we sometimes reach this check just before the
2070 	 * hole-punch begins, so that one fault then races with the punch:
2071 	 * we just need to make racing faults a rare case.
2072 	 *
2073 	 * The implementation below would be much simpler if we just used a
2074 	 * standard mutex or completion: but we cannot take i_rwsem in fault,
2075 	 * and bloating every shmem inode for this unlikely case would be sad.
2076 	 */
2077 	if (unlikely(inode->i_private)) {
2078 		struct shmem_falloc *shmem_falloc;
2079 
2080 		spin_lock(&inode->i_lock);
2081 		shmem_falloc = inode->i_private;
2082 		if (shmem_falloc &&
2083 		    shmem_falloc->waitq &&
2084 		    vmf->pgoff >= shmem_falloc->start &&
2085 		    vmf->pgoff < shmem_falloc->next) {
2086 			struct file *fpin;
2087 			wait_queue_head_t *shmem_falloc_waitq;
2088 			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2089 
2090 			ret = VM_FAULT_NOPAGE;
2091 			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2092 			if (fpin)
2093 				ret = VM_FAULT_RETRY;
2094 
2095 			shmem_falloc_waitq = shmem_falloc->waitq;
2096 			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2097 					TASK_UNINTERRUPTIBLE);
2098 			spin_unlock(&inode->i_lock);
2099 			schedule();
2100 
2101 			/*
2102 			 * shmem_falloc_waitq points into the shmem_fallocate()
2103 			 * stack of the hole-punching task: shmem_falloc_waitq
2104 			 * is usually invalid by the time we reach here, but
2105 			 * finish_wait() does not dereference it in that case;
2106 			 * though i_lock needed lest racing with wake_up_all().
2107 			 */
2108 			spin_lock(&inode->i_lock);
2109 			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2110 			spin_unlock(&inode->i_lock);
2111 
2112 			if (fpin)
2113 				fput(fpin);
2114 			return ret;
2115 		}
2116 		spin_unlock(&inode->i_lock);
2117 	}
2118 
2119 	err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
2120 				  gfp, vma, vmf, &ret);
2121 	if (err)
2122 		return vmf_error(err);
2123 	return ret;
2124 }
2125 
shmem_get_unmapped_area(struct file * file,unsigned long uaddr,unsigned long len,unsigned long pgoff,unsigned long flags)2126 unsigned long shmem_get_unmapped_area(struct file *file,
2127 				      unsigned long uaddr, unsigned long len,
2128 				      unsigned long pgoff, unsigned long flags)
2129 {
2130 	unsigned long (*get_area)(struct file *,
2131 		unsigned long, unsigned long, unsigned long, unsigned long);
2132 	unsigned long addr;
2133 	unsigned long offset;
2134 	unsigned long inflated_len;
2135 	unsigned long inflated_addr;
2136 	unsigned long inflated_offset;
2137 
2138 	if (len > TASK_SIZE)
2139 		return -ENOMEM;
2140 
2141 	get_area = current->mm->get_unmapped_area;
2142 	addr = get_area(file, uaddr, len, pgoff, flags);
2143 
2144 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2145 		return addr;
2146 	if (IS_ERR_VALUE(addr))
2147 		return addr;
2148 	if (addr & ~PAGE_MASK)
2149 		return addr;
2150 	if (addr > TASK_SIZE - len)
2151 		return addr;
2152 
2153 	if (shmem_huge == SHMEM_HUGE_DENY)
2154 		return addr;
2155 	if (len < HPAGE_PMD_SIZE)
2156 		return addr;
2157 	if (flags & MAP_FIXED)
2158 		return addr;
2159 	/*
2160 	 * Our priority is to support MAP_SHARED mapped hugely;
2161 	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2162 	 * But if caller specified an address hint and we allocated area there
2163 	 * successfully, respect that as before.
2164 	 */
2165 	if (uaddr == addr)
2166 		return addr;
2167 
2168 	if (shmem_huge != SHMEM_HUGE_FORCE) {
2169 		struct super_block *sb;
2170 
2171 		if (file) {
2172 			VM_BUG_ON(file->f_op != &shmem_file_operations);
2173 			sb = file_inode(file)->i_sb;
2174 		} else {
2175 			/*
2176 			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2177 			 * for "/dev/zero", to create a shared anonymous object.
2178 			 */
2179 			if (IS_ERR(shm_mnt))
2180 				return addr;
2181 			sb = shm_mnt->mnt_sb;
2182 		}
2183 		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2184 			return addr;
2185 	}
2186 
2187 	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2188 	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2189 		return addr;
2190 	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2191 		return addr;
2192 
2193 	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2194 	if (inflated_len > TASK_SIZE)
2195 		return addr;
2196 	if (inflated_len < len)
2197 		return addr;
2198 
2199 	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2200 	if (IS_ERR_VALUE(inflated_addr))
2201 		return addr;
2202 	if (inflated_addr & ~PAGE_MASK)
2203 		return addr;
2204 
2205 	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2206 	inflated_addr += offset - inflated_offset;
2207 	if (inflated_offset > offset)
2208 		inflated_addr += HPAGE_PMD_SIZE;
2209 
2210 	if (inflated_addr > TASK_SIZE - len)
2211 		return addr;
2212 	return inflated_addr;
2213 }
2214 
2215 #ifdef CONFIG_NUMA
shmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)2216 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2217 {
2218 	struct inode *inode = file_inode(vma->vm_file);
2219 	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2220 }
2221 
shmem_get_policy(struct vm_area_struct * vma,unsigned long addr)2222 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2223 					  unsigned long addr)
2224 {
2225 	struct inode *inode = file_inode(vma->vm_file);
2226 	pgoff_t index;
2227 
2228 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2229 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2230 }
2231 #endif
2232 
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)2233 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2234 {
2235 	struct inode *inode = file_inode(file);
2236 	struct shmem_inode_info *info = SHMEM_I(inode);
2237 	int retval = -ENOMEM;
2238 
2239 	/*
2240 	 * What serializes the accesses to info->flags?
2241 	 * ipc_lock_object() when called from shmctl_do_lock(),
2242 	 * no serialization needed when called from shm_destroy().
2243 	 */
2244 	if (lock && !(info->flags & VM_LOCKED)) {
2245 		if (!user_shm_lock(inode->i_size, ucounts))
2246 			goto out_nomem;
2247 		info->flags |= VM_LOCKED;
2248 		mapping_set_unevictable(file->f_mapping);
2249 	}
2250 	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2251 		user_shm_unlock(inode->i_size, ucounts);
2252 		info->flags &= ~VM_LOCKED;
2253 		mapping_clear_unevictable(file->f_mapping);
2254 	}
2255 	retval = 0;
2256 
2257 out_nomem:
2258 	return retval;
2259 }
2260 
shmem_mmap(struct file * file,struct vm_area_struct * vma)2261 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2262 {
2263 	struct shmem_inode_info *info = SHMEM_I(file_inode(file));
2264 	int ret;
2265 
2266 	ret = seal_check_future_write(info->seals, vma);
2267 	if (ret)
2268 		return ret;
2269 
2270 	/* arm64 - allow memory tagging on RAM-based files */
2271 	vma->vm_flags |= VM_MTE_ALLOWED;
2272 
2273 	file_accessed(file);
2274 	vma->vm_ops = &shmem_vm_ops;
2275 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2276 			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2277 			(vma->vm_end & HPAGE_PMD_MASK)) {
2278 		khugepaged_enter(vma, vma->vm_flags);
2279 	}
2280 	return 0;
2281 }
2282 
shmem_get_inode(struct super_block * sb,const struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)2283 static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
2284 				     umode_t mode, dev_t dev, unsigned long flags)
2285 {
2286 	struct inode *inode;
2287 	struct shmem_inode_info *info;
2288 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2289 	ino_t ino;
2290 
2291 	if (shmem_reserve_inode(sb, &ino))
2292 		return NULL;
2293 
2294 	inode = new_inode(sb);
2295 	if (inode) {
2296 		inode->i_ino = ino;
2297 		inode_init_owner(&init_user_ns, inode, dir, mode);
2298 		inode->i_blocks = 0;
2299 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2300 		inode->i_generation = prandom_u32();
2301 		info = SHMEM_I(inode);
2302 		memset(info, 0, (char *)inode - (char *)info);
2303 		spin_lock_init(&info->lock);
2304 		atomic_set(&info->stop_eviction, 0);
2305 		info->seals = F_SEAL_SEAL;
2306 		info->flags = flags & VM_NORESERVE;
2307 		INIT_LIST_HEAD(&info->shrinklist);
2308 		INIT_LIST_HEAD(&info->swaplist);
2309 		simple_xattrs_init(&info->xattrs);
2310 		cache_no_acl(inode);
2311 
2312 		switch (mode & S_IFMT) {
2313 		default:
2314 			inode->i_op = &shmem_special_inode_operations;
2315 			init_special_inode(inode, mode, dev);
2316 			break;
2317 		case S_IFREG:
2318 			inode->i_mapping->a_ops = &shmem_aops;
2319 			inode->i_op = &shmem_inode_operations;
2320 			inode->i_fop = &shmem_file_operations;
2321 			mpol_shared_policy_init(&info->policy,
2322 						 shmem_get_sbmpol(sbinfo));
2323 			break;
2324 		case S_IFDIR:
2325 			inc_nlink(inode);
2326 			/* Some things misbehave if size == 0 on a directory */
2327 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
2328 			inode->i_op = &shmem_dir_inode_operations;
2329 			inode->i_fop = &simple_dir_operations;
2330 			break;
2331 		case S_IFLNK:
2332 			/*
2333 			 * Must not load anything in the rbtree,
2334 			 * mpol_free_shared_policy will not be called.
2335 			 */
2336 			mpol_shared_policy_init(&info->policy, NULL);
2337 			break;
2338 		}
2339 
2340 		lockdep_annotate_inode_mutex_key(inode);
2341 	} else
2342 		shmem_free_inode(sb);
2343 	return inode;
2344 }
2345 
2346 #ifdef CONFIG_USERFAULTFD
shmem_mfill_atomic_pte(struct mm_struct * dst_mm,pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,bool zeropage,struct page ** pagep)2347 int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
2348 			   pmd_t *dst_pmd,
2349 			   struct vm_area_struct *dst_vma,
2350 			   unsigned long dst_addr,
2351 			   unsigned long src_addr,
2352 			   bool zeropage,
2353 			   struct page **pagep)
2354 {
2355 	struct inode *inode = file_inode(dst_vma->vm_file);
2356 	struct shmem_inode_info *info = SHMEM_I(inode);
2357 	struct address_space *mapping = inode->i_mapping;
2358 	gfp_t gfp = mapping_gfp_mask(mapping);
2359 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2360 	void *page_kaddr;
2361 	struct page *page;
2362 	int ret;
2363 	pgoff_t max_off;
2364 
2365 	if (!shmem_inode_acct_block(inode, 1)) {
2366 		/*
2367 		 * We may have got a page, returned -ENOENT triggering a retry,
2368 		 * and now we find ourselves with -ENOMEM. Release the page, to
2369 		 * avoid a BUG_ON in our caller.
2370 		 */
2371 		if (unlikely(*pagep)) {
2372 			put_page(*pagep);
2373 			*pagep = NULL;
2374 		}
2375 		return -ENOMEM;
2376 	}
2377 
2378 	if (!*pagep) {
2379 		ret = -ENOMEM;
2380 		page = shmem_alloc_page(gfp, info, pgoff);
2381 		if (!page)
2382 			goto out_unacct_blocks;
2383 
2384 		if (!zeropage) {	/* COPY */
2385 			page_kaddr = kmap_atomic(page);
2386 			ret = copy_from_user(page_kaddr,
2387 					     (const void __user *)src_addr,
2388 					     PAGE_SIZE);
2389 			kunmap_atomic(page_kaddr);
2390 
2391 			/* fallback to copy_from_user outside mmap_lock */
2392 			if (unlikely(ret)) {
2393 				*pagep = page;
2394 				ret = -ENOENT;
2395 				/* don't free the page */
2396 				goto out_unacct_blocks;
2397 			}
2398 
2399 			flush_dcache_page(page);
2400 		} else {		/* ZEROPAGE */
2401 			clear_user_highpage(page, dst_addr);
2402 		}
2403 	} else {
2404 		page = *pagep;
2405 		*pagep = NULL;
2406 	}
2407 
2408 	VM_BUG_ON(PageLocked(page));
2409 	VM_BUG_ON(PageSwapBacked(page));
2410 	__SetPageLocked(page);
2411 	__SetPageSwapBacked(page);
2412 	__SetPageUptodate(page);
2413 
2414 	ret = -EFAULT;
2415 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2416 	if (unlikely(pgoff >= max_off))
2417 		goto out_release;
2418 
2419 	ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
2420 				      gfp & GFP_RECLAIM_MASK, dst_mm);
2421 	if (ret)
2422 		goto out_release;
2423 
2424 	ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
2425 				       page, true, false);
2426 	if (ret)
2427 		goto out_delete_from_cache;
2428 
2429 	spin_lock_irq(&info->lock);
2430 	info->alloced++;
2431 	inode->i_blocks += BLOCKS_PER_PAGE;
2432 	shmem_recalc_inode(inode);
2433 	spin_unlock_irq(&info->lock);
2434 
2435 	SetPageDirty(page);
2436 	unlock_page(page);
2437 	return 0;
2438 out_delete_from_cache:
2439 	delete_from_page_cache(page);
2440 out_release:
2441 	unlock_page(page);
2442 	put_page(page);
2443 out_unacct_blocks:
2444 	shmem_inode_unacct_blocks(inode, 1);
2445 	return ret;
2446 }
2447 #endif /* CONFIG_USERFAULTFD */
2448 
2449 #ifdef CONFIG_TMPFS
2450 static const struct inode_operations shmem_symlink_inode_operations;
2451 static const struct inode_operations shmem_short_symlink_operations;
2452 
2453 #ifdef CONFIG_TMPFS_XATTR
2454 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2455 #else
2456 #define shmem_initxattrs NULL
2457 #endif
2458 
2459 static int
shmem_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,void ** fsdata)2460 shmem_write_begin(struct file *file, struct address_space *mapping,
2461 			loff_t pos, unsigned len, unsigned flags,
2462 			struct page **pagep, void **fsdata)
2463 {
2464 	struct inode *inode = mapping->host;
2465 	struct shmem_inode_info *info = SHMEM_I(inode);
2466 	pgoff_t index = pos >> PAGE_SHIFT;
2467 	int ret = 0;
2468 
2469 	/* i_rwsem is held by caller */
2470 	if (unlikely(info->seals & (F_SEAL_GROW |
2471 				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2472 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2473 			return -EPERM;
2474 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2475 			return -EPERM;
2476 	}
2477 
2478 	ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
2479 
2480 	if (ret)
2481 		return ret;
2482 
2483 	if (PageHWPoison(*pagep)) {
2484 		unlock_page(*pagep);
2485 		put_page(*pagep);
2486 		*pagep = NULL;
2487 		return -EIO;
2488 	}
2489 
2490 	return 0;
2491 }
2492 
2493 static int
shmem_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)2494 shmem_write_end(struct file *file, struct address_space *mapping,
2495 			loff_t pos, unsigned len, unsigned copied,
2496 			struct page *page, void *fsdata)
2497 {
2498 	struct inode *inode = mapping->host;
2499 
2500 	if (pos + copied > inode->i_size)
2501 		i_size_write(inode, pos + copied);
2502 
2503 	if (!PageUptodate(page)) {
2504 		struct page *head = compound_head(page);
2505 		if (PageTransCompound(page)) {
2506 			int i;
2507 
2508 			for (i = 0; i < HPAGE_PMD_NR; i++) {
2509 				if (head + i == page)
2510 					continue;
2511 				clear_highpage(head + i);
2512 				flush_dcache_page(head + i);
2513 			}
2514 		}
2515 		if (copied < PAGE_SIZE) {
2516 			unsigned from = pos & (PAGE_SIZE - 1);
2517 			zero_user_segments(page, 0, from,
2518 					from + copied, PAGE_SIZE);
2519 		}
2520 		SetPageUptodate(head);
2521 	}
2522 	set_page_dirty(page);
2523 	unlock_page(page);
2524 	put_page(page);
2525 
2526 	return copied;
2527 }
2528 
shmem_file_read_iter(struct kiocb * iocb,struct iov_iter * to)2529 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2530 {
2531 	struct file *file = iocb->ki_filp;
2532 	struct inode *inode = file_inode(file);
2533 	struct address_space *mapping = inode->i_mapping;
2534 	pgoff_t index;
2535 	unsigned long offset;
2536 	enum sgp_type sgp = SGP_READ;
2537 	int error = 0;
2538 	ssize_t retval = 0;
2539 	loff_t *ppos = &iocb->ki_pos;
2540 
2541 	/*
2542 	 * Might this read be for a stacking filesystem?  Then when reading
2543 	 * holes of a sparse file, we actually need to allocate those pages,
2544 	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
2545 	 */
2546 	if (!iter_is_iovec(to))
2547 		sgp = SGP_CACHE;
2548 
2549 	index = *ppos >> PAGE_SHIFT;
2550 	offset = *ppos & ~PAGE_MASK;
2551 
2552 	for (;;) {
2553 		struct page *page = NULL;
2554 		pgoff_t end_index;
2555 		unsigned long nr, ret;
2556 		loff_t i_size = i_size_read(inode);
2557 
2558 		end_index = i_size >> PAGE_SHIFT;
2559 		if (index > end_index)
2560 			break;
2561 		if (index == end_index) {
2562 			nr = i_size & ~PAGE_MASK;
2563 			if (nr <= offset)
2564 				break;
2565 		}
2566 
2567 		error = shmem_getpage(inode, index, &page, sgp);
2568 		if (error) {
2569 			if (error == -EINVAL)
2570 				error = 0;
2571 			break;
2572 		}
2573 		if (page) {
2574 			if (sgp == SGP_CACHE)
2575 				set_page_dirty(page);
2576 			unlock_page(page);
2577 
2578 			if (PageHWPoison(page)) {
2579 				put_page(page);
2580 				error = -EIO;
2581 				break;
2582 			}
2583 		}
2584 
2585 		/*
2586 		 * We must evaluate after, since reads (unlike writes)
2587 		 * are called without i_rwsem protection against truncate
2588 		 */
2589 		nr = PAGE_SIZE;
2590 		i_size = i_size_read(inode);
2591 		end_index = i_size >> PAGE_SHIFT;
2592 		if (index == end_index) {
2593 			nr = i_size & ~PAGE_MASK;
2594 			if (nr <= offset) {
2595 				if (page)
2596 					put_page(page);
2597 				break;
2598 			}
2599 		}
2600 		nr -= offset;
2601 
2602 		if (page) {
2603 			/*
2604 			 * If users can be writing to this page using arbitrary
2605 			 * virtual addresses, take care about potential aliasing
2606 			 * before reading the page on the kernel side.
2607 			 */
2608 			if (mapping_writably_mapped(mapping))
2609 				flush_dcache_page(page);
2610 			/*
2611 			 * Mark the page accessed if we read the beginning.
2612 			 */
2613 			if (!offset)
2614 				mark_page_accessed(page);
2615 		} else {
2616 			page = ZERO_PAGE(0);
2617 			get_page(page);
2618 		}
2619 
2620 		/*
2621 		 * Ok, we have the page, and it's up-to-date, so
2622 		 * now we can copy it to user space...
2623 		 */
2624 		ret = copy_page_to_iter(page, offset, nr, to);
2625 		retval += ret;
2626 		offset += ret;
2627 		index += offset >> PAGE_SHIFT;
2628 		offset &= ~PAGE_MASK;
2629 
2630 		put_page(page);
2631 		if (!iov_iter_count(to))
2632 			break;
2633 		if (ret < nr) {
2634 			error = -EFAULT;
2635 			break;
2636 		}
2637 		cond_resched();
2638 	}
2639 
2640 	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2641 	file_accessed(file);
2642 	return retval ? retval : error;
2643 }
2644 
shmem_file_llseek(struct file * file,loff_t offset,int whence)2645 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2646 {
2647 	struct address_space *mapping = file->f_mapping;
2648 	struct inode *inode = mapping->host;
2649 
2650 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
2651 		return generic_file_llseek_size(file, offset, whence,
2652 					MAX_LFS_FILESIZE, i_size_read(inode));
2653 	if (offset < 0)
2654 		return -ENXIO;
2655 
2656 	inode_lock(inode);
2657 	/* We're holding i_rwsem so we can access i_size directly */
2658 	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2659 	if (offset >= 0)
2660 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2661 	inode_unlock(inode);
2662 	return offset;
2663 }
2664 
shmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)2665 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2666 							 loff_t len)
2667 {
2668 	struct inode *inode = file_inode(file);
2669 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2670 	struct shmem_inode_info *info = SHMEM_I(inode);
2671 	struct shmem_falloc shmem_falloc;
2672 	pgoff_t start, index, end, undo_fallocend;
2673 	int error;
2674 
2675 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2676 		return -EOPNOTSUPP;
2677 
2678 	inode_lock(inode);
2679 
2680 	if (mode & FALLOC_FL_PUNCH_HOLE) {
2681 		struct address_space *mapping = file->f_mapping;
2682 		loff_t unmap_start = round_up(offset, PAGE_SIZE);
2683 		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2684 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2685 
2686 		/* protected by i_rwsem */
2687 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2688 			error = -EPERM;
2689 			goto out;
2690 		}
2691 
2692 		shmem_falloc.waitq = &shmem_falloc_waitq;
2693 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2694 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2695 		spin_lock(&inode->i_lock);
2696 		inode->i_private = &shmem_falloc;
2697 		spin_unlock(&inode->i_lock);
2698 
2699 		if ((u64)unmap_end > (u64)unmap_start)
2700 			unmap_mapping_range(mapping, unmap_start,
2701 					    1 + unmap_end - unmap_start, 0);
2702 		shmem_truncate_range(inode, offset, offset + len - 1);
2703 		/* No need to unmap again: hole-punching leaves COWed pages */
2704 
2705 		spin_lock(&inode->i_lock);
2706 		inode->i_private = NULL;
2707 		wake_up_all(&shmem_falloc_waitq);
2708 		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2709 		spin_unlock(&inode->i_lock);
2710 		error = 0;
2711 		goto out;
2712 	}
2713 
2714 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2715 	error = inode_newsize_ok(inode, offset + len);
2716 	if (error)
2717 		goto out;
2718 
2719 	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2720 		error = -EPERM;
2721 		goto out;
2722 	}
2723 
2724 	start = offset >> PAGE_SHIFT;
2725 	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2726 	/* Try to avoid a swapstorm if len is impossible to satisfy */
2727 	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2728 		error = -ENOSPC;
2729 		goto out;
2730 	}
2731 
2732 	shmem_falloc.waitq = NULL;
2733 	shmem_falloc.start = start;
2734 	shmem_falloc.next  = start;
2735 	shmem_falloc.nr_falloced = 0;
2736 	shmem_falloc.nr_unswapped = 0;
2737 	spin_lock(&inode->i_lock);
2738 	inode->i_private = &shmem_falloc;
2739 	spin_unlock(&inode->i_lock);
2740 
2741 	/*
2742 	 * info->fallocend is only relevant when huge pages might be
2743 	 * involved: to prevent split_huge_page() freeing fallocated
2744 	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2745 	 */
2746 	undo_fallocend = info->fallocend;
2747 	if (info->fallocend < end)
2748 		info->fallocend = end;
2749 
2750 	for (index = start; index < end; ) {
2751 		struct page *page;
2752 
2753 		/*
2754 		 * Good, the fallocate(2) manpage permits EINTR: we may have
2755 		 * been interrupted because we are using up too much memory.
2756 		 */
2757 		if (signal_pending(current))
2758 			error = -EINTR;
2759 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2760 			error = -ENOMEM;
2761 		else
2762 			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
2763 		if (error) {
2764 			info->fallocend = undo_fallocend;
2765 			/* Remove the !PageUptodate pages we added */
2766 			if (index > start) {
2767 				shmem_undo_range(inode,
2768 				    (loff_t)start << PAGE_SHIFT,
2769 				    ((loff_t)index << PAGE_SHIFT) - 1, true);
2770 			}
2771 			goto undone;
2772 		}
2773 
2774 		index++;
2775 		/*
2776 		 * Here is a more important optimization than it appears:
2777 		 * a second SGP_FALLOC on the same huge page will clear it,
2778 		 * making it PageUptodate and un-undoable if we fail later.
2779 		 */
2780 		if (PageTransCompound(page)) {
2781 			index = round_up(index, HPAGE_PMD_NR);
2782 			/* Beware 32-bit wraparound */
2783 			if (!index)
2784 				index--;
2785 		}
2786 
2787 		/*
2788 		 * Inform shmem_writepage() how far we have reached.
2789 		 * No need for lock or barrier: we have the page lock.
2790 		 */
2791 		if (!PageUptodate(page))
2792 			shmem_falloc.nr_falloced += index - shmem_falloc.next;
2793 		shmem_falloc.next = index;
2794 
2795 		/*
2796 		 * If !PageUptodate, leave it that way so that freeable pages
2797 		 * can be recognized if we need to rollback on error later.
2798 		 * But set_page_dirty so that memory pressure will swap rather
2799 		 * than free the pages we are allocating (and SGP_CACHE pages
2800 		 * might still be clean: we now need to mark those dirty too).
2801 		 */
2802 		set_page_dirty(page);
2803 		unlock_page(page);
2804 		put_page(page);
2805 		cond_resched();
2806 	}
2807 
2808 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
2809 		i_size_write(inode, offset + len);
2810 	inode->i_ctime = current_time(inode);
2811 undone:
2812 	spin_lock(&inode->i_lock);
2813 	inode->i_private = NULL;
2814 	spin_unlock(&inode->i_lock);
2815 out:
2816 	inode_unlock(inode);
2817 	return error;
2818 }
2819 
shmem_statfs(struct dentry * dentry,struct kstatfs * buf)2820 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
2821 {
2822 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
2823 
2824 	buf->f_type = TMPFS_MAGIC;
2825 	buf->f_bsize = PAGE_SIZE;
2826 	buf->f_namelen = NAME_MAX;
2827 	if (sbinfo->max_blocks) {
2828 		buf->f_blocks = sbinfo->max_blocks;
2829 		buf->f_bavail =
2830 		buf->f_bfree  = sbinfo->max_blocks -
2831 				percpu_counter_sum(&sbinfo->used_blocks);
2832 	}
2833 	if (sbinfo->max_inodes) {
2834 		buf->f_files = sbinfo->max_inodes;
2835 		buf->f_ffree = sbinfo->free_inodes;
2836 	}
2837 	/* else leave those fields 0 like simple_statfs */
2838 
2839 	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
2840 
2841 	return 0;
2842 }
2843 
2844 /*
2845  * File creation. Allocate an inode, and we're done..
2846  */
2847 static int
shmem_mknod(struct user_namespace * mnt_userns,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)2848 shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
2849 	    struct dentry *dentry, umode_t mode, dev_t dev)
2850 {
2851 	struct inode *inode;
2852 	int error = -ENOSPC;
2853 
2854 	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
2855 	if (inode) {
2856 		error = simple_acl_create(dir, inode);
2857 		if (error)
2858 			goto out_iput;
2859 		error = security_inode_init_security(inode, dir,
2860 						     &dentry->d_name,
2861 						     shmem_initxattrs, NULL);
2862 		if (error && error != -EOPNOTSUPP)
2863 			goto out_iput;
2864 
2865 		error = 0;
2866 		dir->i_size += BOGO_DIRENT_SIZE;
2867 		dir->i_ctime = dir->i_mtime = current_time(dir);
2868 		d_instantiate(dentry, inode);
2869 		dget(dentry); /* Extra count - pin the dentry in core */
2870 	}
2871 	return error;
2872 out_iput:
2873 	iput(inode);
2874 	return error;
2875 }
2876 
2877 static int
shmem_tmpfile(struct user_namespace * mnt_userns,struct inode * dir,struct dentry * dentry,umode_t mode)2878 shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
2879 	      struct dentry *dentry, umode_t mode)
2880 {
2881 	struct inode *inode;
2882 	int error = -ENOSPC;
2883 
2884 	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
2885 	if (inode) {
2886 		error = security_inode_init_security(inode, dir,
2887 						     NULL,
2888 						     shmem_initxattrs, NULL);
2889 		if (error && error != -EOPNOTSUPP)
2890 			goto out_iput;
2891 		error = simple_acl_create(dir, inode);
2892 		if (error)
2893 			goto out_iput;
2894 		d_tmpfile(dentry, inode);
2895 	}
2896 	return error;
2897 out_iput:
2898 	iput(inode);
2899 	return error;
2900 }
2901 
shmem_mkdir(struct user_namespace * mnt_userns,struct inode * dir,struct dentry * dentry,umode_t mode)2902 static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
2903 		       struct dentry *dentry, umode_t mode)
2904 {
2905 	int error;
2906 
2907 	if ((error = shmem_mknod(&init_user_ns, dir, dentry,
2908 				 mode | S_IFDIR, 0)))
2909 		return error;
2910 	inc_nlink(dir);
2911 	return 0;
2912 }
2913 
shmem_create(struct user_namespace * mnt_userns,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)2914 static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
2915 			struct dentry *dentry, umode_t mode, bool excl)
2916 {
2917 	return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
2918 }
2919 
2920 /*
2921  * Link a file..
2922  */
shmem_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)2923 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2924 {
2925 	struct inode *inode = d_inode(old_dentry);
2926 	int ret = 0;
2927 
2928 	/*
2929 	 * No ordinary (disk based) filesystem counts links as inodes;
2930 	 * but each new link needs a new dentry, pinning lowmem, and
2931 	 * tmpfs dentries cannot be pruned until they are unlinked.
2932 	 * But if an O_TMPFILE file is linked into the tmpfs, the
2933 	 * first link must skip that, to get the accounting right.
2934 	 */
2935 	if (inode->i_nlink) {
2936 		ret = shmem_reserve_inode(inode->i_sb, NULL);
2937 		if (ret)
2938 			goto out;
2939 	}
2940 
2941 	dir->i_size += BOGO_DIRENT_SIZE;
2942 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2943 	inc_nlink(inode);
2944 	ihold(inode);	/* New dentry reference */
2945 	dget(dentry);		/* Extra pinning count for the created dentry */
2946 	d_instantiate(dentry, inode);
2947 out:
2948 	return ret;
2949 }
2950 
shmem_unlink(struct inode * dir,struct dentry * dentry)2951 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2952 {
2953 	struct inode *inode = d_inode(dentry);
2954 
2955 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2956 		shmem_free_inode(inode->i_sb);
2957 
2958 	dir->i_size -= BOGO_DIRENT_SIZE;
2959 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
2960 	drop_nlink(inode);
2961 	dput(dentry);	/* Undo the count from "create" - this does all the work */
2962 	return 0;
2963 }
2964 
shmem_rmdir(struct inode * dir,struct dentry * dentry)2965 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2966 {
2967 	if (!simple_empty(dentry))
2968 		return -ENOTEMPTY;
2969 
2970 	drop_nlink(d_inode(dentry));
2971 	drop_nlink(dir);
2972 	return shmem_unlink(dir, dentry);
2973 }
2974 
shmem_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)2975 static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
2976 {
2977 	bool old_is_dir = d_is_dir(old_dentry);
2978 	bool new_is_dir = d_is_dir(new_dentry);
2979 
2980 	if (old_dir != new_dir && old_is_dir != new_is_dir) {
2981 		if (old_is_dir) {
2982 			drop_nlink(old_dir);
2983 			inc_nlink(new_dir);
2984 		} else {
2985 			drop_nlink(new_dir);
2986 			inc_nlink(old_dir);
2987 		}
2988 	}
2989 	old_dir->i_ctime = old_dir->i_mtime =
2990 	new_dir->i_ctime = new_dir->i_mtime =
2991 	d_inode(old_dentry)->i_ctime =
2992 	d_inode(new_dentry)->i_ctime = current_time(old_dir);
2993 
2994 	return 0;
2995 }
2996 
shmem_whiteout(struct user_namespace * mnt_userns,struct inode * old_dir,struct dentry * old_dentry)2997 static int shmem_whiteout(struct user_namespace *mnt_userns,
2998 			  struct inode *old_dir, struct dentry *old_dentry)
2999 {
3000 	struct dentry *whiteout;
3001 	int error;
3002 
3003 	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3004 	if (!whiteout)
3005 		return -ENOMEM;
3006 
3007 	error = shmem_mknod(&init_user_ns, old_dir, whiteout,
3008 			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3009 	dput(whiteout);
3010 	if (error)
3011 		return error;
3012 
3013 	/*
3014 	 * Cheat and hash the whiteout while the old dentry is still in
3015 	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3016 	 *
3017 	 * d_lookup() will consistently find one of them at this point,
3018 	 * not sure which one, but that isn't even important.
3019 	 */
3020 	d_rehash(whiteout);
3021 	return 0;
3022 }
3023 
3024 /*
3025  * The VFS layer already does all the dentry stuff for rename,
3026  * we just have to decrement the usage count for the target if
3027  * it exists so that the VFS layer correctly free's it when it
3028  * gets overwritten.
3029  */
shmem_rename2(struct user_namespace * mnt_userns,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)3030 static int shmem_rename2(struct user_namespace *mnt_userns,
3031 			 struct inode *old_dir, struct dentry *old_dentry,
3032 			 struct inode *new_dir, struct dentry *new_dentry,
3033 			 unsigned int flags)
3034 {
3035 	struct inode *inode = d_inode(old_dentry);
3036 	int they_are_dirs = S_ISDIR(inode->i_mode);
3037 
3038 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3039 		return -EINVAL;
3040 
3041 	if (flags & RENAME_EXCHANGE)
3042 		return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
3043 
3044 	if (!simple_empty(new_dentry))
3045 		return -ENOTEMPTY;
3046 
3047 	if (flags & RENAME_WHITEOUT) {
3048 		int error;
3049 
3050 		error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
3051 		if (error)
3052 			return error;
3053 	}
3054 
3055 	if (d_really_is_positive(new_dentry)) {
3056 		(void) shmem_unlink(new_dir, new_dentry);
3057 		if (they_are_dirs) {
3058 			drop_nlink(d_inode(new_dentry));
3059 			drop_nlink(old_dir);
3060 		}
3061 	} else if (they_are_dirs) {
3062 		drop_nlink(old_dir);
3063 		inc_nlink(new_dir);
3064 	}
3065 
3066 	old_dir->i_size -= BOGO_DIRENT_SIZE;
3067 	new_dir->i_size += BOGO_DIRENT_SIZE;
3068 	old_dir->i_ctime = old_dir->i_mtime =
3069 	new_dir->i_ctime = new_dir->i_mtime =
3070 	inode->i_ctime = current_time(old_dir);
3071 	return 0;
3072 }
3073 
shmem_symlink(struct user_namespace * mnt_userns,struct inode * dir,struct dentry * dentry,const char * symname)3074 static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
3075 			 struct dentry *dentry, const char *symname)
3076 {
3077 	int error;
3078 	int len;
3079 	struct inode *inode;
3080 	struct page *page;
3081 
3082 	len = strlen(symname) + 1;
3083 	if (len > PAGE_SIZE)
3084 		return -ENAMETOOLONG;
3085 
3086 	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
3087 				VM_NORESERVE);
3088 	if (!inode)
3089 		return -ENOSPC;
3090 
3091 	error = security_inode_init_security(inode, dir, &dentry->d_name,
3092 					     shmem_initxattrs, NULL);
3093 	if (error && error != -EOPNOTSUPP) {
3094 		iput(inode);
3095 		return error;
3096 	}
3097 
3098 	inode->i_size = len-1;
3099 	if (len <= SHORT_SYMLINK_LEN) {
3100 		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3101 		if (!inode->i_link) {
3102 			iput(inode);
3103 			return -ENOMEM;
3104 		}
3105 		inode->i_op = &shmem_short_symlink_operations;
3106 	} else {
3107 		inode_nohighmem(inode);
3108 		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
3109 		if (error) {
3110 			iput(inode);
3111 			return error;
3112 		}
3113 		inode->i_mapping->a_ops = &shmem_aops;
3114 		inode->i_op = &shmem_symlink_inode_operations;
3115 		memcpy(page_address(page), symname, len);
3116 		SetPageUptodate(page);
3117 		set_page_dirty(page);
3118 		unlock_page(page);
3119 		put_page(page);
3120 	}
3121 	dir->i_size += BOGO_DIRENT_SIZE;
3122 	dir->i_ctime = dir->i_mtime = current_time(dir);
3123 	d_instantiate(dentry, inode);
3124 	dget(dentry);
3125 	return 0;
3126 }
3127 
shmem_put_link(void * arg)3128 static void shmem_put_link(void *arg)
3129 {
3130 	mark_page_accessed(arg);
3131 	put_page(arg);
3132 }
3133 
shmem_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * done)3134 static const char *shmem_get_link(struct dentry *dentry,
3135 				  struct inode *inode,
3136 				  struct delayed_call *done)
3137 {
3138 	struct page *page = NULL;
3139 	int error;
3140 	if (!dentry) {
3141 		page = find_get_page(inode->i_mapping, 0);
3142 		if (!page)
3143 			return ERR_PTR(-ECHILD);
3144 		if (PageHWPoison(page) ||
3145 		    !PageUptodate(page)) {
3146 			put_page(page);
3147 			return ERR_PTR(-ECHILD);
3148 		}
3149 	} else {
3150 		error = shmem_getpage(inode, 0, &page, SGP_READ);
3151 		if (error)
3152 			return ERR_PTR(error);
3153 		if (!page)
3154 			return ERR_PTR(-ECHILD);
3155 		if (PageHWPoison(page)) {
3156 			unlock_page(page);
3157 			put_page(page);
3158 			return ERR_PTR(-ECHILD);
3159 		}
3160 		unlock_page(page);
3161 	}
3162 	set_delayed_call(done, shmem_put_link, page);
3163 	return page_address(page);
3164 }
3165 
3166 #ifdef CONFIG_TMPFS_XATTR
3167 /*
3168  * Superblocks without xattr inode operations may get some security.* xattr
3169  * support from the LSM "for free". As soon as we have any other xattrs
3170  * like ACLs, we also need to implement the security.* handlers at
3171  * filesystem level, though.
3172  */
3173 
3174 /*
3175  * Callback for security_inode_init_security() for acquiring xattrs.
3176  */
shmem_initxattrs(struct inode * inode,const struct xattr * xattr_array,void * fs_info)3177 static int shmem_initxattrs(struct inode *inode,
3178 			    const struct xattr *xattr_array,
3179 			    void *fs_info)
3180 {
3181 	struct shmem_inode_info *info = SHMEM_I(inode);
3182 	const struct xattr *xattr;
3183 	struct simple_xattr *new_xattr;
3184 	size_t len;
3185 
3186 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3187 		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3188 		if (!new_xattr)
3189 			return -ENOMEM;
3190 
3191 		len = strlen(xattr->name) + 1;
3192 		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3193 					  GFP_KERNEL);
3194 		if (!new_xattr->name) {
3195 			kvfree(new_xattr);
3196 			return -ENOMEM;
3197 		}
3198 
3199 		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3200 		       XATTR_SECURITY_PREFIX_LEN);
3201 		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3202 		       xattr->name, len);
3203 
3204 		simple_xattr_list_add(&info->xattrs, new_xattr);
3205 	}
3206 
3207 	return 0;
3208 }
3209 
shmem_xattr_handler_get(const struct xattr_handler * handler,struct dentry * unused,struct inode * inode,const char * name,void * buffer,size_t size)3210 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3211 				   struct dentry *unused, struct inode *inode,
3212 				   const char *name, void *buffer, size_t size)
3213 {
3214 	struct shmem_inode_info *info = SHMEM_I(inode);
3215 
3216 	name = xattr_full_name(handler, name);
3217 	return simple_xattr_get(&info->xattrs, name, buffer, size);
3218 }
3219 
shmem_xattr_handler_set(const struct xattr_handler * handler,struct user_namespace * mnt_userns,struct dentry * unused,struct inode * inode,const char * name,const void * value,size_t size,int flags)3220 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3221 				   struct user_namespace *mnt_userns,
3222 				   struct dentry *unused, struct inode *inode,
3223 				   const char *name, const void *value,
3224 				   size_t size, int flags)
3225 {
3226 	struct shmem_inode_info *info = SHMEM_I(inode);
3227 
3228 	name = xattr_full_name(handler, name);
3229 	return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3230 }
3231 
3232 static const struct xattr_handler shmem_security_xattr_handler = {
3233 	.prefix = XATTR_SECURITY_PREFIX,
3234 	.get = shmem_xattr_handler_get,
3235 	.set = shmem_xattr_handler_set,
3236 };
3237 
3238 static const struct xattr_handler shmem_trusted_xattr_handler = {
3239 	.prefix = XATTR_TRUSTED_PREFIX,
3240 	.get = shmem_xattr_handler_get,
3241 	.set = shmem_xattr_handler_set,
3242 };
3243 
3244 static const struct xattr_handler *shmem_xattr_handlers[] = {
3245 #ifdef CONFIG_TMPFS_POSIX_ACL
3246 	&posix_acl_access_xattr_handler,
3247 	&posix_acl_default_xattr_handler,
3248 #endif
3249 	&shmem_security_xattr_handler,
3250 	&shmem_trusted_xattr_handler,
3251 	NULL
3252 };
3253 
shmem_listxattr(struct dentry * dentry,char * buffer,size_t size)3254 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3255 {
3256 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3257 	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3258 }
3259 #endif /* CONFIG_TMPFS_XATTR */
3260 
3261 static const struct inode_operations shmem_short_symlink_operations = {
3262 	.get_link	= simple_get_link,
3263 #ifdef CONFIG_TMPFS_XATTR
3264 	.listxattr	= shmem_listxattr,
3265 #endif
3266 };
3267 
3268 static const struct inode_operations shmem_symlink_inode_operations = {
3269 	.get_link	= shmem_get_link,
3270 #ifdef CONFIG_TMPFS_XATTR
3271 	.listxattr	= shmem_listxattr,
3272 #endif
3273 };
3274 
shmem_get_parent(struct dentry * child)3275 static struct dentry *shmem_get_parent(struct dentry *child)
3276 {
3277 	return ERR_PTR(-ESTALE);
3278 }
3279 
shmem_match(struct inode * ino,void * vfh)3280 static int shmem_match(struct inode *ino, void *vfh)
3281 {
3282 	__u32 *fh = vfh;
3283 	__u64 inum = fh[2];
3284 	inum = (inum << 32) | fh[1];
3285 	return ino->i_ino == inum && fh[0] == ino->i_generation;
3286 }
3287 
3288 /* Find any alias of inode, but prefer a hashed alias */
shmem_find_alias(struct inode * inode)3289 static struct dentry *shmem_find_alias(struct inode *inode)
3290 {
3291 	struct dentry *alias = d_find_alias(inode);
3292 
3293 	return alias ?: d_find_any_alias(inode);
3294 }
3295 
3296 
shmem_fh_to_dentry(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)3297 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3298 		struct fid *fid, int fh_len, int fh_type)
3299 {
3300 	struct inode *inode;
3301 	struct dentry *dentry = NULL;
3302 	u64 inum;
3303 
3304 	if (fh_len < 3)
3305 		return NULL;
3306 
3307 	inum = fid->raw[2];
3308 	inum = (inum << 32) | fid->raw[1];
3309 
3310 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3311 			shmem_match, fid->raw);
3312 	if (inode) {
3313 		dentry = shmem_find_alias(inode);
3314 		iput(inode);
3315 	}
3316 
3317 	return dentry;
3318 }
3319 
shmem_encode_fh(struct inode * inode,__u32 * fh,int * len,struct inode * parent)3320 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3321 				struct inode *parent)
3322 {
3323 	if (*len < 3) {
3324 		*len = 3;
3325 		return FILEID_INVALID;
3326 	}
3327 
3328 	if (inode_unhashed(inode)) {
3329 		/* Unfortunately insert_inode_hash is not idempotent,
3330 		 * so as we hash inodes here rather than at creation
3331 		 * time, we need a lock to ensure we only try
3332 		 * to do it once
3333 		 */
3334 		static DEFINE_SPINLOCK(lock);
3335 		spin_lock(&lock);
3336 		if (inode_unhashed(inode))
3337 			__insert_inode_hash(inode,
3338 					    inode->i_ino + inode->i_generation);
3339 		spin_unlock(&lock);
3340 	}
3341 
3342 	fh[0] = inode->i_generation;
3343 	fh[1] = inode->i_ino;
3344 	fh[2] = ((__u64)inode->i_ino) >> 32;
3345 
3346 	*len = 3;
3347 	return 1;
3348 }
3349 
3350 static const struct export_operations shmem_export_ops = {
3351 	.get_parent     = shmem_get_parent,
3352 	.encode_fh      = shmem_encode_fh,
3353 	.fh_to_dentry	= shmem_fh_to_dentry,
3354 };
3355 
3356 enum shmem_param {
3357 	Opt_gid,
3358 	Opt_huge,
3359 	Opt_mode,
3360 	Opt_mpol,
3361 	Opt_nr_blocks,
3362 	Opt_nr_inodes,
3363 	Opt_size,
3364 	Opt_uid,
3365 	Opt_inode32,
3366 	Opt_inode64,
3367 };
3368 
3369 static const struct constant_table shmem_param_enums_huge[] = {
3370 	{"never",	SHMEM_HUGE_NEVER },
3371 	{"always",	SHMEM_HUGE_ALWAYS },
3372 	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
3373 	{"advise",	SHMEM_HUGE_ADVISE },
3374 	{}
3375 };
3376 
3377 const struct fs_parameter_spec shmem_fs_parameters[] = {
3378 	fsparam_u32   ("gid",		Opt_gid),
3379 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
3380 	fsparam_u32oct("mode",		Opt_mode),
3381 	fsparam_string("mpol",		Opt_mpol),
3382 	fsparam_string("nr_blocks",	Opt_nr_blocks),
3383 	fsparam_string("nr_inodes",	Opt_nr_inodes),
3384 	fsparam_string("size",		Opt_size),
3385 	fsparam_u32   ("uid",		Opt_uid),
3386 	fsparam_flag  ("inode32",	Opt_inode32),
3387 	fsparam_flag  ("inode64",	Opt_inode64),
3388 	{}
3389 };
3390 
shmem_parse_one(struct fs_context * fc,struct fs_parameter * param)3391 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3392 {
3393 	struct shmem_options *ctx = fc->fs_private;
3394 	struct fs_parse_result result;
3395 	unsigned long long size;
3396 	char *rest;
3397 	int opt;
3398 	kuid_t kuid;
3399 	kgid_t kgid;
3400 
3401 	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3402 	if (opt < 0)
3403 		return opt;
3404 
3405 	switch (opt) {
3406 	case Opt_size:
3407 		size = memparse(param->string, &rest);
3408 		if (*rest == '%') {
3409 			size <<= PAGE_SHIFT;
3410 			size *= totalram_pages();
3411 			do_div(size, 100);
3412 			rest++;
3413 		}
3414 		if (*rest)
3415 			goto bad_value;
3416 		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3417 		ctx->seen |= SHMEM_SEEN_BLOCKS;
3418 		break;
3419 	case Opt_nr_blocks:
3420 		ctx->blocks = memparse(param->string, &rest);
3421 		if (*rest)
3422 			goto bad_value;
3423 		ctx->seen |= SHMEM_SEEN_BLOCKS;
3424 		break;
3425 	case Opt_nr_inodes:
3426 		ctx->inodes = memparse(param->string, &rest);
3427 		if (*rest)
3428 			goto bad_value;
3429 		ctx->seen |= SHMEM_SEEN_INODES;
3430 		break;
3431 	case Opt_mode:
3432 		ctx->mode = result.uint_32 & 07777;
3433 		break;
3434 	case Opt_uid:
3435 		kuid = make_kuid(current_user_ns(), result.uint_32);
3436 		if (!uid_valid(kuid))
3437 			goto bad_value;
3438 
3439 		/*
3440 		 * The requested uid must be representable in the
3441 		 * filesystem's idmapping.
3442 		 */
3443 		if (!kuid_has_mapping(fc->user_ns, kuid))
3444 			goto bad_value;
3445 
3446 		ctx->uid = kuid;
3447 		break;
3448 	case Opt_gid:
3449 		kgid = make_kgid(current_user_ns(), result.uint_32);
3450 		if (!gid_valid(kgid))
3451 			goto bad_value;
3452 
3453 		/*
3454 		 * The requested gid must be representable in the
3455 		 * filesystem's idmapping.
3456 		 */
3457 		if (!kgid_has_mapping(fc->user_ns, kgid))
3458 			goto bad_value;
3459 
3460 		ctx->gid = kgid;
3461 		break;
3462 	case Opt_huge:
3463 		ctx->huge = result.uint_32;
3464 		if (ctx->huge != SHMEM_HUGE_NEVER &&
3465 		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3466 		      has_transparent_hugepage()))
3467 			goto unsupported_parameter;
3468 		ctx->seen |= SHMEM_SEEN_HUGE;
3469 		break;
3470 	case Opt_mpol:
3471 		if (IS_ENABLED(CONFIG_NUMA)) {
3472 			mpol_put(ctx->mpol);
3473 			ctx->mpol = NULL;
3474 			if (mpol_parse_str(param->string, &ctx->mpol))
3475 				goto bad_value;
3476 			break;
3477 		}
3478 		goto unsupported_parameter;
3479 	case Opt_inode32:
3480 		ctx->full_inums = false;
3481 		ctx->seen |= SHMEM_SEEN_INUMS;
3482 		break;
3483 	case Opt_inode64:
3484 		if (sizeof(ino_t) < 8) {
3485 			return invalfc(fc,
3486 				       "Cannot use inode64 with <64bit inums in kernel\n");
3487 		}
3488 		ctx->full_inums = true;
3489 		ctx->seen |= SHMEM_SEEN_INUMS;
3490 		break;
3491 	}
3492 	return 0;
3493 
3494 unsupported_parameter:
3495 	return invalfc(fc, "Unsupported parameter '%s'", param->key);
3496 bad_value:
3497 	return invalfc(fc, "Bad value for '%s'", param->key);
3498 }
3499 
shmem_parse_options(struct fs_context * fc,void * data)3500 static int shmem_parse_options(struct fs_context *fc, void *data)
3501 {
3502 	char *options = data;
3503 
3504 	if (options) {
3505 		int err = security_sb_eat_lsm_opts(options, &fc->security);
3506 		if (err)
3507 			return err;
3508 	}
3509 
3510 	while (options != NULL) {
3511 		char *this_char = options;
3512 		for (;;) {
3513 			/*
3514 			 * NUL-terminate this option: unfortunately,
3515 			 * mount options form a comma-separated list,
3516 			 * but mpol's nodelist may also contain commas.
3517 			 */
3518 			options = strchr(options, ',');
3519 			if (options == NULL)
3520 				break;
3521 			options++;
3522 			if (!isdigit(*options)) {
3523 				options[-1] = '\0';
3524 				break;
3525 			}
3526 		}
3527 		if (*this_char) {
3528 			char *value = strchr(this_char, '=');
3529 			size_t len = 0;
3530 			int err;
3531 
3532 			if (value) {
3533 				*value++ = '\0';
3534 				len = strlen(value);
3535 			}
3536 			err = vfs_parse_fs_string(fc, this_char, value, len);
3537 			if (err < 0)
3538 				return err;
3539 		}
3540 	}
3541 	return 0;
3542 }
3543 
3544 /*
3545  * Reconfigure a shmem filesystem.
3546  *
3547  * Note that we disallow change from limited->unlimited blocks/inodes while any
3548  * are in use; but we must separately disallow unlimited->limited, because in
3549  * that case we have no record of how much is already in use.
3550  */
shmem_reconfigure(struct fs_context * fc)3551 static int shmem_reconfigure(struct fs_context *fc)
3552 {
3553 	struct shmem_options *ctx = fc->fs_private;
3554 	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3555 	unsigned long inodes;
3556 	struct mempolicy *mpol = NULL;
3557 	const char *err;
3558 
3559 	raw_spin_lock(&sbinfo->stat_lock);
3560 	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3561 	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3562 		if (!sbinfo->max_blocks) {
3563 			err = "Cannot retroactively limit size";
3564 			goto out;
3565 		}
3566 		if (percpu_counter_compare(&sbinfo->used_blocks,
3567 					   ctx->blocks) > 0) {
3568 			err = "Too small a size for current use";
3569 			goto out;
3570 		}
3571 	}
3572 	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3573 		if (!sbinfo->max_inodes) {
3574 			err = "Cannot retroactively limit inodes";
3575 			goto out;
3576 		}
3577 		if (ctx->inodes < inodes) {
3578 			err = "Too few inodes for current use";
3579 			goto out;
3580 		}
3581 	}
3582 
3583 	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3584 	    sbinfo->next_ino > UINT_MAX) {
3585 		err = "Current inum too high to switch to 32-bit inums";
3586 		goto out;
3587 	}
3588 
3589 	if (ctx->seen & SHMEM_SEEN_HUGE)
3590 		sbinfo->huge = ctx->huge;
3591 	if (ctx->seen & SHMEM_SEEN_INUMS)
3592 		sbinfo->full_inums = ctx->full_inums;
3593 	if (ctx->seen & SHMEM_SEEN_BLOCKS)
3594 		sbinfo->max_blocks  = ctx->blocks;
3595 	if (ctx->seen & SHMEM_SEEN_INODES) {
3596 		sbinfo->max_inodes  = ctx->inodes;
3597 		sbinfo->free_inodes = ctx->inodes - inodes;
3598 	}
3599 
3600 	/*
3601 	 * Preserve previous mempolicy unless mpol remount option was specified.
3602 	 */
3603 	if (ctx->mpol) {
3604 		mpol = sbinfo->mpol;
3605 		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
3606 		ctx->mpol = NULL;
3607 	}
3608 	raw_spin_unlock(&sbinfo->stat_lock);
3609 	mpol_put(mpol);
3610 	return 0;
3611 out:
3612 	raw_spin_unlock(&sbinfo->stat_lock);
3613 	return invalfc(fc, "%s", err);
3614 }
3615 
shmem_show_options(struct seq_file * seq,struct dentry * root)3616 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3617 {
3618 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3619 
3620 	if (sbinfo->max_blocks != shmem_default_max_blocks())
3621 		seq_printf(seq, ",size=%luk",
3622 			sbinfo->max_blocks << (PAGE_SHIFT - 10));
3623 	if (sbinfo->max_inodes != shmem_default_max_inodes())
3624 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3625 	if (sbinfo->mode != (0777 | S_ISVTX))
3626 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3627 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3628 		seq_printf(seq, ",uid=%u",
3629 				from_kuid_munged(&init_user_ns, sbinfo->uid));
3630 	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3631 		seq_printf(seq, ",gid=%u",
3632 				from_kgid_munged(&init_user_ns, sbinfo->gid));
3633 
3634 	/*
3635 	 * Showing inode{64,32} might be useful even if it's the system default,
3636 	 * since then people don't have to resort to checking both here and
3637 	 * /proc/config.gz to confirm 64-bit inums were successfully applied
3638 	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
3639 	 *
3640 	 * We hide it when inode64 isn't the default and we are using 32-bit
3641 	 * inodes, since that probably just means the feature isn't even under
3642 	 * consideration.
3643 	 *
3644 	 * As such:
3645 	 *
3646 	 *                     +-----------------+-----------------+
3647 	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
3648 	 *  +------------------+-----------------+-----------------+
3649 	 *  | full_inums=true  | show            | show            |
3650 	 *  | full_inums=false | show            | hide            |
3651 	 *  +------------------+-----------------+-----------------+
3652 	 *
3653 	 */
3654 	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3655 		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3656 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3657 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3658 	if (sbinfo->huge)
3659 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3660 #endif
3661 	shmem_show_mpol(seq, sbinfo->mpol);
3662 	return 0;
3663 }
3664 
3665 #endif /* CONFIG_TMPFS */
3666 
shmem_put_super(struct super_block * sb)3667 static void shmem_put_super(struct super_block *sb)
3668 {
3669 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3670 
3671 	free_percpu(sbinfo->ino_batch);
3672 	percpu_counter_destroy(&sbinfo->used_blocks);
3673 	mpol_put(sbinfo->mpol);
3674 	kfree(sbinfo);
3675 	sb->s_fs_info = NULL;
3676 }
3677 
shmem_fill_super(struct super_block * sb,struct fs_context * fc)3678 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3679 {
3680 	struct shmem_options *ctx = fc->fs_private;
3681 	struct inode *inode;
3682 	struct shmem_sb_info *sbinfo;
3683 
3684 	/* Round up to L1_CACHE_BYTES to resist false sharing */
3685 	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3686 				L1_CACHE_BYTES), GFP_KERNEL);
3687 	if (!sbinfo)
3688 		return -ENOMEM;
3689 
3690 	sb->s_fs_info = sbinfo;
3691 
3692 #ifdef CONFIG_TMPFS
3693 	/*
3694 	 * Per default we only allow half of the physical ram per
3695 	 * tmpfs instance, limiting inodes to one per page of lowmem;
3696 	 * but the internal instance is left unlimited.
3697 	 */
3698 	if (!(sb->s_flags & SB_KERNMOUNT)) {
3699 		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3700 			ctx->blocks = shmem_default_max_blocks();
3701 		if (!(ctx->seen & SHMEM_SEEN_INODES))
3702 			ctx->inodes = shmem_default_max_inodes();
3703 		if (!(ctx->seen & SHMEM_SEEN_INUMS))
3704 			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3705 	} else {
3706 		sb->s_flags |= SB_NOUSER;
3707 	}
3708 	sb->s_export_op = &shmem_export_ops;
3709 	sb->s_flags |= SB_NOSEC;
3710 #else
3711 	sb->s_flags |= SB_NOUSER;
3712 #endif
3713 	sbinfo->max_blocks = ctx->blocks;
3714 	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3715 	if (sb->s_flags & SB_KERNMOUNT) {
3716 		sbinfo->ino_batch = alloc_percpu(ino_t);
3717 		if (!sbinfo->ino_batch)
3718 			goto failed;
3719 	}
3720 	sbinfo->uid = ctx->uid;
3721 	sbinfo->gid = ctx->gid;
3722 	sbinfo->full_inums = ctx->full_inums;
3723 	sbinfo->mode = ctx->mode;
3724 	sbinfo->huge = ctx->huge;
3725 	sbinfo->mpol = ctx->mpol;
3726 	ctx->mpol = NULL;
3727 
3728 	raw_spin_lock_init(&sbinfo->stat_lock);
3729 	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3730 		goto failed;
3731 	spin_lock_init(&sbinfo->shrinklist_lock);
3732 	INIT_LIST_HEAD(&sbinfo->shrinklist);
3733 
3734 	sb->s_maxbytes = MAX_LFS_FILESIZE;
3735 	sb->s_blocksize = PAGE_SIZE;
3736 	sb->s_blocksize_bits = PAGE_SHIFT;
3737 	sb->s_magic = TMPFS_MAGIC;
3738 	sb->s_op = &shmem_ops;
3739 	sb->s_time_gran = 1;
3740 #ifdef CONFIG_TMPFS_XATTR
3741 	sb->s_xattr = shmem_xattr_handlers;
3742 #endif
3743 #ifdef CONFIG_TMPFS_POSIX_ACL
3744 	sb->s_flags |= SB_POSIXACL;
3745 #endif
3746 	uuid_gen(&sb->s_uuid);
3747 
3748 	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
3749 	if (!inode)
3750 		goto failed;
3751 	inode->i_uid = sbinfo->uid;
3752 	inode->i_gid = sbinfo->gid;
3753 	sb->s_root = d_make_root(inode);
3754 	if (!sb->s_root)
3755 		goto failed;
3756 	return 0;
3757 
3758 failed:
3759 	shmem_put_super(sb);
3760 	return -ENOMEM;
3761 }
3762 
shmem_get_tree(struct fs_context * fc)3763 static int shmem_get_tree(struct fs_context *fc)
3764 {
3765 	return get_tree_nodev(fc, shmem_fill_super);
3766 }
3767 
shmem_free_fc(struct fs_context * fc)3768 static void shmem_free_fc(struct fs_context *fc)
3769 {
3770 	struct shmem_options *ctx = fc->fs_private;
3771 
3772 	if (ctx) {
3773 		mpol_put(ctx->mpol);
3774 		kfree(ctx);
3775 	}
3776 }
3777 
3778 static const struct fs_context_operations shmem_fs_context_ops = {
3779 	.free			= shmem_free_fc,
3780 	.get_tree		= shmem_get_tree,
3781 #ifdef CONFIG_TMPFS
3782 	.parse_monolithic	= shmem_parse_options,
3783 	.parse_param		= shmem_parse_one,
3784 	.reconfigure		= shmem_reconfigure,
3785 #endif
3786 };
3787 
3788 static struct kmem_cache *shmem_inode_cachep;
3789 
shmem_alloc_inode(struct super_block * sb)3790 static struct inode *shmem_alloc_inode(struct super_block *sb)
3791 {
3792 	struct shmem_inode_info *info;
3793 	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
3794 	if (!info)
3795 		return NULL;
3796 	return &info->vfs_inode;
3797 }
3798 
shmem_free_in_core_inode(struct inode * inode)3799 static void shmem_free_in_core_inode(struct inode *inode)
3800 {
3801 	if (S_ISLNK(inode->i_mode))
3802 		kfree(inode->i_link);
3803 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
3804 }
3805 
shmem_destroy_inode(struct inode * inode)3806 static void shmem_destroy_inode(struct inode *inode)
3807 {
3808 	if (S_ISREG(inode->i_mode))
3809 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
3810 }
3811 
shmem_init_inode(void * foo)3812 static void shmem_init_inode(void *foo)
3813 {
3814 	struct shmem_inode_info *info = foo;
3815 	inode_init_once(&info->vfs_inode);
3816 }
3817 
shmem_init_inodecache(void)3818 static void shmem_init_inodecache(void)
3819 {
3820 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
3821 				sizeof(struct shmem_inode_info),
3822 				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
3823 }
3824 
shmem_destroy_inodecache(void)3825 static void shmem_destroy_inodecache(void)
3826 {
3827 	kmem_cache_destroy(shmem_inode_cachep);
3828 }
3829 
3830 /* Keep the page in page cache instead of truncating it */
shmem_error_remove_page(struct address_space * mapping,struct page * page)3831 static int shmem_error_remove_page(struct address_space *mapping,
3832 				   struct page *page)
3833 {
3834 	return 0;
3835 }
3836 
3837 const struct address_space_operations shmem_aops = {
3838 	.writepage	= shmem_writepage,
3839 	.set_page_dirty	= __set_page_dirty_no_writeback,
3840 #ifdef CONFIG_TMPFS
3841 	.write_begin	= shmem_write_begin,
3842 	.write_end	= shmem_write_end,
3843 #endif
3844 #ifdef CONFIG_MIGRATION
3845 	.migratepage	= migrate_page,
3846 #endif
3847 	.error_remove_page = shmem_error_remove_page,
3848 };
3849 EXPORT_SYMBOL(shmem_aops);
3850 
3851 static const struct file_operations shmem_file_operations = {
3852 	.mmap		= shmem_mmap,
3853 	.get_unmapped_area = shmem_get_unmapped_area,
3854 #ifdef CONFIG_TMPFS
3855 	.llseek		= shmem_file_llseek,
3856 	.read_iter	= shmem_file_read_iter,
3857 	.write_iter	= generic_file_write_iter,
3858 	.fsync		= noop_fsync,
3859 	.splice_read	= generic_file_splice_read,
3860 	.splice_write	= iter_file_splice_write,
3861 	.fallocate	= shmem_fallocate,
3862 #endif
3863 };
3864 
3865 static const struct inode_operations shmem_inode_operations = {
3866 	.getattr	= shmem_getattr,
3867 	.setattr	= shmem_setattr,
3868 #ifdef CONFIG_TMPFS_XATTR
3869 	.listxattr	= shmem_listxattr,
3870 	.set_acl	= simple_set_acl,
3871 #endif
3872 };
3873 
3874 static const struct inode_operations shmem_dir_inode_operations = {
3875 #ifdef CONFIG_TMPFS
3876 	.create		= shmem_create,
3877 	.lookup		= simple_lookup,
3878 	.link		= shmem_link,
3879 	.unlink		= shmem_unlink,
3880 	.symlink	= shmem_symlink,
3881 	.mkdir		= shmem_mkdir,
3882 	.rmdir		= shmem_rmdir,
3883 	.mknod		= shmem_mknod,
3884 	.rename		= shmem_rename2,
3885 	.tmpfile	= shmem_tmpfile,
3886 #endif
3887 #ifdef CONFIG_TMPFS_XATTR
3888 	.listxattr	= shmem_listxattr,
3889 #endif
3890 #ifdef CONFIG_TMPFS_POSIX_ACL
3891 	.setattr	= shmem_setattr,
3892 	.set_acl	= simple_set_acl,
3893 #endif
3894 };
3895 
3896 static const struct inode_operations shmem_special_inode_operations = {
3897 #ifdef CONFIG_TMPFS_XATTR
3898 	.listxattr	= shmem_listxattr,
3899 #endif
3900 #ifdef CONFIG_TMPFS_POSIX_ACL
3901 	.setattr	= shmem_setattr,
3902 	.set_acl	= simple_set_acl,
3903 #endif
3904 };
3905 
3906 static const struct super_operations shmem_ops = {
3907 	.alloc_inode	= shmem_alloc_inode,
3908 	.free_inode	= shmem_free_in_core_inode,
3909 	.destroy_inode	= shmem_destroy_inode,
3910 #ifdef CONFIG_TMPFS
3911 	.statfs		= shmem_statfs,
3912 	.show_options	= shmem_show_options,
3913 #endif
3914 	.evict_inode	= shmem_evict_inode,
3915 	.drop_inode	= generic_delete_inode,
3916 	.put_super	= shmem_put_super,
3917 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3918 	.nr_cached_objects	= shmem_unused_huge_count,
3919 	.free_cached_objects	= shmem_unused_huge_scan,
3920 #endif
3921 };
3922 
3923 static const struct vm_operations_struct shmem_vm_ops = {
3924 	.fault		= shmem_fault,
3925 	.map_pages	= filemap_map_pages,
3926 #ifdef CONFIG_NUMA
3927 	.set_policy     = shmem_set_policy,
3928 	.get_policy     = shmem_get_policy,
3929 #endif
3930 };
3931 
shmem_init_fs_context(struct fs_context * fc)3932 int shmem_init_fs_context(struct fs_context *fc)
3933 {
3934 	struct shmem_options *ctx;
3935 
3936 	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
3937 	if (!ctx)
3938 		return -ENOMEM;
3939 
3940 	ctx->mode = 0777 | S_ISVTX;
3941 	ctx->uid = current_fsuid();
3942 	ctx->gid = current_fsgid();
3943 
3944 	fc->fs_private = ctx;
3945 	fc->ops = &shmem_fs_context_ops;
3946 	return 0;
3947 }
3948 
3949 static struct file_system_type shmem_fs_type = {
3950 	.owner		= THIS_MODULE,
3951 	.name		= "tmpfs",
3952 	.init_fs_context = shmem_init_fs_context,
3953 #ifdef CONFIG_TMPFS
3954 	.parameters	= shmem_fs_parameters,
3955 #endif
3956 	.kill_sb	= kill_litter_super,
3957 	.fs_flags	= FS_USERNS_MOUNT | FS_THP_SUPPORT,
3958 };
3959 
shmem_init(void)3960 int __init shmem_init(void)
3961 {
3962 	int error;
3963 
3964 	shmem_init_inodecache();
3965 
3966 	error = register_filesystem(&shmem_fs_type);
3967 	if (error) {
3968 		pr_err("Could not register tmpfs\n");
3969 		goto out2;
3970 	}
3971 
3972 	shm_mnt = kern_mount(&shmem_fs_type);
3973 	if (IS_ERR(shm_mnt)) {
3974 		error = PTR_ERR(shm_mnt);
3975 		pr_err("Could not kern_mount tmpfs\n");
3976 		goto out1;
3977 	}
3978 
3979 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3980 	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
3981 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3982 	else
3983 		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
3984 #endif
3985 	return 0;
3986 
3987 out1:
3988 	unregister_filesystem(&shmem_fs_type);
3989 out2:
3990 	shmem_destroy_inodecache();
3991 	shm_mnt = ERR_PTR(error);
3992 	return error;
3993 }
3994 
3995 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)3996 static ssize_t shmem_enabled_show(struct kobject *kobj,
3997 				  struct kobj_attribute *attr, char *buf)
3998 {
3999 	static const int values[] = {
4000 		SHMEM_HUGE_ALWAYS,
4001 		SHMEM_HUGE_WITHIN_SIZE,
4002 		SHMEM_HUGE_ADVISE,
4003 		SHMEM_HUGE_NEVER,
4004 		SHMEM_HUGE_DENY,
4005 		SHMEM_HUGE_FORCE,
4006 	};
4007 	int len = 0;
4008 	int i;
4009 
4010 	for (i = 0; i < ARRAY_SIZE(values); i++) {
4011 		len += sysfs_emit_at(buf, len,
4012 				     shmem_huge == values[i] ? "%s[%s]" : "%s%s",
4013 				     i ? " " : "",
4014 				     shmem_format_huge(values[i]));
4015 	}
4016 
4017 	len += sysfs_emit_at(buf, len, "\n");
4018 
4019 	return len;
4020 }
4021 
shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)4022 static ssize_t shmem_enabled_store(struct kobject *kobj,
4023 		struct kobj_attribute *attr, const char *buf, size_t count)
4024 {
4025 	char tmp[16];
4026 	int huge;
4027 
4028 	if (count + 1 > sizeof(tmp))
4029 		return -EINVAL;
4030 	memcpy(tmp, buf, count);
4031 	tmp[count] = '\0';
4032 	if (count && tmp[count - 1] == '\n')
4033 		tmp[count - 1] = '\0';
4034 
4035 	huge = shmem_parse_huge(tmp);
4036 	if (huge == -EINVAL)
4037 		return -EINVAL;
4038 	if (!has_transparent_hugepage() &&
4039 			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4040 		return -EINVAL;
4041 
4042 	shmem_huge = huge;
4043 	if (shmem_huge > SHMEM_HUGE_DENY)
4044 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4045 	return count;
4046 }
4047 
4048 struct kobj_attribute shmem_enabled_attr =
4049 	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
4050 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4051 
4052 #else /* !CONFIG_SHMEM */
4053 
4054 /*
4055  * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4056  *
4057  * This is intended for small system where the benefits of the full
4058  * shmem code (swap-backed and resource-limited) are outweighed by
4059  * their complexity. On systems without swap this code should be
4060  * effectively equivalent, but much lighter weight.
4061  */
4062 
4063 static struct file_system_type shmem_fs_type = {
4064 	.name		= "tmpfs",
4065 	.init_fs_context = ramfs_init_fs_context,
4066 	.parameters	= ramfs_fs_parameters,
4067 	.kill_sb	= ramfs_kill_sb,
4068 	.fs_flags	= FS_USERNS_MOUNT,
4069 };
4070 
shmem_init(void)4071 int __init shmem_init(void)
4072 {
4073 	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4074 
4075 	shm_mnt = kern_mount(&shmem_fs_type);
4076 	BUG_ON(IS_ERR(shm_mnt));
4077 
4078 	return 0;
4079 }
4080 
shmem_unuse(unsigned int type,bool frontswap,unsigned long * fs_pages_to_unuse)4081 int shmem_unuse(unsigned int type, bool frontswap,
4082 		unsigned long *fs_pages_to_unuse)
4083 {
4084 	return 0;
4085 }
4086 
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)4087 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
4088 {
4089 	return 0;
4090 }
4091 
shmem_unlock_mapping(struct address_space * mapping)4092 void shmem_unlock_mapping(struct address_space *mapping)
4093 {
4094 }
4095 
4096 #ifdef CONFIG_MMU
shmem_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)4097 unsigned long shmem_get_unmapped_area(struct file *file,
4098 				      unsigned long addr, unsigned long len,
4099 				      unsigned long pgoff, unsigned long flags)
4100 {
4101 	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4102 }
4103 #endif
4104 
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)4105 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4106 {
4107 	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4108 }
4109 EXPORT_SYMBOL_GPL(shmem_truncate_range);
4110 
4111 #define shmem_vm_ops				generic_file_vm_ops
4112 #define shmem_file_operations			ramfs_file_operations
4113 #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
4114 #define shmem_acct_size(flags, size)		0
4115 #define shmem_unacct_size(flags, size)		do {} while (0)
4116 
4117 #endif /* CONFIG_SHMEM */
4118 
4119 /* common code */
4120 
__shmem_file_setup(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags,unsigned int i_flags)4121 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4122 				       unsigned long flags, unsigned int i_flags)
4123 {
4124 	struct inode *inode;
4125 	struct file *res;
4126 
4127 	if (IS_ERR(mnt))
4128 		return ERR_CAST(mnt);
4129 
4130 	if (size < 0 || size > MAX_LFS_FILESIZE)
4131 		return ERR_PTR(-EINVAL);
4132 
4133 	if (shmem_acct_size(flags, size))
4134 		return ERR_PTR(-ENOMEM);
4135 
4136 	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
4137 				flags);
4138 	if (unlikely(!inode)) {
4139 		shmem_unacct_size(flags, size);
4140 		return ERR_PTR(-ENOSPC);
4141 	}
4142 	inode->i_flags |= i_flags;
4143 	inode->i_size = size;
4144 	clear_nlink(inode);	/* It is unlinked */
4145 	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4146 	if (!IS_ERR(res))
4147 		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4148 				&shmem_file_operations);
4149 	if (IS_ERR(res))
4150 		iput(inode);
4151 	return res;
4152 }
4153 
4154 /**
4155  * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4156  * 	kernel internal.  There will be NO LSM permission checks against the
4157  * 	underlying inode.  So users of this interface must do LSM checks at a
4158  *	higher layer.  The users are the big_key and shm implementations.  LSM
4159  *	checks are provided at the key or shm level rather than the inode.
4160  * @name: name for dentry (to be seen in /proc/<pid>/maps
4161  * @size: size to be set for the file
4162  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4163  */
shmem_kernel_file_setup(const char * name,loff_t size,unsigned long flags)4164 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4165 {
4166 	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4167 }
4168 
4169 /**
4170  * shmem_file_setup - get an unlinked file living in tmpfs
4171  * @name: name for dentry (to be seen in /proc/<pid>/maps
4172  * @size: size to be set for the file
4173  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4174  */
shmem_file_setup(const char * name,loff_t size,unsigned long flags)4175 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4176 {
4177 	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4178 }
4179 EXPORT_SYMBOL_GPL(shmem_file_setup);
4180 
4181 /**
4182  * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4183  * @mnt: the tmpfs mount where the file will be created
4184  * @name: name for dentry (to be seen in /proc/<pid>/maps
4185  * @size: size to be set for the file
4186  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4187  */
shmem_file_setup_with_mnt(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags)4188 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4189 				       loff_t size, unsigned long flags)
4190 {
4191 	return __shmem_file_setup(mnt, name, size, flags, 0);
4192 }
4193 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4194 
4195 /**
4196  * shmem_zero_setup - setup a shared anonymous mapping
4197  * @vma: the vma to be mmapped is prepared by do_mmap
4198  */
shmem_zero_setup(struct vm_area_struct * vma)4199 int shmem_zero_setup(struct vm_area_struct *vma)
4200 {
4201 	struct file *file;
4202 	loff_t size = vma->vm_end - vma->vm_start;
4203 
4204 	/*
4205 	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
4206 	 * between XFS directory reading and selinux: since this file is only
4207 	 * accessible to the user through its mapping, use S_PRIVATE flag to
4208 	 * bypass file security, in the same way as shmem_kernel_file_setup().
4209 	 */
4210 	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4211 	if (IS_ERR(file))
4212 		return PTR_ERR(file);
4213 
4214 	if (vma->vm_file)
4215 		fput(vma->vm_file);
4216 	vma->vm_file = file;
4217 	vma->vm_ops = &shmem_vm_ops;
4218 
4219 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4220 			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4221 			(vma->vm_end & HPAGE_PMD_MASK)) {
4222 		khugepaged_enter(vma, vma->vm_flags);
4223 	}
4224 
4225 	return 0;
4226 }
4227 
4228 /**
4229  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
4230  * @mapping:	the page's address_space
4231  * @index:	the page index
4232  * @gfp:	the page allocator flags to use if allocating
4233  *
4234  * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4235  * with any new page allocations done using the specified allocation flags.
4236  * But read_cache_page_gfp() uses the ->readpage() method: which does not
4237  * suit tmpfs, since it may have pages in swapcache, and needs to find those
4238  * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4239  *
4240  * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4241  * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4242  */
shmem_read_mapping_page_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)4243 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4244 					 pgoff_t index, gfp_t gfp)
4245 {
4246 #ifdef CONFIG_SHMEM
4247 	struct inode *inode = mapping->host;
4248 	struct page *page;
4249 	int error;
4250 
4251 	BUG_ON(!shmem_mapping(mapping));
4252 	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
4253 				  gfp, NULL, NULL, NULL);
4254 	if (error)
4255 		return ERR_PTR(error);
4256 
4257 	unlock_page(page);
4258 	if (PageHWPoison(page)) {
4259 		put_page(page);
4260 		return ERR_PTR(-EIO);
4261 	}
4262 
4263 	return page;
4264 #else
4265 	/*
4266 	 * The tiny !SHMEM case uses ramfs without swap
4267 	 */
4268 	return read_cache_page_gfp(mapping, index, gfp);
4269 #endif
4270 }
4271 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4272 
reclaim_shmem_address_space(struct address_space * mapping)4273 int reclaim_shmem_address_space(struct address_space *mapping)
4274 {
4275 #ifdef CONFIG_SHMEM
4276 	pgoff_t start = 0;
4277 	struct page *page;
4278 	LIST_HEAD(page_list);
4279 	XA_STATE(xas, &mapping->i_pages, start);
4280 
4281 	if (!shmem_mapping(mapping))
4282 		return -EINVAL;
4283 
4284 	lru_add_drain();
4285 
4286 	rcu_read_lock();
4287 	xas_for_each(&xas, page, ULONG_MAX) {
4288 		if (xas_retry(&xas, page))
4289 			continue;
4290 		if (xa_is_value(page))
4291 			continue;
4292 		if (isolate_lru_page(page))
4293 			continue;
4294 
4295 		list_add(&page->lru, &page_list);
4296 
4297 		if (need_resched()) {
4298 			xas_pause(&xas);
4299 			cond_resched_rcu();
4300 		}
4301 	}
4302 	rcu_read_unlock();
4303 
4304 	return reclaim_pages(&page_list);
4305 #else
4306 	return 0;
4307 #endif
4308 }
4309 EXPORT_SYMBOL_GPL(reclaim_shmem_address_space);
4310