• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // SPDX-License-Identifier: GPL-2.0-only
2  #include <linux/slab.h>
3  #include <linux/stat.h>
4  #include <linux/sched/xacct.h>
5  #include <linux/fcntl.h>
6  #include <linux/file.h>
7  #include <linux/uio.h>
8  #include <linux/fsnotify.h>
9  #include <linux/security.h>
10  #include <linux/export.h>
11  #include <linux/syscalls.h>
12  #include <linux/pagemap.h>
13  #include <linux/splice.h>
14  #include <linux/compat.h>
15  #include <linux/mount.h>
16  #include <linux/fs.h>
17  #include "internal.h"
18  
19  #include <linux/uaccess.h>
20  #include <asm/unistd.h>
21  
22  /*
23   * Performs necessary checks before doing a clone.
24   *
25   * Can adjust amount of bytes to clone via @req_count argument.
26   * Returns appropriate error code that caller should return or
27   * zero in case the clone should be allowed.
28   */
generic_remap_checks(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * req_count,unsigned int remap_flags)29  static int generic_remap_checks(struct file *file_in, loff_t pos_in,
30  				struct file *file_out, loff_t pos_out,
31  				loff_t *req_count, unsigned int remap_flags)
32  {
33  	struct inode *inode_in = file_in->f_mapping->host;
34  	struct inode *inode_out = file_out->f_mapping->host;
35  	uint64_t count = *req_count;
36  	uint64_t bcount;
37  	loff_t size_in, size_out;
38  	loff_t bs = inode_out->i_sb->s_blocksize;
39  	int ret;
40  
41  	/* The start of both ranges must be aligned to an fs block. */
42  	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
43  		return -EINVAL;
44  
45  	/* Ensure offsets don't wrap. */
46  	if (pos_in + count < pos_in || pos_out + count < pos_out)
47  		return -EINVAL;
48  
49  	size_in = i_size_read(inode_in);
50  	size_out = i_size_read(inode_out);
51  
52  	/* Dedupe requires both ranges to be within EOF. */
53  	if ((remap_flags & REMAP_FILE_DEDUP) &&
54  	    (pos_in >= size_in || pos_in + count > size_in ||
55  	     pos_out >= size_out || pos_out + count > size_out))
56  		return -EINVAL;
57  
58  	/* Ensure the infile range is within the infile. */
59  	if (pos_in >= size_in)
60  		return -EINVAL;
61  	count = min(count, size_in - (uint64_t)pos_in);
62  
63  	ret = generic_write_check_limits(file_out, pos_out, &count);
64  	if (ret)
65  		return ret;
66  
67  	/*
68  	 * If the user wanted us to link to the infile's EOF, round up to the
69  	 * next block boundary for this check.
70  	 *
71  	 * Otherwise, make sure the count is also block-aligned, having
72  	 * already confirmed the starting offsets' block alignment.
73  	 */
74  	if (pos_in + count == size_in &&
75  	    (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
76  		bcount = ALIGN(size_in, bs) - pos_in;
77  	} else {
78  		if (!IS_ALIGNED(count, bs))
79  			count = ALIGN_DOWN(count, bs);
80  		bcount = count;
81  	}
82  
83  	/* Don't allow overlapped cloning within the same file. */
84  	if (inode_in == inode_out &&
85  	    pos_out + bcount > pos_in &&
86  	    pos_out < pos_in + bcount)
87  		return -EINVAL;
88  
89  	/*
90  	 * We shortened the request but the caller can't deal with that, so
91  	 * bounce the request back to userspace.
92  	 */
93  	if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
94  		return -EINVAL;
95  
96  	*req_count = count;
97  	return 0;
98  }
99  
remap_verify_area(struct file * file,loff_t pos,loff_t len,bool write)100  static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
101  			     bool write)
102  {
103  	struct inode *inode = file_inode(file);
104  
105  	if (unlikely(pos < 0 || len < 0))
106  		return -EINVAL;
107  
108  	if (unlikely((loff_t) (pos + len) < 0))
109  		return -EINVAL;
110  
111  	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
112  		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
113  		int retval;
114  
115  		retval = locks_mandatory_area(inode, file, pos, end,
116  				write ? F_WRLCK : F_RDLCK);
117  		if (retval < 0)
118  			return retval;
119  	}
120  
121  	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
122  }
123  
124  /*
125   * Ensure that we don't remap a partial EOF block in the middle of something
126   * else.  Assume that the offsets have already been checked for block
127   * alignment.
128   *
129   * For clone we only link a partial EOF block above or at the destination file's
130   * EOF.  For deduplication we accept a partial EOF block only if it ends at the
131   * destination file's EOF (can not link it into the middle of a file).
132   *
133   * Shorten the request if possible.
134   */
generic_remap_check_len(struct inode * inode_in,struct inode * inode_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)135  static int generic_remap_check_len(struct inode *inode_in,
136  				   struct inode *inode_out,
137  				   loff_t pos_out,
138  				   loff_t *len,
139  				   unsigned int remap_flags)
140  {
141  	u64 blkmask = i_blocksize(inode_in) - 1;
142  	loff_t new_len = *len;
143  
144  	if ((*len & blkmask) == 0)
145  		return 0;
146  
147  	if (pos_out + *len < i_size_read(inode_out))
148  		new_len &= ~blkmask;
149  
150  	if (new_len == *len)
151  		return 0;
152  
153  	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
154  		*len = new_len;
155  		return 0;
156  	}
157  
158  	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
159  }
160  
161  /* Read a page's worth of file data into the page cache. */
vfs_dedupe_get_page(struct inode * inode,loff_t offset)162  static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
163  {
164  	struct page *page;
165  
166  	page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
167  	if (IS_ERR(page))
168  		return page;
169  	if (!PageUptodate(page)) {
170  		put_page(page);
171  		return ERR_PTR(-EIO);
172  	}
173  	return page;
174  }
175  
176  /*
177   * Lock two pages, ensuring that we lock in offset order if the pages are from
178   * the same file.
179   */
vfs_lock_two_pages(struct page * page1,struct page * page2)180  static void vfs_lock_two_pages(struct page *page1, struct page *page2)
181  {
182  	/* Always lock in order of increasing index. */
183  	if (page1->index > page2->index)
184  		swap(page1, page2);
185  
186  	lock_page(page1);
187  	if (page1 != page2)
188  		lock_page(page2);
189  }
190  
191  /* Unlock two pages, being careful not to unlock the same page twice. */
vfs_unlock_two_pages(struct page * page1,struct page * page2)192  static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
193  {
194  	unlock_page(page1);
195  	if (page1 != page2)
196  		unlock_page(page2);
197  }
198  
199  /*
200   * Compare extents of two files to see if they are the same.
201   * Caller must have locked both inodes to prevent write races.
202   */
vfs_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dest,loff_t destoff,loff_t len,bool * is_same)203  static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
204  					 struct inode *dest, loff_t destoff,
205  					 loff_t len, bool *is_same)
206  {
207  	loff_t src_poff;
208  	loff_t dest_poff;
209  	void *src_addr;
210  	void *dest_addr;
211  	struct page *src_page;
212  	struct page *dest_page;
213  	loff_t cmp_len;
214  	bool same;
215  	int error;
216  
217  	error = -EINVAL;
218  	same = true;
219  	while (len) {
220  		src_poff = srcoff & (PAGE_SIZE - 1);
221  		dest_poff = destoff & (PAGE_SIZE - 1);
222  		cmp_len = min(PAGE_SIZE - src_poff,
223  			      PAGE_SIZE - dest_poff);
224  		cmp_len = min(cmp_len, len);
225  		if (cmp_len <= 0)
226  			goto out_error;
227  
228  		src_page = vfs_dedupe_get_page(src, srcoff);
229  		if (IS_ERR(src_page)) {
230  			error = PTR_ERR(src_page);
231  			goto out_error;
232  		}
233  		dest_page = vfs_dedupe_get_page(dest, destoff);
234  		if (IS_ERR(dest_page)) {
235  			error = PTR_ERR(dest_page);
236  			put_page(src_page);
237  			goto out_error;
238  		}
239  
240  		vfs_lock_two_pages(src_page, dest_page);
241  
242  		/*
243  		 * Now that we've locked both pages, make sure they're still
244  		 * mapped to the file data we're interested in.  If not,
245  		 * someone is invalidating pages on us and we lose.
246  		 */
247  		if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
248  		    src_page->mapping != src->i_mapping ||
249  		    dest_page->mapping != dest->i_mapping) {
250  			same = false;
251  			goto unlock;
252  		}
253  
254  		src_addr = kmap_atomic(src_page);
255  		dest_addr = kmap_atomic(dest_page);
256  
257  		flush_dcache_page(src_page);
258  		flush_dcache_page(dest_page);
259  
260  		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
261  			same = false;
262  
263  		kunmap_atomic(dest_addr);
264  		kunmap_atomic(src_addr);
265  unlock:
266  		vfs_unlock_two_pages(src_page, dest_page);
267  		put_page(dest_page);
268  		put_page(src_page);
269  
270  		if (!same)
271  			break;
272  
273  		srcoff += cmp_len;
274  		destoff += cmp_len;
275  		len -= cmp_len;
276  	}
277  
278  	*is_same = same;
279  	return 0;
280  
281  out_error:
282  	return error;
283  }
284  
285  /*
286   * Check that the two inodes are eligible for cloning, the ranges make
287   * sense, and then flush all dirty data.  Caller must ensure that the
288   * inodes have been locked against any other modifications.
289   *
290   * If there's an error, then the usual negative error code is returned.
291   * Otherwise returns 0 with *len set to the request length.
292   */
generic_remap_file_range_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)293  int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
294  				  struct file *file_out, loff_t pos_out,
295  				  loff_t *len, unsigned int remap_flags)
296  {
297  	struct inode *inode_in = file_inode(file_in);
298  	struct inode *inode_out = file_inode(file_out);
299  	bool same_inode = (inode_in == inode_out);
300  	int ret;
301  
302  	/* Don't touch certain kinds of inodes */
303  	if (IS_IMMUTABLE(inode_out))
304  		return -EPERM;
305  
306  	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
307  		return -ETXTBSY;
308  
309  	/* Don't reflink dirs, pipes, sockets... */
310  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
311  		return -EISDIR;
312  	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
313  		return -EINVAL;
314  
315  	/* Zero length dedupe exits immediately; reflink goes to EOF. */
316  	if (*len == 0) {
317  		loff_t isize = i_size_read(inode_in);
318  
319  		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
320  			return 0;
321  		if (pos_in > isize)
322  			return -EINVAL;
323  		*len = isize - pos_in;
324  		if (*len == 0)
325  			return 0;
326  	}
327  
328  	/* Check that we don't violate system file offset limits. */
329  	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
330  			remap_flags);
331  	if (ret)
332  		return ret;
333  
334  	/* Wait for the completion of any pending IOs on both files */
335  	inode_dio_wait(inode_in);
336  	if (!same_inode)
337  		inode_dio_wait(inode_out);
338  
339  	ret = filemap_write_and_wait_range(inode_in->i_mapping,
340  			pos_in, pos_in + *len - 1);
341  	if (ret)
342  		return ret;
343  
344  	ret = filemap_write_and_wait_range(inode_out->i_mapping,
345  			pos_out, pos_out + *len - 1);
346  	if (ret)
347  		return ret;
348  
349  	/*
350  	 * Check that the extents are the same.
351  	 */
352  	if (remap_flags & REMAP_FILE_DEDUP) {
353  		bool		is_same = false;
354  
355  		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
356  				inode_out, pos_out, *len, &is_same);
357  		if (ret)
358  			return ret;
359  		if (!is_same)
360  			return -EBADE;
361  	}
362  
363  	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
364  			remap_flags);
365  	if (ret)
366  		return ret;
367  
368  	/* If can't alter the file contents, we're done. */
369  	if (!(remap_flags & REMAP_FILE_DEDUP))
370  		ret = file_modified(file_out);
371  
372  	return ret;
373  }
374  EXPORT_SYMBOL(generic_remap_file_range_prep);
375  
do_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)376  loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
377  			   struct file *file_out, loff_t pos_out,
378  			   loff_t len, unsigned int remap_flags)
379  {
380  	loff_t ret;
381  
382  	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
383  
384  	/*
385  	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
386  	 * the same mount. Practically, they only need to be on the same file
387  	 * system.
388  	 */
389  	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
390  		return -EXDEV;
391  
392  	ret = generic_file_rw_checks(file_in, file_out);
393  	if (ret < 0)
394  		return ret;
395  
396  	if (!file_in->f_op->remap_file_range)
397  		return -EOPNOTSUPP;
398  
399  	ret = remap_verify_area(file_in, pos_in, len, false);
400  	if (ret)
401  		return ret;
402  
403  	ret = remap_verify_area(file_out, pos_out, len, true);
404  	if (ret)
405  		return ret;
406  
407  	ret = file_in->f_op->remap_file_range(file_in, pos_in,
408  			file_out, pos_out, len, remap_flags);
409  	if (ret < 0)
410  		return ret;
411  
412  	fsnotify_access(file_in);
413  	fsnotify_modify(file_out);
414  	return ret;
415  }
416  EXPORT_SYMBOL(do_clone_file_range);
417  
vfs_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)418  loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
419  			    struct file *file_out, loff_t pos_out,
420  			    loff_t len, unsigned int remap_flags)
421  {
422  	loff_t ret;
423  
424  	file_start_write(file_out);
425  	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
426  				  remap_flags);
427  	file_end_write(file_out);
428  
429  	return ret;
430  }
431  EXPORT_SYMBOL(vfs_clone_file_range);
432  
433  /* Check whether we are allowed to dedupe the destination file */
allow_file_dedupe(struct file * file)434  static bool allow_file_dedupe(struct file *file)
435  {
436  	if (capable(CAP_SYS_ADMIN))
437  		return true;
438  	if (file->f_mode & FMODE_WRITE)
439  		return true;
440  	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
441  		return true;
442  	if (!inode_permission(file_inode(file), MAY_WRITE))
443  		return true;
444  	return false;
445  }
446  
vfs_dedupe_file_range_one(struct file * src_file,loff_t src_pos,struct file * dst_file,loff_t dst_pos,loff_t len,unsigned int remap_flags)447  loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
448  				 struct file *dst_file, loff_t dst_pos,
449  				 loff_t len, unsigned int remap_flags)
450  {
451  	loff_t ret;
452  
453  	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
454  				     REMAP_FILE_CAN_SHORTEN));
455  
456  	ret = mnt_want_write_file(dst_file);
457  	if (ret)
458  		return ret;
459  
460  	ret = remap_verify_area(dst_file, dst_pos, len, true);
461  	if (ret < 0)
462  		goto out_drop_write;
463  
464  	ret = -EPERM;
465  	if (!allow_file_dedupe(dst_file))
466  		goto out_drop_write;
467  
468  	ret = -EXDEV;
469  	if (src_file->f_path.mnt != dst_file->f_path.mnt)
470  		goto out_drop_write;
471  
472  	ret = -EISDIR;
473  	if (S_ISDIR(file_inode(dst_file)->i_mode))
474  		goto out_drop_write;
475  
476  	ret = -EINVAL;
477  	if (!dst_file->f_op->remap_file_range)
478  		goto out_drop_write;
479  
480  	if (len == 0) {
481  		ret = 0;
482  		goto out_drop_write;
483  	}
484  
485  	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
486  			dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
487  out_drop_write:
488  	mnt_drop_write_file(dst_file);
489  
490  	return ret;
491  }
492  EXPORT_SYMBOL(vfs_dedupe_file_range_one);
493  
vfs_dedupe_file_range(struct file * file,struct file_dedupe_range * same)494  int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
495  {
496  	struct file_dedupe_range_info *info;
497  	struct inode *src = file_inode(file);
498  	u64 off;
499  	u64 len;
500  	int i;
501  	int ret;
502  	u16 count = same->dest_count;
503  	loff_t deduped;
504  
505  	if (!(file->f_mode & FMODE_READ))
506  		return -EINVAL;
507  
508  	if (same->reserved1 || same->reserved2)
509  		return -EINVAL;
510  
511  	off = same->src_offset;
512  	len = same->src_length;
513  
514  	if (S_ISDIR(src->i_mode))
515  		return -EISDIR;
516  
517  	if (!S_ISREG(src->i_mode))
518  		return -EINVAL;
519  
520  	if (!file->f_op->remap_file_range)
521  		return -EOPNOTSUPP;
522  
523  	ret = remap_verify_area(file, off, len, false);
524  	if (ret < 0)
525  		return ret;
526  	ret = 0;
527  
528  	if (off + len > i_size_read(src))
529  		return -EINVAL;
530  
531  	/* Arbitrary 1G limit on a single dedupe request, can be raised. */
532  	len = min_t(u64, len, 1 << 30);
533  
534  	/* pre-format output fields to sane values */
535  	for (i = 0; i < count; i++) {
536  		same->info[i].bytes_deduped = 0ULL;
537  		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
538  	}
539  
540  	for (i = 0, info = same->info; i < count; i++, info++) {
541  		struct fd dst_fd = fdget(info->dest_fd);
542  		struct file *dst_file = dst_fd.file;
543  
544  		if (!dst_file) {
545  			info->status = -EBADF;
546  			goto next_loop;
547  		}
548  
549  		if (info->reserved) {
550  			info->status = -EINVAL;
551  			goto next_fdput;
552  		}
553  
554  		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
555  						    info->dest_offset, len,
556  						    REMAP_FILE_CAN_SHORTEN);
557  		if (deduped == -EBADE)
558  			info->status = FILE_DEDUPE_RANGE_DIFFERS;
559  		else if (deduped < 0)
560  			info->status = deduped;
561  		else
562  			info->bytes_deduped = len;
563  
564  next_fdput:
565  		fdput(dst_fd);
566  next_loop:
567  		if (fatal_signal_pending(current))
568  			break;
569  	}
570  	return ret;
571  }
572  EXPORT_SYMBOL(vfs_dedupe_file_range);
573