• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/read_write.c
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  */
7 
8 #include <linux/slab.h>
9 #include <linux/stat.h>
10 #include <linux/sched/xacct.h>
11 #include <linux/fcntl.h>
12 #include <linux/file.h>
13 #include <linux/uio.h>
14 #include <linux/fsnotify.h>
15 #include <linux/security.h>
16 #include <linux/export.h>
17 #include <linux/syscalls.h>
18 #include <linux/pagemap.h>
19 #include <linux/splice.h>
20 #include <linux/compat.h>
21 #include <linux/mount.h>
22 #include <linux/fs.h>
23 #include "internal.h"
24 
25 #include <linux/uaccess.h>
26 #include <asm/unistd.h>
27 
28 const struct file_operations generic_ro_fops = {
29 	.llseek		= generic_file_llseek,
30 	.read_iter	= generic_file_read_iter,
31 	.mmap		= generic_file_readonly_mmap,
32 	.splice_read	= generic_file_splice_read,
33 };
34 
35 EXPORT_SYMBOL(generic_ro_fops);
36 
unsigned_offsets(struct file * file)37 static inline bool unsigned_offsets(struct file *file)
38 {
39 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
40 }
41 
42 /**
43  * vfs_setpos - update the file offset for lseek
44  * @file:	file structure in question
45  * @offset:	file offset to seek to
46  * @maxsize:	maximum file size
47  *
48  * This is a low-level filesystem helper for updating the file offset to
49  * the value specified by @offset if the given offset is valid and it is
50  * not equal to the current file offset.
51  *
52  * Return the specified offset on success and -EINVAL on invalid offset.
53  */
vfs_setpos(struct file * file,loff_t offset,loff_t maxsize)54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55 {
56 	if (offset < 0 && !unsigned_offsets(file))
57 		return -EINVAL;
58 	if (offset > maxsize)
59 		return -EINVAL;
60 
61 	if (offset != file->f_pos) {
62 		file->f_pos = offset;
63 		file->f_version = 0;
64 	}
65 	return offset;
66 }
67 EXPORT_SYMBOL(vfs_setpos);
68 
69 /**
70  * generic_file_llseek_size - generic llseek implementation for regular files
71  * @file:	file structure to seek on
72  * @offset:	file offset to seek to
73  * @whence:	type of seek
74  * @size:	max size of this file in file system
75  * @eof:	offset used for SEEK_END position
76  *
77  * This is a variant of generic_file_llseek that allows passing in a custom
78  * maximum file size and a custom EOF position, for e.g. hashed directories
79  *
80  * Synchronization:
81  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83  * read/writes behave like SEEK_SET against seeks.
84  */
85 loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)86 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87 		loff_t maxsize, loff_t eof)
88 {
89 	switch (whence) {
90 	case SEEK_END:
91 		offset += eof;
92 		break;
93 	case SEEK_CUR:
94 		/*
95 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 		 * position-querying operation.  Avoid rewriting the "same"
97 		 * f_pos value back to the file because a concurrent read(),
98 		 * write() or lseek() might have altered it
99 		 */
100 		if (offset == 0)
101 			return file->f_pos;
102 		/*
103 		 * f_lock protects against read/modify/write race with other
104 		 * SEEK_CURs. Note that parallel writes and reads behave
105 		 * like SEEK_SET.
106 		 */
107 		spin_lock(&file->f_lock);
108 		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109 		spin_unlock(&file->f_lock);
110 		return offset;
111 	case SEEK_DATA:
112 		/*
113 		 * In the generic case the entire file is data, so as long as
114 		 * offset isn't at the end of the file then the offset is data.
115 		 */
116 		if ((unsigned long long)offset >= eof)
117 			return -ENXIO;
118 		break;
119 	case SEEK_HOLE:
120 		/*
121 		 * There is a virtual hole at the end of the file, so as long as
122 		 * offset isn't i_size or larger, return i_size.
123 		 */
124 		if ((unsigned long long)offset >= eof)
125 			return -ENXIO;
126 		offset = eof;
127 		break;
128 	}
129 
130 	return vfs_setpos(file, offset, maxsize);
131 }
132 EXPORT_SYMBOL(generic_file_llseek_size);
133 
134 /**
135  * generic_file_llseek - generic llseek implementation for regular files
136  * @file:	file structure to seek on
137  * @offset:	file offset to seek to
138  * @whence:	type of seek
139  *
140  * This is a generic implemenation of ->llseek useable for all normal local
141  * filesystems.  It just updates the file offset to the value specified by
142  * @offset and @whence.
143  */
generic_file_llseek(struct file * file,loff_t offset,int whence)144 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145 {
146 	struct inode *inode = file->f_mapping->host;
147 
148 	return generic_file_llseek_size(file, offset, whence,
149 					inode->i_sb->s_maxbytes,
150 					i_size_read(inode));
151 }
152 EXPORT_SYMBOL(generic_file_llseek);
153 
154 /**
155  * fixed_size_llseek - llseek implementation for fixed-sized devices
156  * @file:	file structure to seek on
157  * @offset:	file offset to seek to
158  * @whence:	type of seek
159  * @size:	size of the file
160  *
161  */
fixed_size_llseek(struct file * file,loff_t offset,int whence,loff_t size)162 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163 {
164 	switch (whence) {
165 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 		return generic_file_llseek_size(file, offset, whence,
167 						size, size);
168 	default:
169 		return -EINVAL;
170 	}
171 }
172 EXPORT_SYMBOL(fixed_size_llseek);
173 
174 /**
175  * no_seek_end_llseek - llseek implementation for fixed-sized devices
176  * @file:	file structure to seek on
177  * @offset:	file offset to seek to
178  * @whence:	type of seek
179  *
180  */
no_seek_end_llseek(struct file * file,loff_t offset,int whence)181 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182 {
183 	switch (whence) {
184 	case SEEK_SET: case SEEK_CUR:
185 		return generic_file_llseek_size(file, offset, whence,
186 						OFFSET_MAX, 0);
187 	default:
188 		return -EINVAL;
189 	}
190 }
191 EXPORT_SYMBOL(no_seek_end_llseek);
192 
193 /**
194  * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195  * @file:	file structure to seek on
196  * @offset:	file offset to seek to
197  * @whence:	type of seek
198  * @size:	maximal offset allowed
199  *
200  */
no_seek_end_llseek_size(struct file * file,loff_t offset,int whence,loff_t size)201 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202 {
203 	switch (whence) {
204 	case SEEK_SET: case SEEK_CUR:
205 		return generic_file_llseek_size(file, offset, whence,
206 						size, 0);
207 	default:
208 		return -EINVAL;
209 	}
210 }
211 EXPORT_SYMBOL(no_seek_end_llseek_size);
212 
213 /**
214  * noop_llseek - No Operation Performed llseek implementation
215  * @file:	file structure to seek on
216  * @offset:	file offset to seek to
217  * @whence:	type of seek
218  *
219  * This is an implementation of ->llseek useable for the rare special case when
220  * userspace expects the seek to succeed but the (device) file is actually not
221  * able to perform the seek. In this case you use noop_llseek() instead of
222  * falling back to the default implementation of ->llseek.
223  */
noop_llseek(struct file * file,loff_t offset,int whence)224 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225 {
226 	return file->f_pos;
227 }
228 EXPORT_SYMBOL(noop_llseek);
229 
no_llseek(struct file * file,loff_t offset,int whence)230 loff_t no_llseek(struct file *file, loff_t offset, int whence)
231 {
232 	return -ESPIPE;
233 }
234 EXPORT_SYMBOL(no_llseek);
235 
default_llseek(struct file * file,loff_t offset,int whence)236 loff_t default_llseek(struct file *file, loff_t offset, int whence)
237 {
238 	struct inode *inode = file_inode(file);
239 	loff_t retval;
240 
241 	inode_lock(inode);
242 	switch (whence) {
243 		case SEEK_END:
244 			offset += i_size_read(inode);
245 			break;
246 		case SEEK_CUR:
247 			if (offset == 0) {
248 				retval = file->f_pos;
249 				goto out;
250 			}
251 			offset += file->f_pos;
252 			break;
253 		case SEEK_DATA:
254 			/*
255 			 * In the generic case the entire file is data, so as
256 			 * long as offset isn't at the end of the file then the
257 			 * offset is data.
258 			 */
259 			if (offset >= inode->i_size) {
260 				retval = -ENXIO;
261 				goto out;
262 			}
263 			break;
264 		case SEEK_HOLE:
265 			/*
266 			 * There is a virtual hole at the end of the file, so
267 			 * as long as offset isn't i_size or larger, return
268 			 * i_size.
269 			 */
270 			if (offset >= inode->i_size) {
271 				retval = -ENXIO;
272 				goto out;
273 			}
274 			offset = inode->i_size;
275 			break;
276 	}
277 	retval = -EINVAL;
278 	if (offset >= 0 || unsigned_offsets(file)) {
279 		if (offset != file->f_pos) {
280 			file->f_pos = offset;
281 			file->f_version = 0;
282 		}
283 		retval = offset;
284 	}
285 out:
286 	inode_unlock(inode);
287 	return retval;
288 }
289 EXPORT_SYMBOL(default_llseek);
290 
vfs_llseek(struct file * file,loff_t offset,int whence)291 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
292 {
293 	loff_t (*fn)(struct file *, loff_t, int);
294 
295 	fn = no_llseek;
296 	if (file->f_mode & FMODE_LSEEK) {
297 		if (file->f_op->llseek)
298 			fn = file->f_op->llseek;
299 	}
300 	return fn(file, offset, whence);
301 }
302 EXPORT_SYMBOL(vfs_llseek);
303 
ksys_lseek(unsigned int fd,off_t offset,unsigned int whence)304 off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
305 {
306 	off_t retval;
307 	struct fd f = fdget_pos(fd);
308 	if (!f.file)
309 		return -EBADF;
310 
311 	retval = -EINVAL;
312 	if (whence <= SEEK_MAX) {
313 		loff_t res = vfs_llseek(f.file, offset, whence);
314 		retval = res;
315 		if (res != (loff_t)retval)
316 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
317 	}
318 	fdput_pos(f);
319 	return retval;
320 }
321 
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)322 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323 {
324 	return ksys_lseek(fd, offset, whence);
325 }
326 
327 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)328 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329 {
330 	return ksys_lseek(fd, offset, whence);
331 }
332 #endif
333 
334 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)335 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
336 		unsigned long, offset_low, loff_t __user *, result,
337 		unsigned int, whence)
338 {
339 	int retval;
340 	struct fd f = fdget_pos(fd);
341 	loff_t offset;
342 
343 	if (!f.file)
344 		return -EBADF;
345 
346 	retval = -EINVAL;
347 	if (whence > SEEK_MAX)
348 		goto out_putf;
349 
350 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
351 			whence);
352 
353 	retval = (int)offset;
354 	if (offset >= 0) {
355 		retval = -EFAULT;
356 		if (!copy_to_user(result, &offset, sizeof(offset)))
357 			retval = 0;
358 	}
359 out_putf:
360 	fdput_pos(f);
361 	return retval;
362 }
363 #endif
364 
rw_verify_area(int read_write,struct file * file,const loff_t * ppos,size_t count)365 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
366 {
367 	struct inode *inode;
368 	int retval = -EINVAL;
369 
370 	inode = file_inode(file);
371 	if (unlikely((ssize_t) count < 0))
372 		return retval;
373 
374 	/*
375 	 * ranged mandatory locking does not apply to streams - it makes sense
376 	 * only for files where position has a meaning.
377 	 */
378 	if (ppos) {
379 		loff_t pos = *ppos;
380 
381 		if (unlikely(pos < 0)) {
382 			if (!unsigned_offsets(file))
383 				return retval;
384 			if (count >= -pos) /* both values are in 0..LLONG_MAX */
385 				return -EOVERFLOW;
386 		} else if (unlikely((loff_t) (pos + count) < 0)) {
387 			if (!unsigned_offsets(file))
388 				return retval;
389 		}
390 
391 		if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
392 			retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
393 					read_write == READ ? F_RDLCK : F_WRLCK);
394 			if (retval < 0)
395 				return retval;
396 		}
397 	}
398 
399 	return security_file_permission(file,
400 				read_write == READ ? MAY_READ : MAY_WRITE);
401 }
402 
new_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)403 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
404 {
405 	struct iovec iov = { .iov_base = buf, .iov_len = len };
406 	struct kiocb kiocb;
407 	struct iov_iter iter;
408 	ssize_t ret;
409 
410 	init_sync_kiocb(&kiocb, filp);
411 	kiocb.ki_pos = (ppos ? *ppos : 0);
412 	iov_iter_init(&iter, READ, &iov, 1, len);
413 
414 	ret = call_read_iter(filp, &kiocb, &iter);
415 	BUG_ON(ret == -EIOCBQUEUED);
416 	if (ppos)
417 		*ppos = kiocb.ki_pos;
418 	return ret;
419 }
420 
__vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)421 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
422 		   loff_t *pos)
423 {
424 	if (file->f_op->read)
425 		return file->f_op->read(file, buf, count, pos);
426 	else if (file->f_op->read_iter)
427 		return new_sync_read(file, buf, count, pos);
428 	else
429 		return -EINVAL;
430 }
431 
kernel_read(struct file * file,void * buf,size_t count,loff_t * pos)432 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
433 {
434 	mm_segment_t old_fs;
435 	ssize_t result;
436 
437 	old_fs = get_fs();
438 	set_fs(KERNEL_DS);
439 	/* The cast to a user pointer is valid due to the set_fs() */
440 	result = vfs_read(file, (void __user *)buf, count, pos);
441 	set_fs(old_fs);
442 	return result;
443 }
444 EXPORT_SYMBOL(kernel_read);
445 
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)446 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
447 {
448 	ssize_t ret;
449 
450 	if (!(file->f_mode & FMODE_READ))
451 		return -EBADF;
452 	if (!(file->f_mode & FMODE_CAN_READ))
453 		return -EINVAL;
454 	if (unlikely(!access_ok(buf, count)))
455 		return -EFAULT;
456 
457 	ret = rw_verify_area(READ, file, pos, count);
458 	if (!ret) {
459 		if (count > MAX_RW_COUNT)
460 			count =  MAX_RW_COUNT;
461 		ret = __vfs_read(file, buf, count, pos);
462 		if (ret > 0) {
463 			fsnotify_access(file);
464 			add_rchar(current, ret);
465 		}
466 		inc_syscr(current);
467 	}
468 
469 	return ret;
470 }
471 
472 EXPORT_SYMBOL(vfs_read);
473 
new_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)474 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
475 {
476 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
477 	struct kiocb kiocb;
478 	struct iov_iter iter;
479 	ssize_t ret;
480 
481 	init_sync_kiocb(&kiocb, filp);
482 	kiocb.ki_pos = (ppos ? *ppos : 0);
483 	iov_iter_init(&iter, WRITE, &iov, 1, len);
484 
485 	ret = call_write_iter(filp, &kiocb, &iter);
486 	BUG_ON(ret == -EIOCBQUEUED);
487 	if (ret > 0 && ppos)
488 		*ppos = kiocb.ki_pos;
489 	return ret;
490 }
491 
__vfs_write(struct file * file,const char __user * p,size_t count,loff_t * pos)492 static ssize_t __vfs_write(struct file *file, const char __user *p,
493 			   size_t count, loff_t *pos)
494 {
495 	if (file->f_op->write)
496 		return file->f_op->write(file, p, count, pos);
497 	else if (file->f_op->write_iter)
498 		return new_sync_write(file, p, count, pos);
499 	else
500 		return -EINVAL;
501 }
502 
__kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)503 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
504 {
505 	mm_segment_t old_fs;
506 	const char __user *p;
507 	ssize_t ret;
508 
509 	if (!(file->f_mode & FMODE_CAN_WRITE))
510 		return -EINVAL;
511 
512 	old_fs = get_fs();
513 	set_fs(KERNEL_DS);
514 	p = (__force const char __user *)buf;
515 	if (count > MAX_RW_COUNT)
516 		count =  MAX_RW_COUNT;
517 	ret = __vfs_write(file, p, count, pos);
518 	set_fs(old_fs);
519 	if (ret > 0) {
520 		fsnotify_modify(file);
521 		add_wchar(current, ret);
522 	}
523 	inc_syscw(current);
524 	return ret;
525 }
526 EXPORT_SYMBOL(__kernel_write);
527 
kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)528 ssize_t kernel_write(struct file *file, const void *buf, size_t count,
529 			    loff_t *pos)
530 {
531 	mm_segment_t old_fs;
532 	ssize_t res;
533 
534 	old_fs = get_fs();
535 	set_fs(KERNEL_DS);
536 	/* The cast to a user pointer is valid due to the set_fs() */
537 	res = vfs_write(file, (__force const char __user *)buf, count, pos);
538 	set_fs(old_fs);
539 
540 	return res;
541 }
542 EXPORT_SYMBOL(kernel_write);
543 
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)544 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
545 {
546 	ssize_t ret;
547 
548 	if (!(file->f_mode & FMODE_WRITE))
549 		return -EBADF;
550 	if (!(file->f_mode & FMODE_CAN_WRITE))
551 		return -EINVAL;
552 	if (unlikely(!access_ok(buf, count)))
553 		return -EFAULT;
554 
555 	ret = rw_verify_area(WRITE, file, pos, count);
556 	if (!ret) {
557 		if (count > MAX_RW_COUNT)
558 			count =  MAX_RW_COUNT;
559 		file_start_write(file);
560 		ret = __vfs_write(file, buf, count, pos);
561 		if (ret > 0) {
562 			fsnotify_modify(file);
563 			add_wchar(current, ret);
564 		}
565 		inc_syscw(current);
566 		file_end_write(file);
567 	}
568 
569 	return ret;
570 }
571 EXPORT_SYMBOL(vfs_write);
572 
573 /* file_ppos returns &file->f_pos or NULL if file is stream */
file_ppos(struct file * file)574 static inline loff_t *file_ppos(struct file *file)
575 {
576 	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
577 }
578 
ksys_read(unsigned int fd,char __user * buf,size_t count)579 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
580 {
581 	struct fd f = fdget_pos(fd);
582 	ssize_t ret = -EBADF;
583 
584 	if (f.file) {
585 		loff_t pos, *ppos = file_ppos(f.file);
586 		if (ppos) {
587 			pos = *ppos;
588 			ppos = &pos;
589 		}
590 		ret = vfs_read(f.file, buf, count, ppos);
591 		if (ret >= 0 && ppos)
592 			f.file->f_pos = pos;
593 		fdput_pos(f);
594 	}
595 	return ret;
596 }
597 
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)598 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
599 {
600 	return ksys_read(fd, buf, count);
601 }
602 
ksys_write(unsigned int fd,const char __user * buf,size_t count)603 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
604 {
605 	struct fd f = fdget_pos(fd);
606 	ssize_t ret = -EBADF;
607 
608 	if (f.file) {
609 		loff_t pos, *ppos = file_ppos(f.file);
610 		if (ppos) {
611 			pos = *ppos;
612 			ppos = &pos;
613 		}
614 		ret = vfs_write(f.file, buf, count, ppos);
615 		if (ret >= 0 && ppos)
616 			f.file->f_pos = pos;
617 		fdput_pos(f);
618 	}
619 
620 	return ret;
621 }
622 
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)623 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
624 		size_t, count)
625 {
626 	return ksys_write(fd, buf, count);
627 }
628 
ksys_pread64(unsigned int fd,char __user * buf,size_t count,loff_t pos)629 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
630 		     loff_t pos)
631 {
632 	struct fd f;
633 	ssize_t ret = -EBADF;
634 
635 	if (pos < 0)
636 		return -EINVAL;
637 
638 	f = fdget(fd);
639 	if (f.file) {
640 		ret = -ESPIPE;
641 		if (f.file->f_mode & FMODE_PREAD)
642 			ret = vfs_read(f.file, buf, count, &pos);
643 		fdput(f);
644 	}
645 
646 	return ret;
647 }
648 
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)649 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
650 			size_t, count, loff_t, pos)
651 {
652 	return ksys_pread64(fd, buf, count, pos);
653 }
654 
ksys_pwrite64(unsigned int fd,const char __user * buf,size_t count,loff_t pos)655 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
656 		      size_t count, loff_t pos)
657 {
658 	struct fd f;
659 	ssize_t ret = -EBADF;
660 
661 	if (pos < 0)
662 		return -EINVAL;
663 
664 	f = fdget(fd);
665 	if (f.file) {
666 		ret = -ESPIPE;
667 		if (f.file->f_mode & FMODE_PWRITE)
668 			ret = vfs_write(f.file, buf, count, &pos);
669 		fdput(f);
670 	}
671 
672 	return ret;
673 }
674 
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)675 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
676 			 size_t, count, loff_t, pos)
677 {
678 	return ksys_pwrite64(fd, buf, count, pos);
679 }
680 
do_iter_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)681 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
682 		loff_t *ppos, int type, rwf_t flags)
683 {
684 	struct kiocb kiocb;
685 	ssize_t ret;
686 
687 	init_sync_kiocb(&kiocb, filp);
688 	ret = kiocb_set_rw_flags(&kiocb, flags);
689 	if (ret)
690 		return ret;
691 	kiocb.ki_pos = (ppos ? *ppos : 0);
692 
693 	if (type == READ)
694 		ret = call_read_iter(filp, &kiocb, iter);
695 	else
696 		ret = call_write_iter(filp, &kiocb, iter);
697 	BUG_ON(ret == -EIOCBQUEUED);
698 	if (ppos)
699 		*ppos = kiocb.ki_pos;
700 	return ret;
701 }
702 
703 /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)704 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
705 		loff_t *ppos, int type, rwf_t flags)
706 {
707 	ssize_t ret = 0;
708 
709 	if (flags & ~RWF_HIPRI)
710 		return -EOPNOTSUPP;
711 
712 	while (iov_iter_count(iter)) {
713 		struct iovec iovec = iov_iter_iovec(iter);
714 		ssize_t nr;
715 
716 		if (type == READ) {
717 			nr = filp->f_op->read(filp, iovec.iov_base,
718 					      iovec.iov_len, ppos);
719 		} else {
720 			nr = filp->f_op->write(filp, iovec.iov_base,
721 					       iovec.iov_len, ppos);
722 		}
723 
724 		if (nr < 0) {
725 			if (!ret)
726 				ret = nr;
727 			break;
728 		}
729 		ret += nr;
730 		if (nr != iovec.iov_len)
731 			break;
732 		iov_iter_advance(iter, nr);
733 	}
734 
735 	return ret;
736 }
737 
738 /**
739  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
740  *     into the kernel and check that it is valid.
741  *
742  * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
743  * @uvector: Pointer to the userspace array.
744  * @nr_segs: Number of elements in userspace array.
745  * @fast_segs: Number of elements in @fast_pointer.
746  * @fast_pointer: Pointer to (usually small on-stack) kernel array.
747  * @ret_pointer: (output parameter) Pointer to a variable that will point to
748  *     either @fast_pointer, a newly allocated kernel array, or NULL,
749  *     depending on which array was used.
750  *
751  * This function copies an array of &struct iovec of @nr_segs from
752  * userspace into the kernel and checks that each element is valid (e.g.
753  * it does not point to a kernel address or cause overflow by being too
754  * large, etc.).
755  *
756  * As an optimization, the caller may provide a pointer to a small
757  * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
758  * (the size of this array, or 0 if unused, should be given in @fast_segs).
759  *
760  * @ret_pointer will always point to the array that was used, so the
761  * caller must take care not to call kfree() on it e.g. in case the
762  * @fast_pointer array was used and it was allocated on the stack.
763  *
764  * Return: The total number of bytes covered by the iovec array on success
765  *   or a negative error code on error.
766  */
rw_copy_check_uvector(int type,const struct iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)767 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
768 			      unsigned long nr_segs, unsigned long fast_segs,
769 			      struct iovec *fast_pointer,
770 			      struct iovec **ret_pointer)
771 {
772 	unsigned long seg;
773 	ssize_t ret;
774 	struct iovec *iov = fast_pointer;
775 
776 	/*
777 	 * SuS says "The readv() function *may* fail if the iovcnt argument
778 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
779 	 * traditionally returned zero for zero segments, so...
780 	 */
781 	if (nr_segs == 0) {
782 		ret = 0;
783 		goto out;
784 	}
785 
786 	/*
787 	 * First get the "struct iovec" from user memory and
788 	 * verify all the pointers
789 	 */
790 	if (nr_segs > UIO_MAXIOV) {
791 		ret = -EINVAL;
792 		goto out;
793 	}
794 	if (nr_segs > fast_segs) {
795 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
796 		if (iov == NULL) {
797 			ret = -ENOMEM;
798 			goto out;
799 		}
800 	}
801 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
802 		ret = -EFAULT;
803 		goto out;
804 	}
805 
806 	/*
807 	 * According to the Single Unix Specification we should return EINVAL
808 	 * if an element length is < 0 when cast to ssize_t or if the
809 	 * total length would overflow the ssize_t return value of the
810 	 * system call.
811 	 *
812 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
813 	 * overflow case.
814 	 */
815 	ret = 0;
816 	for (seg = 0; seg < nr_segs; seg++) {
817 		void __user *buf = iov[seg].iov_base;
818 		ssize_t len = (ssize_t)iov[seg].iov_len;
819 
820 		/* see if we we're about to use an invalid len or if
821 		 * it's about to overflow ssize_t */
822 		if (len < 0) {
823 			ret = -EINVAL;
824 			goto out;
825 		}
826 		if (type >= 0
827 		    && unlikely(!access_ok(buf, len))) {
828 			ret = -EFAULT;
829 			goto out;
830 		}
831 		if (len > MAX_RW_COUNT - ret) {
832 			len = MAX_RW_COUNT - ret;
833 			iov[seg].iov_len = len;
834 		}
835 		ret += len;
836 	}
837 out:
838 	*ret_pointer = iov;
839 	return ret;
840 }
841 
842 #ifdef CONFIG_COMPAT
compat_rw_copy_check_uvector(int type,const struct compat_iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)843 ssize_t compat_rw_copy_check_uvector(int type,
844 		const struct compat_iovec __user *uvector, unsigned long nr_segs,
845 		unsigned long fast_segs, struct iovec *fast_pointer,
846 		struct iovec **ret_pointer)
847 {
848 	compat_ssize_t tot_len;
849 	struct iovec *iov = *ret_pointer = fast_pointer;
850 	ssize_t ret = 0;
851 	int seg;
852 
853 	/*
854 	 * SuS says "The readv() function *may* fail if the iovcnt argument
855 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
856 	 * traditionally returned zero for zero segments, so...
857 	 */
858 	if (nr_segs == 0)
859 		goto out;
860 
861 	ret = -EINVAL;
862 	if (nr_segs > UIO_MAXIOV)
863 		goto out;
864 	if (nr_segs > fast_segs) {
865 		ret = -ENOMEM;
866 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
867 		if (iov == NULL)
868 			goto out;
869 	}
870 	*ret_pointer = iov;
871 
872 	ret = -EFAULT;
873 	if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
874 		goto out;
875 
876 	/*
877 	 * Single unix specification:
878 	 * We should -EINVAL if an element length is not >= 0 and fitting an
879 	 * ssize_t.
880 	 *
881 	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
882 	 * no overflow possibility.
883 	 */
884 	tot_len = 0;
885 	ret = -EINVAL;
886 	for (seg = 0; seg < nr_segs; seg++) {
887 		compat_uptr_t buf;
888 		compat_ssize_t len;
889 
890 		if (__get_user(len, &uvector->iov_len) ||
891 		   __get_user(buf, &uvector->iov_base)) {
892 			ret = -EFAULT;
893 			goto out;
894 		}
895 		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
896 			goto out;
897 		if (type >= 0 &&
898 		    !access_ok(compat_ptr(buf), len)) {
899 			ret = -EFAULT;
900 			goto out;
901 		}
902 		if (len > MAX_RW_COUNT - tot_len)
903 			len = MAX_RW_COUNT - tot_len;
904 		tot_len += len;
905 		iov->iov_base = compat_ptr(buf);
906 		iov->iov_len = (compat_size_t) len;
907 		uvector++;
908 		iov++;
909 	}
910 	ret = tot_len;
911 
912 out:
913 	return ret;
914 }
915 #endif
916 
do_iter_read(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)917 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
918 		loff_t *pos, rwf_t flags)
919 {
920 	size_t tot_len;
921 	ssize_t ret = 0;
922 
923 	if (!(file->f_mode & FMODE_READ))
924 		return -EBADF;
925 	if (!(file->f_mode & FMODE_CAN_READ))
926 		return -EINVAL;
927 
928 	tot_len = iov_iter_count(iter);
929 	if (!tot_len)
930 		goto out;
931 	ret = rw_verify_area(READ, file, pos, tot_len);
932 	if (ret < 0)
933 		return ret;
934 
935 	if (file->f_op->read_iter)
936 		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
937 	else
938 		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
939 out:
940 	if (ret >= 0)
941 		fsnotify_access(file);
942 	return ret;
943 }
944 
vfs_iter_read(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)945 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
946 		rwf_t flags)
947 {
948 	if (!file->f_op->read_iter)
949 		return -EINVAL;
950 	return do_iter_read(file, iter, ppos, flags);
951 }
952 EXPORT_SYMBOL(vfs_iter_read);
953 
do_iter_write(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)954 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
955 		loff_t *pos, rwf_t flags)
956 {
957 	size_t tot_len;
958 	ssize_t ret = 0;
959 
960 	if (!(file->f_mode & FMODE_WRITE))
961 		return -EBADF;
962 	if (!(file->f_mode & FMODE_CAN_WRITE))
963 		return -EINVAL;
964 
965 	tot_len = iov_iter_count(iter);
966 	if (!tot_len)
967 		return 0;
968 	ret = rw_verify_area(WRITE, file, pos, tot_len);
969 	if (ret < 0)
970 		return ret;
971 
972 	if (file->f_op->write_iter)
973 		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
974 	else
975 		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
976 	if (ret > 0)
977 		fsnotify_modify(file);
978 	return ret;
979 }
980 
vfs_iter_write(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)981 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
982 		rwf_t flags)
983 {
984 	if (!file->f_op->write_iter)
985 		return -EINVAL;
986 	return do_iter_write(file, iter, ppos, flags);
987 }
988 EXPORT_SYMBOL(vfs_iter_write);
989 
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)990 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
991 		  unsigned long vlen, loff_t *pos, rwf_t flags)
992 {
993 	struct iovec iovstack[UIO_FASTIOV];
994 	struct iovec *iov = iovstack;
995 	struct iov_iter iter;
996 	ssize_t ret;
997 
998 	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
999 	if (ret >= 0) {
1000 		ret = do_iter_read(file, &iter, pos, flags);
1001 		kfree(iov);
1002 	}
1003 
1004 	return ret;
1005 }
1006 
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1007 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1008 		   unsigned long vlen, loff_t *pos, rwf_t flags)
1009 {
1010 	struct iovec iovstack[UIO_FASTIOV];
1011 	struct iovec *iov = iovstack;
1012 	struct iov_iter iter;
1013 	ssize_t ret;
1014 
1015 	ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1016 	if (ret >= 0) {
1017 		file_start_write(file);
1018 		ret = do_iter_write(file, &iter, pos, flags);
1019 		file_end_write(file);
1020 		kfree(iov);
1021 	}
1022 	return ret;
1023 }
1024 
do_readv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1025 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1026 			unsigned long vlen, rwf_t flags)
1027 {
1028 	struct fd f = fdget_pos(fd);
1029 	ssize_t ret = -EBADF;
1030 
1031 	if (f.file) {
1032 		loff_t pos, *ppos = file_ppos(f.file);
1033 		if (ppos) {
1034 			pos = *ppos;
1035 			ppos = &pos;
1036 		}
1037 		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
1038 		if (ret >= 0 && ppos)
1039 			f.file->f_pos = pos;
1040 		fdput_pos(f);
1041 	}
1042 
1043 	if (ret > 0)
1044 		add_rchar(current, ret);
1045 	inc_syscr(current);
1046 	return ret;
1047 }
1048 
do_writev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1049 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1050 			 unsigned long vlen, rwf_t flags)
1051 {
1052 	struct fd f = fdget_pos(fd);
1053 	ssize_t ret = -EBADF;
1054 
1055 	if (f.file) {
1056 		loff_t pos, *ppos = file_ppos(f.file);
1057 		if (ppos) {
1058 			pos = *ppos;
1059 			ppos = &pos;
1060 		}
1061 		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1062 		if (ret >= 0 && ppos)
1063 			f.file->f_pos = pos;
1064 		fdput_pos(f);
1065 	}
1066 
1067 	if (ret > 0)
1068 		add_wchar(current, ret);
1069 	inc_syscw(current);
1070 	return ret;
1071 }
1072 
pos_from_hilo(unsigned long high,unsigned long low)1073 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1074 {
1075 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
1076 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1077 }
1078 
do_preadv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1079 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1080 			 unsigned long vlen, loff_t pos, rwf_t flags)
1081 {
1082 	struct fd f;
1083 	ssize_t ret = -EBADF;
1084 
1085 	if (pos < 0)
1086 		return -EINVAL;
1087 
1088 	f = fdget(fd);
1089 	if (f.file) {
1090 		ret = -ESPIPE;
1091 		if (f.file->f_mode & FMODE_PREAD)
1092 			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1093 		fdput(f);
1094 	}
1095 
1096 	if (ret > 0)
1097 		add_rchar(current, ret);
1098 	inc_syscr(current);
1099 	return ret;
1100 }
1101 
do_pwritev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1102 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1103 			  unsigned long vlen, loff_t pos, rwf_t flags)
1104 {
1105 	struct fd f;
1106 	ssize_t ret = -EBADF;
1107 
1108 	if (pos < 0)
1109 		return -EINVAL;
1110 
1111 	f = fdget(fd);
1112 	if (f.file) {
1113 		ret = -ESPIPE;
1114 		if (f.file->f_mode & FMODE_PWRITE)
1115 			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1116 		fdput(f);
1117 	}
1118 
1119 	if (ret > 0)
1120 		add_wchar(current, ret);
1121 	inc_syscw(current);
1122 	return ret;
1123 }
1124 
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1125 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1126 		unsigned long, vlen)
1127 {
1128 	return do_readv(fd, vec, vlen, 0);
1129 }
1130 
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1131 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1132 		unsigned long, vlen)
1133 {
1134 	return do_writev(fd, vec, vlen, 0);
1135 }
1136 
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1137 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1138 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1139 {
1140 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1141 
1142 	return do_preadv(fd, vec, vlen, pos, 0);
1143 }
1144 
SYSCALL_DEFINE6(preadv2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1145 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1146 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1147 		rwf_t, flags)
1148 {
1149 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1150 
1151 	if (pos == -1)
1152 		return do_readv(fd, vec, vlen, flags);
1153 
1154 	return do_preadv(fd, vec, vlen, pos, flags);
1155 }
1156 
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1157 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1158 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1159 {
1160 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1161 
1162 	return do_pwritev(fd, vec, vlen, pos, 0);
1163 }
1164 
SYSCALL_DEFINE6(pwritev2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1165 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1166 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1167 		rwf_t, flags)
1168 {
1169 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1170 
1171 	if (pos == -1)
1172 		return do_writev(fd, vec, vlen, flags);
1173 
1174 	return do_pwritev(fd, vec, vlen, pos, flags);
1175 }
1176 
1177 #ifdef CONFIG_COMPAT
compat_readv(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1178 static size_t compat_readv(struct file *file,
1179 			   const struct compat_iovec __user *vec,
1180 			   unsigned long vlen, loff_t *pos, rwf_t flags)
1181 {
1182 	struct iovec iovstack[UIO_FASTIOV];
1183 	struct iovec *iov = iovstack;
1184 	struct iov_iter iter;
1185 	ssize_t ret;
1186 
1187 	ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1188 	if (ret >= 0) {
1189 		ret = do_iter_read(file, &iter, pos, flags);
1190 		kfree(iov);
1191 	}
1192 	if (ret > 0)
1193 		add_rchar(current, ret);
1194 	inc_syscr(current);
1195 	return ret;
1196 }
1197 
do_compat_readv(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1198 static size_t do_compat_readv(compat_ulong_t fd,
1199 				 const struct compat_iovec __user *vec,
1200 				 compat_ulong_t vlen, rwf_t flags)
1201 {
1202 	struct fd f = fdget_pos(fd);
1203 	ssize_t ret;
1204 	loff_t pos;
1205 
1206 	if (!f.file)
1207 		return -EBADF;
1208 	pos = f.file->f_pos;
1209 	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1210 	if (ret >= 0)
1211 		f.file->f_pos = pos;
1212 	fdput_pos(f);
1213 	return ret;
1214 
1215 }
1216 
COMPAT_SYSCALL_DEFINE3(readv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1217 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1218 		const struct compat_iovec __user *,vec,
1219 		compat_ulong_t, vlen)
1220 {
1221 	return do_compat_readv(fd, vec, vlen, 0);
1222 }
1223 
do_compat_preadv64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1224 static long do_compat_preadv64(unsigned long fd,
1225 				  const struct compat_iovec __user *vec,
1226 				  unsigned long vlen, loff_t pos, rwf_t flags)
1227 {
1228 	struct fd f;
1229 	ssize_t ret;
1230 
1231 	if (pos < 0)
1232 		return -EINVAL;
1233 	f = fdget(fd);
1234 	if (!f.file)
1235 		return -EBADF;
1236 	ret = -ESPIPE;
1237 	if (f.file->f_mode & FMODE_PREAD)
1238 		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1239 	fdput(f);
1240 	return ret;
1241 }
1242 
1243 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1244 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1245 		const struct compat_iovec __user *,vec,
1246 		unsigned long, vlen, loff_t, pos)
1247 {
1248 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1249 }
1250 #endif
1251 
COMPAT_SYSCALL_DEFINE5(preadv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1252 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1253 		const struct compat_iovec __user *,vec,
1254 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1255 {
1256 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1257 
1258 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1259 }
1260 
1261 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1262 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1263 		const struct compat_iovec __user *,vec,
1264 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1265 {
1266 	if (pos == -1)
1267 		return do_compat_readv(fd, vec, vlen, flags);
1268 
1269 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1270 }
1271 #endif
1272 
COMPAT_SYSCALL_DEFINE6(preadv2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1273 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1274 		const struct compat_iovec __user *,vec,
1275 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1276 		rwf_t, flags)
1277 {
1278 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1279 
1280 	if (pos == -1)
1281 		return do_compat_readv(fd, vec, vlen, flags);
1282 
1283 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1284 }
1285 
compat_writev(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1286 static size_t compat_writev(struct file *file,
1287 			    const struct compat_iovec __user *vec,
1288 			    unsigned long vlen, loff_t *pos, rwf_t flags)
1289 {
1290 	struct iovec iovstack[UIO_FASTIOV];
1291 	struct iovec *iov = iovstack;
1292 	struct iov_iter iter;
1293 	ssize_t ret;
1294 
1295 	ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1296 	if (ret >= 0) {
1297 		file_start_write(file);
1298 		ret = do_iter_write(file, &iter, pos, flags);
1299 		file_end_write(file);
1300 		kfree(iov);
1301 	}
1302 	if (ret > 0)
1303 		add_wchar(current, ret);
1304 	inc_syscw(current);
1305 	return ret;
1306 }
1307 
do_compat_writev(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1308 static size_t do_compat_writev(compat_ulong_t fd,
1309 				  const struct compat_iovec __user* vec,
1310 				  compat_ulong_t vlen, rwf_t flags)
1311 {
1312 	struct fd f = fdget_pos(fd);
1313 	ssize_t ret;
1314 	loff_t pos;
1315 
1316 	if (!f.file)
1317 		return -EBADF;
1318 	pos = f.file->f_pos;
1319 	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1320 	if (ret >= 0)
1321 		f.file->f_pos = pos;
1322 	fdput_pos(f);
1323 	return ret;
1324 }
1325 
COMPAT_SYSCALL_DEFINE3(writev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1326 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1327 		const struct compat_iovec __user *, vec,
1328 		compat_ulong_t, vlen)
1329 {
1330 	return do_compat_writev(fd, vec, vlen, 0);
1331 }
1332 
do_compat_pwritev64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1333 static long do_compat_pwritev64(unsigned long fd,
1334 				   const struct compat_iovec __user *vec,
1335 				   unsigned long vlen, loff_t pos, rwf_t flags)
1336 {
1337 	struct fd f;
1338 	ssize_t ret;
1339 
1340 	if (pos < 0)
1341 		return -EINVAL;
1342 	f = fdget(fd);
1343 	if (!f.file)
1344 		return -EBADF;
1345 	ret = -ESPIPE;
1346 	if (f.file->f_mode & FMODE_PWRITE)
1347 		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1348 	fdput(f);
1349 	return ret;
1350 }
1351 
1352 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1353 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1354 		const struct compat_iovec __user *,vec,
1355 		unsigned long, vlen, loff_t, pos)
1356 {
1357 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1358 }
1359 #endif
1360 
COMPAT_SYSCALL_DEFINE5(pwritev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1361 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1362 		const struct compat_iovec __user *,vec,
1363 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1364 {
1365 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1366 
1367 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1368 }
1369 
1370 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1371 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1372 		const struct compat_iovec __user *,vec,
1373 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1374 {
1375 	if (pos == -1)
1376 		return do_compat_writev(fd, vec, vlen, flags);
1377 
1378 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1379 }
1380 #endif
1381 
COMPAT_SYSCALL_DEFINE6(pwritev2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1382 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1383 		const struct compat_iovec __user *,vec,
1384 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1385 {
1386 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1387 
1388 	if (pos == -1)
1389 		return do_compat_writev(fd, vec, vlen, flags);
1390 
1391 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1392 }
1393 
1394 #endif
1395 
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1396 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1397 		  	   size_t count, loff_t max)
1398 {
1399 	struct fd in, out;
1400 	struct inode *in_inode, *out_inode;
1401 	loff_t pos;
1402 	loff_t out_pos;
1403 	ssize_t retval;
1404 	int fl;
1405 
1406 	/*
1407 	 * Get input file, and verify that it is ok..
1408 	 */
1409 	retval = -EBADF;
1410 	in = fdget(in_fd);
1411 	if (!in.file)
1412 		goto out;
1413 	if (!(in.file->f_mode & FMODE_READ))
1414 		goto fput_in;
1415 	retval = -ESPIPE;
1416 	if (!ppos) {
1417 		pos = in.file->f_pos;
1418 	} else {
1419 		pos = *ppos;
1420 		if (!(in.file->f_mode & FMODE_PREAD))
1421 			goto fput_in;
1422 	}
1423 	retval = rw_verify_area(READ, in.file, &pos, count);
1424 	if (retval < 0)
1425 		goto fput_in;
1426 	if (count > MAX_RW_COUNT)
1427 		count =  MAX_RW_COUNT;
1428 
1429 	/*
1430 	 * Get output file, and verify that it is ok..
1431 	 */
1432 	retval = -EBADF;
1433 	out = fdget(out_fd);
1434 	if (!out.file)
1435 		goto fput_in;
1436 	if (!(out.file->f_mode & FMODE_WRITE))
1437 		goto fput_out;
1438 	in_inode = file_inode(in.file);
1439 	out_inode = file_inode(out.file);
1440 	out_pos = out.file->f_pos;
1441 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1442 	if (retval < 0)
1443 		goto fput_out;
1444 
1445 	if (!max)
1446 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1447 
1448 	if (unlikely(pos + count > max)) {
1449 		retval = -EOVERFLOW;
1450 		if (pos >= max)
1451 			goto fput_out;
1452 		count = max - pos;
1453 	}
1454 
1455 	fl = 0;
1456 #if 0
1457 	/*
1458 	 * We need to debate whether we can enable this or not. The
1459 	 * man page documents EAGAIN return for the output at least,
1460 	 * and the application is arguably buggy if it doesn't expect
1461 	 * EAGAIN on a non-blocking file descriptor.
1462 	 */
1463 	if (in.file->f_flags & O_NONBLOCK)
1464 		fl = SPLICE_F_NONBLOCK;
1465 #endif
1466 	file_start_write(out.file);
1467 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1468 	file_end_write(out.file);
1469 
1470 	if (retval > 0) {
1471 		add_rchar(current, retval);
1472 		add_wchar(current, retval);
1473 		fsnotify_access(in.file);
1474 		fsnotify_modify(out.file);
1475 		out.file->f_pos = out_pos;
1476 		if (ppos)
1477 			*ppos = pos;
1478 		else
1479 			in.file->f_pos = pos;
1480 	}
1481 
1482 	inc_syscr(current);
1483 	inc_syscw(current);
1484 	if (pos > max)
1485 		retval = -EOVERFLOW;
1486 
1487 fput_out:
1488 	fdput(out);
1489 fput_in:
1490 	fdput(in);
1491 out:
1492 	return retval;
1493 }
1494 
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1495 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1496 {
1497 	loff_t pos;
1498 	off_t off;
1499 	ssize_t ret;
1500 
1501 	if (offset) {
1502 		if (unlikely(get_user(off, offset)))
1503 			return -EFAULT;
1504 		pos = off;
1505 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1506 		if (unlikely(put_user(pos, offset)))
1507 			return -EFAULT;
1508 		return ret;
1509 	}
1510 
1511 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1512 }
1513 
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1514 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1515 {
1516 	loff_t pos;
1517 	ssize_t ret;
1518 
1519 	if (offset) {
1520 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1521 			return -EFAULT;
1522 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1523 		if (unlikely(put_user(pos, offset)))
1524 			return -EFAULT;
1525 		return ret;
1526 	}
1527 
1528 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1529 }
1530 
1531 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1532 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1533 		compat_off_t __user *, offset, compat_size_t, count)
1534 {
1535 	loff_t pos;
1536 	off_t off;
1537 	ssize_t ret;
1538 
1539 	if (offset) {
1540 		if (unlikely(get_user(off, offset)))
1541 			return -EFAULT;
1542 		pos = off;
1543 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1544 		if (unlikely(put_user(pos, offset)))
1545 			return -EFAULT;
1546 		return ret;
1547 	}
1548 
1549 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1550 }
1551 
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1552 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1553 		compat_loff_t __user *, offset, compat_size_t, count)
1554 {
1555 	loff_t pos;
1556 	ssize_t ret;
1557 
1558 	if (offset) {
1559 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1560 			return -EFAULT;
1561 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1562 		if (unlikely(put_user(pos, offset)))
1563 			return -EFAULT;
1564 		return ret;
1565 	}
1566 
1567 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1568 }
1569 #endif
1570 
1571 /**
1572  * generic_copy_file_range - copy data between two files
1573  * @file_in:	file structure to read from
1574  * @pos_in:	file offset to read from
1575  * @file_out:	file structure to write data to
1576  * @pos_out:	file offset to write data to
1577  * @len:	amount of data to copy
1578  * @flags:	copy flags
1579  *
1580  * This is a generic filesystem helper to copy data from one file to another.
1581  * It has no constraints on the source or destination file owners - the files
1582  * can belong to different superblocks and different filesystem types. Short
1583  * copies are allowed.
1584  *
1585  * This should be called from the @file_out filesystem, as per the
1586  * ->copy_file_range() method.
1587  *
1588  * Returns the number of bytes copied or a negative error indicating the
1589  * failure.
1590  */
1591 
generic_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1592 ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1593 				struct file *file_out, loff_t pos_out,
1594 				size_t len, unsigned int flags)
1595 {
1596 	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1597 				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1598 }
1599 EXPORT_SYMBOL(generic_copy_file_range);
1600 
do_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1601 static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1602 				  struct file *file_out, loff_t pos_out,
1603 				  size_t len, unsigned int flags)
1604 {
1605 	/*
1606 	 * Although we now allow filesystems to handle cross sb copy, passing
1607 	 * a file of the wrong filesystem type to filesystem driver can result
1608 	 * in an attempt to dereference the wrong type of ->private_data, so
1609 	 * avoid doing that until we really have a good reason.  NFS defines
1610 	 * several different file_system_type structures, but they all end up
1611 	 * using the same ->copy_file_range() function pointer.
1612 	 */
1613 	if (file_out->f_op->copy_file_range &&
1614 	    file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1615 		return file_out->f_op->copy_file_range(file_in, pos_in,
1616 						       file_out, pos_out,
1617 						       len, flags);
1618 
1619 	return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1620 				       flags);
1621 }
1622 
1623 /*
1624  * copy_file_range() differs from regular file read and write in that it
1625  * specifically allows return partial success.  When it does so is up to
1626  * the copy_file_range method.
1627  */
vfs_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1628 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1629 			    struct file *file_out, loff_t pos_out,
1630 			    size_t len, unsigned int flags)
1631 {
1632 	ssize_t ret;
1633 
1634 	if (flags != 0)
1635 		return -EINVAL;
1636 
1637 	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1638 				       flags);
1639 	if (unlikely(ret))
1640 		return ret;
1641 
1642 	ret = rw_verify_area(READ, file_in, &pos_in, len);
1643 	if (unlikely(ret))
1644 		return ret;
1645 
1646 	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1647 	if (unlikely(ret))
1648 		return ret;
1649 
1650 	if (len == 0)
1651 		return 0;
1652 
1653 	file_start_write(file_out);
1654 
1655 	/*
1656 	 * Try cloning first, this is supported by more file systems, and
1657 	 * more efficient if both clone and copy are supported (e.g. NFS).
1658 	 */
1659 	if (file_in->f_op->remap_file_range &&
1660 	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1661 		loff_t cloned;
1662 
1663 		cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1664 				file_out, pos_out,
1665 				min_t(loff_t, MAX_RW_COUNT, len),
1666 				REMAP_FILE_CAN_SHORTEN);
1667 		if (cloned > 0) {
1668 			ret = cloned;
1669 			goto done;
1670 		}
1671 	}
1672 
1673 	ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1674 				flags);
1675 	WARN_ON_ONCE(ret == -EOPNOTSUPP);
1676 done:
1677 	if (ret > 0) {
1678 		fsnotify_access(file_in);
1679 		add_rchar(current, ret);
1680 		fsnotify_modify(file_out);
1681 		add_wchar(current, ret);
1682 	}
1683 
1684 	inc_syscr(current);
1685 	inc_syscw(current);
1686 
1687 	file_end_write(file_out);
1688 
1689 	return ret;
1690 }
1691 EXPORT_SYMBOL(vfs_copy_file_range);
1692 
SYSCALL_DEFINE6(copy_file_range,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1693 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1694 		int, fd_out, loff_t __user *, off_out,
1695 		size_t, len, unsigned int, flags)
1696 {
1697 	loff_t pos_in;
1698 	loff_t pos_out;
1699 	struct fd f_in;
1700 	struct fd f_out;
1701 	ssize_t ret = -EBADF;
1702 
1703 	f_in = fdget(fd_in);
1704 	if (!f_in.file)
1705 		goto out2;
1706 
1707 	f_out = fdget(fd_out);
1708 	if (!f_out.file)
1709 		goto out1;
1710 
1711 	ret = -EFAULT;
1712 	if (off_in) {
1713 		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1714 			goto out;
1715 	} else {
1716 		pos_in = f_in.file->f_pos;
1717 	}
1718 
1719 	if (off_out) {
1720 		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1721 			goto out;
1722 	} else {
1723 		pos_out = f_out.file->f_pos;
1724 	}
1725 
1726 	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1727 				  flags);
1728 	if (ret > 0) {
1729 		pos_in += ret;
1730 		pos_out += ret;
1731 
1732 		if (off_in) {
1733 			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1734 				ret = -EFAULT;
1735 		} else {
1736 			f_in.file->f_pos = pos_in;
1737 		}
1738 
1739 		if (off_out) {
1740 			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1741 				ret = -EFAULT;
1742 		} else {
1743 			f_out.file->f_pos = pos_out;
1744 		}
1745 	}
1746 
1747 out:
1748 	fdput(f_out);
1749 out1:
1750 	fdput(f_in);
1751 out2:
1752 	return ret;
1753 }
1754 
remap_verify_area(struct file * file,loff_t pos,loff_t len,bool write)1755 static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1756 			     bool write)
1757 {
1758 	struct inode *inode = file_inode(file);
1759 
1760 	if (unlikely(pos < 0 || len < 0))
1761 		return -EINVAL;
1762 
1763 	 if (unlikely((loff_t) (pos + len) < 0))
1764 		return -EINVAL;
1765 
1766 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1767 		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1768 		int retval;
1769 
1770 		retval = locks_mandatory_area(inode, file, pos, end,
1771 				write ? F_WRLCK : F_RDLCK);
1772 		if (retval < 0)
1773 			return retval;
1774 	}
1775 
1776 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1777 }
1778 /*
1779  * Ensure that we don't remap a partial EOF block in the middle of something
1780  * else.  Assume that the offsets have already been checked for block
1781  * alignment.
1782  *
1783  * For deduplication we always scale down to the previous block because we
1784  * can't meaningfully compare post-EOF contents.
1785  *
1786  * For clone we only link a partial EOF block above the destination file's EOF.
1787  *
1788  * Shorten the request if possible.
1789  */
generic_remap_check_len(struct inode * inode_in,struct inode * inode_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)1790 static int generic_remap_check_len(struct inode *inode_in,
1791 				   struct inode *inode_out,
1792 				   loff_t pos_out,
1793 				   loff_t *len,
1794 				   unsigned int remap_flags)
1795 {
1796 	u64 blkmask = i_blocksize(inode_in) - 1;
1797 	loff_t new_len = *len;
1798 
1799 	if ((*len & blkmask) == 0)
1800 		return 0;
1801 
1802 	if ((remap_flags & REMAP_FILE_DEDUP) ||
1803 	    pos_out + *len < i_size_read(inode_out))
1804 		new_len &= ~blkmask;
1805 
1806 	if (new_len == *len)
1807 		return 0;
1808 
1809 	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1810 		*len = new_len;
1811 		return 0;
1812 	}
1813 
1814 	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1815 }
1816 
1817 /* Read a page's worth of file data into the page cache. */
vfs_dedupe_get_page(struct inode * inode,loff_t offset)1818 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1819 {
1820 	struct page *page;
1821 
1822 	page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1823 	if (IS_ERR(page))
1824 		return page;
1825 	if (!PageUptodate(page)) {
1826 		put_page(page);
1827 		return ERR_PTR(-EIO);
1828 	}
1829 	return page;
1830 }
1831 
1832 /*
1833  * Lock two pages, ensuring that we lock in offset order if the pages are from
1834  * the same file.
1835  */
vfs_lock_two_pages(struct page * page1,struct page * page2)1836 static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1837 {
1838 	/* Always lock in order of increasing index. */
1839 	if (page1->index > page2->index)
1840 		swap(page1, page2);
1841 
1842 	lock_page(page1);
1843 	if (page1 != page2)
1844 		lock_page(page2);
1845 }
1846 
1847 /* Unlock two pages, being careful not to unlock the same page twice. */
vfs_unlock_two_pages(struct page * page1,struct page * page2)1848 static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1849 {
1850 	unlock_page(page1);
1851 	if (page1 != page2)
1852 		unlock_page(page2);
1853 }
1854 
1855 /*
1856  * Compare extents of two files to see if they are the same.
1857  * Caller must have locked both inodes to prevent write races.
1858  */
vfs_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dest,loff_t destoff,loff_t len,bool * is_same)1859 static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1860 					 struct inode *dest, loff_t destoff,
1861 					 loff_t len, bool *is_same)
1862 {
1863 	loff_t src_poff;
1864 	loff_t dest_poff;
1865 	void *src_addr;
1866 	void *dest_addr;
1867 	struct page *src_page;
1868 	struct page *dest_page;
1869 	loff_t cmp_len;
1870 	bool same;
1871 	int error;
1872 
1873 	error = -EINVAL;
1874 	same = true;
1875 	while (len) {
1876 		src_poff = srcoff & (PAGE_SIZE - 1);
1877 		dest_poff = destoff & (PAGE_SIZE - 1);
1878 		cmp_len = min(PAGE_SIZE - src_poff,
1879 			      PAGE_SIZE - dest_poff);
1880 		cmp_len = min(cmp_len, len);
1881 		if (cmp_len <= 0)
1882 			goto out_error;
1883 
1884 		src_page = vfs_dedupe_get_page(src, srcoff);
1885 		if (IS_ERR(src_page)) {
1886 			error = PTR_ERR(src_page);
1887 			goto out_error;
1888 		}
1889 		dest_page = vfs_dedupe_get_page(dest, destoff);
1890 		if (IS_ERR(dest_page)) {
1891 			error = PTR_ERR(dest_page);
1892 			put_page(src_page);
1893 			goto out_error;
1894 		}
1895 
1896 		vfs_lock_two_pages(src_page, dest_page);
1897 
1898 		/*
1899 		 * Now that we've locked both pages, make sure they're still
1900 		 * mapped to the file data we're interested in.  If not,
1901 		 * someone is invalidating pages on us and we lose.
1902 		 */
1903 		if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1904 		    src_page->mapping != src->i_mapping ||
1905 		    dest_page->mapping != dest->i_mapping) {
1906 			same = false;
1907 			goto unlock;
1908 		}
1909 
1910 		src_addr = kmap_atomic(src_page);
1911 		dest_addr = kmap_atomic(dest_page);
1912 
1913 		flush_dcache_page(src_page);
1914 		flush_dcache_page(dest_page);
1915 
1916 		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1917 			same = false;
1918 
1919 		kunmap_atomic(dest_addr);
1920 		kunmap_atomic(src_addr);
1921 unlock:
1922 		vfs_unlock_two_pages(src_page, dest_page);
1923 		put_page(dest_page);
1924 		put_page(src_page);
1925 
1926 		if (!same)
1927 			break;
1928 
1929 		srcoff += cmp_len;
1930 		destoff += cmp_len;
1931 		len -= cmp_len;
1932 	}
1933 
1934 	*is_same = same;
1935 	return 0;
1936 
1937 out_error:
1938 	return error;
1939 }
1940 
1941 /*
1942  * Check that the two inodes are eligible for cloning, the ranges make
1943  * sense, and then flush all dirty data.  Caller must ensure that the
1944  * inodes have been locked against any other modifications.
1945  *
1946  * If there's an error, then the usual negative error code is returned.
1947  * Otherwise returns 0 with *len set to the request length.
1948  */
generic_remap_file_range_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)1949 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1950 				  struct file *file_out, loff_t pos_out,
1951 				  loff_t *len, unsigned int remap_flags)
1952 {
1953 	struct inode *inode_in = file_inode(file_in);
1954 	struct inode *inode_out = file_inode(file_out);
1955 	bool same_inode = (inode_in == inode_out);
1956 	int ret;
1957 
1958 	/* Don't touch certain kinds of inodes */
1959 	if (IS_IMMUTABLE(inode_out))
1960 		return -EPERM;
1961 
1962 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1963 		return -ETXTBSY;
1964 
1965 	/* Don't reflink dirs, pipes, sockets... */
1966 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1967 		return -EISDIR;
1968 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1969 		return -EINVAL;
1970 
1971 	/* Zero length dedupe exits immediately; reflink goes to EOF. */
1972 	if (*len == 0) {
1973 		loff_t isize = i_size_read(inode_in);
1974 
1975 		if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
1976 			return 0;
1977 		if (pos_in > isize)
1978 			return -EINVAL;
1979 		*len = isize - pos_in;
1980 		if (*len == 0)
1981 			return 0;
1982 	}
1983 
1984 	/* Check that we don't violate system file offset limits. */
1985 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
1986 			remap_flags);
1987 	if (ret)
1988 		return ret;
1989 
1990 	/* Wait for the completion of any pending IOs on both files */
1991 	inode_dio_wait(inode_in);
1992 	if (!same_inode)
1993 		inode_dio_wait(inode_out);
1994 
1995 	ret = filemap_write_and_wait_range(inode_in->i_mapping,
1996 			pos_in, pos_in + *len - 1);
1997 	if (ret)
1998 		return ret;
1999 
2000 	ret = filemap_write_and_wait_range(inode_out->i_mapping,
2001 			pos_out, pos_out + *len - 1);
2002 	if (ret)
2003 		return ret;
2004 
2005 	/*
2006 	 * Check that the extents are the same.
2007 	 */
2008 	if (remap_flags & REMAP_FILE_DEDUP) {
2009 		bool		is_same = false;
2010 
2011 		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2012 				inode_out, pos_out, *len, &is_same);
2013 		if (ret)
2014 			return ret;
2015 		if (!is_same)
2016 			return -EBADE;
2017 	}
2018 
2019 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
2020 			remap_flags);
2021 	if (ret)
2022 		return ret;
2023 
2024 	/* If can't alter the file contents, we're done. */
2025 	if (!(remap_flags & REMAP_FILE_DEDUP))
2026 		ret = file_modified(file_out);
2027 
2028 	return ret;
2029 }
2030 EXPORT_SYMBOL(generic_remap_file_range_prep);
2031 
do_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)2032 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
2033 			   struct file *file_out, loff_t pos_out,
2034 			   loff_t len, unsigned int remap_flags)
2035 {
2036 	loff_t ret;
2037 
2038 	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
2039 
2040 	/*
2041 	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2042 	 * the same mount. Practically, they only need to be on the same file
2043 	 * system.
2044 	 */
2045 	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
2046 		return -EXDEV;
2047 
2048 	ret = generic_file_rw_checks(file_in, file_out);
2049 	if (ret < 0)
2050 		return ret;
2051 
2052 	if (!file_in->f_op->remap_file_range)
2053 		return -EOPNOTSUPP;
2054 
2055 	ret = remap_verify_area(file_in, pos_in, len, false);
2056 	if (ret)
2057 		return ret;
2058 
2059 	ret = remap_verify_area(file_out, pos_out, len, true);
2060 	if (ret)
2061 		return ret;
2062 
2063 	ret = file_in->f_op->remap_file_range(file_in, pos_in,
2064 			file_out, pos_out, len, remap_flags);
2065 	if (ret < 0)
2066 		return ret;
2067 
2068 	fsnotify_access(file_in);
2069 	fsnotify_modify(file_out);
2070 	return ret;
2071 }
2072 EXPORT_SYMBOL(do_clone_file_range);
2073 
vfs_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)2074 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
2075 			    struct file *file_out, loff_t pos_out,
2076 			    loff_t len, unsigned int remap_flags)
2077 {
2078 	loff_t ret;
2079 
2080 	file_start_write(file_out);
2081 	ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2082 				  remap_flags);
2083 	file_end_write(file_out);
2084 
2085 	return ret;
2086 }
2087 EXPORT_SYMBOL(vfs_clone_file_range);
2088 
2089 /* Check whether we are allowed to dedupe the destination file */
allow_file_dedupe(struct file * file)2090 static bool allow_file_dedupe(struct file *file)
2091 {
2092 	if (capable(CAP_SYS_ADMIN))
2093 		return true;
2094 	if (file->f_mode & FMODE_WRITE)
2095 		return true;
2096 	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
2097 		return true;
2098 	if (!inode_permission(file_inode(file), MAY_WRITE))
2099 		return true;
2100 	return false;
2101 }
2102 
vfs_dedupe_file_range_one(struct file * src_file,loff_t src_pos,struct file * dst_file,loff_t dst_pos,loff_t len,unsigned int remap_flags)2103 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2104 				 struct file *dst_file, loff_t dst_pos,
2105 				 loff_t len, unsigned int remap_flags)
2106 {
2107 	loff_t ret;
2108 
2109 	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2110 				     REMAP_FILE_CAN_SHORTEN));
2111 
2112 	ret = mnt_want_write_file(dst_file);
2113 	if (ret)
2114 		return ret;
2115 
2116 	ret = remap_verify_area(dst_file, dst_pos, len, true);
2117 	if (ret < 0)
2118 		goto out_drop_write;
2119 
2120 	ret = -EPERM;
2121 	if (!allow_file_dedupe(dst_file))
2122 		goto out_drop_write;
2123 
2124 	ret = -EXDEV;
2125 	if (src_file->f_path.mnt != dst_file->f_path.mnt)
2126 		goto out_drop_write;
2127 
2128 	ret = -EISDIR;
2129 	if (S_ISDIR(file_inode(dst_file)->i_mode))
2130 		goto out_drop_write;
2131 
2132 	ret = -EINVAL;
2133 	if (!dst_file->f_op->remap_file_range)
2134 		goto out_drop_write;
2135 
2136 	if (len == 0) {
2137 		ret = 0;
2138 		goto out_drop_write;
2139 	}
2140 
2141 	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
2142 			dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
2143 out_drop_write:
2144 	mnt_drop_write_file(dst_file);
2145 
2146 	return ret;
2147 }
2148 EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2149 
vfs_dedupe_file_range(struct file * file,struct file_dedupe_range * same)2150 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2151 {
2152 	struct file_dedupe_range_info *info;
2153 	struct inode *src = file_inode(file);
2154 	u64 off;
2155 	u64 len;
2156 	int i;
2157 	int ret;
2158 	u16 count = same->dest_count;
2159 	loff_t deduped;
2160 
2161 	if (!(file->f_mode & FMODE_READ))
2162 		return -EINVAL;
2163 
2164 	if (same->reserved1 || same->reserved2)
2165 		return -EINVAL;
2166 
2167 	off = same->src_offset;
2168 	len = same->src_length;
2169 
2170 	if (S_ISDIR(src->i_mode))
2171 		return -EISDIR;
2172 
2173 	if (!S_ISREG(src->i_mode))
2174 		return -EINVAL;
2175 
2176 	if (!file->f_op->remap_file_range)
2177 		return -EOPNOTSUPP;
2178 
2179 	ret = remap_verify_area(file, off, len, false);
2180 	if (ret < 0)
2181 		return ret;
2182 	ret = 0;
2183 
2184 	if (off + len > i_size_read(src))
2185 		return -EINVAL;
2186 
2187 	/* Arbitrary 1G limit on a single dedupe request, can be raised. */
2188 	len = min_t(u64, len, 1 << 30);
2189 
2190 	/* pre-format output fields to sane values */
2191 	for (i = 0; i < count; i++) {
2192 		same->info[i].bytes_deduped = 0ULL;
2193 		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2194 	}
2195 
2196 	for (i = 0, info = same->info; i < count; i++, info++) {
2197 		struct fd dst_fd = fdget(info->dest_fd);
2198 		struct file *dst_file = dst_fd.file;
2199 
2200 		if (!dst_file) {
2201 			info->status = -EBADF;
2202 			goto next_loop;
2203 		}
2204 
2205 		if (info->reserved) {
2206 			info->status = -EINVAL;
2207 			goto next_fdput;
2208 		}
2209 
2210 		deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2211 						    info->dest_offset, len,
2212 						    REMAP_FILE_CAN_SHORTEN);
2213 		if (deduped == -EBADE)
2214 			info->status = FILE_DEDUPE_RANGE_DIFFERS;
2215 		else if (deduped < 0)
2216 			info->status = deduped;
2217 		else
2218 			info->bytes_deduped = len;
2219 
2220 next_fdput:
2221 		fdput(dst_fd);
2222 next_loop:
2223 		if (fatal_signal_pending(current))
2224 			break;
2225 	}
2226 	return ret;
2227 }
2228 EXPORT_SYMBOL(vfs_dedupe_file_range);
2229