• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/fsnotify.h>
13 #include <linux/security.h>
14 #include <linux/export.h>
15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h>
17 #include <linux/splice.h>
18 #include <linux/compat.h>
19 #include <linux/mount.h>
20 #include <linux/fs.h>
21 #include "internal.h"
22 
23 #include <asm/uaccess.h>
24 #include <asm/unistd.h>
25 
26 const struct file_operations generic_ro_fops = {
27 	.llseek		= generic_file_llseek,
28 	.read_iter	= generic_file_read_iter,
29 	.mmap		= generic_file_readonly_mmap,
30 	.splice_read	= generic_file_splice_read,
31 };
32 
33 EXPORT_SYMBOL(generic_ro_fops);
34 
unsigned_offsets(struct file * file)35 static inline int unsigned_offsets(struct file *file)
36 {
37 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
38 }
39 
40 /**
41  * vfs_setpos - update the file offset for lseek
42  * @file:	file structure in question
43  * @offset:	file offset to seek to
44  * @maxsize:	maximum file size
45  *
46  * This is a low-level filesystem helper for updating the file offset to
47  * the value specified by @offset if the given offset is valid and it is
48  * not equal to the current file offset.
49  *
50  * Return the specified offset on success and -EINVAL on invalid offset.
51  */
vfs_setpos(struct file * file,loff_t offset,loff_t maxsize)52 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
53 {
54 	if (offset < 0 && !unsigned_offsets(file))
55 		return -EINVAL;
56 	if (offset > maxsize)
57 		return -EINVAL;
58 
59 	if (offset != file->f_pos) {
60 		file->f_pos = offset;
61 		file->f_version = 0;
62 	}
63 	return offset;
64 }
65 EXPORT_SYMBOL(vfs_setpos);
66 
67 /**
68  * generic_file_llseek_size - generic llseek implementation for regular files
69  * @file:	file structure to seek on
70  * @offset:	file offset to seek to
71  * @whence:	type of seek
72  * @size:	max size of this file in file system
73  * @eof:	offset used for SEEK_END position
74  *
75  * This is a variant of generic_file_llseek that allows passing in a custom
76  * maximum file size and a custom EOF position, for e.g. hashed directories
77  *
78  * Synchronization:
79  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
80  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
81  * read/writes behave like SEEK_SET against seeks.
82  */
83 loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)84 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
85 		loff_t maxsize, loff_t eof)
86 {
87 	switch (whence) {
88 	case SEEK_END:
89 		offset += eof;
90 		break;
91 	case SEEK_CUR:
92 		/*
93 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
94 		 * position-querying operation.  Avoid rewriting the "same"
95 		 * f_pos value back to the file because a concurrent read(),
96 		 * write() or lseek() might have altered it
97 		 */
98 		if (offset == 0)
99 			return file->f_pos;
100 		/*
101 		 * f_lock protects against read/modify/write race with other
102 		 * SEEK_CURs. Note that parallel writes and reads behave
103 		 * like SEEK_SET.
104 		 */
105 		spin_lock(&file->f_lock);
106 		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
107 		spin_unlock(&file->f_lock);
108 		return offset;
109 	case SEEK_DATA:
110 		/*
111 		 * In the generic case the entire file is data, so as long as
112 		 * offset isn't at the end of the file then the offset is data.
113 		 */
114 		if ((unsigned long long)offset >= eof)
115 			return -ENXIO;
116 		break;
117 	case SEEK_HOLE:
118 		/*
119 		 * There is a virtual hole at the end of the file, so as long as
120 		 * offset isn't i_size or larger, return i_size.
121 		 */
122 		if ((unsigned long long)offset >= eof)
123 			return -ENXIO;
124 		offset = eof;
125 		break;
126 	}
127 
128 	return vfs_setpos(file, offset, maxsize);
129 }
130 EXPORT_SYMBOL(generic_file_llseek_size);
131 
132 /**
133  * generic_file_llseek - generic llseek implementation for regular files
134  * @file:	file structure to seek on
135  * @offset:	file offset to seek to
136  * @whence:	type of seek
137  *
138  * This is a generic implemenation of ->llseek useable for all normal local
139  * filesystems.  It just updates the file offset to the value specified by
140  * @offset and @whence.
141  */
generic_file_llseek(struct file * file,loff_t offset,int whence)142 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
143 {
144 	struct inode *inode = file->f_mapping->host;
145 
146 	return generic_file_llseek_size(file, offset, whence,
147 					inode->i_sb->s_maxbytes,
148 					i_size_read(inode));
149 }
150 EXPORT_SYMBOL(generic_file_llseek);
151 
152 /**
153  * fixed_size_llseek - llseek implementation for fixed-sized devices
154  * @file:	file structure to seek on
155  * @offset:	file offset to seek to
156  * @whence:	type of seek
157  * @size:	size of the file
158  *
159  */
fixed_size_llseek(struct file * file,loff_t offset,int whence,loff_t size)160 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
161 {
162 	switch (whence) {
163 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
164 		return generic_file_llseek_size(file, offset, whence,
165 						size, size);
166 	default:
167 		return -EINVAL;
168 	}
169 }
170 EXPORT_SYMBOL(fixed_size_llseek);
171 
172 /**
173  * no_seek_end_llseek - llseek implementation for fixed-sized devices
174  * @file:	file structure to seek on
175  * @offset:	file offset to seek to
176  * @whence:	type of seek
177  *
178  */
no_seek_end_llseek(struct file * file,loff_t offset,int whence)179 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
180 {
181 	switch (whence) {
182 	case SEEK_SET: case SEEK_CUR:
183 		return generic_file_llseek_size(file, offset, whence,
184 						OFFSET_MAX, 0);
185 	default:
186 		return -EINVAL;
187 	}
188 }
189 EXPORT_SYMBOL(no_seek_end_llseek);
190 
191 /**
192  * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
193  * @file:	file structure to seek on
194  * @offset:	file offset to seek to
195  * @whence:	type of seek
196  * @size:	maximal offset allowed
197  *
198  */
no_seek_end_llseek_size(struct file * file,loff_t offset,int whence,loff_t size)199 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
200 {
201 	switch (whence) {
202 	case SEEK_SET: case SEEK_CUR:
203 		return generic_file_llseek_size(file, offset, whence,
204 						size, 0);
205 	default:
206 		return -EINVAL;
207 	}
208 }
209 EXPORT_SYMBOL(no_seek_end_llseek_size);
210 
211 /**
212  * noop_llseek - No Operation Performed llseek implementation
213  * @file:	file structure to seek on
214  * @offset:	file offset to seek to
215  * @whence:	type of seek
216  *
217  * This is an implementation of ->llseek useable for the rare special case when
218  * userspace expects the seek to succeed but the (device) file is actually not
219  * able to perform the seek. In this case you use noop_llseek() instead of
220  * falling back to the default implementation of ->llseek.
221  */
noop_llseek(struct file * file,loff_t offset,int whence)222 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
223 {
224 	return file->f_pos;
225 }
226 EXPORT_SYMBOL(noop_llseek);
227 
no_llseek(struct file * file,loff_t offset,int whence)228 loff_t no_llseek(struct file *file, loff_t offset, int whence)
229 {
230 	return -ESPIPE;
231 }
232 EXPORT_SYMBOL(no_llseek);
233 
default_llseek(struct file * file,loff_t offset,int whence)234 loff_t default_llseek(struct file *file, loff_t offset, int whence)
235 {
236 	struct inode *inode = file_inode(file);
237 	loff_t retval;
238 
239 	inode_lock(inode);
240 	switch (whence) {
241 		case SEEK_END:
242 			offset += i_size_read(inode);
243 			break;
244 		case SEEK_CUR:
245 			if (offset == 0) {
246 				retval = file->f_pos;
247 				goto out;
248 			}
249 			offset += file->f_pos;
250 			break;
251 		case SEEK_DATA:
252 			/*
253 			 * In the generic case the entire file is data, so as
254 			 * long as offset isn't at the end of the file then the
255 			 * offset is data.
256 			 */
257 			if (offset >= inode->i_size) {
258 				retval = -ENXIO;
259 				goto out;
260 			}
261 			break;
262 		case SEEK_HOLE:
263 			/*
264 			 * There is a virtual hole at the end of the file, so
265 			 * as long as offset isn't i_size or larger, return
266 			 * i_size.
267 			 */
268 			if (offset >= inode->i_size) {
269 				retval = -ENXIO;
270 				goto out;
271 			}
272 			offset = inode->i_size;
273 			break;
274 	}
275 	retval = -EINVAL;
276 	if (offset >= 0 || unsigned_offsets(file)) {
277 		if (offset != file->f_pos) {
278 			file->f_pos = offset;
279 			file->f_version = 0;
280 		}
281 		retval = offset;
282 	}
283 out:
284 	inode_unlock(inode);
285 	return retval;
286 }
287 EXPORT_SYMBOL(default_llseek);
288 
vfs_llseek(struct file * file,loff_t offset,int whence)289 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
290 {
291 	loff_t (*fn)(struct file *, loff_t, int);
292 
293 	fn = no_llseek;
294 	if (file->f_mode & FMODE_LSEEK) {
295 		if (file->f_op->llseek)
296 			fn = file->f_op->llseek;
297 	}
298 	return fn(file, offset, whence);
299 }
300 EXPORT_SYMBOL(vfs_llseek);
301 
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)302 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
303 {
304 	off_t retval;
305 	struct fd f = fdget_pos(fd);
306 	if (!f.file)
307 		return -EBADF;
308 
309 	retval = -EINVAL;
310 	if (whence <= SEEK_MAX) {
311 		loff_t res = vfs_llseek(f.file, offset, whence);
312 		retval = res;
313 		if (res != (loff_t)retval)
314 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
315 	}
316 	fdput_pos(f);
317 	return retval;
318 }
319 
320 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)321 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
322 {
323 	return sys_lseek(fd, offset, whence);
324 }
325 #endif
326 
327 #ifdef __ARCH_WANT_SYS_LLSEEK
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)328 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
329 		unsigned long, offset_low, loff_t __user *, result,
330 		unsigned int, whence)
331 {
332 	int retval;
333 	struct fd f = fdget_pos(fd);
334 	loff_t offset;
335 
336 	if (!f.file)
337 		return -EBADF;
338 
339 	retval = -EINVAL;
340 	if (whence > SEEK_MAX)
341 		goto out_putf;
342 
343 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
344 			whence);
345 
346 	retval = (int)offset;
347 	if (offset >= 0) {
348 		retval = -EFAULT;
349 		if (!copy_to_user(result, &offset, sizeof(offset)))
350 			retval = 0;
351 	}
352 out_putf:
353 	fdput_pos(f);
354 	return retval;
355 }
356 #endif
357 
vfs_iter_read(struct file * file,struct iov_iter * iter,loff_t * ppos)358 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
359 {
360 	struct kiocb kiocb;
361 	ssize_t ret;
362 
363 	if (!file->f_op->read_iter)
364 		return -EINVAL;
365 
366 	init_sync_kiocb(&kiocb, file);
367 	kiocb.ki_pos = *ppos;
368 
369 	iter->type |= READ;
370 	ret = file->f_op->read_iter(&kiocb, iter);
371 	BUG_ON(ret == -EIOCBQUEUED);
372 	if (ret > 0)
373 		*ppos = kiocb.ki_pos;
374 	return ret;
375 }
376 EXPORT_SYMBOL(vfs_iter_read);
377 
vfs_iter_write(struct file * file,struct iov_iter * iter,loff_t * ppos)378 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
379 {
380 	struct kiocb kiocb;
381 	ssize_t ret;
382 
383 	if (!file->f_op->write_iter)
384 		return -EINVAL;
385 
386 	init_sync_kiocb(&kiocb, file);
387 	kiocb.ki_pos = *ppos;
388 
389 	iter->type |= WRITE;
390 	ret = file->f_op->write_iter(&kiocb, iter);
391 	BUG_ON(ret == -EIOCBQUEUED);
392 	if (ret > 0)
393 		*ppos = kiocb.ki_pos;
394 	return ret;
395 }
396 EXPORT_SYMBOL(vfs_iter_write);
397 
rw_verify_area(int read_write,struct file * file,const loff_t * ppos,size_t count)398 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
399 {
400 	struct inode *inode;
401 	loff_t pos;
402 	int retval = -EINVAL;
403 
404 	inode = file_inode(file);
405 	if (unlikely((ssize_t) count < 0))
406 		return retval;
407 	pos = *ppos;
408 	if (unlikely(pos < 0)) {
409 		if (!unsigned_offsets(file))
410 			return retval;
411 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
412 			return -EOVERFLOW;
413 	} else if (unlikely((loff_t) (pos + count) < 0)) {
414 		if (!unsigned_offsets(file))
415 			return retval;
416 	}
417 
418 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
419 		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
420 				read_write == READ ? F_RDLCK : F_WRLCK);
421 		if (retval < 0)
422 			return retval;
423 	}
424 	return security_file_permission(file,
425 				read_write == READ ? MAY_READ : MAY_WRITE);
426 }
427 
new_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)428 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
429 {
430 	struct iovec iov = { .iov_base = buf, .iov_len = len };
431 	struct kiocb kiocb;
432 	struct iov_iter iter;
433 	ssize_t ret;
434 
435 	init_sync_kiocb(&kiocb, filp);
436 	kiocb.ki_pos = *ppos;
437 	iov_iter_init(&iter, READ, &iov, 1, len);
438 
439 	ret = filp->f_op->read_iter(&kiocb, &iter);
440 	BUG_ON(ret == -EIOCBQUEUED);
441 	*ppos = kiocb.ki_pos;
442 	return ret;
443 }
444 
__vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)445 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
446 		   loff_t *pos)
447 {
448 	if (file->f_op->read)
449 		return file->f_op->read(file, buf, count, pos);
450 	else if (file->f_op->read_iter)
451 		return new_sync_read(file, buf, count, pos);
452 	else
453 		return -EINVAL;
454 }
455 EXPORT_SYMBOL(__vfs_read);
456 
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)457 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
458 {
459 	ssize_t ret;
460 
461 	if (!(file->f_mode & FMODE_READ))
462 		return -EBADF;
463 	if (!(file->f_mode & FMODE_CAN_READ))
464 		return -EINVAL;
465 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
466 		return -EFAULT;
467 
468 	ret = rw_verify_area(READ, file, pos, count);
469 	if (!ret) {
470 		if (count > MAX_RW_COUNT)
471 			count =  MAX_RW_COUNT;
472 		ret = __vfs_read(file, buf, count, pos);
473 		if (ret > 0) {
474 			fsnotify_access(file);
475 			add_rchar(current, ret);
476 		}
477 		inc_syscr(current);
478 	}
479 
480 	return ret;
481 }
482 
483 EXPORT_SYMBOL(vfs_read);
484 
new_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)485 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
486 {
487 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
488 	struct kiocb kiocb;
489 	struct iov_iter iter;
490 	ssize_t ret;
491 
492 	init_sync_kiocb(&kiocb, filp);
493 	kiocb.ki_pos = *ppos;
494 	iov_iter_init(&iter, WRITE, &iov, 1, len);
495 
496 	ret = filp->f_op->write_iter(&kiocb, &iter);
497 	BUG_ON(ret == -EIOCBQUEUED);
498 	if (ret > 0)
499 		*ppos = kiocb.ki_pos;
500 	return ret;
501 }
502 
__vfs_write(struct file * file,const char __user * p,size_t count,loff_t * pos)503 ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
504 		    loff_t *pos)
505 {
506 	if (file->f_op->write)
507 		return file->f_op->write(file, p, count, pos);
508 	else if (file->f_op->write_iter)
509 		return new_sync_write(file, p, count, pos);
510 	else
511 		return -EINVAL;
512 }
513 EXPORT_SYMBOL(__vfs_write);
514 
__kernel_write(struct file * file,const char * buf,size_t count,loff_t * pos)515 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
516 {
517 	mm_segment_t old_fs;
518 	const char __user *p;
519 	ssize_t ret;
520 
521 	if (!(file->f_mode & FMODE_CAN_WRITE))
522 		return -EINVAL;
523 
524 	old_fs = get_fs();
525 	set_fs(get_ds());
526 	p = (__force const char __user *)buf;
527 	if (count > MAX_RW_COUNT)
528 		count =  MAX_RW_COUNT;
529 	ret = __vfs_write(file, p, count, pos);
530 	set_fs(old_fs);
531 	if (ret > 0) {
532 		fsnotify_modify(file);
533 		add_wchar(current, ret);
534 	}
535 	inc_syscw(current);
536 	return ret;
537 }
538 
539 EXPORT_SYMBOL(__kernel_write);
540 
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)541 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
542 {
543 	ssize_t ret;
544 
545 	if (!(file->f_mode & FMODE_WRITE))
546 		return -EBADF;
547 	if (!(file->f_mode & FMODE_CAN_WRITE))
548 		return -EINVAL;
549 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
550 		return -EFAULT;
551 
552 	ret = rw_verify_area(WRITE, file, pos, count);
553 	if (!ret) {
554 		if (count > MAX_RW_COUNT)
555 			count =  MAX_RW_COUNT;
556 		file_start_write(file);
557 		ret = __vfs_write(file, buf, count, pos);
558 		if (ret > 0) {
559 			fsnotify_modify(file);
560 			add_wchar(current, ret);
561 		}
562 		inc_syscw(current);
563 		file_end_write(file);
564 	}
565 
566 	return ret;
567 }
568 
569 EXPORT_SYMBOL(vfs_write);
570 
file_pos_read(struct file * file)571 static inline loff_t file_pos_read(struct file *file)
572 {
573 	return file->f_pos;
574 }
575 
file_pos_write(struct file * file,loff_t pos)576 static inline void file_pos_write(struct file *file, loff_t pos)
577 {
578 	file->f_pos = pos;
579 }
580 
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)581 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
582 {
583 	struct fd f = fdget_pos(fd);
584 	ssize_t ret = -EBADF;
585 
586 	if (f.file) {
587 		loff_t pos = file_pos_read(f.file);
588 		ret = vfs_read(f.file, buf, count, &pos);
589 		if (ret >= 0)
590 			file_pos_write(f.file, pos);
591 		fdput_pos(f);
592 	}
593 	return ret;
594 }
595 
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)596 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
597 		size_t, count)
598 {
599 	struct fd f = fdget_pos(fd);
600 	ssize_t ret = -EBADF;
601 
602 	if (f.file) {
603 		loff_t pos = file_pos_read(f.file);
604 		ret = vfs_write(f.file, buf, count, &pos);
605 		if (ret >= 0)
606 			file_pos_write(f.file, pos);
607 		fdput_pos(f);
608 	}
609 
610 	return ret;
611 }
612 
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)613 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
614 			size_t, count, loff_t, pos)
615 {
616 	struct fd f;
617 	ssize_t ret = -EBADF;
618 
619 	if (pos < 0)
620 		return -EINVAL;
621 
622 	f = fdget(fd);
623 	if (f.file) {
624 		ret = -ESPIPE;
625 		if (f.file->f_mode & FMODE_PREAD)
626 			ret = vfs_read(f.file, buf, count, &pos);
627 		fdput(f);
628 	}
629 
630 	return ret;
631 }
632 
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)633 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
634 			 size_t, count, loff_t, pos)
635 {
636 	struct fd f;
637 	ssize_t ret = -EBADF;
638 
639 	if (pos < 0)
640 		return -EINVAL;
641 
642 	f = fdget(fd);
643 	if (f.file) {
644 		ret = -ESPIPE;
645 		if (f.file->f_mode & FMODE_PWRITE)
646 			ret = vfs_write(f.file, buf, count, &pos);
647 		fdput(f);
648 	}
649 
650 	return ret;
651 }
652 
653 /*
654  * Reduce an iovec's length in-place.  Return the resulting number of segments
655  */
iov_shorten(struct iovec * iov,unsigned long nr_segs,size_t to)656 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
657 {
658 	unsigned long seg = 0;
659 	size_t len = 0;
660 
661 	while (seg < nr_segs) {
662 		seg++;
663 		if (len + iov->iov_len >= to) {
664 			iov->iov_len = to - len;
665 			break;
666 		}
667 		len += iov->iov_len;
668 		iov++;
669 	}
670 	return seg;
671 }
672 EXPORT_SYMBOL(iov_shorten);
673 
do_iter_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,int flags)674 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
675 		loff_t *ppos, int type, int flags)
676 {
677 	struct kiocb kiocb;
678 	ssize_t ret;
679 
680 	if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
681 		return -EOPNOTSUPP;
682 
683 	init_sync_kiocb(&kiocb, filp);
684 	if (flags & RWF_HIPRI)
685 		kiocb.ki_flags |= IOCB_HIPRI;
686 	if (flags & RWF_DSYNC)
687 		kiocb.ki_flags |= IOCB_DSYNC;
688 	if (flags & RWF_SYNC)
689 		kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
690 	kiocb.ki_pos = *ppos;
691 
692 	if (type == READ)
693 		ret = filp->f_op->read_iter(&kiocb, iter);
694 	else
695 		ret = filp->f_op->write_iter(&kiocb, iter);
696 	BUG_ON(ret == -EIOCBQUEUED);
697 	*ppos = kiocb.ki_pos;
698 	return ret;
699 }
700 
701 /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,int flags)702 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
703 		loff_t *ppos, int type, int flags)
704 {
705 	ssize_t ret = 0;
706 
707 	if (flags & ~RWF_HIPRI)
708 		return -EOPNOTSUPP;
709 
710 	while (iov_iter_count(iter)) {
711 		struct iovec iovec = iov_iter_iovec(iter);
712 		ssize_t nr;
713 
714 		if (type == READ) {
715 			nr = filp->f_op->read(filp, iovec.iov_base,
716 					      iovec.iov_len, ppos);
717 		} else {
718 			nr = filp->f_op->write(filp, iovec.iov_base,
719 					       iovec.iov_len, ppos);
720 		}
721 
722 		if (nr < 0) {
723 			if (!ret)
724 				ret = nr;
725 			break;
726 		}
727 		ret += nr;
728 		if (nr != iovec.iov_len)
729 			break;
730 		iov_iter_advance(iter, nr);
731 	}
732 
733 	return ret;
734 }
735 
736 /* A write operation does a read from user space and vice versa */
737 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
738 
739 /**
740  * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
741  *     into the kernel and check that it is valid.
742  *
743  * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
744  * @uvector: Pointer to the userspace array.
745  * @nr_segs: Number of elements in userspace array.
746  * @fast_segs: Number of elements in @fast_pointer.
747  * @fast_pointer: Pointer to (usually small on-stack) kernel array.
748  * @ret_pointer: (output parameter) Pointer to a variable that will point to
749  *     either @fast_pointer, a newly allocated kernel array, or NULL,
750  *     depending on which array was used.
751  *
752  * This function copies an array of &struct iovec of @nr_segs from
753  * userspace into the kernel and checks that each element is valid (e.g.
754  * it does not point to a kernel address or cause overflow by being too
755  * large, etc.).
756  *
757  * As an optimization, the caller may provide a pointer to a small
758  * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
759  * (the size of this array, or 0 if unused, should be given in @fast_segs).
760  *
761  * @ret_pointer will always point to the array that was used, so the
762  * caller must take care not to call kfree() on it e.g. in case the
763  * @fast_pointer array was used and it was allocated on the stack.
764  *
765  * Return: The total number of bytes covered by the iovec array on success
766  *   or a negative error code on error.
767  */
rw_copy_check_uvector(int type,const struct iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)768 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
769 			      unsigned long nr_segs, unsigned long fast_segs,
770 			      struct iovec *fast_pointer,
771 			      struct iovec **ret_pointer)
772 {
773 	unsigned long seg;
774 	ssize_t ret;
775 	struct iovec *iov = fast_pointer;
776 
777 	/*
778 	 * SuS says "The readv() function *may* fail if the iovcnt argument
779 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
780 	 * traditionally returned zero for zero segments, so...
781 	 */
782 	if (nr_segs == 0) {
783 		ret = 0;
784 		goto out;
785 	}
786 
787 	/*
788 	 * First get the "struct iovec" from user memory and
789 	 * verify all the pointers
790 	 */
791 	if (nr_segs > UIO_MAXIOV) {
792 		ret = -EINVAL;
793 		goto out;
794 	}
795 	if (nr_segs > fast_segs) {
796 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
797 		if (iov == NULL) {
798 			ret = -ENOMEM;
799 			goto out;
800 		}
801 	}
802 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
803 		ret = -EFAULT;
804 		goto out;
805 	}
806 
807 	/*
808 	 * According to the Single Unix Specification we should return EINVAL
809 	 * if an element length is < 0 when cast to ssize_t or if the
810 	 * total length would overflow the ssize_t return value of the
811 	 * system call.
812 	 *
813 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
814 	 * overflow case.
815 	 */
816 	ret = 0;
817 	for (seg = 0; seg < nr_segs; seg++) {
818 		void __user *buf = iov[seg].iov_base;
819 		ssize_t len = (ssize_t)iov[seg].iov_len;
820 
821 		/* see if we we're about to use an invalid len or if
822 		 * it's about to overflow ssize_t */
823 		if (len < 0) {
824 			ret = -EINVAL;
825 			goto out;
826 		}
827 		if (type >= 0
828 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
829 			ret = -EFAULT;
830 			goto out;
831 		}
832 		if (len > MAX_RW_COUNT - ret) {
833 			len = MAX_RW_COUNT - ret;
834 			iov[seg].iov_len = len;
835 		}
836 		ret += len;
837 	}
838 out:
839 	*ret_pointer = iov;
840 	return ret;
841 }
842 
do_readv_writev(int type,struct file * file,const struct iovec __user * uvector,unsigned long nr_segs,loff_t * pos,int flags)843 static ssize_t do_readv_writev(int type, struct file *file,
844 			       const struct iovec __user * uvector,
845 			       unsigned long nr_segs, loff_t *pos,
846 			       int flags)
847 {
848 	size_t tot_len;
849 	struct iovec iovstack[UIO_FASTIOV];
850 	struct iovec *iov = iovstack;
851 	struct iov_iter iter;
852 	ssize_t ret;
853 
854 	ret = import_iovec(type, uvector, nr_segs,
855 			   ARRAY_SIZE(iovstack), &iov, &iter);
856 	if (ret < 0)
857 		return ret;
858 
859 	tot_len = iov_iter_count(&iter);
860 	if (!tot_len)
861 		goto out;
862 	ret = rw_verify_area(type, file, pos, tot_len);
863 	if (ret < 0)
864 		goto out;
865 
866 	if (type != READ)
867 		file_start_write(file);
868 
869 	if ((type == READ && file->f_op->read_iter) ||
870 	    (type == WRITE && file->f_op->write_iter))
871 		ret = do_iter_readv_writev(file, &iter, pos, type, flags);
872 	else
873 		ret = do_loop_readv_writev(file, &iter, pos, type, flags);
874 
875 	if (type != READ)
876 		file_end_write(file);
877 
878 out:
879 	kfree(iov);
880 	if ((ret + (type == READ)) > 0) {
881 		if (type == READ)
882 			fsnotify_access(file);
883 		else
884 			fsnotify_modify(file);
885 	}
886 	return ret;
887 }
888 
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,int flags)889 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
890 		  unsigned long vlen, loff_t *pos, int flags)
891 {
892 	if (!(file->f_mode & FMODE_READ))
893 		return -EBADF;
894 	if (!(file->f_mode & FMODE_CAN_READ))
895 		return -EINVAL;
896 
897 	return do_readv_writev(READ, file, vec, vlen, pos, flags);
898 }
899 
900 EXPORT_SYMBOL(vfs_readv);
901 
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,int flags)902 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
903 		   unsigned long vlen, loff_t *pos, int flags)
904 {
905 	if (!(file->f_mode & FMODE_WRITE))
906 		return -EBADF;
907 	if (!(file->f_mode & FMODE_CAN_WRITE))
908 		return -EINVAL;
909 
910 	return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
911 }
912 
913 EXPORT_SYMBOL(vfs_writev);
914 
do_readv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,int flags)915 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
916 			unsigned long vlen, int flags)
917 {
918 	struct fd f = fdget_pos(fd);
919 	ssize_t ret = -EBADF;
920 
921 	if (f.file) {
922 		loff_t pos = file_pos_read(f.file);
923 		ret = vfs_readv(f.file, vec, vlen, &pos, flags);
924 		if (ret >= 0)
925 			file_pos_write(f.file, pos);
926 		fdput_pos(f);
927 	}
928 
929 	if (ret > 0)
930 		add_rchar(current, ret);
931 	inc_syscr(current);
932 	return ret;
933 }
934 
do_writev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,int flags)935 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
936 			 unsigned long vlen, int flags)
937 {
938 	struct fd f = fdget_pos(fd);
939 	ssize_t ret = -EBADF;
940 
941 	if (f.file) {
942 		loff_t pos = file_pos_read(f.file);
943 		ret = vfs_writev(f.file, vec, vlen, &pos, flags);
944 		if (ret >= 0)
945 			file_pos_write(f.file, pos);
946 		fdput_pos(f);
947 	}
948 
949 	if (ret > 0)
950 		add_wchar(current, ret);
951 	inc_syscw(current);
952 	return ret;
953 }
954 
pos_from_hilo(unsigned long high,unsigned long low)955 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
956 {
957 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
958 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
959 }
960 
do_preadv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,int flags)961 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
962 			 unsigned long vlen, loff_t pos, int flags)
963 {
964 	struct fd f;
965 	ssize_t ret = -EBADF;
966 
967 	if (pos < 0)
968 		return -EINVAL;
969 
970 	f = fdget(fd);
971 	if (f.file) {
972 		ret = -ESPIPE;
973 		if (f.file->f_mode & FMODE_PREAD)
974 			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
975 		fdput(f);
976 	}
977 
978 	if (ret > 0)
979 		add_rchar(current, ret);
980 	inc_syscr(current);
981 	return ret;
982 }
983 
do_pwritev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,int flags)984 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
985 			  unsigned long vlen, loff_t pos, int flags)
986 {
987 	struct fd f;
988 	ssize_t ret = -EBADF;
989 
990 	if (pos < 0)
991 		return -EINVAL;
992 
993 	f = fdget(fd);
994 	if (f.file) {
995 		ret = -ESPIPE;
996 		if (f.file->f_mode & FMODE_PWRITE)
997 			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
998 		fdput(f);
999 	}
1000 
1001 	if (ret > 0)
1002 		add_wchar(current, ret);
1003 	inc_syscw(current);
1004 	return ret;
1005 }
1006 
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1007 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1008 		unsigned long, vlen)
1009 {
1010 	return do_readv(fd, vec, vlen, 0);
1011 }
1012 
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1013 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1014 		unsigned long, vlen)
1015 {
1016 	return do_writev(fd, vec, vlen, 0);
1017 }
1018 
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1019 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1020 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1021 {
1022 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1023 
1024 	return do_preadv(fd, vec, vlen, pos, 0);
1025 }
1026 
SYSCALL_DEFINE6(preadv2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,int,flags)1027 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1028 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1029 		int, flags)
1030 {
1031 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1032 
1033 	if (pos == -1)
1034 		return do_readv(fd, vec, vlen, flags);
1035 
1036 	return do_preadv(fd, vec, vlen, pos, flags);
1037 }
1038 
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1039 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1040 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1041 {
1042 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1043 
1044 	return do_pwritev(fd, vec, vlen, pos, 0);
1045 }
1046 
SYSCALL_DEFINE6(pwritev2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,int,flags)1047 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1048 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1049 		int, flags)
1050 {
1051 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1052 
1053 	if (pos == -1)
1054 		return do_writev(fd, vec, vlen, flags);
1055 
1056 	return do_pwritev(fd, vec, vlen, pos, flags);
1057 }
1058 
1059 #ifdef CONFIG_COMPAT
1060 
compat_do_readv_writev(int type,struct file * file,const struct compat_iovec __user * uvector,unsigned long nr_segs,loff_t * pos,int flags)1061 static ssize_t compat_do_readv_writev(int type, struct file *file,
1062 			       const struct compat_iovec __user *uvector,
1063 			       unsigned long nr_segs, loff_t *pos,
1064 			       int flags)
1065 {
1066 	compat_ssize_t tot_len;
1067 	struct iovec iovstack[UIO_FASTIOV];
1068 	struct iovec *iov = iovstack;
1069 	struct iov_iter iter;
1070 	ssize_t ret;
1071 
1072 	ret = compat_import_iovec(type, uvector, nr_segs,
1073 				  UIO_FASTIOV, &iov, &iter);
1074 	if (ret < 0)
1075 		return ret;
1076 
1077 	tot_len = iov_iter_count(&iter);
1078 	if (!tot_len)
1079 		goto out;
1080 	ret = rw_verify_area(type, file, pos, tot_len);
1081 	if (ret < 0)
1082 		goto out;
1083 
1084 	if (type != READ)
1085 		file_start_write(file);
1086 
1087 	if ((type == READ && file->f_op->read_iter) ||
1088 	    (type == WRITE && file->f_op->write_iter))
1089 		ret = do_iter_readv_writev(file, &iter, pos, type, flags);
1090 	else
1091 		ret = do_loop_readv_writev(file, &iter, pos, type, flags);
1092 
1093 	if (type != READ)
1094 		file_end_write(file);
1095 
1096 out:
1097 	kfree(iov);
1098 	if ((ret + (type == READ)) > 0) {
1099 		if (type == READ)
1100 			fsnotify_access(file);
1101 		else
1102 			fsnotify_modify(file);
1103 	}
1104 	return ret;
1105 }
1106 
compat_readv(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,int flags)1107 static size_t compat_readv(struct file *file,
1108 			   const struct compat_iovec __user *vec,
1109 			   unsigned long vlen, loff_t *pos, int flags)
1110 {
1111 	ssize_t ret = -EBADF;
1112 
1113 	if (!(file->f_mode & FMODE_READ))
1114 		goto out;
1115 
1116 	ret = -EINVAL;
1117 	if (!(file->f_mode & FMODE_CAN_READ))
1118 		goto out;
1119 
1120 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
1121 
1122 out:
1123 	if (ret > 0)
1124 		add_rchar(current, ret);
1125 	inc_syscr(current);
1126 	return ret;
1127 }
1128 
do_compat_readv(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,int flags)1129 static size_t do_compat_readv(compat_ulong_t fd,
1130 				 const struct compat_iovec __user *vec,
1131 				 compat_ulong_t vlen, int flags)
1132 {
1133 	struct fd f = fdget_pos(fd);
1134 	ssize_t ret;
1135 	loff_t pos;
1136 
1137 	if (!f.file)
1138 		return -EBADF;
1139 	pos = f.file->f_pos;
1140 	ret = compat_readv(f.file, vec, vlen, &pos, flags);
1141 	if (ret >= 0)
1142 		f.file->f_pos = pos;
1143 	fdput_pos(f);
1144 	return ret;
1145 
1146 }
1147 
COMPAT_SYSCALL_DEFINE3(readv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1148 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1149 		const struct compat_iovec __user *,vec,
1150 		compat_ulong_t, vlen)
1151 {
1152 	return do_compat_readv(fd, vec, vlen, 0);
1153 }
1154 
do_compat_preadv64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,int flags)1155 static long do_compat_preadv64(unsigned long fd,
1156 				  const struct compat_iovec __user *vec,
1157 				  unsigned long vlen, loff_t pos, int flags)
1158 {
1159 	struct fd f;
1160 	ssize_t ret;
1161 
1162 	if (pos < 0)
1163 		return -EINVAL;
1164 	f = fdget(fd);
1165 	if (!f.file)
1166 		return -EBADF;
1167 	ret = -ESPIPE;
1168 	if (f.file->f_mode & FMODE_PREAD)
1169 		ret = compat_readv(f.file, vec, vlen, &pos, flags);
1170 	fdput(f);
1171 	return ret;
1172 }
1173 
1174 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1175 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1176 		const struct compat_iovec __user *,vec,
1177 		unsigned long, vlen, loff_t, pos)
1178 {
1179 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1180 }
1181 #endif
1182 
COMPAT_SYSCALL_DEFINE5(preadv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1183 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1184 		const struct compat_iovec __user *,vec,
1185 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1186 {
1187 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1188 
1189 	return do_compat_preadv64(fd, vec, vlen, pos, 0);
1190 }
1191 
1192 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,int,flags)1193 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1194 		const struct compat_iovec __user *,vec,
1195 		unsigned long, vlen, loff_t, pos, int, flags)
1196 {
1197 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1198 }
1199 #endif
1200 
COMPAT_SYSCALL_DEFINE6(preadv2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,int,flags)1201 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1202 		const struct compat_iovec __user *,vec,
1203 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1204 		int, flags)
1205 {
1206 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1207 
1208 	if (pos == -1)
1209 		return do_compat_readv(fd, vec, vlen, flags);
1210 
1211 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
1212 }
1213 
compat_writev(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,int flags)1214 static size_t compat_writev(struct file *file,
1215 			    const struct compat_iovec __user *vec,
1216 			    unsigned long vlen, loff_t *pos, int flags)
1217 {
1218 	ssize_t ret = -EBADF;
1219 
1220 	if (!(file->f_mode & FMODE_WRITE))
1221 		goto out;
1222 
1223 	ret = -EINVAL;
1224 	if (!(file->f_mode & FMODE_CAN_WRITE))
1225 		goto out;
1226 
1227 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags);
1228 
1229 out:
1230 	if (ret > 0)
1231 		add_wchar(current, ret);
1232 	inc_syscw(current);
1233 	return ret;
1234 }
1235 
do_compat_writev(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,int flags)1236 static size_t do_compat_writev(compat_ulong_t fd,
1237 				  const struct compat_iovec __user* vec,
1238 				  compat_ulong_t vlen, int flags)
1239 {
1240 	struct fd f = fdget_pos(fd);
1241 	ssize_t ret;
1242 	loff_t pos;
1243 
1244 	if (!f.file)
1245 		return -EBADF;
1246 	pos = f.file->f_pos;
1247 	ret = compat_writev(f.file, vec, vlen, &pos, flags);
1248 	if (ret >= 0)
1249 		f.file->f_pos = pos;
1250 	fdput_pos(f);
1251 	return ret;
1252 }
1253 
COMPAT_SYSCALL_DEFINE3(writev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1254 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1255 		const struct compat_iovec __user *, vec,
1256 		compat_ulong_t, vlen)
1257 {
1258 	return do_compat_writev(fd, vec, vlen, 0);
1259 }
1260 
do_compat_pwritev64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,int flags)1261 static long do_compat_pwritev64(unsigned long fd,
1262 				   const struct compat_iovec __user *vec,
1263 				   unsigned long vlen, loff_t pos, int flags)
1264 {
1265 	struct fd f;
1266 	ssize_t ret;
1267 
1268 	if (pos < 0)
1269 		return -EINVAL;
1270 	f = fdget(fd);
1271 	if (!f.file)
1272 		return -EBADF;
1273 	ret = -ESPIPE;
1274 	if (f.file->f_mode & FMODE_PWRITE)
1275 		ret = compat_writev(f.file, vec, vlen, &pos, flags);
1276 	fdput(f);
1277 	return ret;
1278 }
1279 
1280 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1281 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1282 		const struct compat_iovec __user *,vec,
1283 		unsigned long, vlen, loff_t, pos)
1284 {
1285 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1286 }
1287 #endif
1288 
COMPAT_SYSCALL_DEFINE5(pwritev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1289 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1290 		const struct compat_iovec __user *,vec,
1291 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1292 {
1293 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1294 
1295 	return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1296 }
1297 
1298 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,int,flags)1299 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1300 		const struct compat_iovec __user *,vec,
1301 		unsigned long, vlen, loff_t, pos, int, flags)
1302 {
1303 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1304 }
1305 #endif
1306 
COMPAT_SYSCALL_DEFINE6(pwritev2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,int,flags)1307 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1308 		const struct compat_iovec __user *,vec,
1309 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
1310 {
1311 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1312 
1313 	if (pos == -1)
1314 		return do_compat_writev(fd, vec, vlen, flags);
1315 
1316 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1317 }
1318 
1319 #endif
1320 
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1321 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1322 		  	   size_t count, loff_t max)
1323 {
1324 	struct fd in, out;
1325 	struct inode *in_inode, *out_inode;
1326 	loff_t pos;
1327 	loff_t out_pos;
1328 	ssize_t retval;
1329 	int fl;
1330 
1331 	/*
1332 	 * Get input file, and verify that it is ok..
1333 	 */
1334 	retval = -EBADF;
1335 	in = fdget(in_fd);
1336 	if (!in.file)
1337 		goto out;
1338 	if (!(in.file->f_mode & FMODE_READ))
1339 		goto fput_in;
1340 	retval = -ESPIPE;
1341 	if (!ppos) {
1342 		pos = in.file->f_pos;
1343 	} else {
1344 		pos = *ppos;
1345 		if (!(in.file->f_mode & FMODE_PREAD))
1346 			goto fput_in;
1347 	}
1348 	retval = rw_verify_area(READ, in.file, &pos, count);
1349 	if (retval < 0)
1350 		goto fput_in;
1351 	if (count > MAX_RW_COUNT)
1352 		count =  MAX_RW_COUNT;
1353 
1354 	/*
1355 	 * Get output file, and verify that it is ok..
1356 	 */
1357 	retval = -EBADF;
1358 	out = fdget(out_fd);
1359 	if (!out.file)
1360 		goto fput_in;
1361 	if (!(out.file->f_mode & FMODE_WRITE))
1362 		goto fput_out;
1363 	retval = -EINVAL;
1364 	in_inode = file_inode(in.file);
1365 	out_inode = file_inode(out.file);
1366 	out_pos = out.file->f_pos;
1367 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1368 	if (retval < 0)
1369 		goto fput_out;
1370 
1371 	if (!max)
1372 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1373 
1374 	if (unlikely(pos + count > max)) {
1375 		retval = -EOVERFLOW;
1376 		if (pos >= max)
1377 			goto fput_out;
1378 		count = max - pos;
1379 	}
1380 
1381 	fl = 0;
1382 #if 0
1383 	/*
1384 	 * We need to debate whether we can enable this or not. The
1385 	 * man page documents EAGAIN return for the output at least,
1386 	 * and the application is arguably buggy if it doesn't expect
1387 	 * EAGAIN on a non-blocking file descriptor.
1388 	 */
1389 	if (in.file->f_flags & O_NONBLOCK)
1390 		fl = SPLICE_F_NONBLOCK;
1391 #endif
1392 	file_start_write(out.file);
1393 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1394 	file_end_write(out.file);
1395 
1396 	if (retval > 0) {
1397 		add_rchar(current, retval);
1398 		add_wchar(current, retval);
1399 		fsnotify_access(in.file);
1400 		fsnotify_modify(out.file);
1401 		out.file->f_pos = out_pos;
1402 		if (ppos)
1403 			*ppos = pos;
1404 		else
1405 			in.file->f_pos = pos;
1406 	}
1407 
1408 	inc_syscr(current);
1409 	inc_syscw(current);
1410 	if (pos > max)
1411 		retval = -EOVERFLOW;
1412 
1413 fput_out:
1414 	fdput(out);
1415 fput_in:
1416 	fdput(in);
1417 out:
1418 	return retval;
1419 }
1420 
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1421 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1422 {
1423 	loff_t pos;
1424 	off_t off;
1425 	ssize_t ret;
1426 
1427 	if (offset) {
1428 		if (unlikely(get_user(off, offset)))
1429 			return -EFAULT;
1430 		pos = off;
1431 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1432 		if (unlikely(put_user(pos, offset)))
1433 			return -EFAULT;
1434 		return ret;
1435 	}
1436 
1437 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1438 }
1439 
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1440 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1441 {
1442 	loff_t pos;
1443 	ssize_t ret;
1444 
1445 	if (offset) {
1446 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1447 			return -EFAULT;
1448 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1449 		if (unlikely(put_user(pos, offset)))
1450 			return -EFAULT;
1451 		return ret;
1452 	}
1453 
1454 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1455 }
1456 
1457 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1458 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1459 		compat_off_t __user *, offset, compat_size_t, count)
1460 {
1461 	loff_t pos;
1462 	off_t off;
1463 	ssize_t ret;
1464 
1465 	if (offset) {
1466 		if (unlikely(get_user(off, offset)))
1467 			return -EFAULT;
1468 		pos = off;
1469 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1470 		if (unlikely(put_user(pos, offset)))
1471 			return -EFAULT;
1472 		return ret;
1473 	}
1474 
1475 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1476 }
1477 
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1478 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1479 		compat_loff_t __user *, offset, compat_size_t, count)
1480 {
1481 	loff_t pos;
1482 	ssize_t ret;
1483 
1484 	if (offset) {
1485 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1486 			return -EFAULT;
1487 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1488 		if (unlikely(put_user(pos, offset)))
1489 			return -EFAULT;
1490 		return ret;
1491 	}
1492 
1493 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1494 }
1495 #endif
1496 
1497 /*
1498  * copy_file_range() differs from regular file read and write in that it
1499  * specifically allows return partial success.  When it does so is up to
1500  * the copy_file_range method.
1501  */
vfs_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1502 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1503 			    struct file *file_out, loff_t pos_out,
1504 			    size_t len, unsigned int flags)
1505 {
1506 	struct inode *inode_in = file_inode(file_in);
1507 	struct inode *inode_out = file_inode(file_out);
1508 	ssize_t ret;
1509 
1510 	if (flags != 0)
1511 		return -EINVAL;
1512 
1513 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1514 		return -EISDIR;
1515 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1516 		return -EINVAL;
1517 
1518 	ret = rw_verify_area(READ, file_in, &pos_in, len);
1519 	if (unlikely(ret))
1520 		return ret;
1521 
1522 	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1523 	if (unlikely(ret))
1524 		return ret;
1525 
1526 	if (!(file_in->f_mode & FMODE_READ) ||
1527 	    !(file_out->f_mode & FMODE_WRITE) ||
1528 	    (file_out->f_flags & O_APPEND))
1529 		return -EBADF;
1530 
1531 	/* this could be relaxed once a method supports cross-fs copies */
1532 	if (inode_in->i_sb != inode_out->i_sb)
1533 		return -EXDEV;
1534 
1535 	if (len == 0)
1536 		return 0;
1537 
1538 	ret = mnt_want_write_file(file_out);
1539 	if (ret)
1540 		return ret;
1541 
1542 	ret = -EOPNOTSUPP;
1543 	if (file_out->f_op->copy_file_range)
1544 		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
1545 						      pos_out, len, flags);
1546 	if (ret == -EOPNOTSUPP)
1547 		ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1548 				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1549 
1550 	if (ret > 0) {
1551 		fsnotify_access(file_in);
1552 		add_rchar(current, ret);
1553 		fsnotify_modify(file_out);
1554 		add_wchar(current, ret);
1555 	}
1556 	inc_syscr(current);
1557 	inc_syscw(current);
1558 
1559 	mnt_drop_write_file(file_out);
1560 
1561 	return ret;
1562 }
1563 EXPORT_SYMBOL(vfs_copy_file_range);
1564 
SYSCALL_DEFINE6(copy_file_range,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1565 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1566 		int, fd_out, loff_t __user *, off_out,
1567 		size_t, len, unsigned int, flags)
1568 {
1569 	loff_t pos_in;
1570 	loff_t pos_out;
1571 	struct fd f_in;
1572 	struct fd f_out;
1573 	ssize_t ret = -EBADF;
1574 
1575 	f_in = fdget(fd_in);
1576 	if (!f_in.file)
1577 		goto out2;
1578 
1579 	f_out = fdget(fd_out);
1580 	if (!f_out.file)
1581 		goto out1;
1582 
1583 	ret = -EFAULT;
1584 	if (off_in) {
1585 		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1586 			goto out;
1587 	} else {
1588 		pos_in = f_in.file->f_pos;
1589 	}
1590 
1591 	if (off_out) {
1592 		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1593 			goto out;
1594 	} else {
1595 		pos_out = f_out.file->f_pos;
1596 	}
1597 
1598 	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1599 				  flags);
1600 	if (ret > 0) {
1601 		pos_in += ret;
1602 		pos_out += ret;
1603 
1604 		if (off_in) {
1605 			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1606 				ret = -EFAULT;
1607 		} else {
1608 			f_in.file->f_pos = pos_in;
1609 		}
1610 
1611 		if (off_out) {
1612 			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1613 				ret = -EFAULT;
1614 		} else {
1615 			f_out.file->f_pos = pos_out;
1616 		}
1617 	}
1618 
1619 out:
1620 	fdput(f_out);
1621 out1:
1622 	fdput(f_in);
1623 out2:
1624 	return ret;
1625 }
1626 
clone_verify_area(struct file * file,loff_t pos,u64 len,bool write)1627 static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
1628 {
1629 	struct inode *inode = file_inode(file);
1630 
1631 	if (unlikely(pos < 0))
1632 		return -EINVAL;
1633 
1634 	 if (unlikely((loff_t) (pos + len) < 0))
1635 		return -EINVAL;
1636 
1637 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1638 		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1639 		int retval;
1640 
1641 		retval = locks_mandatory_area(inode, file, pos, end,
1642 				write ? F_WRLCK : F_RDLCK);
1643 		if (retval < 0)
1644 			return retval;
1645 	}
1646 
1647 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1648 }
1649 
vfs_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,u64 len)1650 int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
1651 		struct file *file_out, loff_t pos_out, u64 len)
1652 {
1653 	struct inode *inode_in = file_inode(file_in);
1654 	struct inode *inode_out = file_inode(file_out);
1655 	int ret;
1656 
1657 	if (inode_in->i_sb != inode_out->i_sb ||
1658 	    file_in->f_path.mnt != file_out->f_path.mnt)
1659 		return -EXDEV;
1660 
1661 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1662 		return -EISDIR;
1663 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1664 		return -EINVAL;
1665 
1666 	if (!(file_in->f_mode & FMODE_READ) ||
1667 	    !(file_out->f_mode & FMODE_WRITE) ||
1668 	    (file_out->f_flags & O_APPEND))
1669 		return -EBADF;
1670 
1671 	if (!file_in->f_op->clone_file_range)
1672 		return -EOPNOTSUPP;
1673 
1674 	ret = clone_verify_area(file_in, pos_in, len, false);
1675 	if (ret)
1676 		return ret;
1677 
1678 	ret = clone_verify_area(file_out, pos_out, len, true);
1679 	if (ret)
1680 		return ret;
1681 
1682 	if (pos_in + len > i_size_read(inode_in))
1683 		return -EINVAL;
1684 
1685 	ret = mnt_want_write_file(file_out);
1686 	if (ret)
1687 		return ret;
1688 
1689 	ret = file_in->f_op->clone_file_range(file_in, pos_in,
1690 			file_out, pos_out, len);
1691 	if (!ret) {
1692 		fsnotify_access(file_in);
1693 		fsnotify_modify(file_out);
1694 	}
1695 
1696 	mnt_drop_write_file(file_out);
1697 	return ret;
1698 }
1699 EXPORT_SYMBOL(vfs_clone_file_range);
1700 
vfs_dedupe_file_range(struct file * file,struct file_dedupe_range * same)1701 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1702 {
1703 	struct file_dedupe_range_info *info;
1704 	struct inode *src = file_inode(file);
1705 	u64 off;
1706 	u64 len;
1707 	int i;
1708 	int ret;
1709 	bool is_admin = capable(CAP_SYS_ADMIN);
1710 	u16 count = same->dest_count;
1711 	struct file *dst_file;
1712 	loff_t dst_off;
1713 	ssize_t deduped;
1714 
1715 	if (!(file->f_mode & FMODE_READ))
1716 		return -EINVAL;
1717 
1718 	if (same->reserved1 || same->reserved2)
1719 		return -EINVAL;
1720 
1721 	off = same->src_offset;
1722 	len = same->src_length;
1723 
1724 	ret = -EISDIR;
1725 	if (S_ISDIR(src->i_mode))
1726 		goto out;
1727 
1728 	ret = -EINVAL;
1729 	if (!S_ISREG(src->i_mode))
1730 		goto out;
1731 
1732 	ret = clone_verify_area(file, off, len, false);
1733 	if (ret < 0)
1734 		goto out;
1735 	ret = 0;
1736 
1737 	/* pre-format output fields to sane values */
1738 	for (i = 0; i < count; i++) {
1739 		same->info[i].bytes_deduped = 0ULL;
1740 		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
1741 	}
1742 
1743 	for (i = 0, info = same->info; i < count; i++, info++) {
1744 		struct inode *dst;
1745 		struct fd dst_fd = fdget(info->dest_fd);
1746 
1747 		dst_file = dst_fd.file;
1748 		if (!dst_file) {
1749 			info->status = -EBADF;
1750 			goto next_loop;
1751 		}
1752 		dst = file_inode(dst_file);
1753 
1754 		ret = mnt_want_write_file(dst_file);
1755 		if (ret) {
1756 			info->status = ret;
1757 			goto next_loop;
1758 		}
1759 
1760 		dst_off = info->dest_offset;
1761 		ret = clone_verify_area(dst_file, dst_off, len, true);
1762 		if (ret < 0) {
1763 			info->status = ret;
1764 			goto next_file;
1765 		}
1766 		ret = 0;
1767 
1768 		if (info->reserved) {
1769 			info->status = -EINVAL;
1770 		} else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
1771 			info->status = -EINVAL;
1772 		} else if (file->f_path.mnt != dst_file->f_path.mnt) {
1773 			info->status = -EXDEV;
1774 		} else if (S_ISDIR(dst->i_mode)) {
1775 			info->status = -EISDIR;
1776 		} else if (dst_file->f_op->dedupe_file_range == NULL) {
1777 			info->status = -EINVAL;
1778 		} else {
1779 			deduped = dst_file->f_op->dedupe_file_range(file, off,
1780 							len, dst_file,
1781 							info->dest_offset);
1782 			if (deduped == -EBADE)
1783 				info->status = FILE_DEDUPE_RANGE_DIFFERS;
1784 			else if (deduped < 0)
1785 				info->status = deduped;
1786 			else
1787 				info->bytes_deduped += deduped;
1788 		}
1789 
1790 next_file:
1791 		mnt_drop_write_file(dst_file);
1792 next_loop:
1793 		fdput(dst_fd);
1794 
1795 		if (fatal_signal_pending(current))
1796 			goto out;
1797 	}
1798 
1799 out:
1800 	return ret;
1801 }
1802 EXPORT_SYMBOL(vfs_dedupe_file_range);
1803