• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/aio.h>
13 #include <linux/fsnotify.h>
14 #include <linux/security.h>
15 #include <linux/export.h>
16 #include <linux/syscalls.h>
17 #include <linux/pagemap.h>
18 #include <linux/splice.h>
19 #include <linux/compat.h>
20 #include "internal.h"
21 
22 #include <asm/uaccess.h>
23 #include <asm/unistd.h>
24 
25 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
26 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
27 		unsigned long, loff_t);
28 
29 const struct file_operations generic_ro_fops = {
30 	.llseek		= generic_file_llseek,
31 	.read		= do_sync_read,
32 	.aio_read	= generic_file_aio_read,
33 	.mmap		= generic_file_readonly_mmap,
34 	.splice_read	= generic_file_splice_read,
35 };
36 
37 EXPORT_SYMBOL(generic_ro_fops);
38 
unsigned_offsets(struct file * file)39 static inline int unsigned_offsets(struct file *file)
40 {
41 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
42 }
43 
lseek_execute(struct file * file,struct inode * inode,loff_t offset,loff_t maxsize)44 static loff_t lseek_execute(struct file *file, struct inode *inode,
45 		loff_t offset, loff_t maxsize)
46 {
47 	if (offset < 0 && !unsigned_offsets(file))
48 		return -EINVAL;
49 	if (offset > maxsize)
50 		return -EINVAL;
51 
52 	if (offset != file->f_pos) {
53 		file->f_pos = offset;
54 		file->f_version = 0;
55 	}
56 	return offset;
57 }
58 
59 /**
60  * generic_file_llseek_size - generic llseek implementation for regular files
61  * @file:	file structure to seek on
62  * @offset:	file offset to seek to
63  * @whence:	type of seek
64  * @size:	max size of this file in file system
65  * @eof:	offset used for SEEK_END position
66  *
67  * This is a variant of generic_file_llseek that allows passing in a custom
68  * maximum file size and a custom EOF position, for e.g. hashed directories
69  *
70  * Synchronization:
71  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
72  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
73  * read/writes behave like SEEK_SET against seeks.
74  */
75 loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)76 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
77 		loff_t maxsize, loff_t eof)
78 {
79 	struct inode *inode = file->f_mapping->host;
80 
81 	switch (whence) {
82 	case SEEK_END:
83 		offset += eof;
84 		break;
85 	case SEEK_CUR:
86 		/*
87 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
88 		 * position-querying operation.  Avoid rewriting the "same"
89 		 * f_pos value back to the file because a concurrent read(),
90 		 * write() or lseek() might have altered it
91 		 */
92 		if (offset == 0)
93 			return file->f_pos;
94 		/*
95 		 * f_lock protects against read/modify/write race with other
96 		 * SEEK_CURs. Note that parallel writes and reads behave
97 		 * like SEEK_SET.
98 		 */
99 		spin_lock(&file->f_lock);
100 		offset = lseek_execute(file, inode, file->f_pos + offset,
101 				       maxsize);
102 		spin_unlock(&file->f_lock);
103 		return offset;
104 	case SEEK_DATA:
105 		/*
106 		 * In the generic case the entire file is data, so as long as
107 		 * offset isn't at the end of the file then the offset is data.
108 		 */
109 		if (offset >= eof)
110 			return -ENXIO;
111 		break;
112 	case SEEK_HOLE:
113 		/*
114 		 * There is a virtual hole at the end of the file, so as long as
115 		 * offset isn't i_size or larger, return i_size.
116 		 */
117 		if (offset >= eof)
118 			return -ENXIO;
119 		offset = eof;
120 		break;
121 	}
122 
123 	return lseek_execute(file, inode, offset, maxsize);
124 }
125 EXPORT_SYMBOL(generic_file_llseek_size);
126 
127 /**
128  * generic_file_llseek - generic llseek implementation for regular files
129  * @file:	file structure to seek on
130  * @offset:	file offset to seek to
131  * @whence:	type of seek
132  *
133  * This is a generic implemenation of ->llseek useable for all normal local
134  * filesystems.  It just updates the file offset to the value specified by
135  * @offset and @whence.
136  */
generic_file_llseek(struct file * file,loff_t offset,int whence)137 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
138 {
139 	struct inode *inode = file->f_mapping->host;
140 
141 	return generic_file_llseek_size(file, offset, whence,
142 					inode->i_sb->s_maxbytes,
143 					i_size_read(inode));
144 }
145 EXPORT_SYMBOL(generic_file_llseek);
146 
147 /**
148  * noop_llseek - No Operation Performed llseek implementation
149  * @file:	file structure to seek on
150  * @offset:	file offset to seek to
151  * @whence:	type of seek
152  *
153  * This is an implementation of ->llseek useable for the rare special case when
154  * userspace expects the seek to succeed but the (device) file is actually not
155  * able to perform the seek. In this case you use noop_llseek() instead of
156  * falling back to the default implementation of ->llseek.
157  */
noop_llseek(struct file * file,loff_t offset,int whence)158 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
159 {
160 	return file->f_pos;
161 }
162 EXPORT_SYMBOL(noop_llseek);
163 
no_llseek(struct file * file,loff_t offset,int whence)164 loff_t no_llseek(struct file *file, loff_t offset, int whence)
165 {
166 	return -ESPIPE;
167 }
168 EXPORT_SYMBOL(no_llseek);
169 
default_llseek(struct file * file,loff_t offset,int whence)170 loff_t default_llseek(struct file *file, loff_t offset, int whence)
171 {
172 	struct inode *inode = file_inode(file);
173 	loff_t retval;
174 
175 	mutex_lock(&inode->i_mutex);
176 	switch (whence) {
177 		case SEEK_END:
178 			offset += i_size_read(inode);
179 			break;
180 		case SEEK_CUR:
181 			if (offset == 0) {
182 				retval = file->f_pos;
183 				goto out;
184 			}
185 			offset += file->f_pos;
186 			break;
187 		case SEEK_DATA:
188 			/*
189 			 * In the generic case the entire file is data, so as
190 			 * long as offset isn't at the end of the file then the
191 			 * offset is data.
192 			 */
193 			if (offset >= inode->i_size) {
194 				retval = -ENXIO;
195 				goto out;
196 			}
197 			break;
198 		case SEEK_HOLE:
199 			/*
200 			 * There is a virtual hole at the end of the file, so
201 			 * as long as offset isn't i_size or larger, return
202 			 * i_size.
203 			 */
204 			if (offset >= inode->i_size) {
205 				retval = -ENXIO;
206 				goto out;
207 			}
208 			offset = inode->i_size;
209 			break;
210 	}
211 	retval = -EINVAL;
212 	if (offset >= 0 || unsigned_offsets(file)) {
213 		if (offset != file->f_pos) {
214 			file->f_pos = offset;
215 			file->f_version = 0;
216 		}
217 		retval = offset;
218 	}
219 out:
220 	mutex_unlock(&inode->i_mutex);
221 	return retval;
222 }
223 EXPORT_SYMBOL(default_llseek);
224 
vfs_llseek(struct file * file,loff_t offset,int whence)225 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
226 {
227 	loff_t (*fn)(struct file *, loff_t, int);
228 
229 	fn = no_llseek;
230 	if (file->f_mode & FMODE_LSEEK) {
231 		if (file->f_op && file->f_op->llseek)
232 			fn = file->f_op->llseek;
233 	}
234 	return fn(file, offset, whence);
235 }
236 EXPORT_SYMBOL(vfs_llseek);
237 
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)238 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
239 {
240 	off_t retval;
241 	struct fd f = fdget(fd);
242 	if (!f.file)
243 		return -EBADF;
244 
245 	retval = -EINVAL;
246 	if (whence <= SEEK_MAX) {
247 		loff_t res = vfs_llseek(f.file, offset, whence);
248 		retval = res;
249 		if (res != (loff_t)retval)
250 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
251 	}
252 	fdput(f);
253 	return retval;
254 }
255 
256 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)257 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
258 {
259 	return sys_lseek(fd, offset, whence);
260 }
261 #endif
262 
263 #ifdef __ARCH_WANT_SYS_LLSEEK
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)264 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
265 		unsigned long, offset_low, loff_t __user *, result,
266 		unsigned int, whence)
267 {
268 	int retval;
269 	struct fd f = fdget(fd);
270 	loff_t offset;
271 
272 	if (!f.file)
273 		return -EBADF;
274 
275 	retval = -EINVAL;
276 	if (whence > SEEK_MAX)
277 		goto out_putf;
278 
279 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
280 			whence);
281 
282 	retval = (int)offset;
283 	if (offset >= 0) {
284 		retval = -EFAULT;
285 		if (!copy_to_user(result, &offset, sizeof(offset)))
286 			retval = 0;
287 	}
288 out_putf:
289 	fdput(f);
290 	return retval;
291 }
292 #endif
293 
294 /*
295  * rw_verify_area doesn't like huge counts. We limit
296  * them to something that fits in "int" so that others
297  * won't have to do range checks all the time.
298  */
rw_verify_area(int read_write,struct file * file,loff_t * ppos,size_t count)299 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
300 {
301 	struct inode *inode;
302 	loff_t pos;
303 	int retval = -EINVAL;
304 
305 	inode = file_inode(file);
306 	if (unlikely((ssize_t) count < 0))
307 		return retval;
308 	pos = *ppos;
309 	if (unlikely(pos < 0)) {
310 		if (!unsigned_offsets(file))
311 			return retval;
312 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
313 			return -EOVERFLOW;
314 	} else if (unlikely((loff_t) (pos + count) < 0)) {
315 		if (!unsigned_offsets(file))
316 			return retval;
317 	}
318 
319 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
320 		retval = locks_mandatory_area(
321 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
322 			inode, file, pos, count);
323 		if (retval < 0)
324 			return retval;
325 	}
326 	retval = security_file_permission(file,
327 				read_write == READ ? MAY_READ : MAY_WRITE);
328 	if (retval)
329 		return retval;
330 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
331 }
332 
do_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)333 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
334 {
335 	struct iovec iov = { .iov_base = buf, .iov_len = len };
336 	struct kiocb kiocb;
337 	ssize_t ret;
338 
339 	init_sync_kiocb(&kiocb, filp);
340 	kiocb.ki_pos = *ppos;
341 	kiocb.ki_left = len;
342 	kiocb.ki_nbytes = len;
343 
344 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
345 	if (-EIOCBQUEUED == ret)
346 		ret = wait_on_sync_kiocb(&kiocb);
347 	*ppos = kiocb.ki_pos;
348 	return ret;
349 }
350 
351 EXPORT_SYMBOL(do_sync_read);
352 
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)353 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
354 {
355 	ssize_t ret;
356 
357 	if (!(file->f_mode & FMODE_READ))
358 		return -EBADF;
359 	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
360 		return -EINVAL;
361 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
362 		return -EFAULT;
363 
364 	ret = rw_verify_area(READ, file, pos, count);
365 	if (ret >= 0) {
366 		count = ret;
367 		if (file->f_op->read)
368 			ret = file->f_op->read(file, buf, count, pos);
369 		else
370 			ret = do_sync_read(file, buf, count, pos);
371 		if (ret > 0) {
372 			fsnotify_access(file);
373 			add_rchar(current, ret);
374 		}
375 		inc_syscr(current);
376 	}
377 
378 	return ret;
379 }
380 
381 EXPORT_SYMBOL(vfs_read);
382 
do_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)383 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
384 {
385 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
386 	struct kiocb kiocb;
387 	ssize_t ret;
388 
389 	init_sync_kiocb(&kiocb, filp);
390 	kiocb.ki_pos = *ppos;
391 	kiocb.ki_left = len;
392 	kiocb.ki_nbytes = len;
393 
394 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
395 	if (-EIOCBQUEUED == ret)
396 		ret = wait_on_sync_kiocb(&kiocb);
397 	*ppos = kiocb.ki_pos;
398 	return ret;
399 }
400 
401 EXPORT_SYMBOL(do_sync_write);
402 
__kernel_write(struct file * file,const char * buf,size_t count,loff_t * pos)403 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
404 {
405 	mm_segment_t old_fs;
406 	const char __user *p;
407 	ssize_t ret;
408 
409 	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
410 		return -EINVAL;
411 
412 	old_fs = get_fs();
413 	set_fs(get_ds());
414 	p = (__force const char __user *)buf;
415 	if (count > MAX_RW_COUNT)
416 		count =  MAX_RW_COUNT;
417 	if (file->f_op->write)
418 		ret = file->f_op->write(file, p, count, pos);
419 	else
420 		ret = do_sync_write(file, p, count, pos);
421 	set_fs(old_fs);
422 	if (ret > 0) {
423 		fsnotify_modify(file);
424 		add_wchar(current, ret);
425 	}
426 	inc_syscw(current);
427 	return ret;
428 }
429 
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)430 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
431 {
432 	ssize_t ret;
433 
434 	if (!(file->f_mode & FMODE_WRITE))
435 		return -EBADF;
436 	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
437 		return -EINVAL;
438 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
439 		return -EFAULT;
440 
441 	ret = rw_verify_area(WRITE, file, pos, count);
442 	if (ret >= 0) {
443 		count = ret;
444 		file_start_write(file);
445 		if (file->f_op->write)
446 			ret = file->f_op->write(file, buf, count, pos);
447 		else
448 			ret = do_sync_write(file, buf, count, pos);
449 		if (ret > 0) {
450 			fsnotify_modify(file);
451 			add_wchar(current, ret);
452 		}
453 		inc_syscw(current);
454 		file_end_write(file);
455 	}
456 
457 	return ret;
458 }
459 
460 EXPORT_SYMBOL(vfs_write);
461 
file_pos_read(struct file * file)462 static inline loff_t file_pos_read(struct file *file)
463 {
464 	return file->f_pos;
465 }
466 
file_pos_write(struct file * file,loff_t pos)467 static inline void file_pos_write(struct file *file, loff_t pos)
468 {
469 	file->f_pos = pos;
470 }
471 
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)472 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
473 {
474 	struct fd f = fdget(fd);
475 	ssize_t ret = -EBADF;
476 
477 	if (f.file) {
478 		loff_t pos = file_pos_read(f.file);
479 		ret = vfs_read(f.file, buf, count, &pos);
480 		file_pos_write(f.file, pos);
481 		fdput(f);
482 	}
483 	return ret;
484 }
485 
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)486 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
487 		size_t, count)
488 {
489 	struct fd f = fdget(fd);
490 	ssize_t ret = -EBADF;
491 
492 	if (f.file) {
493 		loff_t pos = file_pos_read(f.file);
494 		ret = vfs_write(f.file, buf, count, &pos);
495 		file_pos_write(f.file, pos);
496 		fdput(f);
497 	}
498 
499 	return ret;
500 }
501 
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)502 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
503 			size_t, count, loff_t, pos)
504 {
505 	struct fd f;
506 	ssize_t ret = -EBADF;
507 
508 	if (pos < 0)
509 		return -EINVAL;
510 
511 	f = fdget(fd);
512 	if (f.file) {
513 		ret = -ESPIPE;
514 		if (f.file->f_mode & FMODE_PREAD)
515 			ret = vfs_read(f.file, buf, count, &pos);
516 		fdput(f);
517 	}
518 
519 	return ret;
520 }
521 
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)522 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
523 			 size_t, count, loff_t, pos)
524 {
525 	struct fd f;
526 	ssize_t ret = -EBADF;
527 
528 	if (pos < 0)
529 		return -EINVAL;
530 
531 	f = fdget(fd);
532 	if (f.file) {
533 		ret = -ESPIPE;
534 		if (f.file->f_mode & FMODE_PWRITE)
535 			ret = vfs_write(f.file, buf, count, &pos);
536 		fdput(f);
537 	}
538 
539 	return ret;
540 }
541 
542 /*
543  * Reduce an iovec's length in-place.  Return the resulting number of segments
544  */
iov_shorten(struct iovec * iov,unsigned long nr_segs,size_t to)545 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
546 {
547 	unsigned long seg = 0;
548 	size_t len = 0;
549 
550 	while (seg < nr_segs) {
551 		seg++;
552 		if (len + iov->iov_len >= to) {
553 			iov->iov_len = to - len;
554 			break;
555 		}
556 		len += iov->iov_len;
557 		iov++;
558 	}
559 	return seg;
560 }
561 EXPORT_SYMBOL(iov_shorten);
562 
do_sync_readv_writev(struct file * filp,const struct iovec * iov,unsigned long nr_segs,size_t len,loff_t * ppos,iov_fn_t fn)563 static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
564 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
565 {
566 	struct kiocb kiocb;
567 	ssize_t ret;
568 
569 	init_sync_kiocb(&kiocb, filp);
570 	kiocb.ki_pos = *ppos;
571 	kiocb.ki_left = len;
572 	kiocb.ki_nbytes = len;
573 
574 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
575 	if (ret == -EIOCBQUEUED)
576 		ret = wait_on_sync_kiocb(&kiocb);
577 	*ppos = kiocb.ki_pos;
578 	return ret;
579 }
580 
581 /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iovec * iov,unsigned long nr_segs,loff_t * ppos,io_fn_t fn)582 static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
583 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
584 {
585 	struct iovec *vector = iov;
586 	ssize_t ret = 0;
587 
588 	while (nr_segs > 0) {
589 		void __user *base;
590 		size_t len;
591 		ssize_t nr;
592 
593 		base = vector->iov_base;
594 		len = vector->iov_len;
595 		vector++;
596 		nr_segs--;
597 
598 		nr = fn(filp, base, len, ppos);
599 
600 		if (nr < 0) {
601 			if (!ret)
602 				ret = nr;
603 			break;
604 		}
605 		ret += nr;
606 		if (nr != len)
607 			break;
608 	}
609 
610 	return ret;
611 }
612 
613 /* A write operation does a read from user space and vice versa */
614 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
615 
rw_copy_check_uvector(int type,const struct iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)616 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
617 			      unsigned long nr_segs, unsigned long fast_segs,
618 			      struct iovec *fast_pointer,
619 			      struct iovec **ret_pointer)
620 {
621 	unsigned long seg;
622 	ssize_t ret;
623 	struct iovec *iov = fast_pointer;
624 
625 	/*
626 	 * SuS says "The readv() function *may* fail if the iovcnt argument
627 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
628 	 * traditionally returned zero for zero segments, so...
629 	 */
630 	if (nr_segs == 0) {
631 		ret = 0;
632 		goto out;
633 	}
634 
635 	/*
636 	 * First get the "struct iovec" from user memory and
637 	 * verify all the pointers
638 	 */
639 	if (nr_segs > UIO_MAXIOV) {
640 		ret = -EINVAL;
641 		goto out;
642 	}
643 	if (nr_segs > fast_segs) {
644 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
645 		if (iov == NULL) {
646 			ret = -ENOMEM;
647 			goto out;
648 		}
649 	}
650 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
651 		ret = -EFAULT;
652 		goto out;
653 	}
654 
655 	/*
656 	 * According to the Single Unix Specification we should return EINVAL
657 	 * if an element length is < 0 when cast to ssize_t or if the
658 	 * total length would overflow the ssize_t return value of the
659 	 * system call.
660 	 *
661 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
662 	 * overflow case.
663 	 */
664 	ret = 0;
665 	for (seg = 0; seg < nr_segs; seg++) {
666 		void __user *buf = iov[seg].iov_base;
667 		ssize_t len = (ssize_t)iov[seg].iov_len;
668 
669 		/* see if we we're about to use an invalid len or if
670 		 * it's about to overflow ssize_t */
671 		if (len < 0) {
672 			ret = -EINVAL;
673 			goto out;
674 		}
675 		if (type >= 0
676 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
677 			ret = -EFAULT;
678 			goto out;
679 		}
680 		if (len > MAX_RW_COUNT - ret) {
681 			len = MAX_RW_COUNT - ret;
682 			iov[seg].iov_len = len;
683 		}
684 		ret += len;
685 	}
686 out:
687 	*ret_pointer = iov;
688 	return ret;
689 }
690 
do_readv_writev(int type,struct file * file,const struct iovec __user * uvector,unsigned long nr_segs,loff_t * pos)691 static ssize_t do_readv_writev(int type, struct file *file,
692 			       const struct iovec __user * uvector,
693 			       unsigned long nr_segs, loff_t *pos)
694 {
695 	size_t tot_len;
696 	struct iovec iovstack[UIO_FASTIOV];
697 	struct iovec *iov = iovstack;
698 	ssize_t ret;
699 	io_fn_t fn;
700 	iov_fn_t fnv;
701 
702 	if (!file->f_op) {
703 		ret = -EINVAL;
704 		goto out;
705 	}
706 
707 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
708 				    ARRAY_SIZE(iovstack), iovstack, &iov);
709 	if (ret <= 0)
710 		goto out;
711 
712 	tot_len = ret;
713 	ret = rw_verify_area(type, file, pos, tot_len);
714 	if (ret < 0)
715 		goto out;
716 
717 	fnv = NULL;
718 	if (type == READ) {
719 		fn = file->f_op->read;
720 		fnv = file->f_op->aio_read;
721 	} else {
722 		fn = (io_fn_t)file->f_op->write;
723 		fnv = file->f_op->aio_write;
724 		file_start_write(file);
725 	}
726 
727 	if (fnv)
728 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
729 						pos, fnv);
730 	else
731 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
732 
733 	if (type != READ)
734 		file_end_write(file);
735 
736 out:
737 	if (iov != iovstack)
738 		kfree(iov);
739 	if ((ret + (type == READ)) > 0) {
740 		if (type == READ)
741 			fsnotify_access(file);
742 		else
743 			fsnotify_modify(file);
744 	}
745 	return ret;
746 }
747 
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos)748 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
749 		  unsigned long vlen, loff_t *pos)
750 {
751 	if (!(file->f_mode & FMODE_READ))
752 		return -EBADF;
753 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
754 		return -EINVAL;
755 
756 	return do_readv_writev(READ, file, vec, vlen, pos);
757 }
758 
759 EXPORT_SYMBOL(vfs_readv);
760 
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos)761 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
762 		   unsigned long vlen, loff_t *pos)
763 {
764 	if (!(file->f_mode & FMODE_WRITE))
765 		return -EBADF;
766 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
767 		return -EINVAL;
768 
769 	return do_readv_writev(WRITE, file, vec, vlen, pos);
770 }
771 
772 EXPORT_SYMBOL(vfs_writev);
773 
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)774 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
775 		unsigned long, vlen)
776 {
777 	struct fd f = fdget(fd);
778 	ssize_t ret = -EBADF;
779 
780 	if (f.file) {
781 		loff_t pos = file_pos_read(f.file);
782 		ret = vfs_readv(f.file, vec, vlen, &pos);
783 		file_pos_write(f.file, pos);
784 		fdput(f);
785 	}
786 
787 	if (ret > 0)
788 		add_rchar(current, ret);
789 	inc_syscr(current);
790 	return ret;
791 }
792 
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)793 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
794 		unsigned long, vlen)
795 {
796 	struct fd f = fdget(fd);
797 	ssize_t ret = -EBADF;
798 
799 	if (f.file) {
800 		loff_t pos = file_pos_read(f.file);
801 		ret = vfs_writev(f.file, vec, vlen, &pos);
802 		file_pos_write(f.file, pos);
803 		fdput(f);
804 	}
805 
806 	if (ret > 0)
807 		add_wchar(current, ret);
808 	inc_syscw(current);
809 	return ret;
810 }
811 
pos_from_hilo(unsigned long high,unsigned long low)812 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
813 {
814 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
815 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
816 }
817 
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)818 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
819 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
820 {
821 	loff_t pos = pos_from_hilo(pos_h, pos_l);
822 	struct fd f;
823 	ssize_t ret = -EBADF;
824 
825 	if (pos < 0)
826 		return -EINVAL;
827 
828 	f = fdget(fd);
829 	if (f.file) {
830 		ret = -ESPIPE;
831 		if (f.file->f_mode & FMODE_PREAD)
832 			ret = vfs_readv(f.file, vec, vlen, &pos);
833 		fdput(f);
834 	}
835 
836 	if (ret > 0)
837 		add_rchar(current, ret);
838 	inc_syscr(current);
839 	return ret;
840 }
841 
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)842 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
843 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
844 {
845 	loff_t pos = pos_from_hilo(pos_h, pos_l);
846 	struct fd f;
847 	ssize_t ret = -EBADF;
848 
849 	if (pos < 0)
850 		return -EINVAL;
851 
852 	f = fdget(fd);
853 	if (f.file) {
854 		ret = -ESPIPE;
855 		if (f.file->f_mode & FMODE_PWRITE)
856 			ret = vfs_writev(f.file, vec, vlen, &pos);
857 		fdput(f);
858 	}
859 
860 	if (ret > 0)
861 		add_wchar(current, ret);
862 	inc_syscw(current);
863 	return ret;
864 }
865 
866 #ifdef CONFIG_COMPAT
867 
compat_do_readv_writev(int type,struct file * file,const struct compat_iovec __user * uvector,unsigned long nr_segs,loff_t * pos)868 static ssize_t compat_do_readv_writev(int type, struct file *file,
869 			       const struct compat_iovec __user *uvector,
870 			       unsigned long nr_segs, loff_t *pos)
871 {
872 	compat_ssize_t tot_len;
873 	struct iovec iovstack[UIO_FASTIOV];
874 	struct iovec *iov = iovstack;
875 	ssize_t ret;
876 	io_fn_t fn;
877 	iov_fn_t fnv;
878 
879 	ret = -EINVAL;
880 	if (!file->f_op)
881 		goto out;
882 
883 	ret = -EFAULT;
884 	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
885 		goto out;
886 
887 	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
888 					       UIO_FASTIOV, iovstack, &iov);
889 	if (ret <= 0)
890 		goto out;
891 
892 	tot_len = ret;
893 	ret = rw_verify_area(type, file, pos, tot_len);
894 	if (ret < 0)
895 		goto out;
896 
897 	fnv = NULL;
898 	if (type == READ) {
899 		fn = file->f_op->read;
900 		fnv = file->f_op->aio_read;
901 	} else {
902 		fn = (io_fn_t)file->f_op->write;
903 		fnv = file->f_op->aio_write;
904 		file_start_write(file);
905 	}
906 
907 	if (fnv)
908 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
909 						pos, fnv);
910 	else
911 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
912 
913 	if (type != READ)
914 		file_end_write(file);
915 
916 out:
917 	if (iov != iovstack)
918 		kfree(iov);
919 	if ((ret + (type == READ)) > 0) {
920 		if (type == READ)
921 			fsnotify_access(file);
922 		else
923 			fsnotify_modify(file);
924 	}
925 	return ret;
926 }
927 
compat_readv(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos)928 static size_t compat_readv(struct file *file,
929 			   const struct compat_iovec __user *vec,
930 			   unsigned long vlen, loff_t *pos)
931 {
932 	ssize_t ret = -EBADF;
933 
934 	if (!(file->f_mode & FMODE_READ))
935 		goto out;
936 
937 	ret = -EINVAL;
938 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
939 		goto out;
940 
941 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
942 
943 out:
944 	if (ret > 0)
945 		add_rchar(current, ret);
946 	inc_syscr(current);
947 	return ret;
948 }
949 
COMPAT_SYSCALL_DEFINE3(readv,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen)950 COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
951 		const struct compat_iovec __user *,vec,
952 		unsigned long, vlen)
953 {
954 	struct fd f = fdget(fd);
955 	ssize_t ret;
956 	loff_t pos;
957 
958 	if (!f.file)
959 		return -EBADF;
960 	pos = f.file->f_pos;
961 	ret = compat_readv(f.file, vec, vlen, &pos);
962 	f.file->f_pos = pos;
963 	fdput(f);
964 	return ret;
965 }
966 
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)967 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
968 		const struct compat_iovec __user *,vec,
969 		unsigned long, vlen, loff_t, pos)
970 {
971 	struct fd f;
972 	ssize_t ret;
973 
974 	if (pos < 0)
975 		return -EINVAL;
976 	f = fdget(fd);
977 	if (!f.file)
978 		return -EBADF;
979 	ret = -ESPIPE;
980 	if (f.file->f_mode & FMODE_PREAD)
981 		ret = compat_readv(f.file, vec, vlen, &pos);
982 	fdput(f);
983 	return ret;
984 }
985 
COMPAT_SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,u32,pos_low,u32,pos_high)986 COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
987 		const struct compat_iovec __user *,vec,
988 		unsigned long, vlen, u32, pos_low, u32, pos_high)
989 {
990 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
991 	return compat_sys_preadv64(fd, vec, vlen, pos);
992 }
993 
compat_writev(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos)994 static size_t compat_writev(struct file *file,
995 			    const struct compat_iovec __user *vec,
996 			    unsigned long vlen, loff_t *pos)
997 {
998 	ssize_t ret = -EBADF;
999 
1000 	if (!(file->f_mode & FMODE_WRITE))
1001 		goto out;
1002 
1003 	ret = -EINVAL;
1004 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1005 		goto out;
1006 
1007 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1008 
1009 out:
1010 	if (ret > 0)
1011 		add_wchar(current, ret);
1012 	inc_syscw(current);
1013 	return ret;
1014 }
1015 
COMPAT_SYSCALL_DEFINE3(writev,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen)1016 COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1017 		const struct compat_iovec __user *, vec,
1018 		unsigned long, vlen)
1019 {
1020 	struct fd f = fdget(fd);
1021 	ssize_t ret;
1022 	loff_t pos;
1023 
1024 	if (!f.file)
1025 		return -EBADF;
1026 	pos = f.file->f_pos;
1027 	ret = compat_writev(f.file, vec, vlen, &pos);
1028 	f.file->f_pos = pos;
1029 	fdput(f);
1030 	return ret;
1031 }
1032 
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1033 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1034 		const struct compat_iovec __user *,vec,
1035 		unsigned long, vlen, loff_t, pos)
1036 {
1037 	struct fd f;
1038 	ssize_t ret;
1039 
1040 	if (pos < 0)
1041 		return -EINVAL;
1042 	f = fdget(fd);
1043 	if (!f.file)
1044 		return -EBADF;
1045 	ret = -ESPIPE;
1046 	if (f.file->f_mode & FMODE_PWRITE)
1047 		ret = compat_writev(f.file, vec, vlen, &pos);
1048 	fdput(f);
1049 	return ret;
1050 }
1051 
COMPAT_SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,u32,pos_low,u32,pos_high)1052 COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
1053 		const struct compat_iovec __user *,vec,
1054 		unsigned long, vlen, u32, pos_low, u32, pos_high)
1055 {
1056 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1057 	return compat_sys_pwritev64(fd, vec, vlen, pos);
1058 }
1059 #endif
1060 
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1061 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1062 		  	   size_t count, loff_t max)
1063 {
1064 	struct fd in, out;
1065 	struct inode *in_inode, *out_inode;
1066 	loff_t pos;
1067 	loff_t out_pos;
1068 	ssize_t retval;
1069 	int fl;
1070 
1071 	/*
1072 	 * Get input file, and verify that it is ok..
1073 	 */
1074 	retval = -EBADF;
1075 	in = fdget(in_fd);
1076 	if (!in.file)
1077 		goto out;
1078 	if (!(in.file->f_mode & FMODE_READ))
1079 		goto fput_in;
1080 	retval = -ESPIPE;
1081 	if (!ppos) {
1082 		pos = in.file->f_pos;
1083 	} else {
1084 		pos = *ppos;
1085 		if (!(in.file->f_mode & FMODE_PREAD))
1086 			goto fput_in;
1087 	}
1088 	retval = rw_verify_area(READ, in.file, &pos, count);
1089 	if (retval < 0)
1090 		goto fput_in;
1091 	count = retval;
1092 
1093 	/*
1094 	 * Get output file, and verify that it is ok..
1095 	 */
1096 	retval = -EBADF;
1097 	out = fdget(out_fd);
1098 	if (!out.file)
1099 		goto fput_in;
1100 	if (!(out.file->f_mode & FMODE_WRITE))
1101 		goto fput_out;
1102 	retval = -EINVAL;
1103 	in_inode = file_inode(in.file);
1104 	out_inode = file_inode(out.file);
1105 	out_pos = out.file->f_pos;
1106 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1107 	if (retval < 0)
1108 		goto fput_out;
1109 	count = retval;
1110 
1111 	if (!max)
1112 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1113 
1114 	if (unlikely(pos + count > max)) {
1115 		retval = -EOVERFLOW;
1116 		if (pos >= max)
1117 			goto fput_out;
1118 		count = max - pos;
1119 	}
1120 
1121 	fl = 0;
1122 #if 0
1123 	/*
1124 	 * We need to debate whether we can enable this or not. The
1125 	 * man page documents EAGAIN return for the output at least,
1126 	 * and the application is arguably buggy if it doesn't expect
1127 	 * EAGAIN on a non-blocking file descriptor.
1128 	 */
1129 	if (in.file->f_flags & O_NONBLOCK)
1130 		fl = SPLICE_F_NONBLOCK;
1131 #endif
1132 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1133 
1134 	if (retval > 0) {
1135 		add_rchar(current, retval);
1136 		add_wchar(current, retval);
1137 		fsnotify_access(in.file);
1138 		fsnotify_modify(out.file);
1139 		out.file->f_pos = out_pos;
1140 		if (ppos)
1141 			*ppos = pos;
1142 		else
1143 			in.file->f_pos = pos;
1144 	}
1145 
1146 	inc_syscr(current);
1147 	inc_syscw(current);
1148 	if (pos > max)
1149 		retval = -EOVERFLOW;
1150 
1151 fput_out:
1152 	fdput(out);
1153 fput_in:
1154 	fdput(in);
1155 out:
1156 	return retval;
1157 }
1158 
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1159 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1160 {
1161 	loff_t pos;
1162 	off_t off;
1163 	ssize_t ret;
1164 
1165 	if (offset) {
1166 		if (unlikely(get_user(off, offset)))
1167 			return -EFAULT;
1168 		pos = off;
1169 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1170 		if (unlikely(put_user(pos, offset)))
1171 			return -EFAULT;
1172 		return ret;
1173 	}
1174 
1175 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1176 }
1177 
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1178 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1179 {
1180 	loff_t pos;
1181 	ssize_t ret;
1182 
1183 	if (offset) {
1184 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1185 			return -EFAULT;
1186 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1187 		if (unlikely(put_user(pos, offset)))
1188 			return -EFAULT;
1189 		return ret;
1190 	}
1191 
1192 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1193 }
1194 
1195 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1196 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1197 		compat_off_t __user *, offset, compat_size_t, count)
1198 {
1199 	loff_t pos;
1200 	off_t off;
1201 	ssize_t ret;
1202 
1203 	if (offset) {
1204 		if (unlikely(get_user(off, offset)))
1205 			return -EFAULT;
1206 		pos = off;
1207 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1208 		if (unlikely(put_user(pos, offset)))
1209 			return -EFAULT;
1210 		return ret;
1211 	}
1212 
1213 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1214 }
1215 
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1216 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1217 		compat_loff_t __user *, offset, compat_size_t, count)
1218 {
1219 	loff_t pos;
1220 	ssize_t ret;
1221 
1222 	if (offset) {
1223 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1224 			return -EFAULT;
1225 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1226 		if (unlikely(put_user(pos, offset)))
1227 			return -EFAULT;
1228 		return ret;
1229 	}
1230 
1231 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1232 }
1233 #endif
1234