1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * linux/fs/read_write.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
8 #include <linux/slab.h>
9 #include <linux/stat.h>
10 #include <linux/sched/xacct.h>
11 #include <linux/fcntl.h>
12 #include <linux/file.h>
13 #include <linux/uio.h>
14 #include <linux/fsnotify.h>
15 #include <linux/security.h>
16 #include <linux/export.h>
17 #include <linux/syscalls.h>
18 #include <linux/pagemap.h>
19 #include <linux/splice.h>
20 #include <linux/compat.h>
21 #include <linux/mount.h>
22 #include <linux/fs.h>
23 #include "internal.h"
24
25 #include <linux/uaccess.h>
26 #include <asm/unistd.h>
27
28 const struct file_operations generic_ro_fops = {
29 .llseek = generic_file_llseek,
30 .read_iter = generic_file_read_iter,
31 .mmap = generic_file_readonly_mmap,
32 .splice_read = generic_file_splice_read,
33 };
34
35 EXPORT_SYMBOL(generic_ro_fops);
36
unsigned_offsets(struct file * file)37 static inline bool unsigned_offsets(struct file *file)
38 {
39 return file->f_mode & FMODE_UNSIGNED_OFFSET;
40 }
41
42 /**
43 * vfs_setpos - update the file offset for lseek
44 * @file: file structure in question
45 * @offset: file offset to seek to
46 * @maxsize: maximum file size
47 *
48 * This is a low-level filesystem helper for updating the file offset to
49 * the value specified by @offset if the given offset is valid and it is
50 * not equal to the current file offset.
51 *
52 * Return the specified offset on success and -EINVAL on invalid offset.
53 */
vfs_setpos(struct file * file,loff_t offset,loff_t maxsize)54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55 {
56 if (offset < 0 && !unsigned_offsets(file))
57 return -EINVAL;
58 if (offset > maxsize)
59 return -EINVAL;
60
61 if (offset != file->f_pos) {
62 file->f_pos = offset;
63 file->f_version = 0;
64 }
65 return offset;
66 }
67 EXPORT_SYMBOL(vfs_setpos);
68
69 /**
70 * generic_file_llseek_size - generic llseek implementation for regular files
71 * @file: file structure to seek on
72 * @offset: file offset to seek to
73 * @whence: type of seek
74 * @size: max size of this file in file system
75 * @eof: offset used for SEEK_END position
76 *
77 * This is a variant of generic_file_llseek that allows passing in a custom
78 * maximum file size and a custom EOF position, for e.g. hashed directories
79 *
80 * Synchronization:
81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83 * read/writes behave like SEEK_SET against seeks.
84 */
85 loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)86 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87 loff_t maxsize, loff_t eof)
88 {
89 switch (whence) {
90 case SEEK_END:
91 offset += eof;
92 break;
93 case SEEK_CUR:
94 /*
95 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 * position-querying operation. Avoid rewriting the "same"
97 * f_pos value back to the file because a concurrent read(),
98 * write() or lseek() might have altered it
99 */
100 if (offset == 0)
101 return file->f_pos;
102 /*
103 * f_lock protects against read/modify/write race with other
104 * SEEK_CURs. Note that parallel writes and reads behave
105 * like SEEK_SET.
106 */
107 spin_lock(&file->f_lock);
108 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109 spin_unlock(&file->f_lock);
110 return offset;
111 case SEEK_DATA:
112 /*
113 * In the generic case the entire file is data, so as long as
114 * offset isn't at the end of the file then the offset is data.
115 */
116 if ((unsigned long long)offset >= eof)
117 return -ENXIO;
118 break;
119 case SEEK_HOLE:
120 /*
121 * There is a virtual hole at the end of the file, so as long as
122 * offset isn't i_size or larger, return i_size.
123 */
124 if ((unsigned long long)offset >= eof)
125 return -ENXIO;
126 offset = eof;
127 break;
128 }
129
130 return vfs_setpos(file, offset, maxsize);
131 }
132 EXPORT_SYMBOL(generic_file_llseek_size);
133
134 /**
135 * generic_file_llseek - generic llseek implementation for regular files
136 * @file: file structure to seek on
137 * @offset: file offset to seek to
138 * @whence: type of seek
139 *
140 * This is a generic implemenation of ->llseek useable for all normal local
141 * filesystems. It just updates the file offset to the value specified by
142 * @offset and @whence.
143 */
generic_file_llseek(struct file * file,loff_t offset,int whence)144 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145 {
146 struct inode *inode = file->f_mapping->host;
147
148 return generic_file_llseek_size(file, offset, whence,
149 inode->i_sb->s_maxbytes,
150 i_size_read(inode));
151 }
152 EXPORT_SYMBOL(generic_file_llseek);
153
154 /**
155 * fixed_size_llseek - llseek implementation for fixed-sized devices
156 * @file: file structure to seek on
157 * @offset: file offset to seek to
158 * @whence: type of seek
159 * @size: size of the file
160 *
161 */
fixed_size_llseek(struct file * file,loff_t offset,int whence,loff_t size)162 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163 {
164 switch (whence) {
165 case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 return generic_file_llseek_size(file, offset, whence,
167 size, size);
168 default:
169 return -EINVAL;
170 }
171 }
172 EXPORT_SYMBOL(fixed_size_llseek);
173
174 /**
175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
176 * @file: file structure to seek on
177 * @offset: file offset to seek to
178 * @whence: type of seek
179 *
180 */
no_seek_end_llseek(struct file * file,loff_t offset,int whence)181 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182 {
183 switch (whence) {
184 case SEEK_SET: case SEEK_CUR:
185 return generic_file_llseek_size(file, offset, whence,
186 OFFSET_MAX, 0);
187 default:
188 return -EINVAL;
189 }
190 }
191 EXPORT_SYMBOL(no_seek_end_llseek);
192
193 /**
194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195 * @file: file structure to seek on
196 * @offset: file offset to seek to
197 * @whence: type of seek
198 * @size: maximal offset allowed
199 *
200 */
no_seek_end_llseek_size(struct file * file,loff_t offset,int whence,loff_t size)201 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202 {
203 switch (whence) {
204 case SEEK_SET: case SEEK_CUR:
205 return generic_file_llseek_size(file, offset, whence,
206 size, 0);
207 default:
208 return -EINVAL;
209 }
210 }
211 EXPORT_SYMBOL(no_seek_end_llseek_size);
212
213 /**
214 * noop_llseek - No Operation Performed llseek implementation
215 * @file: file structure to seek on
216 * @offset: file offset to seek to
217 * @whence: type of seek
218 *
219 * This is an implementation of ->llseek useable for the rare special case when
220 * userspace expects the seek to succeed but the (device) file is actually not
221 * able to perform the seek. In this case you use noop_llseek() instead of
222 * falling back to the default implementation of ->llseek.
223 */
noop_llseek(struct file * file,loff_t offset,int whence)224 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225 {
226 return file->f_pos;
227 }
228 EXPORT_SYMBOL(noop_llseek);
229
no_llseek(struct file * file,loff_t offset,int whence)230 loff_t no_llseek(struct file *file, loff_t offset, int whence)
231 {
232 return -ESPIPE;
233 }
234 EXPORT_SYMBOL(no_llseek);
235
default_llseek(struct file * file,loff_t offset,int whence)236 loff_t default_llseek(struct file *file, loff_t offset, int whence)
237 {
238 struct inode *inode = file_inode(file);
239 loff_t retval;
240
241 inode_lock(inode);
242 switch (whence) {
243 case SEEK_END:
244 offset += i_size_read(inode);
245 break;
246 case SEEK_CUR:
247 if (offset == 0) {
248 retval = file->f_pos;
249 goto out;
250 }
251 offset += file->f_pos;
252 break;
253 case SEEK_DATA:
254 /*
255 * In the generic case the entire file is data, so as
256 * long as offset isn't at the end of the file then the
257 * offset is data.
258 */
259 if (offset >= inode->i_size) {
260 retval = -ENXIO;
261 goto out;
262 }
263 break;
264 case SEEK_HOLE:
265 /*
266 * There is a virtual hole at the end of the file, so
267 * as long as offset isn't i_size or larger, return
268 * i_size.
269 */
270 if (offset >= inode->i_size) {
271 retval = -ENXIO;
272 goto out;
273 }
274 offset = inode->i_size;
275 break;
276 }
277 retval = -EINVAL;
278 if (offset >= 0 || unsigned_offsets(file)) {
279 if (offset != file->f_pos) {
280 file->f_pos = offset;
281 file->f_version = 0;
282 }
283 retval = offset;
284 }
285 out:
286 inode_unlock(inode);
287 return retval;
288 }
289 EXPORT_SYMBOL(default_llseek);
290
vfs_llseek(struct file * file,loff_t offset,int whence)291 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
292 {
293 loff_t (*fn)(struct file *, loff_t, int);
294
295 fn = no_llseek;
296 if (file->f_mode & FMODE_LSEEK) {
297 if (file->f_op->llseek)
298 fn = file->f_op->llseek;
299 }
300 return fn(file, offset, whence);
301 }
302 EXPORT_SYMBOL(vfs_llseek);
303
ksys_lseek(unsigned int fd,off_t offset,unsigned int whence)304 off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
305 {
306 off_t retval;
307 struct fd f = fdget_pos(fd);
308 if (!f.file)
309 return -EBADF;
310
311 retval = -EINVAL;
312 if (whence <= SEEK_MAX) {
313 loff_t res = vfs_llseek(f.file, offset, whence);
314 retval = res;
315 if (res != (loff_t)retval)
316 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
317 }
318 fdput_pos(f);
319 return retval;
320 }
321
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)322 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323 {
324 return ksys_lseek(fd, offset, whence);
325 }
326
327 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)328 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329 {
330 return ksys_lseek(fd, offset, whence);
331 }
332 #endif
333
334 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)335 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
336 unsigned long, offset_low, loff_t __user *, result,
337 unsigned int, whence)
338 {
339 int retval;
340 struct fd f = fdget_pos(fd);
341 loff_t offset;
342
343 if (!f.file)
344 return -EBADF;
345
346 retval = -EINVAL;
347 if (whence > SEEK_MAX)
348 goto out_putf;
349
350 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
351 whence);
352
353 retval = (int)offset;
354 if (offset >= 0) {
355 retval = -EFAULT;
356 if (!copy_to_user(result, &offset, sizeof(offset)))
357 retval = 0;
358 }
359 out_putf:
360 fdput_pos(f);
361 return retval;
362 }
363 #endif
364
rw_verify_area(int read_write,struct file * file,const loff_t * ppos,size_t count)365 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
366 {
367 struct inode *inode;
368 int retval = -EINVAL;
369
370 inode = file_inode(file);
371 if (unlikely((ssize_t) count < 0))
372 return retval;
373
374 /*
375 * ranged mandatory locking does not apply to streams - it makes sense
376 * only for files where position has a meaning.
377 */
378 if (ppos) {
379 loff_t pos = *ppos;
380
381 if (unlikely(pos < 0)) {
382 if (!unsigned_offsets(file))
383 return retval;
384 if (count >= -pos) /* both values are in 0..LLONG_MAX */
385 return -EOVERFLOW;
386 } else if (unlikely((loff_t) (pos + count) < 0)) {
387 if (!unsigned_offsets(file))
388 return retval;
389 }
390
391 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
392 retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
393 read_write == READ ? F_RDLCK : F_WRLCK);
394 if (retval < 0)
395 return retval;
396 }
397 }
398
399 return security_file_permission(file,
400 read_write == READ ? MAY_READ : MAY_WRITE);
401 }
402
new_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)403 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
404 {
405 struct iovec iov = { .iov_base = buf, .iov_len = len };
406 struct kiocb kiocb;
407 struct iov_iter iter;
408 ssize_t ret;
409
410 init_sync_kiocb(&kiocb, filp);
411 kiocb.ki_pos = (ppos ? *ppos : 0);
412 iov_iter_init(&iter, READ, &iov, 1, len);
413
414 ret = call_read_iter(filp, &kiocb, &iter);
415 BUG_ON(ret == -EIOCBQUEUED);
416 if (ppos)
417 *ppos = kiocb.ki_pos;
418 return ret;
419 }
420
__vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)421 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
422 loff_t *pos)
423 {
424 if (file->f_op->read)
425 return file->f_op->read(file, buf, count, pos);
426 else if (file->f_op->read_iter)
427 return new_sync_read(file, buf, count, pos);
428 else
429 return -EINVAL;
430 }
431
kernel_read(struct file * file,void * buf,size_t count,loff_t * pos)432 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
433 {
434 mm_segment_t old_fs;
435 ssize_t result;
436
437 old_fs = get_fs();
438 set_fs(KERNEL_DS);
439 /* The cast to a user pointer is valid due to the set_fs() */
440 result = vfs_read(file, (void __user *)buf, count, pos);
441 set_fs(old_fs);
442 return result;
443 }
444 EXPORT_SYMBOL(kernel_read);
445
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)446 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
447 {
448 ssize_t ret;
449
450 if (!(file->f_mode & FMODE_READ))
451 return -EBADF;
452 if (!(file->f_mode & FMODE_CAN_READ))
453 return -EINVAL;
454 if (unlikely(!access_ok(buf, count)))
455 return -EFAULT;
456
457 ret = rw_verify_area(READ, file, pos, count);
458 if (!ret) {
459 if (count > MAX_RW_COUNT)
460 count = MAX_RW_COUNT;
461 ret = __vfs_read(file, buf, count, pos);
462 if (ret > 0) {
463 fsnotify_access(file);
464 add_rchar(current, ret);
465 }
466 inc_syscr(current);
467 }
468
469 return ret;
470 }
471
472 EXPORT_SYMBOL(vfs_read);
473
new_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)474 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
475 {
476 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
477 struct kiocb kiocb;
478 struct iov_iter iter;
479 ssize_t ret;
480
481 init_sync_kiocb(&kiocb, filp);
482 kiocb.ki_pos = (ppos ? *ppos : 0);
483 iov_iter_init(&iter, WRITE, &iov, 1, len);
484
485 ret = call_write_iter(filp, &kiocb, &iter);
486 BUG_ON(ret == -EIOCBQUEUED);
487 if (ret > 0 && ppos)
488 *ppos = kiocb.ki_pos;
489 return ret;
490 }
491
__vfs_write(struct file * file,const char __user * p,size_t count,loff_t * pos)492 static ssize_t __vfs_write(struct file *file, const char __user *p,
493 size_t count, loff_t *pos)
494 {
495 if (file->f_op->write)
496 return file->f_op->write(file, p, count, pos);
497 else if (file->f_op->write_iter)
498 return new_sync_write(file, p, count, pos);
499 else
500 return -EINVAL;
501 }
502
__kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)503 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
504 {
505 mm_segment_t old_fs;
506 const char __user *p;
507 ssize_t ret;
508
509 if (!(file->f_mode & FMODE_CAN_WRITE))
510 return -EINVAL;
511
512 old_fs = get_fs();
513 set_fs(KERNEL_DS);
514 p = (__force const char __user *)buf;
515 if (count > MAX_RW_COUNT)
516 count = MAX_RW_COUNT;
517 ret = __vfs_write(file, p, count, pos);
518 set_fs(old_fs);
519 if (ret > 0) {
520 fsnotify_modify(file);
521 add_wchar(current, ret);
522 }
523 inc_syscw(current);
524 return ret;
525 }
526 EXPORT_SYMBOL(__kernel_write);
527
kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)528 ssize_t kernel_write(struct file *file, const void *buf, size_t count,
529 loff_t *pos)
530 {
531 mm_segment_t old_fs;
532 ssize_t res;
533
534 old_fs = get_fs();
535 set_fs(KERNEL_DS);
536 /* The cast to a user pointer is valid due to the set_fs() */
537 res = vfs_write(file, (__force const char __user *)buf, count, pos);
538 set_fs(old_fs);
539
540 return res;
541 }
542 EXPORT_SYMBOL(kernel_write);
543
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)544 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
545 {
546 ssize_t ret;
547
548 if (!(file->f_mode & FMODE_WRITE))
549 return -EBADF;
550 if (!(file->f_mode & FMODE_CAN_WRITE))
551 return -EINVAL;
552 if (unlikely(!access_ok(buf, count)))
553 return -EFAULT;
554
555 ret = rw_verify_area(WRITE, file, pos, count);
556 if (!ret) {
557 if (count > MAX_RW_COUNT)
558 count = MAX_RW_COUNT;
559 file_start_write(file);
560 ret = __vfs_write(file, buf, count, pos);
561 if (ret > 0) {
562 fsnotify_modify(file);
563 add_wchar(current, ret);
564 }
565 inc_syscw(current);
566 file_end_write(file);
567 }
568
569 return ret;
570 }
571 EXPORT_SYMBOL(vfs_write);
572
573 /* file_ppos returns &file->f_pos or NULL if file is stream */
file_ppos(struct file * file)574 static inline loff_t *file_ppos(struct file *file)
575 {
576 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
577 }
578
ksys_read(unsigned int fd,char __user * buf,size_t count)579 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
580 {
581 struct fd f = fdget_pos(fd);
582 ssize_t ret = -EBADF;
583
584 if (f.file) {
585 loff_t pos, *ppos = file_ppos(f.file);
586 if (ppos) {
587 pos = *ppos;
588 ppos = &pos;
589 }
590 ret = vfs_read(f.file, buf, count, ppos);
591 if (ret >= 0 && ppos)
592 f.file->f_pos = pos;
593 fdput_pos(f);
594 }
595 return ret;
596 }
597
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)598 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
599 {
600 return ksys_read(fd, buf, count);
601 }
602
ksys_write(unsigned int fd,const char __user * buf,size_t count)603 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
604 {
605 struct fd f = fdget_pos(fd);
606 ssize_t ret = -EBADF;
607
608 if (f.file) {
609 loff_t pos, *ppos = file_ppos(f.file);
610 if (ppos) {
611 pos = *ppos;
612 ppos = &pos;
613 }
614 ret = vfs_write(f.file, buf, count, ppos);
615 if (ret >= 0 && ppos)
616 f.file->f_pos = pos;
617 fdput_pos(f);
618 }
619
620 return ret;
621 }
622
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)623 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
624 size_t, count)
625 {
626 return ksys_write(fd, buf, count);
627 }
628
ksys_pread64(unsigned int fd,char __user * buf,size_t count,loff_t pos)629 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
630 loff_t pos)
631 {
632 struct fd f;
633 ssize_t ret = -EBADF;
634
635 if (pos < 0)
636 return -EINVAL;
637
638 f = fdget(fd);
639 if (f.file) {
640 ret = -ESPIPE;
641 if (f.file->f_mode & FMODE_PREAD)
642 ret = vfs_read(f.file, buf, count, &pos);
643 fdput(f);
644 }
645
646 return ret;
647 }
648
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)649 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
650 size_t, count, loff_t, pos)
651 {
652 return ksys_pread64(fd, buf, count, pos);
653 }
654
ksys_pwrite64(unsigned int fd,const char __user * buf,size_t count,loff_t pos)655 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
656 size_t count, loff_t pos)
657 {
658 struct fd f;
659 ssize_t ret = -EBADF;
660
661 if (pos < 0)
662 return -EINVAL;
663
664 f = fdget(fd);
665 if (f.file) {
666 ret = -ESPIPE;
667 if (f.file->f_mode & FMODE_PWRITE)
668 ret = vfs_write(f.file, buf, count, &pos);
669 fdput(f);
670 }
671
672 return ret;
673 }
674
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)675 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
676 size_t, count, loff_t, pos)
677 {
678 return ksys_pwrite64(fd, buf, count, pos);
679 }
680
do_iter_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)681 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
682 loff_t *ppos, int type, rwf_t flags)
683 {
684 struct kiocb kiocb;
685 ssize_t ret;
686
687 init_sync_kiocb(&kiocb, filp);
688 ret = kiocb_set_rw_flags(&kiocb, flags);
689 if (ret)
690 return ret;
691 kiocb.ki_pos = (ppos ? *ppos : 0);
692
693 if (type == READ)
694 ret = call_read_iter(filp, &kiocb, iter);
695 else
696 ret = call_write_iter(filp, &kiocb, iter);
697 BUG_ON(ret == -EIOCBQUEUED);
698 if (ppos)
699 *ppos = kiocb.ki_pos;
700 return ret;
701 }
702
703 /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)704 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
705 loff_t *ppos, int type, rwf_t flags)
706 {
707 ssize_t ret = 0;
708
709 if (flags & ~RWF_HIPRI)
710 return -EOPNOTSUPP;
711
712 while (iov_iter_count(iter)) {
713 struct iovec iovec = iov_iter_iovec(iter);
714 ssize_t nr;
715
716 if (type == READ) {
717 nr = filp->f_op->read(filp, iovec.iov_base,
718 iovec.iov_len, ppos);
719 } else {
720 nr = filp->f_op->write(filp, iovec.iov_base,
721 iovec.iov_len, ppos);
722 }
723
724 if (nr < 0) {
725 if (!ret)
726 ret = nr;
727 break;
728 }
729 ret += nr;
730 if (nr != iovec.iov_len)
731 break;
732 iov_iter_advance(iter, nr);
733 }
734
735 return ret;
736 }
737
738 /**
739 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
740 * into the kernel and check that it is valid.
741 *
742 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
743 * @uvector: Pointer to the userspace array.
744 * @nr_segs: Number of elements in userspace array.
745 * @fast_segs: Number of elements in @fast_pointer.
746 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
747 * @ret_pointer: (output parameter) Pointer to a variable that will point to
748 * either @fast_pointer, a newly allocated kernel array, or NULL,
749 * depending on which array was used.
750 *
751 * This function copies an array of &struct iovec of @nr_segs from
752 * userspace into the kernel and checks that each element is valid (e.g.
753 * it does not point to a kernel address or cause overflow by being too
754 * large, etc.).
755 *
756 * As an optimization, the caller may provide a pointer to a small
757 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
758 * (the size of this array, or 0 if unused, should be given in @fast_segs).
759 *
760 * @ret_pointer will always point to the array that was used, so the
761 * caller must take care not to call kfree() on it e.g. in case the
762 * @fast_pointer array was used and it was allocated on the stack.
763 *
764 * Return: The total number of bytes covered by the iovec array on success
765 * or a negative error code on error.
766 */
rw_copy_check_uvector(int type,const struct iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)767 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
768 unsigned long nr_segs, unsigned long fast_segs,
769 struct iovec *fast_pointer,
770 struct iovec **ret_pointer)
771 {
772 unsigned long seg;
773 ssize_t ret;
774 struct iovec *iov = fast_pointer;
775
776 /*
777 * SuS says "The readv() function *may* fail if the iovcnt argument
778 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
779 * traditionally returned zero for zero segments, so...
780 */
781 if (nr_segs == 0) {
782 ret = 0;
783 goto out;
784 }
785
786 /*
787 * First get the "struct iovec" from user memory and
788 * verify all the pointers
789 */
790 if (nr_segs > UIO_MAXIOV) {
791 ret = -EINVAL;
792 goto out;
793 }
794 if (nr_segs > fast_segs) {
795 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
796 if (iov == NULL) {
797 ret = -ENOMEM;
798 goto out;
799 }
800 }
801 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
802 ret = -EFAULT;
803 goto out;
804 }
805
806 /*
807 * According to the Single Unix Specification we should return EINVAL
808 * if an element length is < 0 when cast to ssize_t or if the
809 * total length would overflow the ssize_t return value of the
810 * system call.
811 *
812 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
813 * overflow case.
814 */
815 ret = 0;
816 for (seg = 0; seg < nr_segs; seg++) {
817 void __user *buf = iov[seg].iov_base;
818 ssize_t len = (ssize_t)iov[seg].iov_len;
819
820 /* see if we we're about to use an invalid len or if
821 * it's about to overflow ssize_t */
822 if (len < 0) {
823 ret = -EINVAL;
824 goto out;
825 }
826 if (type >= 0
827 && unlikely(!access_ok(buf, len))) {
828 ret = -EFAULT;
829 goto out;
830 }
831 if (len > MAX_RW_COUNT - ret) {
832 len = MAX_RW_COUNT - ret;
833 iov[seg].iov_len = len;
834 }
835 ret += len;
836 }
837 out:
838 *ret_pointer = iov;
839 return ret;
840 }
841
842 #ifdef CONFIG_COMPAT
compat_rw_copy_check_uvector(int type,const struct compat_iovec __user * uvector,unsigned long nr_segs,unsigned long fast_segs,struct iovec * fast_pointer,struct iovec ** ret_pointer)843 ssize_t compat_rw_copy_check_uvector(int type,
844 const struct compat_iovec __user *uvector, unsigned long nr_segs,
845 unsigned long fast_segs, struct iovec *fast_pointer,
846 struct iovec **ret_pointer)
847 {
848 compat_ssize_t tot_len;
849 struct iovec *iov = *ret_pointer = fast_pointer;
850 ssize_t ret = 0;
851 int seg;
852
853 /*
854 * SuS says "The readv() function *may* fail if the iovcnt argument
855 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
856 * traditionally returned zero for zero segments, so...
857 */
858 if (nr_segs == 0)
859 goto out;
860
861 ret = -EINVAL;
862 if (nr_segs > UIO_MAXIOV)
863 goto out;
864 if (nr_segs > fast_segs) {
865 ret = -ENOMEM;
866 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
867 if (iov == NULL)
868 goto out;
869 }
870 *ret_pointer = iov;
871
872 ret = -EFAULT;
873 if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
874 goto out;
875
876 /*
877 * Single unix specification:
878 * We should -EINVAL if an element length is not >= 0 and fitting an
879 * ssize_t.
880 *
881 * In Linux, the total length is limited to MAX_RW_COUNT, there is
882 * no overflow possibility.
883 */
884 tot_len = 0;
885 ret = -EINVAL;
886 for (seg = 0; seg < nr_segs; seg++) {
887 compat_uptr_t buf;
888 compat_ssize_t len;
889
890 if (__get_user(len, &uvector->iov_len) ||
891 __get_user(buf, &uvector->iov_base)) {
892 ret = -EFAULT;
893 goto out;
894 }
895 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
896 goto out;
897 if (type >= 0 &&
898 !access_ok(compat_ptr(buf), len)) {
899 ret = -EFAULT;
900 goto out;
901 }
902 if (len > MAX_RW_COUNT - tot_len)
903 len = MAX_RW_COUNT - tot_len;
904 tot_len += len;
905 iov->iov_base = compat_ptr(buf);
906 iov->iov_len = (compat_size_t) len;
907 uvector++;
908 iov++;
909 }
910 ret = tot_len;
911
912 out:
913 return ret;
914 }
915 #endif
916
do_iter_read(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)917 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
918 loff_t *pos, rwf_t flags)
919 {
920 size_t tot_len;
921 ssize_t ret = 0;
922
923 if (!(file->f_mode & FMODE_READ))
924 return -EBADF;
925 if (!(file->f_mode & FMODE_CAN_READ))
926 return -EINVAL;
927
928 tot_len = iov_iter_count(iter);
929 if (!tot_len)
930 goto out;
931 ret = rw_verify_area(READ, file, pos, tot_len);
932 if (ret < 0)
933 return ret;
934
935 if (file->f_op->read_iter)
936 ret = do_iter_readv_writev(file, iter, pos, READ, flags);
937 else
938 ret = do_loop_readv_writev(file, iter, pos, READ, flags);
939 out:
940 if (ret >= 0)
941 fsnotify_access(file);
942 return ret;
943 }
944
vfs_iter_read(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)945 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
946 rwf_t flags)
947 {
948 if (!file->f_op->read_iter)
949 return -EINVAL;
950 return do_iter_read(file, iter, ppos, flags);
951 }
952 EXPORT_SYMBOL(vfs_iter_read);
953
do_iter_write(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)954 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
955 loff_t *pos, rwf_t flags)
956 {
957 size_t tot_len;
958 ssize_t ret = 0;
959
960 if (!(file->f_mode & FMODE_WRITE))
961 return -EBADF;
962 if (!(file->f_mode & FMODE_CAN_WRITE))
963 return -EINVAL;
964
965 tot_len = iov_iter_count(iter);
966 if (!tot_len)
967 return 0;
968 ret = rw_verify_area(WRITE, file, pos, tot_len);
969 if (ret < 0)
970 return ret;
971
972 if (file->f_op->write_iter)
973 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
974 else
975 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
976 if (ret > 0)
977 fsnotify_modify(file);
978 return ret;
979 }
980
vfs_iter_write(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)981 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
982 rwf_t flags)
983 {
984 if (!file->f_op->write_iter)
985 return -EINVAL;
986 return do_iter_write(file, iter, ppos, flags);
987 }
988 EXPORT_SYMBOL(vfs_iter_write);
989
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)990 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
991 unsigned long vlen, loff_t *pos, rwf_t flags)
992 {
993 struct iovec iovstack[UIO_FASTIOV];
994 struct iovec *iov = iovstack;
995 struct iov_iter iter;
996 ssize_t ret;
997
998 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
999 if (ret >= 0) {
1000 ret = do_iter_read(file, &iter, pos, flags);
1001 kfree(iov);
1002 }
1003
1004 return ret;
1005 }
1006
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1007 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1008 unsigned long vlen, loff_t *pos, rwf_t flags)
1009 {
1010 struct iovec iovstack[UIO_FASTIOV];
1011 struct iovec *iov = iovstack;
1012 struct iov_iter iter;
1013 ssize_t ret;
1014
1015 ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1016 if (ret >= 0) {
1017 file_start_write(file);
1018 ret = do_iter_write(file, &iter, pos, flags);
1019 file_end_write(file);
1020 kfree(iov);
1021 }
1022 return ret;
1023 }
1024
do_readv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1025 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1026 unsigned long vlen, rwf_t flags)
1027 {
1028 struct fd f = fdget_pos(fd);
1029 ssize_t ret = -EBADF;
1030
1031 if (f.file) {
1032 loff_t pos, *ppos = file_ppos(f.file);
1033 if (ppos) {
1034 pos = *ppos;
1035 ppos = &pos;
1036 }
1037 ret = vfs_readv(f.file, vec, vlen, ppos, flags);
1038 if (ret >= 0 && ppos)
1039 f.file->f_pos = pos;
1040 fdput_pos(f);
1041 }
1042
1043 if (ret > 0)
1044 add_rchar(current, ret);
1045 inc_syscr(current);
1046 return ret;
1047 }
1048
do_writev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)1049 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1050 unsigned long vlen, rwf_t flags)
1051 {
1052 struct fd f = fdget_pos(fd);
1053 ssize_t ret = -EBADF;
1054
1055 if (f.file) {
1056 loff_t pos, *ppos = file_ppos(f.file);
1057 if (ppos) {
1058 pos = *ppos;
1059 ppos = &pos;
1060 }
1061 ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1062 if (ret >= 0 && ppos)
1063 f.file->f_pos = pos;
1064 fdput_pos(f);
1065 }
1066
1067 if (ret > 0)
1068 add_wchar(current, ret);
1069 inc_syscw(current);
1070 return ret;
1071 }
1072
pos_from_hilo(unsigned long high,unsigned long low)1073 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1074 {
1075 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
1076 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1077 }
1078
do_preadv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1079 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1080 unsigned long vlen, loff_t pos, rwf_t flags)
1081 {
1082 struct fd f;
1083 ssize_t ret = -EBADF;
1084
1085 if (pos < 0)
1086 return -EINVAL;
1087
1088 f = fdget(fd);
1089 if (f.file) {
1090 ret = -ESPIPE;
1091 if (f.file->f_mode & FMODE_PREAD)
1092 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1093 fdput(f);
1094 }
1095
1096 if (ret > 0)
1097 add_rchar(current, ret);
1098 inc_syscr(current);
1099 return ret;
1100 }
1101
do_pwritev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1102 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1103 unsigned long vlen, loff_t pos, rwf_t flags)
1104 {
1105 struct fd f;
1106 ssize_t ret = -EBADF;
1107
1108 if (pos < 0)
1109 return -EINVAL;
1110
1111 f = fdget(fd);
1112 if (f.file) {
1113 ret = -ESPIPE;
1114 if (f.file->f_mode & FMODE_PWRITE)
1115 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1116 fdput(f);
1117 }
1118
1119 if (ret > 0)
1120 add_wchar(current, ret);
1121 inc_syscw(current);
1122 return ret;
1123 }
1124
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1125 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1126 unsigned long, vlen)
1127 {
1128 return do_readv(fd, vec, vlen, 0);
1129 }
1130
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1131 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1132 unsigned long, vlen)
1133 {
1134 return do_writev(fd, vec, vlen, 0);
1135 }
1136
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1137 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1138 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1139 {
1140 loff_t pos = pos_from_hilo(pos_h, pos_l);
1141
1142 return do_preadv(fd, vec, vlen, pos, 0);
1143 }
1144
SYSCALL_DEFINE6(preadv2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1145 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1146 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1147 rwf_t, flags)
1148 {
1149 loff_t pos = pos_from_hilo(pos_h, pos_l);
1150
1151 if (pos == -1)
1152 return do_readv(fd, vec, vlen, flags);
1153
1154 return do_preadv(fd, vec, vlen, pos, flags);
1155 }
1156
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1157 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1158 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1159 {
1160 loff_t pos = pos_from_hilo(pos_h, pos_l);
1161
1162 return do_pwritev(fd, vec, vlen, pos, 0);
1163 }
1164
SYSCALL_DEFINE6(pwritev2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1165 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1166 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1167 rwf_t, flags)
1168 {
1169 loff_t pos = pos_from_hilo(pos_h, pos_l);
1170
1171 if (pos == -1)
1172 return do_writev(fd, vec, vlen, flags);
1173
1174 return do_pwritev(fd, vec, vlen, pos, flags);
1175 }
1176
1177 #ifdef CONFIG_COMPAT
compat_readv(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1178 static size_t compat_readv(struct file *file,
1179 const struct compat_iovec __user *vec,
1180 unsigned long vlen, loff_t *pos, rwf_t flags)
1181 {
1182 struct iovec iovstack[UIO_FASTIOV];
1183 struct iovec *iov = iovstack;
1184 struct iov_iter iter;
1185 ssize_t ret;
1186
1187 ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
1188 if (ret >= 0) {
1189 ret = do_iter_read(file, &iter, pos, flags);
1190 kfree(iov);
1191 }
1192 if (ret > 0)
1193 add_rchar(current, ret);
1194 inc_syscr(current);
1195 return ret;
1196 }
1197
do_compat_readv(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1198 static size_t do_compat_readv(compat_ulong_t fd,
1199 const struct compat_iovec __user *vec,
1200 compat_ulong_t vlen, rwf_t flags)
1201 {
1202 struct fd f = fdget_pos(fd);
1203 ssize_t ret;
1204 loff_t pos;
1205
1206 if (!f.file)
1207 return -EBADF;
1208 pos = f.file->f_pos;
1209 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1210 if (ret >= 0)
1211 f.file->f_pos = pos;
1212 fdput_pos(f);
1213 return ret;
1214
1215 }
1216
COMPAT_SYSCALL_DEFINE3(readv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1217 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1218 const struct compat_iovec __user *,vec,
1219 compat_ulong_t, vlen)
1220 {
1221 return do_compat_readv(fd, vec, vlen, 0);
1222 }
1223
do_compat_preadv64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1224 static long do_compat_preadv64(unsigned long fd,
1225 const struct compat_iovec __user *vec,
1226 unsigned long vlen, loff_t pos, rwf_t flags)
1227 {
1228 struct fd f;
1229 ssize_t ret;
1230
1231 if (pos < 0)
1232 return -EINVAL;
1233 f = fdget(fd);
1234 if (!f.file)
1235 return -EBADF;
1236 ret = -ESPIPE;
1237 if (f.file->f_mode & FMODE_PREAD)
1238 ret = compat_readv(f.file, vec, vlen, &pos, flags);
1239 fdput(f);
1240 return ret;
1241 }
1242
1243 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1244 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1245 const struct compat_iovec __user *,vec,
1246 unsigned long, vlen, loff_t, pos)
1247 {
1248 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1249 }
1250 #endif
1251
COMPAT_SYSCALL_DEFINE5(preadv,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1252 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1253 const struct compat_iovec __user *,vec,
1254 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1255 {
1256 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1257
1258 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1259 }
1260
1261 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1262 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1263 const struct compat_iovec __user *,vec,
1264 unsigned long, vlen, loff_t, pos, rwf_t, flags)
1265 {
1266 if (pos == -1)
1267 return do_compat_readv(fd, vec, vlen, flags);
1268
1269 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1270 }
1271 #endif
1272
COMPAT_SYSCALL_DEFINE6(preadv2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1273 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1274 const struct compat_iovec __user *,vec,
1275 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1276 rwf_t, flags)
1277 {
1278 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1279
1280 if (pos == -1)
1281 return do_compat_readv(fd, vec, vlen, flags);
1282
1283 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1284 }
1285
compat_writev(struct file * file,const struct compat_iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)1286 static size_t compat_writev(struct file *file,
1287 const struct compat_iovec __user *vec,
1288 unsigned long vlen, loff_t *pos, rwf_t flags)
1289 {
1290 struct iovec iovstack[UIO_FASTIOV];
1291 struct iovec *iov = iovstack;
1292 struct iov_iter iter;
1293 ssize_t ret;
1294
1295 ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
1296 if (ret >= 0) {
1297 file_start_write(file);
1298 ret = do_iter_write(file, &iter, pos, flags);
1299 file_end_write(file);
1300 kfree(iov);
1301 }
1302 if (ret > 0)
1303 add_wchar(current, ret);
1304 inc_syscw(current);
1305 return ret;
1306 }
1307
do_compat_writev(compat_ulong_t fd,const struct compat_iovec __user * vec,compat_ulong_t vlen,rwf_t flags)1308 static size_t do_compat_writev(compat_ulong_t fd,
1309 const struct compat_iovec __user* vec,
1310 compat_ulong_t vlen, rwf_t flags)
1311 {
1312 struct fd f = fdget_pos(fd);
1313 ssize_t ret;
1314 loff_t pos;
1315
1316 if (!f.file)
1317 return -EBADF;
1318 pos = f.file->f_pos;
1319 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1320 if (ret >= 0)
1321 f.file->f_pos = pos;
1322 fdput_pos(f);
1323 return ret;
1324 }
1325
COMPAT_SYSCALL_DEFINE3(writev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen)1326 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1327 const struct compat_iovec __user *, vec,
1328 compat_ulong_t, vlen)
1329 {
1330 return do_compat_writev(fd, vec, vlen, 0);
1331 }
1332
do_compat_pwritev64(unsigned long fd,const struct compat_iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1333 static long do_compat_pwritev64(unsigned long fd,
1334 const struct compat_iovec __user *vec,
1335 unsigned long vlen, loff_t pos, rwf_t flags)
1336 {
1337 struct fd f;
1338 ssize_t ret;
1339
1340 if (pos < 0)
1341 return -EINVAL;
1342 f = fdget(fd);
1343 if (!f.file)
1344 return -EBADF;
1345 ret = -ESPIPE;
1346 if (f.file->f_mode & FMODE_PWRITE)
1347 ret = compat_writev(f.file, vec, vlen, &pos, flags);
1348 fdput(f);
1349 return ret;
1350 }
1351
1352 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos)1353 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1354 const struct compat_iovec __user *,vec,
1355 unsigned long, vlen, loff_t, pos)
1356 {
1357 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1358 }
1359 #endif
1360
COMPAT_SYSCALL_DEFINE5(pwritev,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1361 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1362 const struct compat_iovec __user *,vec,
1363 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1364 {
1365 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1366
1367 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
1368 }
1369
1370 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2,unsigned long,fd,const struct compat_iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1371 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1372 const struct compat_iovec __user *,vec,
1373 unsigned long, vlen, loff_t, pos, rwf_t, flags)
1374 {
1375 if (pos == -1)
1376 return do_compat_writev(fd, vec, vlen, flags);
1377
1378 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1379 }
1380 #endif
1381
COMPAT_SYSCALL_DEFINE6(pwritev2,compat_ulong_t,fd,const struct compat_iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1382 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1383 const struct compat_iovec __user *,vec,
1384 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1385 {
1386 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1387
1388 if (pos == -1)
1389 return do_compat_writev(fd, vec, vlen, flags);
1390
1391 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1392 }
1393
1394 #endif
1395
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1396 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1397 size_t count, loff_t max)
1398 {
1399 struct fd in, out;
1400 struct inode *in_inode, *out_inode;
1401 loff_t pos;
1402 loff_t out_pos;
1403 ssize_t retval;
1404 int fl;
1405
1406 /*
1407 * Get input file, and verify that it is ok..
1408 */
1409 retval = -EBADF;
1410 in = fdget(in_fd);
1411 if (!in.file)
1412 goto out;
1413 if (!(in.file->f_mode & FMODE_READ))
1414 goto fput_in;
1415 retval = -ESPIPE;
1416 if (!ppos) {
1417 pos = in.file->f_pos;
1418 } else {
1419 pos = *ppos;
1420 if (!(in.file->f_mode & FMODE_PREAD))
1421 goto fput_in;
1422 }
1423 retval = rw_verify_area(READ, in.file, &pos, count);
1424 if (retval < 0)
1425 goto fput_in;
1426 if (count > MAX_RW_COUNT)
1427 count = MAX_RW_COUNT;
1428
1429 /*
1430 * Get output file, and verify that it is ok..
1431 */
1432 retval = -EBADF;
1433 out = fdget(out_fd);
1434 if (!out.file)
1435 goto fput_in;
1436 if (!(out.file->f_mode & FMODE_WRITE))
1437 goto fput_out;
1438 in_inode = file_inode(in.file);
1439 out_inode = file_inode(out.file);
1440 out_pos = out.file->f_pos;
1441 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1442 if (retval < 0)
1443 goto fput_out;
1444
1445 if (!max)
1446 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1447
1448 if (unlikely(pos + count > max)) {
1449 retval = -EOVERFLOW;
1450 if (pos >= max)
1451 goto fput_out;
1452 count = max - pos;
1453 }
1454
1455 fl = 0;
1456 #if 0
1457 /*
1458 * We need to debate whether we can enable this or not. The
1459 * man page documents EAGAIN return for the output at least,
1460 * and the application is arguably buggy if it doesn't expect
1461 * EAGAIN on a non-blocking file descriptor.
1462 */
1463 if (in.file->f_flags & O_NONBLOCK)
1464 fl = SPLICE_F_NONBLOCK;
1465 #endif
1466 file_start_write(out.file);
1467 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1468 file_end_write(out.file);
1469
1470 if (retval > 0) {
1471 add_rchar(current, retval);
1472 add_wchar(current, retval);
1473 fsnotify_access(in.file);
1474 fsnotify_modify(out.file);
1475 out.file->f_pos = out_pos;
1476 if (ppos)
1477 *ppos = pos;
1478 else
1479 in.file->f_pos = pos;
1480 }
1481
1482 inc_syscr(current);
1483 inc_syscw(current);
1484 if (pos > max)
1485 retval = -EOVERFLOW;
1486
1487 fput_out:
1488 fdput(out);
1489 fput_in:
1490 fdput(in);
1491 out:
1492 return retval;
1493 }
1494
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1495 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1496 {
1497 loff_t pos;
1498 off_t off;
1499 ssize_t ret;
1500
1501 if (offset) {
1502 if (unlikely(get_user(off, offset)))
1503 return -EFAULT;
1504 pos = off;
1505 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1506 if (unlikely(put_user(pos, offset)))
1507 return -EFAULT;
1508 return ret;
1509 }
1510
1511 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1512 }
1513
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1514 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1515 {
1516 loff_t pos;
1517 ssize_t ret;
1518
1519 if (offset) {
1520 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1521 return -EFAULT;
1522 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1523 if (unlikely(put_user(pos, offset)))
1524 return -EFAULT;
1525 return ret;
1526 }
1527
1528 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1529 }
1530
1531 #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1532 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1533 compat_off_t __user *, offset, compat_size_t, count)
1534 {
1535 loff_t pos;
1536 off_t off;
1537 ssize_t ret;
1538
1539 if (offset) {
1540 if (unlikely(get_user(off, offset)))
1541 return -EFAULT;
1542 pos = off;
1543 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1544 if (unlikely(put_user(pos, offset)))
1545 return -EFAULT;
1546 return ret;
1547 }
1548
1549 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1550 }
1551
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1552 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1553 compat_loff_t __user *, offset, compat_size_t, count)
1554 {
1555 loff_t pos;
1556 ssize_t ret;
1557
1558 if (offset) {
1559 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1560 return -EFAULT;
1561 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1562 if (unlikely(put_user(pos, offset)))
1563 return -EFAULT;
1564 return ret;
1565 }
1566
1567 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1568 }
1569 #endif
1570
1571 /**
1572 * generic_copy_file_range - copy data between two files
1573 * @file_in: file structure to read from
1574 * @pos_in: file offset to read from
1575 * @file_out: file structure to write data to
1576 * @pos_out: file offset to write data to
1577 * @len: amount of data to copy
1578 * @flags: copy flags
1579 *
1580 * This is a generic filesystem helper to copy data from one file to another.
1581 * It has no constraints on the source or destination file owners - the files
1582 * can belong to different superblocks and different filesystem types. Short
1583 * copies are allowed.
1584 *
1585 * This should be called from the @file_out filesystem, as per the
1586 * ->copy_file_range() method.
1587 *
1588 * Returns the number of bytes copied or a negative error indicating the
1589 * failure.
1590 */
1591
generic_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1592 ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1593 struct file *file_out, loff_t pos_out,
1594 size_t len, unsigned int flags)
1595 {
1596 return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1597 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1598 }
1599 EXPORT_SYMBOL(generic_copy_file_range);
1600
do_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1601 static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1602 struct file *file_out, loff_t pos_out,
1603 size_t len, unsigned int flags)
1604 {
1605 /*
1606 * Although we now allow filesystems to handle cross sb copy, passing
1607 * a file of the wrong filesystem type to filesystem driver can result
1608 * in an attempt to dereference the wrong type of ->private_data, so
1609 * avoid doing that until we really have a good reason. NFS defines
1610 * several different file_system_type structures, but they all end up
1611 * using the same ->copy_file_range() function pointer.
1612 */
1613 if (file_out->f_op->copy_file_range &&
1614 file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
1615 return file_out->f_op->copy_file_range(file_in, pos_in,
1616 file_out, pos_out,
1617 len, flags);
1618
1619 return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1620 flags);
1621 }
1622
1623 /*
1624 * copy_file_range() differs from regular file read and write in that it
1625 * specifically allows return partial success. When it does so is up to
1626 * the copy_file_range method.
1627 */
vfs_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1628 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1629 struct file *file_out, loff_t pos_out,
1630 size_t len, unsigned int flags)
1631 {
1632 ssize_t ret;
1633
1634 if (flags != 0)
1635 return -EINVAL;
1636
1637 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1638 flags);
1639 if (unlikely(ret))
1640 return ret;
1641
1642 ret = rw_verify_area(READ, file_in, &pos_in, len);
1643 if (unlikely(ret))
1644 return ret;
1645
1646 ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1647 if (unlikely(ret))
1648 return ret;
1649
1650 if (len == 0)
1651 return 0;
1652
1653 file_start_write(file_out);
1654
1655 /*
1656 * Try cloning first, this is supported by more file systems, and
1657 * more efficient if both clone and copy are supported (e.g. NFS).
1658 */
1659 if (file_in->f_op->remap_file_range &&
1660 file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1661 loff_t cloned;
1662
1663 cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1664 file_out, pos_out,
1665 min_t(loff_t, MAX_RW_COUNT, len),
1666 REMAP_FILE_CAN_SHORTEN);
1667 if (cloned > 0) {
1668 ret = cloned;
1669 goto done;
1670 }
1671 }
1672
1673 ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1674 flags);
1675 WARN_ON_ONCE(ret == -EOPNOTSUPP);
1676 done:
1677 if (ret > 0) {
1678 fsnotify_access(file_in);
1679 add_rchar(current, ret);
1680 fsnotify_modify(file_out);
1681 add_wchar(current, ret);
1682 }
1683
1684 inc_syscr(current);
1685 inc_syscw(current);
1686
1687 file_end_write(file_out);
1688
1689 return ret;
1690 }
1691 EXPORT_SYMBOL(vfs_copy_file_range);
1692
SYSCALL_DEFINE6(copy_file_range,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1693 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1694 int, fd_out, loff_t __user *, off_out,
1695 size_t, len, unsigned int, flags)
1696 {
1697 loff_t pos_in;
1698 loff_t pos_out;
1699 struct fd f_in;
1700 struct fd f_out;
1701 ssize_t ret = -EBADF;
1702
1703 f_in = fdget(fd_in);
1704 if (!f_in.file)
1705 goto out2;
1706
1707 f_out = fdget(fd_out);
1708 if (!f_out.file)
1709 goto out1;
1710
1711 ret = -EFAULT;
1712 if (off_in) {
1713 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1714 goto out;
1715 } else {
1716 pos_in = f_in.file->f_pos;
1717 }
1718
1719 if (off_out) {
1720 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1721 goto out;
1722 } else {
1723 pos_out = f_out.file->f_pos;
1724 }
1725
1726 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1727 flags);
1728 if (ret > 0) {
1729 pos_in += ret;
1730 pos_out += ret;
1731
1732 if (off_in) {
1733 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1734 ret = -EFAULT;
1735 } else {
1736 f_in.file->f_pos = pos_in;
1737 }
1738
1739 if (off_out) {
1740 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1741 ret = -EFAULT;
1742 } else {
1743 f_out.file->f_pos = pos_out;
1744 }
1745 }
1746
1747 out:
1748 fdput(f_out);
1749 out1:
1750 fdput(f_in);
1751 out2:
1752 return ret;
1753 }
1754
remap_verify_area(struct file * file,loff_t pos,loff_t len,bool write)1755 static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1756 bool write)
1757 {
1758 struct inode *inode = file_inode(file);
1759
1760 if (unlikely(pos < 0 || len < 0))
1761 return -EINVAL;
1762
1763 if (unlikely((loff_t) (pos + len) < 0))
1764 return -EINVAL;
1765
1766 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1767 loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1768 int retval;
1769
1770 retval = locks_mandatory_area(inode, file, pos, end,
1771 write ? F_WRLCK : F_RDLCK);
1772 if (retval < 0)
1773 return retval;
1774 }
1775
1776 return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1777 }
1778 /*
1779 * Ensure that we don't remap a partial EOF block in the middle of something
1780 * else. Assume that the offsets have already been checked for block
1781 * alignment.
1782 *
1783 * For deduplication we always scale down to the previous block because we
1784 * can't meaningfully compare post-EOF contents.
1785 *
1786 * For clone we only link a partial EOF block above the destination file's EOF.
1787 *
1788 * Shorten the request if possible.
1789 */
generic_remap_check_len(struct inode * inode_in,struct inode * inode_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)1790 static int generic_remap_check_len(struct inode *inode_in,
1791 struct inode *inode_out,
1792 loff_t pos_out,
1793 loff_t *len,
1794 unsigned int remap_flags)
1795 {
1796 u64 blkmask = i_blocksize(inode_in) - 1;
1797 loff_t new_len = *len;
1798
1799 if ((*len & blkmask) == 0)
1800 return 0;
1801
1802 if ((remap_flags & REMAP_FILE_DEDUP) ||
1803 pos_out + *len < i_size_read(inode_out))
1804 new_len &= ~blkmask;
1805
1806 if (new_len == *len)
1807 return 0;
1808
1809 if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1810 *len = new_len;
1811 return 0;
1812 }
1813
1814 return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
1815 }
1816
1817 /* Read a page's worth of file data into the page cache. */
vfs_dedupe_get_page(struct inode * inode,loff_t offset)1818 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1819 {
1820 struct page *page;
1821
1822 page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1823 if (IS_ERR(page))
1824 return page;
1825 if (!PageUptodate(page)) {
1826 put_page(page);
1827 return ERR_PTR(-EIO);
1828 }
1829 return page;
1830 }
1831
1832 /*
1833 * Lock two pages, ensuring that we lock in offset order if the pages are from
1834 * the same file.
1835 */
vfs_lock_two_pages(struct page * page1,struct page * page2)1836 static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1837 {
1838 /* Always lock in order of increasing index. */
1839 if (page1->index > page2->index)
1840 swap(page1, page2);
1841
1842 lock_page(page1);
1843 if (page1 != page2)
1844 lock_page(page2);
1845 }
1846
1847 /* Unlock two pages, being careful not to unlock the same page twice. */
vfs_unlock_two_pages(struct page * page1,struct page * page2)1848 static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1849 {
1850 unlock_page(page1);
1851 if (page1 != page2)
1852 unlock_page(page2);
1853 }
1854
1855 /*
1856 * Compare extents of two files to see if they are the same.
1857 * Caller must have locked both inodes to prevent write races.
1858 */
vfs_dedupe_file_range_compare(struct inode * src,loff_t srcoff,struct inode * dest,loff_t destoff,loff_t len,bool * is_same)1859 static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1860 struct inode *dest, loff_t destoff,
1861 loff_t len, bool *is_same)
1862 {
1863 loff_t src_poff;
1864 loff_t dest_poff;
1865 void *src_addr;
1866 void *dest_addr;
1867 struct page *src_page;
1868 struct page *dest_page;
1869 loff_t cmp_len;
1870 bool same;
1871 int error;
1872
1873 error = -EINVAL;
1874 same = true;
1875 while (len) {
1876 src_poff = srcoff & (PAGE_SIZE - 1);
1877 dest_poff = destoff & (PAGE_SIZE - 1);
1878 cmp_len = min(PAGE_SIZE - src_poff,
1879 PAGE_SIZE - dest_poff);
1880 cmp_len = min(cmp_len, len);
1881 if (cmp_len <= 0)
1882 goto out_error;
1883
1884 src_page = vfs_dedupe_get_page(src, srcoff);
1885 if (IS_ERR(src_page)) {
1886 error = PTR_ERR(src_page);
1887 goto out_error;
1888 }
1889 dest_page = vfs_dedupe_get_page(dest, destoff);
1890 if (IS_ERR(dest_page)) {
1891 error = PTR_ERR(dest_page);
1892 put_page(src_page);
1893 goto out_error;
1894 }
1895
1896 vfs_lock_two_pages(src_page, dest_page);
1897
1898 /*
1899 * Now that we've locked both pages, make sure they're still
1900 * mapped to the file data we're interested in. If not,
1901 * someone is invalidating pages on us and we lose.
1902 */
1903 if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1904 src_page->mapping != src->i_mapping ||
1905 dest_page->mapping != dest->i_mapping) {
1906 same = false;
1907 goto unlock;
1908 }
1909
1910 src_addr = kmap_atomic(src_page);
1911 dest_addr = kmap_atomic(dest_page);
1912
1913 flush_dcache_page(src_page);
1914 flush_dcache_page(dest_page);
1915
1916 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1917 same = false;
1918
1919 kunmap_atomic(dest_addr);
1920 kunmap_atomic(src_addr);
1921 unlock:
1922 vfs_unlock_two_pages(src_page, dest_page);
1923 put_page(dest_page);
1924 put_page(src_page);
1925
1926 if (!same)
1927 break;
1928
1929 srcoff += cmp_len;
1930 destoff += cmp_len;
1931 len -= cmp_len;
1932 }
1933
1934 *is_same = same;
1935 return 0;
1936
1937 out_error:
1938 return error;
1939 }
1940
1941 /*
1942 * Check that the two inodes are eligible for cloning, the ranges make
1943 * sense, and then flush all dirty data. Caller must ensure that the
1944 * inodes have been locked against any other modifications.
1945 *
1946 * If there's an error, then the usual negative error code is returned.
1947 * Otherwise returns 0 with *len set to the request length.
1948 */
generic_remap_file_range_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)1949 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1950 struct file *file_out, loff_t pos_out,
1951 loff_t *len, unsigned int remap_flags)
1952 {
1953 struct inode *inode_in = file_inode(file_in);
1954 struct inode *inode_out = file_inode(file_out);
1955 bool same_inode = (inode_in == inode_out);
1956 int ret;
1957
1958 /* Don't touch certain kinds of inodes */
1959 if (IS_IMMUTABLE(inode_out))
1960 return -EPERM;
1961
1962 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1963 return -ETXTBSY;
1964
1965 /* Don't reflink dirs, pipes, sockets... */
1966 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1967 return -EISDIR;
1968 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1969 return -EINVAL;
1970
1971 /* Zero length dedupe exits immediately; reflink goes to EOF. */
1972 if (*len == 0) {
1973 loff_t isize = i_size_read(inode_in);
1974
1975 if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
1976 return 0;
1977 if (pos_in > isize)
1978 return -EINVAL;
1979 *len = isize - pos_in;
1980 if (*len == 0)
1981 return 0;
1982 }
1983
1984 /* Check that we don't violate system file offset limits. */
1985 ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
1986 remap_flags);
1987 if (ret)
1988 return ret;
1989
1990 /* Wait for the completion of any pending IOs on both files */
1991 inode_dio_wait(inode_in);
1992 if (!same_inode)
1993 inode_dio_wait(inode_out);
1994
1995 ret = filemap_write_and_wait_range(inode_in->i_mapping,
1996 pos_in, pos_in + *len - 1);
1997 if (ret)
1998 return ret;
1999
2000 ret = filemap_write_and_wait_range(inode_out->i_mapping,
2001 pos_out, pos_out + *len - 1);
2002 if (ret)
2003 return ret;
2004
2005 /*
2006 * Check that the extents are the same.
2007 */
2008 if (remap_flags & REMAP_FILE_DEDUP) {
2009 bool is_same = false;
2010
2011 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2012 inode_out, pos_out, *len, &is_same);
2013 if (ret)
2014 return ret;
2015 if (!is_same)
2016 return -EBADE;
2017 }
2018
2019 ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
2020 remap_flags);
2021 if (ret)
2022 return ret;
2023
2024 /* If can't alter the file contents, we're done. */
2025 if (!(remap_flags & REMAP_FILE_DEDUP))
2026 ret = file_modified(file_out);
2027
2028 return ret;
2029 }
2030 EXPORT_SYMBOL(generic_remap_file_range_prep);
2031
do_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)2032 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
2033 struct file *file_out, loff_t pos_out,
2034 loff_t len, unsigned int remap_flags)
2035 {
2036 loff_t ret;
2037
2038 WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
2039
2040 /*
2041 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2042 * the same mount. Practically, they only need to be on the same file
2043 * system.
2044 */
2045 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
2046 return -EXDEV;
2047
2048 ret = generic_file_rw_checks(file_in, file_out);
2049 if (ret < 0)
2050 return ret;
2051
2052 if (!file_in->f_op->remap_file_range)
2053 return -EOPNOTSUPP;
2054
2055 ret = remap_verify_area(file_in, pos_in, len, false);
2056 if (ret)
2057 return ret;
2058
2059 ret = remap_verify_area(file_out, pos_out, len, true);
2060 if (ret)
2061 return ret;
2062
2063 ret = file_in->f_op->remap_file_range(file_in, pos_in,
2064 file_out, pos_out, len, remap_flags);
2065 if (ret < 0)
2066 return ret;
2067
2068 fsnotify_access(file_in);
2069 fsnotify_modify(file_out);
2070 return ret;
2071 }
2072 EXPORT_SYMBOL(do_clone_file_range);
2073
vfs_clone_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t len,unsigned int remap_flags)2074 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
2075 struct file *file_out, loff_t pos_out,
2076 loff_t len, unsigned int remap_flags)
2077 {
2078 loff_t ret;
2079
2080 file_start_write(file_out);
2081 ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2082 remap_flags);
2083 file_end_write(file_out);
2084
2085 return ret;
2086 }
2087 EXPORT_SYMBOL(vfs_clone_file_range);
2088
2089 /* Check whether we are allowed to dedupe the destination file */
allow_file_dedupe(struct file * file)2090 static bool allow_file_dedupe(struct file *file)
2091 {
2092 if (capable(CAP_SYS_ADMIN))
2093 return true;
2094 if (file->f_mode & FMODE_WRITE)
2095 return true;
2096 if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
2097 return true;
2098 if (!inode_permission(file_inode(file), MAY_WRITE))
2099 return true;
2100 return false;
2101 }
2102
vfs_dedupe_file_range_one(struct file * src_file,loff_t src_pos,struct file * dst_file,loff_t dst_pos,loff_t len,unsigned int remap_flags)2103 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2104 struct file *dst_file, loff_t dst_pos,
2105 loff_t len, unsigned int remap_flags)
2106 {
2107 loff_t ret;
2108
2109 WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2110 REMAP_FILE_CAN_SHORTEN));
2111
2112 ret = mnt_want_write_file(dst_file);
2113 if (ret)
2114 return ret;
2115
2116 ret = remap_verify_area(dst_file, dst_pos, len, true);
2117 if (ret < 0)
2118 goto out_drop_write;
2119
2120 ret = -EPERM;
2121 if (!allow_file_dedupe(dst_file))
2122 goto out_drop_write;
2123
2124 ret = -EXDEV;
2125 if (src_file->f_path.mnt != dst_file->f_path.mnt)
2126 goto out_drop_write;
2127
2128 ret = -EISDIR;
2129 if (S_ISDIR(file_inode(dst_file)->i_mode))
2130 goto out_drop_write;
2131
2132 ret = -EINVAL;
2133 if (!dst_file->f_op->remap_file_range)
2134 goto out_drop_write;
2135
2136 if (len == 0) {
2137 ret = 0;
2138 goto out_drop_write;
2139 }
2140
2141 ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
2142 dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
2143 out_drop_write:
2144 mnt_drop_write_file(dst_file);
2145
2146 return ret;
2147 }
2148 EXPORT_SYMBOL(vfs_dedupe_file_range_one);
2149
vfs_dedupe_file_range(struct file * file,struct file_dedupe_range * same)2150 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2151 {
2152 struct file_dedupe_range_info *info;
2153 struct inode *src = file_inode(file);
2154 u64 off;
2155 u64 len;
2156 int i;
2157 int ret;
2158 u16 count = same->dest_count;
2159 loff_t deduped;
2160
2161 if (!(file->f_mode & FMODE_READ))
2162 return -EINVAL;
2163
2164 if (same->reserved1 || same->reserved2)
2165 return -EINVAL;
2166
2167 off = same->src_offset;
2168 len = same->src_length;
2169
2170 if (S_ISDIR(src->i_mode))
2171 return -EISDIR;
2172
2173 if (!S_ISREG(src->i_mode))
2174 return -EINVAL;
2175
2176 if (!file->f_op->remap_file_range)
2177 return -EOPNOTSUPP;
2178
2179 ret = remap_verify_area(file, off, len, false);
2180 if (ret < 0)
2181 return ret;
2182 ret = 0;
2183
2184 if (off + len > i_size_read(src))
2185 return -EINVAL;
2186
2187 /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2188 len = min_t(u64, len, 1 << 30);
2189
2190 /* pre-format output fields to sane values */
2191 for (i = 0; i < count; i++) {
2192 same->info[i].bytes_deduped = 0ULL;
2193 same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2194 }
2195
2196 for (i = 0, info = same->info; i < count; i++, info++) {
2197 struct fd dst_fd = fdget(info->dest_fd);
2198 struct file *dst_file = dst_fd.file;
2199
2200 if (!dst_file) {
2201 info->status = -EBADF;
2202 goto next_loop;
2203 }
2204
2205 if (info->reserved) {
2206 info->status = -EINVAL;
2207 goto next_fdput;
2208 }
2209
2210 deduped = vfs_dedupe_file_range_one(file, off, dst_file,
2211 info->dest_offset, len,
2212 REMAP_FILE_CAN_SHORTEN);
2213 if (deduped == -EBADE)
2214 info->status = FILE_DEDUPE_RANGE_DIFFERS;
2215 else if (deduped < 0)
2216 info->status = deduped;
2217 else
2218 info->bytes_deduped = len;
2219
2220 next_fdput:
2221 fdput(dst_fd);
2222 next_loop:
2223 if (fatal_signal_pending(current))
2224 break;
2225 }
2226 return ret;
2227 }
2228 EXPORT_SYMBOL(vfs_dedupe_file_range);
2229