1 /*
2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
4
5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING.
7 */
8
9 #include "fuse_i.h"
10
11 #include <linux/filter.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include <linux/kernel.h>
15 #include <linux/sched.h>
16 #include <linux/sched/signal.h>
17 #include <linux/module.h>
18 #include <linux/splice.h>
19 #include <linux/swap.h>
20 #include <linux/falloc.h>
21 #include <linux/uio.h>
22 #include <linux/fs.h>
23 #include <linux/filelock.h>
24 #include <linux/splice.h>
25 #include <linux/task_io_accounting_ops.h>
26
fuse_send_open(struct fuse_mount * fm,u64 nodeid,unsigned int open_flags,int opcode,struct fuse_open_out * outargp)27 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
28 unsigned int open_flags, int opcode,
29 struct fuse_open_out *outargp)
30 {
31 struct fuse_open_in inarg;
32 FUSE_ARGS(args);
33
34 memset(&inarg, 0, sizeof(inarg));
35 inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
36 if (!fm->fc->atomic_o_trunc)
37 inarg.flags &= ~O_TRUNC;
38
39 if (fm->fc->handle_killpriv_v2 &&
40 (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
41 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
42 }
43
44 args.opcode = opcode;
45 args.nodeid = nodeid;
46 args.in_numargs = 1;
47 args.in_args[0].size = sizeof(inarg);
48 args.in_args[0].value = &inarg;
49 args.out_numargs = 1;
50 args.out_args[0].size = sizeof(*outargp);
51 args.out_args[0].value = outargp;
52
53 return fuse_simple_request(fm, &args);
54 }
55
fuse_file_alloc(struct fuse_mount * fm,bool release)56 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
57 {
58 struct fuse_file *ff;
59
60 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
61 if (unlikely(!ff))
62 return NULL;
63
64 ff->fm = fm;
65 if (release) {
66 ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
67 if (!ff->args) {
68 kfree(ff);
69 return NULL;
70 }
71 }
72
73 INIT_LIST_HEAD(&ff->write_entry);
74 refcount_set(&ff->count, 1);
75 RB_CLEAR_NODE(&ff->polled_node);
76 init_waitqueue_head(&ff->poll_wait);
77
78 ff->kh = atomic64_inc_return(&fm->fc->khctr);
79
80 return ff;
81 }
82
fuse_file_free(struct fuse_file * ff)83 void fuse_file_free(struct fuse_file *ff)
84 {
85 kfree(ff->args);
86 kfree(ff);
87 }
88
fuse_file_get(struct fuse_file * ff)89 static struct fuse_file *fuse_file_get(struct fuse_file *ff)
90 {
91 refcount_inc(&ff->count);
92 return ff;
93 }
94
fuse_release_end(struct fuse_mount * fm,struct fuse_args * args,int error)95 static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
96 int error)
97 {
98 struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
99
100 iput(ra->inode);
101 kfree(ra);
102 }
103
fuse_file_put(struct inode * inode,struct fuse_file * ff,bool sync)104 static void fuse_file_put(struct inode *inode, struct fuse_file *ff, bool sync)
105 {
106 if (refcount_dec_and_test(&ff->count)) {
107 struct fuse_release_args *ra = &ff->args->release_args;
108 struct fuse_args *args = (ra ? &ra->args : NULL);
109
110 #ifdef CONFIG_FUSE_BPF
111 struct fuse_err_ret fer;
112
113 fer = fuse_bpf_backing(inode, struct fuse_release_in,
114 fuse_release_initialize, fuse_release_backing,
115 fuse_release_finalize,
116 inode, ff);
117 if (fer.ret) {
118 fuse_release_end(ff->fm, args, 0);
119 } else
120 #endif
121 {
122 if (ra && ra->inode)
123 fuse_file_io_release(ff, ra->inode);
124
125 if (!args) {
126 /* Do nothing when server does not implement 'open' */
127 } else if (sync) {
128 fuse_simple_request(ff->fm, args);
129 fuse_release_end(ff->fm, args, 0);
130 } else {
131 args->end = fuse_release_end;
132 if (fuse_simple_background(ff->fm, args,
133 GFP_KERNEL | __GFP_NOFAIL))
134 fuse_release_end(ff->fm, args, -ENOTCONN);
135 }
136 }
137 kfree(ff);
138 }
139 }
140
fuse_file_open(struct fuse_mount * fm,u64 nodeid,unsigned int open_flags,bool isdir)141 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
142 unsigned int open_flags, bool isdir)
143 {
144 struct fuse_conn *fc = fm->fc;
145 struct fuse_file *ff;
146 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
147 bool open = isdir ? !fc->no_opendir : !fc->no_open;
148
149 ff = fuse_file_alloc(fm, open);
150 if (!ff)
151 return ERR_PTR(-ENOMEM);
152
153 ff->fh = 0;
154 /* Default for no-open */
155 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
156 if (open) {
157 /* Store outarg for fuse_finish_open() */
158 struct fuse_open_out *outargp = &ff->args->open_outarg;
159 int err;
160
161 err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
162 if (!err) {
163 ff->fh = outargp->fh;
164 ff->open_flags = outargp->open_flags;
165 } else if (err != -ENOSYS) {
166 fuse_file_free(ff);
167 return ERR_PTR(err);
168 } else {
169 /* No release needed */
170 kfree(ff->args);
171 ff->args = NULL;
172 if (isdir)
173 fc->no_opendir = 1;
174 else
175 fc->no_open = 1;
176 }
177 }
178
179 if (isdir)
180 ff->open_flags &= ~FOPEN_DIRECT_IO;
181
182 ff->nodeid = nodeid;
183
184 return ff;
185 }
186
fuse_do_open(struct fuse_mount * fm,u64 nodeid,struct file * file,bool isdir)187 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
188 bool isdir)
189 {
190 struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
191
192 if (!IS_ERR(ff))
193 file->private_data = ff;
194
195 return PTR_ERR_OR_ZERO(ff);
196 }
197 EXPORT_SYMBOL_GPL(fuse_do_open);
198
fuse_link_write_file(struct file * file)199 static void fuse_link_write_file(struct file *file)
200 {
201 struct inode *inode = file_inode(file);
202 struct fuse_inode *fi = get_fuse_inode(inode);
203 struct fuse_file *ff = file->private_data;
204 /*
205 * file may be written through mmap, so chain it onto the
206 * inodes's write_file list
207 */
208 spin_lock(&fi->lock);
209 if (list_empty(&ff->write_entry))
210 list_add(&ff->write_entry, &fi->write_files);
211 spin_unlock(&fi->lock);
212 }
213
fuse_finish_open(struct inode * inode,struct file * file)214 int fuse_finish_open(struct inode *inode, struct file *file)
215 {
216 struct fuse_file *ff = file->private_data;
217 struct fuse_conn *fc = get_fuse_conn(inode);
218 int err;
219
220 err = fuse_file_io_open(file, inode);
221 if (err)
222 return err;
223
224 if (ff->open_flags & FOPEN_STREAM)
225 stream_open(inode, file);
226 else if (ff->open_flags & FOPEN_NONSEEKABLE)
227 nonseekable_open(inode, file);
228
229 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
230 fuse_link_write_file(file);
231
232 return 0;
233 }
234
fuse_truncate_update_attr(struct inode * inode,struct file * file)235 static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
236 {
237 struct fuse_conn *fc = get_fuse_conn(inode);
238 struct fuse_inode *fi = get_fuse_inode(inode);
239
240 spin_lock(&fi->lock);
241 fi->attr_version = atomic64_inc_return(&fc->attr_version);
242 i_size_write(inode, 0);
243 spin_unlock(&fi->lock);
244 file_update_time(file);
245 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
246 }
247
fuse_open(struct inode * inode,struct file * file)248 static int fuse_open(struct inode *inode, struct file *file)
249 {
250 struct fuse_mount *fm = get_fuse_mount(inode);
251 struct fuse_inode *fi = get_fuse_inode(inode);
252 struct fuse_conn *fc = fm->fc;
253 struct fuse_file *ff;
254 int err;
255 bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
256 bool is_wb_truncate = is_truncate && fc->writeback_cache;
257 bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
258
259 if (fuse_is_bad(inode))
260 return -EIO;
261
262 err = generic_file_open(inode, file);
263 if (err)
264 return err;
265
266 #ifdef CONFIG_FUSE_BPF
267 {
268 struct fuse_err_ret fer;
269
270 fer = fuse_bpf_backing(inode, struct fuse_open_io,
271 fuse_open_initialize,
272 fuse_open_backing,
273 fuse_open_finalize,
274 inode, file, false);
275 if (fer.ret)
276 return PTR_ERR(fer.result);
277 }
278 #endif
279
280 if (is_wb_truncate || dax_truncate)
281 inode_lock(inode);
282
283 if (dax_truncate) {
284 filemap_invalidate_lock(inode->i_mapping);
285 err = fuse_dax_break_layouts(inode, 0, -1);
286 if (err)
287 goto out_inode_unlock;
288 }
289
290 if (is_wb_truncate || dax_truncate)
291 fuse_set_nowrite(inode);
292
293 err = fuse_do_open(fm, get_node_id(inode), file, false);
294 if (!err) {
295 ff = file->private_data;
296 err = fuse_finish_open(inode, file);
297 if (err)
298 fuse_sync_release(fi, ff, file->f_flags);
299 else if (is_truncate)
300 fuse_truncate_update_attr(inode, file);
301 }
302
303 if (is_wb_truncate || dax_truncate)
304 fuse_release_nowrite(inode);
305 if (!err) {
306 if (is_truncate)
307 truncate_pagecache(inode, 0);
308 else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
309 invalidate_inode_pages2(inode->i_mapping);
310 }
311 if (dax_truncate)
312 filemap_invalidate_unlock(inode->i_mapping);
313 out_inode_unlock:
314 if (is_wb_truncate || dax_truncate)
315 inode_unlock(inode);
316
317 return err;
318 }
319
fuse_prepare_release(struct fuse_inode * fi,struct fuse_file * ff,unsigned int flags,int opcode,bool sync)320 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
321 unsigned int flags, int opcode, bool sync)
322 {
323 struct fuse_conn *fc = ff->fm->fc;
324 struct fuse_release_args *ra = &ff->args->release_args;
325
326 if (fuse_file_passthrough(ff))
327 fuse_passthrough_release(ff, fuse_inode_backing(fi));
328
329 /* Inode is NULL on error path of fuse_create_open() */
330 if (likely(fi)) {
331 spin_lock(&fi->lock);
332 list_del(&ff->write_entry);
333 spin_unlock(&fi->lock);
334 }
335 spin_lock(&fc->lock);
336 if (!RB_EMPTY_NODE(&ff->polled_node))
337 rb_erase(&ff->polled_node, &fc->polled_files);
338 spin_unlock(&fc->lock);
339
340 wake_up_interruptible_all(&ff->poll_wait);
341
342 if (!ra)
343 return;
344
345 /* ff->args was used for open outarg */
346 memset(ff->args, 0, sizeof(*ff->args));
347 ra->inarg.fh = ff->fh;
348 ra->inarg.flags = flags;
349 ra->args.in_numargs = 1;
350 ra->args.in_args[0].size = sizeof(struct fuse_release_in);
351 ra->args.in_args[0].value = &ra->inarg;
352 ra->args.opcode = opcode;
353 ra->args.nodeid = ff->nodeid;
354 ra->args.force = true;
355 ra->args.nocreds = true;
356
357 /*
358 * Hold inode until release is finished.
359 * From fuse_sync_release() the refcount is 1 and everything's
360 * synchronous, so we are fine with not doing igrab() here.
361 */
362 ra->inode = sync ? NULL : igrab(&fi->inode);
363 }
364
fuse_file_release(struct inode * inode,struct fuse_file * ff,unsigned int open_flags,fl_owner_t id,bool isdir)365 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
366 unsigned int open_flags, fl_owner_t id, bool isdir)
367 {
368 struct fuse_inode *fi = get_fuse_inode(inode);
369 struct fuse_release_args *ra = &ff->args->release_args;
370 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
371
372 fuse_prepare_release(fi, ff, open_flags, opcode, false);
373
374 if (ra && ff->flock) {
375 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
376 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
377 }
378
379 /*
380 * Normally this will send the RELEASE request, however if
381 * some asynchronous READ or WRITE requests are outstanding,
382 * the sending will be delayed.
383 *
384 * Make the release synchronous if this is a fuseblk mount,
385 * synchronous RELEASE is allowed (and desirable) in this case
386 * because the server can be trusted not to screw up.
387 */
388 fuse_file_put(ra->inode, ff, ff->fm->fc->destroy);
389 }
390
fuse_release_common(struct file * file,bool isdir)391 void fuse_release_common(struct file *file, bool isdir)
392 {
393 fuse_file_release(file_inode(file), file->private_data, file->f_flags,
394 (fl_owner_t) file, isdir);
395 }
396
fuse_release(struct inode * inode,struct file * file)397 static int fuse_release(struct inode *inode, struct file *file)
398 {
399 struct fuse_conn *fc = get_fuse_conn(inode);
400
401 /*
402 * Dirty pages might remain despite write_inode_now() call from
403 * fuse_flush() due to writes racing with the close.
404 */
405 if (fc->writeback_cache)
406 write_inode_now(inode, 1);
407
408 fuse_release_common(file, false);
409
410 /* return value is ignored by VFS */
411 return 0;
412 }
413
fuse_sync_release(struct fuse_inode * fi,struct fuse_file * ff,unsigned int flags)414 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
415 unsigned int flags)
416 {
417 WARN_ON(refcount_read(&ff->count) > 1);
418 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
419 fuse_file_put(&fi->inode, ff, true);
420 }
421 EXPORT_SYMBOL_GPL(fuse_sync_release);
422
423 /*
424 * Scramble the ID space with XTEA, so that the value of the files_struct
425 * pointer is not exposed to userspace.
426 */
fuse_lock_owner_id(struct fuse_conn * fc,fl_owner_t id)427 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
428 {
429 u32 *k = fc->scramble_key;
430 u64 v = (unsigned long) id;
431 u32 v0 = v;
432 u32 v1 = v >> 32;
433 u32 sum = 0;
434 int i;
435
436 for (i = 0; i < 32; i++) {
437 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
438 sum += 0x9E3779B9;
439 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
440 }
441
442 return (u64) v0 + ((u64) v1 << 32);
443 }
444
445 struct fuse_writepage_args {
446 struct fuse_io_args ia;
447 struct rb_node writepages_entry;
448 struct list_head queue_entry;
449 struct fuse_writepage_args *next;
450 struct inode *inode;
451 struct fuse_sync_bucket *bucket;
452 };
453
fuse_find_writeback(struct fuse_inode * fi,pgoff_t idx_from,pgoff_t idx_to)454 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
455 pgoff_t idx_from, pgoff_t idx_to)
456 {
457 struct rb_node *n;
458
459 n = fi->writepages.rb_node;
460
461 while (n) {
462 struct fuse_writepage_args *wpa;
463 pgoff_t curr_index;
464
465 wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
466 WARN_ON(get_fuse_inode(wpa->inode) != fi);
467 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
468 if (idx_from >= curr_index + wpa->ia.ap.num_pages)
469 n = n->rb_right;
470 else if (idx_to < curr_index)
471 n = n->rb_left;
472 else
473 return wpa;
474 }
475 return NULL;
476 }
477
478 /*
479 * Check if any page in a range is under writeback
480 */
fuse_range_is_writeback(struct inode * inode,pgoff_t idx_from,pgoff_t idx_to)481 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
482 pgoff_t idx_to)
483 {
484 struct fuse_inode *fi = get_fuse_inode(inode);
485 bool found;
486
487 if (RB_EMPTY_ROOT(&fi->writepages))
488 return false;
489
490 spin_lock(&fi->lock);
491 found = fuse_find_writeback(fi, idx_from, idx_to);
492 spin_unlock(&fi->lock);
493
494 return found;
495 }
496
fuse_page_is_writeback(struct inode * inode,pgoff_t index)497 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
498 {
499 return fuse_range_is_writeback(inode, index, index);
500 }
501
502 /*
503 * Wait for page writeback to be completed.
504 *
505 * Since fuse doesn't rely on the VM writeback tracking, this has to
506 * use some other means.
507 */
fuse_wait_on_page_writeback(struct inode * inode,pgoff_t index)508 static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
509 {
510 struct fuse_inode *fi = get_fuse_inode(inode);
511
512 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
513 }
514
515 /*
516 * Wait for all pending writepages on the inode to finish.
517 *
518 * This is currently done by blocking further writes with FUSE_NOWRITE
519 * and waiting for all sent writes to complete.
520 *
521 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
522 * could conflict with truncation.
523 */
fuse_sync_writes(struct inode * inode)524 static void fuse_sync_writes(struct inode *inode)
525 {
526 fuse_set_nowrite(inode);
527 fuse_release_nowrite(inode);
528 }
529
fuse_flush(struct file * file,fl_owner_t id)530 static int fuse_flush(struct file *file, fl_owner_t id)
531 {
532 struct inode *inode = file_inode(file);
533 struct fuse_mount *fm = get_fuse_mount(inode);
534 struct fuse_file *ff = file->private_data;
535 struct fuse_flush_in inarg;
536 FUSE_ARGS(args);
537 int err;
538
539 #ifdef CONFIG_FUSE_BPF
540 struct fuse_err_ret fer;
541
542 fer = fuse_bpf_backing(file->f_inode, struct fuse_flush_in,
543 fuse_flush_initialize, fuse_flush_backing,
544 fuse_flush_finalize,
545 file, id);
546 if (fer.ret)
547 return PTR_ERR(fer.result);
548 #endif
549
550 if (fuse_is_bad(inode))
551 return -EIO;
552
553 if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
554 return 0;
555
556 err = write_inode_now(inode, 1);
557 if (err)
558 return err;
559
560 inode_lock(inode);
561 fuse_sync_writes(inode);
562 inode_unlock(inode);
563
564 err = filemap_check_errors(file->f_mapping);
565 if (err)
566 return err;
567
568 err = 0;
569 if (fm->fc->no_flush)
570 goto inval_attr_out;
571
572 memset(&inarg, 0, sizeof(inarg));
573 inarg.fh = ff->fh;
574 inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
575 args.opcode = FUSE_FLUSH;
576 args.nodeid = get_node_id(inode);
577 args.in_numargs = 1;
578 args.in_args[0].size = sizeof(inarg);
579 args.in_args[0].value = &inarg;
580 args.force = true;
581
582 err = fuse_simple_request(fm, &args);
583 if (err == -ENOSYS) {
584 fm->fc->no_flush = 1;
585 err = 0;
586 }
587
588 inval_attr_out:
589 /*
590 * In memory i_blocks is not maintained by fuse, if writeback cache is
591 * enabled, i_blocks from cached attr may not be accurate.
592 */
593 if (!err && fm->fc->writeback_cache)
594 fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
595 return err;
596 }
597
fuse_fsync_common(struct file * file,loff_t start,loff_t end,int datasync,int opcode)598 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
599 int datasync, int opcode)
600 {
601 struct inode *inode = file->f_mapping->host;
602 struct fuse_mount *fm = get_fuse_mount(inode);
603 struct fuse_file *ff = file->private_data;
604 FUSE_ARGS(args);
605 struct fuse_fsync_in inarg;
606
607 memset(&inarg, 0, sizeof(inarg));
608 inarg.fh = ff->fh;
609 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
610 args.opcode = opcode;
611 args.nodeid = get_node_id(inode);
612 args.in_numargs = 1;
613 args.in_args[0].size = sizeof(inarg);
614 args.in_args[0].value = &inarg;
615 return fuse_simple_request(fm, &args);
616 }
617
fuse_fsync(struct file * file,loff_t start,loff_t end,int datasync)618 static int fuse_fsync(struct file *file, loff_t start, loff_t end,
619 int datasync)
620 {
621 struct inode *inode = file->f_mapping->host;
622 struct fuse_conn *fc = get_fuse_conn(inode);
623 int err;
624
625 #ifdef CONFIG_FUSE_BPF
626 struct fuse_err_ret fer;
627
628 fer = fuse_bpf_backing(inode, struct fuse_fsync_in,
629 fuse_fsync_initialize, fuse_fsync_backing,
630 fuse_fsync_finalize,
631 file, start, end, datasync);
632 if (fer.ret)
633 return PTR_ERR(fer.result);
634 #endif
635
636 if (fuse_is_bad(inode))
637 return -EIO;
638
639 inode_lock(inode);
640
641 /*
642 * Start writeback against all dirty pages of the inode, then
643 * wait for all outstanding writes, before sending the FSYNC
644 * request.
645 */
646 err = file_write_and_wait_range(file, start, end);
647 if (err)
648 goto out;
649
650 fuse_sync_writes(inode);
651
652 /*
653 * Due to implementation of fuse writeback
654 * file_write_and_wait_range() does not catch errors.
655 * We have to do this directly after fuse_sync_writes()
656 */
657 err = file_check_and_advance_wb_err(file);
658 if (err)
659 goto out;
660
661 err = sync_inode_metadata(inode, 1);
662 if (err)
663 goto out;
664
665 if (fc->no_fsync)
666 goto out;
667
668 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
669 if (err == -ENOSYS) {
670 fc->no_fsync = 1;
671 err = 0;
672 }
673 out:
674 inode_unlock(inode);
675
676 return err;
677 }
678
fuse_read_args_fill(struct fuse_io_args * ia,struct file * file,loff_t pos,size_t count,int opcode)679 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
680 size_t count, int opcode)
681 {
682 struct fuse_file *ff = file->private_data;
683 struct fuse_args *args = &ia->ap.args;
684
685 ia->read.in.fh = ff->fh;
686 ia->read.in.offset = pos;
687 ia->read.in.size = count;
688 ia->read.in.flags = file->f_flags;
689 args->opcode = opcode;
690 args->nodeid = ff->nodeid;
691 args->in_numargs = 1;
692 args->in_args[0].size = sizeof(ia->read.in);
693 args->in_args[0].value = &ia->read.in;
694 args->out_argvar = true;
695 args->out_numargs = 1;
696 args->out_args[0].size = count;
697 }
698
fuse_release_user_pages(struct fuse_args_pages * ap,ssize_t nres,bool should_dirty)699 static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres,
700 bool should_dirty)
701 {
702 unsigned int i;
703
704 for (i = 0; i < ap->num_pages; i++) {
705 if (should_dirty)
706 set_page_dirty_lock(ap->pages[i]);
707 if (ap->args.is_pinned)
708 unpin_user_page(ap->pages[i]);
709 }
710
711 if (nres > 0 && ap->args.invalidate_vmap)
712 invalidate_kernel_vmap_range(ap->args.vmap_base, nres);
713 }
714
fuse_io_release(struct kref * kref)715 static void fuse_io_release(struct kref *kref)
716 {
717 kfree(container_of(kref, struct fuse_io_priv, refcnt));
718 }
719
fuse_get_res_by_io(struct fuse_io_priv * io)720 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
721 {
722 if (io->err)
723 return io->err;
724
725 if (io->bytes >= 0 && io->write)
726 return -EIO;
727
728 return io->bytes < 0 ? io->size : io->bytes;
729 }
730
731 /*
732 * In case of short read, the caller sets 'pos' to the position of
733 * actual end of fuse request in IO request. Otherwise, if bytes_requested
734 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
735 *
736 * An example:
737 * User requested DIO read of 64K. It was split into two 32K fuse requests,
738 * both submitted asynchronously. The first of them was ACKed by userspace as
739 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
740 * second request was ACKed as short, e.g. only 1K was read, resulting in
741 * pos == 33K.
742 *
743 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
744 * will be equal to the length of the longest contiguous fragment of
745 * transferred data starting from the beginning of IO request.
746 */
fuse_aio_complete(struct fuse_io_priv * io,int err,ssize_t pos)747 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
748 {
749 int left;
750
751 spin_lock(&io->lock);
752 if (err)
753 io->err = io->err ? : err;
754 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
755 io->bytes = pos;
756
757 left = --io->reqs;
758 if (!left && io->blocking)
759 complete(io->done);
760 spin_unlock(&io->lock);
761
762 if (!left && !io->blocking) {
763 ssize_t res = fuse_get_res_by_io(io);
764
765 if (res >= 0) {
766 struct inode *inode = file_inode(io->iocb->ki_filp);
767 struct fuse_conn *fc = get_fuse_conn(inode);
768 struct fuse_inode *fi = get_fuse_inode(inode);
769
770 spin_lock(&fi->lock);
771 fi->attr_version = atomic64_inc_return(&fc->attr_version);
772 spin_unlock(&fi->lock);
773 }
774
775 io->iocb->ki_complete(io->iocb, res);
776 }
777
778 kref_put(&io->refcnt, fuse_io_release);
779 }
780
fuse_io_alloc(struct fuse_io_priv * io,unsigned int npages)781 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
782 unsigned int npages)
783 {
784 struct fuse_io_args *ia;
785
786 ia = kzalloc(sizeof(*ia), GFP_KERNEL);
787 if (ia) {
788 ia->io = io;
789 ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
790 &ia->ap.descs);
791 if (!ia->ap.pages) {
792 kfree(ia);
793 ia = NULL;
794 }
795 }
796 return ia;
797 }
798
fuse_io_free(struct fuse_io_args * ia)799 static void fuse_io_free(struct fuse_io_args *ia)
800 {
801 kfree(ia->ap.pages);
802 kfree(ia);
803 }
804
fuse_aio_complete_req(struct fuse_mount * fm,struct fuse_args * args,int err)805 static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
806 int err)
807 {
808 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
809 struct fuse_io_priv *io = ia->io;
810 ssize_t pos = -1;
811 size_t nres;
812
813 if (err) {
814 /* Nothing */
815 } else if (io->write) {
816 if (ia->write.out.size > ia->write.in.size) {
817 err = -EIO;
818 } else {
819 nres = ia->write.out.size;
820 if (ia->write.in.size != ia->write.out.size)
821 pos = ia->write.in.offset - io->offset +
822 ia->write.out.size;
823 }
824 } else {
825 u32 outsize = args->out_args[0].size;
826
827 nres = outsize;
828 if (ia->read.in.size != outsize)
829 pos = ia->read.in.offset - io->offset + outsize;
830 }
831
832 fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty);
833
834 fuse_aio_complete(io, err, pos);
835 fuse_io_free(ia);
836 }
837
fuse_async_req_send(struct fuse_mount * fm,struct fuse_io_args * ia,size_t num_bytes)838 static ssize_t fuse_async_req_send(struct fuse_mount *fm,
839 struct fuse_io_args *ia, size_t num_bytes)
840 {
841 ssize_t err;
842 struct fuse_io_priv *io = ia->io;
843
844 spin_lock(&io->lock);
845 kref_get(&io->refcnt);
846 io->size += num_bytes;
847 io->reqs++;
848 spin_unlock(&io->lock);
849
850 ia->ap.args.end = fuse_aio_complete_req;
851 ia->ap.args.may_block = io->should_dirty;
852 err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
853 if (err)
854 fuse_aio_complete_req(fm, &ia->ap.args, err);
855
856 return num_bytes;
857 }
858
fuse_send_read(struct fuse_io_args * ia,loff_t pos,size_t count,fl_owner_t owner)859 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
860 fl_owner_t owner)
861 {
862 struct file *file = ia->io->iocb->ki_filp;
863 struct fuse_file *ff = file->private_data;
864 struct fuse_mount *fm = ff->fm;
865
866 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
867 if (owner != NULL) {
868 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
869 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
870 }
871
872 if (ia->io->async)
873 return fuse_async_req_send(fm, ia, count);
874
875 return fuse_simple_request(fm, &ia->ap.args);
876 }
877
fuse_read_update_size(struct inode * inode,loff_t size,u64 attr_ver)878 static void fuse_read_update_size(struct inode *inode, loff_t size,
879 u64 attr_ver)
880 {
881 struct fuse_conn *fc = get_fuse_conn(inode);
882 struct fuse_inode *fi = get_fuse_inode(inode);
883
884 spin_lock(&fi->lock);
885 if (attr_ver >= fi->attr_version && size < inode->i_size &&
886 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
887 fi->attr_version = atomic64_inc_return(&fc->attr_version);
888 i_size_write(inode, size);
889 }
890 spin_unlock(&fi->lock);
891 }
892
fuse_short_read(struct inode * inode,u64 attr_ver,size_t num_read,struct fuse_args_pages * ap)893 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
894 struct fuse_args_pages *ap)
895 {
896 struct fuse_conn *fc = get_fuse_conn(inode);
897
898 /*
899 * If writeback_cache is enabled, a short read means there's a hole in
900 * the file. Some data after the hole is in page cache, but has not
901 * reached the client fs yet. So the hole is not present there.
902 */
903 if (!fc->writeback_cache) {
904 loff_t pos = page_offset(ap->pages[0]) + num_read;
905 fuse_read_update_size(inode, pos, attr_ver);
906 }
907 }
908
fuse_do_readpage(struct file * file,struct page * page)909 static int fuse_do_readpage(struct file *file, struct page *page)
910 {
911 struct inode *inode = page->mapping->host;
912 struct fuse_mount *fm = get_fuse_mount(inode);
913 loff_t pos = page_offset(page);
914 struct fuse_page_desc desc = { .length = PAGE_SIZE };
915 struct fuse_io_args ia = {
916 .ap.args.page_zeroing = true,
917 .ap.args.out_pages = true,
918 .ap.num_pages = 1,
919 .ap.pages = &page,
920 .ap.descs = &desc,
921 };
922 ssize_t res;
923 u64 attr_ver;
924
925 /*
926 * Page writeback can extend beyond the lifetime of the
927 * page-cache page, so make sure we read a properly synced
928 * page.
929 */
930 fuse_wait_on_page_writeback(inode, page->index);
931
932 attr_ver = fuse_get_attr_version(fm->fc);
933
934 /* Don't overflow end offset */
935 if (pos + (desc.length - 1) == LLONG_MAX)
936 desc.length--;
937
938 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
939 res = fuse_simple_request(fm, &ia.ap.args);
940 if (res < 0)
941 return res;
942 /*
943 * Short read means EOF. If file size is larger, truncate it
944 */
945 if (res < desc.length)
946 fuse_short_read(inode, attr_ver, res, &ia.ap);
947
948 SetPageUptodate(page);
949
950 return 0;
951 }
952
fuse_read_folio(struct file * file,struct folio * folio)953 static int fuse_read_folio(struct file *file, struct folio *folio)
954 {
955 struct page *page = &folio->page;
956 struct inode *inode = page->mapping->host;
957 int err;
958
959 err = -EIO;
960 if (fuse_is_bad(inode))
961 goto out;
962
963 err = fuse_do_readpage(file, page);
964 fuse_invalidate_atime(inode);
965 out:
966 unlock_page(page);
967 return err;
968 }
969
fuse_readpages_end(struct fuse_mount * fm,struct fuse_args * args,int err)970 static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
971 int err)
972 {
973 int i;
974 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
975 struct fuse_args_pages *ap = &ia->ap;
976 size_t count = ia->read.in.size;
977 size_t num_read = args->out_args[0].size;
978 struct address_space *mapping = NULL;
979
980 for (i = 0; mapping == NULL && i < ap->num_pages; i++)
981 mapping = ap->pages[i]->mapping;
982
983 if (mapping) {
984 struct inode *inode = mapping->host;
985
986 /*
987 * Short read means EOF. If file size is larger, truncate it
988 */
989 if (!err && num_read < count)
990 fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
991
992 fuse_invalidate_atime(inode);
993 }
994
995 for (i = 0; i < ap->num_pages; i++) {
996 struct folio *folio = page_folio(ap->pages[i]);
997
998 folio_end_read(folio, !err);
999 folio_put(folio);
1000 }
1001 if (ia->ff)
1002 fuse_file_put(mapping ? mapping->host : NULL, ia->ff, false);
1003
1004 fuse_io_free(ia);
1005 }
1006
fuse_send_readpages(struct fuse_io_args * ia,struct file * file)1007 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
1008 {
1009 struct fuse_file *ff = file->private_data;
1010 struct fuse_mount *fm = ff->fm;
1011 struct fuse_args_pages *ap = &ia->ap;
1012 loff_t pos = page_offset(ap->pages[0]);
1013 size_t count = ap->num_pages << PAGE_SHIFT;
1014 ssize_t res;
1015 int err;
1016
1017 ap->args.out_pages = true;
1018 ap->args.page_zeroing = true;
1019 ap->args.page_replace = true;
1020
1021 /* Don't overflow end offset */
1022 if (pos + (count - 1) == LLONG_MAX) {
1023 count--;
1024 ap->descs[ap->num_pages - 1].length--;
1025 }
1026 WARN_ON((loff_t) (pos + count) < 0);
1027
1028 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
1029 ia->read.attr_ver = fuse_get_attr_version(fm->fc);
1030 if (fm->fc->async_read) {
1031 ia->ff = fuse_file_get(ff);
1032 ap->args.end = fuse_readpages_end;
1033 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
1034 if (!err)
1035 return;
1036 } else {
1037 res = fuse_simple_request(fm, &ap->args);
1038 err = res < 0 ? res : 0;
1039 }
1040 fuse_readpages_end(fm, &ap->args, err);
1041 }
1042
fuse_readahead(struct readahead_control * rac)1043 static void fuse_readahead(struct readahead_control *rac)
1044 {
1045 struct inode *inode = rac->mapping->host;
1046 struct fuse_conn *fc = get_fuse_conn(inode);
1047 unsigned int i, max_pages, nr_pages = 0;
1048
1049 #ifdef CONFIG_FUSE_BPF
1050 /*
1051 * Currently no meaningful readahead is possible with fuse-bpf within
1052 * the kernel, so unless the daemon is aware of this file, ignore this
1053 * call.
1054 */
1055 if (!get_fuse_inode(inode)->nodeid)
1056 return;
1057 #endif
1058
1059 if (fuse_is_bad(inode))
1060 return;
1061
1062 max_pages = min_t(unsigned int, fc->max_pages,
1063 fc->max_read / PAGE_SIZE);
1064
1065 for (;;) {
1066 struct fuse_io_args *ia;
1067 struct fuse_args_pages *ap;
1068
1069 if (fc->num_background >= fc->congestion_threshold &&
1070 rac->ra->async_size >= readahead_count(rac))
1071 /*
1072 * Congested and only async pages left, so skip the
1073 * rest.
1074 */
1075 break;
1076
1077 nr_pages = readahead_count(rac) - nr_pages;
1078 if (nr_pages > max_pages)
1079 nr_pages = max_pages;
1080 if (nr_pages == 0)
1081 break;
1082 ia = fuse_io_alloc(NULL, nr_pages);
1083 if (!ia)
1084 return;
1085 ap = &ia->ap;
1086 nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
1087 for (i = 0; i < nr_pages; i++) {
1088 fuse_wait_on_page_writeback(inode,
1089 readahead_index(rac) + i);
1090 ap->descs[i].length = PAGE_SIZE;
1091 }
1092 ap->num_pages = nr_pages;
1093 fuse_send_readpages(ia, rac->file);
1094 }
1095 }
1096
fuse_cache_read_iter(struct kiocb * iocb,struct iov_iter * to)1097 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
1098 {
1099 struct inode *inode = iocb->ki_filp->f_mapping->host;
1100 struct fuse_conn *fc = get_fuse_conn(inode);
1101
1102 /*
1103 * In auto invalidate mode, always update attributes on read.
1104 * Otherwise, only update if we attempt to read past EOF (to ensure
1105 * i_size is up to date).
1106 */
1107 if (fc->auto_inval_data ||
1108 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
1109 int err;
1110 err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
1111 if (err)
1112 return err;
1113 }
1114
1115 return generic_file_read_iter(iocb, to);
1116 }
1117
fuse_write_args_fill(struct fuse_io_args * ia,struct fuse_file * ff,loff_t pos,size_t count)1118 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
1119 loff_t pos, size_t count)
1120 {
1121 struct fuse_args *args = &ia->ap.args;
1122
1123 ia->write.in.fh = ff->fh;
1124 ia->write.in.offset = pos;
1125 ia->write.in.size = count;
1126 args->opcode = FUSE_WRITE;
1127 args->nodeid = ff->nodeid;
1128 args->in_numargs = 2;
1129 if (ff->fm->fc->minor < 9)
1130 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1131 else
1132 args->in_args[0].size = sizeof(ia->write.in);
1133 args->in_args[0].value = &ia->write.in;
1134 args->in_args[1].size = count;
1135 args->out_numargs = 1;
1136 args->out_args[0].size = sizeof(ia->write.out);
1137 args->out_args[0].value = &ia->write.out;
1138 }
1139
fuse_write_flags(struct kiocb * iocb)1140 static unsigned int fuse_write_flags(struct kiocb *iocb)
1141 {
1142 unsigned int flags = iocb->ki_filp->f_flags;
1143
1144 if (iocb_is_dsync(iocb))
1145 flags |= O_DSYNC;
1146 if (iocb->ki_flags & IOCB_SYNC)
1147 flags |= O_SYNC;
1148
1149 return flags;
1150 }
1151
fuse_send_write(struct fuse_io_args * ia,loff_t pos,size_t count,fl_owner_t owner)1152 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1153 size_t count, fl_owner_t owner)
1154 {
1155 struct kiocb *iocb = ia->io->iocb;
1156 struct file *file = iocb->ki_filp;
1157 struct fuse_file *ff = file->private_data;
1158 struct fuse_mount *fm = ff->fm;
1159 struct fuse_write_in *inarg = &ia->write.in;
1160 ssize_t err;
1161
1162 fuse_write_args_fill(ia, ff, pos, count);
1163 inarg->flags = fuse_write_flags(iocb);
1164 if (owner != NULL) {
1165 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1166 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
1167 }
1168
1169 if (ia->io->async)
1170 return fuse_async_req_send(fm, ia, count);
1171
1172 err = fuse_simple_request(fm, &ia->ap.args);
1173 if (!err && ia->write.out.size > count)
1174 err = -EIO;
1175
1176 return err ?: ia->write.out.size;
1177 }
1178
fuse_write_update_attr(struct inode * inode,loff_t pos,ssize_t written)1179 bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
1180 {
1181 struct fuse_conn *fc = get_fuse_conn(inode);
1182 struct fuse_inode *fi = get_fuse_inode(inode);
1183 bool ret = false;
1184
1185 spin_lock(&fi->lock);
1186 fi->attr_version = atomic64_inc_return(&fc->attr_version);
1187 if (written > 0 && pos > inode->i_size) {
1188 i_size_write(inode, pos);
1189 ret = true;
1190 }
1191 spin_unlock(&fi->lock);
1192
1193 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
1194
1195 return ret;
1196 }
1197
fuse_send_write_pages(struct fuse_io_args * ia,struct kiocb * iocb,struct inode * inode,loff_t pos,size_t count)1198 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1199 struct kiocb *iocb, struct inode *inode,
1200 loff_t pos, size_t count)
1201 {
1202 struct fuse_args_pages *ap = &ia->ap;
1203 struct file *file = iocb->ki_filp;
1204 struct fuse_file *ff = file->private_data;
1205 struct fuse_mount *fm = ff->fm;
1206 unsigned int offset, i;
1207 bool short_write;
1208 int err;
1209
1210 for (i = 0; i < ap->num_pages; i++)
1211 fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
1212
1213 fuse_write_args_fill(ia, ff, pos, count);
1214 ia->write.in.flags = fuse_write_flags(iocb);
1215 if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
1216 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1217
1218 err = fuse_simple_request(fm, &ap->args);
1219 if (!err && ia->write.out.size > count)
1220 err = -EIO;
1221
1222 short_write = ia->write.out.size < count;
1223 offset = ap->descs[0].offset;
1224 count = ia->write.out.size;
1225 for (i = 0; i < ap->num_pages; i++) {
1226 struct page *page = ap->pages[i];
1227
1228 if (err) {
1229 ClearPageUptodate(page);
1230 } else {
1231 if (count >= PAGE_SIZE - offset)
1232 count -= PAGE_SIZE - offset;
1233 else {
1234 if (short_write)
1235 ClearPageUptodate(page);
1236 count = 0;
1237 }
1238 offset = 0;
1239 }
1240 if (ia->write.page_locked && (i == ap->num_pages - 1))
1241 unlock_page(page);
1242 put_page(page);
1243 }
1244
1245 return err;
1246 }
1247
fuse_fill_write_pages(struct fuse_io_args * ia,struct address_space * mapping,struct iov_iter * ii,loff_t pos,unsigned int max_pages)1248 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
1249 struct address_space *mapping,
1250 struct iov_iter *ii, loff_t pos,
1251 unsigned int max_pages)
1252 {
1253 struct fuse_args_pages *ap = &ia->ap;
1254 struct fuse_conn *fc = get_fuse_conn(mapping->host);
1255 unsigned offset = pos & (PAGE_SIZE - 1);
1256 size_t count = 0;
1257 int err;
1258
1259 ap->args.in_pages = true;
1260 ap->descs[0].offset = offset;
1261
1262 do {
1263 size_t tmp;
1264 struct page *page;
1265 pgoff_t index = pos >> PAGE_SHIFT;
1266 size_t bytes = min_t(size_t, PAGE_SIZE - offset,
1267 iov_iter_count(ii));
1268
1269 bytes = min_t(size_t, bytes, fc->max_write - count);
1270
1271 again:
1272 err = -EFAULT;
1273 if (fault_in_iov_iter_readable(ii, bytes))
1274 break;
1275
1276 err = -ENOMEM;
1277 page = grab_cache_page_write_begin(mapping, index);
1278 if (!page)
1279 break;
1280
1281 if (mapping_writably_mapped(mapping))
1282 flush_dcache_page(page);
1283
1284 tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
1285 flush_dcache_page(page);
1286
1287 if (!tmp) {
1288 unlock_page(page);
1289 put_page(page);
1290 goto again;
1291 }
1292
1293 err = 0;
1294 ap->pages[ap->num_pages] = page;
1295 ap->descs[ap->num_pages].length = tmp;
1296 ap->num_pages++;
1297
1298 count += tmp;
1299 pos += tmp;
1300 offset += tmp;
1301 if (offset == PAGE_SIZE)
1302 offset = 0;
1303
1304 /* If we copied full page, mark it uptodate */
1305 if (tmp == PAGE_SIZE)
1306 SetPageUptodate(page);
1307
1308 if (PageUptodate(page)) {
1309 unlock_page(page);
1310 } else {
1311 ia->write.page_locked = true;
1312 break;
1313 }
1314 if (!fc->big_writes)
1315 break;
1316 } while (iov_iter_count(ii) && count < fc->max_write &&
1317 ap->num_pages < max_pages && offset == 0);
1318
1319 return count > 0 ? count : err;
1320 }
1321
fuse_wr_pages(loff_t pos,size_t len,unsigned int max_pages)1322 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1323 unsigned int max_pages)
1324 {
1325 return min_t(unsigned int,
1326 ((pos + len - 1) >> PAGE_SHIFT) -
1327 (pos >> PAGE_SHIFT) + 1,
1328 max_pages);
1329 }
1330
fuse_perform_write(struct kiocb * iocb,struct iov_iter * ii)1331 static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
1332 {
1333 struct address_space *mapping = iocb->ki_filp->f_mapping;
1334 struct inode *inode = mapping->host;
1335 struct fuse_conn *fc = get_fuse_conn(inode);
1336 struct fuse_inode *fi = get_fuse_inode(inode);
1337 loff_t pos = iocb->ki_pos;
1338 int err = 0;
1339 ssize_t res = 0;
1340
1341 if (inode->i_size < pos + iov_iter_count(ii))
1342 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1343
1344 do {
1345 ssize_t count;
1346 struct fuse_io_args ia = {};
1347 struct fuse_args_pages *ap = &ia.ap;
1348 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1349 fc->max_pages);
1350
1351 ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1352 if (!ap->pages) {
1353 err = -ENOMEM;
1354 break;
1355 }
1356
1357 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
1358 if (count <= 0) {
1359 err = count;
1360 } else {
1361 err = fuse_send_write_pages(&ia, iocb, inode,
1362 pos, count);
1363 if (!err) {
1364 size_t num_written = ia.write.out.size;
1365
1366 res += num_written;
1367 pos += num_written;
1368
1369 /* break out of the loop on short write */
1370 if (num_written != count)
1371 err = -EIO;
1372 }
1373 }
1374 kfree(ap->pages);
1375 } while (!err && iov_iter_count(ii));
1376
1377 fuse_write_update_attr(inode, pos, res);
1378 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1379
1380 if (!res)
1381 return err;
1382 iocb->ki_pos += res;
1383 return res;
1384 }
1385
fuse_io_past_eof(struct kiocb * iocb,struct iov_iter * iter)1386 static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
1387 {
1388 struct inode *inode = file_inode(iocb->ki_filp);
1389
1390 return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
1391 }
1392
1393 /*
1394 * @return true if an exclusive lock for direct IO writes is needed
1395 */
fuse_dio_wr_exclusive_lock(struct kiocb * iocb,struct iov_iter * from)1396 static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
1397 {
1398 struct file *file = iocb->ki_filp;
1399 struct fuse_file *ff = file->private_data;
1400 struct inode *inode = file_inode(iocb->ki_filp);
1401 struct fuse_inode *fi = get_fuse_inode(inode);
1402
1403 /* Server side has to advise that it supports parallel dio writes. */
1404 if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
1405 return true;
1406
1407 /*
1408 * Append will need to know the eventual EOF - always needs an
1409 * exclusive lock.
1410 */
1411 if (iocb->ki_flags & IOCB_APPEND)
1412 return true;
1413
1414 /* shared locks are not allowed with parallel page cache IO */
1415 if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
1416 return true;
1417
1418 /* Parallel dio beyond EOF is not supported, at least for now. */
1419 if (fuse_io_past_eof(iocb, from))
1420 return true;
1421
1422 return false;
1423 }
1424
fuse_dio_lock(struct kiocb * iocb,struct iov_iter * from,bool * exclusive)1425 static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
1426 bool *exclusive)
1427 {
1428 struct inode *inode = file_inode(iocb->ki_filp);
1429 struct fuse_inode *fi = get_fuse_inode(inode);
1430
1431 *exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
1432 if (*exclusive) {
1433 inode_lock(inode);
1434 } else {
1435 inode_lock_shared(inode);
1436 /*
1437 * New parallal dio allowed only if inode is not in caching
1438 * mode and denies new opens in caching mode. This check
1439 * should be performed only after taking shared inode lock.
1440 * Previous past eof check was without inode lock and might
1441 * have raced, so check it again.
1442 */
1443 if (fuse_io_past_eof(iocb, from) ||
1444 fuse_inode_uncached_io_start(fi, NULL) != 0) {
1445 inode_unlock_shared(inode);
1446 inode_lock(inode);
1447 *exclusive = true;
1448 }
1449 }
1450 }
1451
fuse_dio_unlock(struct kiocb * iocb,bool exclusive)1452 static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
1453 {
1454 struct inode *inode = file_inode(iocb->ki_filp);
1455 struct fuse_inode *fi = get_fuse_inode(inode);
1456
1457 if (exclusive) {
1458 inode_unlock(inode);
1459 } else {
1460 /* Allow opens in caching mode after last parallel dio end */
1461 fuse_inode_uncached_io_end(fi);
1462 inode_unlock_shared(inode);
1463 }
1464 }
1465
fuse_cache_write_iter(struct kiocb * iocb,struct iov_iter * from)1466 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
1467 {
1468 struct file *file = iocb->ki_filp;
1469 struct mnt_idmap *idmap = file_mnt_idmap(file);
1470 struct address_space *mapping = file->f_mapping;
1471 ssize_t written = 0;
1472 struct inode *inode = mapping->host;
1473 ssize_t err, count;
1474 struct fuse_conn *fc = get_fuse_conn(inode);
1475
1476 if (fc->writeback_cache) {
1477 /* Update size (EOF optimization) and mode (SUID clearing) */
1478 err = fuse_update_attributes(mapping->host, file,
1479 STATX_SIZE | STATX_MODE);
1480 if (err)
1481 return err;
1482
1483 if (fc->handle_killpriv_v2 &&
1484 setattr_should_drop_suidgid(idmap,
1485 file_inode(file))) {
1486 goto writethrough;
1487 }
1488
1489 return generic_file_write_iter(iocb, from);
1490 }
1491
1492 writethrough:
1493 inode_lock(inode);
1494
1495 err = count = generic_write_checks(iocb, from);
1496 if (err <= 0)
1497 goto out;
1498
1499 task_io_account_write(count);
1500
1501 err = file_remove_privs(file);
1502 if (err)
1503 goto out;
1504
1505 err = file_update_time(file);
1506 if (err)
1507 goto out;
1508
1509 if (iocb->ki_flags & IOCB_DIRECT) {
1510 written = generic_file_direct_write(iocb, from);
1511 if (written < 0 || !iov_iter_count(from))
1512 goto out;
1513 written = direct_write_fallback(iocb, from, written,
1514 fuse_perform_write(iocb, from));
1515 } else {
1516 written = fuse_perform_write(iocb, from);
1517 }
1518 out:
1519 inode_unlock(inode);
1520 if (written > 0)
1521 written = generic_write_sync(iocb, written);
1522
1523 return written ? written : err;
1524 }
1525
fuse_get_user_addr(const struct iov_iter * ii)1526 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1527 {
1528 return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
1529 }
1530
fuse_get_frag_size(const struct iov_iter * ii,size_t max_size)1531 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1532 size_t max_size)
1533 {
1534 return min(iov_iter_single_seg_count(ii), max_size);
1535 }
1536
fuse_get_user_pages(struct fuse_args_pages * ap,struct iov_iter * ii,size_t * nbytesp,int write,unsigned int max_pages,bool use_pages_for_kvec_io)1537 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1538 size_t *nbytesp, int write,
1539 unsigned int max_pages,
1540 bool use_pages_for_kvec_io)
1541 {
1542 bool flush_or_invalidate = false;
1543 size_t nbytes = 0; /* # bytes already packed in req */
1544 ssize_t ret = 0;
1545
1546 /* Special case for kernel I/O: can copy directly into the buffer.
1547 * However if the implementation of fuse_conn requires pages instead of
1548 * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
1549 */
1550 if (iov_iter_is_kvec(ii)) {
1551 void *user_addr = (void *)fuse_get_user_addr(ii);
1552
1553 if (!use_pages_for_kvec_io) {
1554 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1555
1556 if (write)
1557 ap->args.in_args[1].value = user_addr;
1558 else
1559 ap->args.out_args[0].value = user_addr;
1560
1561 iov_iter_advance(ii, frag_size);
1562 *nbytesp = frag_size;
1563 return 0;
1564 }
1565
1566 if (is_vmalloc_addr(user_addr)) {
1567 ap->args.vmap_base = user_addr;
1568 flush_or_invalidate = true;
1569 }
1570 }
1571
1572 while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1573 unsigned npages;
1574 size_t start;
1575 struct page **pt_pages;
1576
1577 pt_pages = &ap->pages[ap->num_pages];
1578 ret = iov_iter_extract_pages(ii, &pt_pages,
1579 *nbytesp - nbytes,
1580 max_pages - ap->num_pages,
1581 0, &start);
1582 if (ret < 0)
1583 break;
1584
1585 nbytes += ret;
1586
1587 ret += start;
1588 npages = DIV_ROUND_UP(ret, PAGE_SIZE);
1589
1590 ap->descs[ap->num_pages].offset = start;
1591 fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1592
1593 ap->num_pages += npages;
1594 ap->descs[ap->num_pages - 1].length -=
1595 (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1596 }
1597
1598 if (write && flush_or_invalidate)
1599 flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
1600
1601 ap->args.invalidate_vmap = !write && flush_or_invalidate;
1602 ap->args.is_pinned = iov_iter_extract_will_pin(ii);
1603 ap->args.user_pages = true;
1604 if (write)
1605 ap->args.in_pages = true;
1606 else
1607 ap->args.out_pages = true;
1608
1609 *nbytesp = nbytes;
1610
1611 return ret < 0 ? ret : 0;
1612 }
1613
fuse_direct_io(struct fuse_io_priv * io,struct iov_iter * iter,loff_t * ppos,int flags)1614 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1615 loff_t *ppos, int flags)
1616 {
1617 int write = flags & FUSE_DIO_WRITE;
1618 int cuse = flags & FUSE_DIO_CUSE;
1619 struct file *file = io->iocb->ki_filp;
1620 struct address_space *mapping = file->f_mapping;
1621 struct inode *inode = mapping->host;
1622 struct fuse_file *ff = file->private_data;
1623 struct fuse_conn *fc = ff->fm->fc;
1624 size_t nmax = write ? fc->max_write : fc->max_read;
1625 loff_t pos = *ppos;
1626 size_t count = iov_iter_count(iter);
1627 pgoff_t idx_from = pos >> PAGE_SHIFT;
1628 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1629 ssize_t res = 0;
1630 int err = 0;
1631 struct fuse_io_args *ia;
1632 unsigned int max_pages;
1633 bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
1634
1635 max_pages = iov_iter_npages(iter, fc->max_pages);
1636 ia = fuse_io_alloc(io, max_pages);
1637 if (!ia)
1638 return -ENOMEM;
1639
1640 if (fopen_direct_io && fc->direct_io_allow_mmap) {
1641 res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
1642 if (res) {
1643 fuse_io_free(ia);
1644 return res;
1645 }
1646 }
1647 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1648 if (!write)
1649 inode_lock(inode);
1650 fuse_sync_writes(inode);
1651 if (!write)
1652 inode_unlock(inode);
1653 }
1654
1655 if (fopen_direct_io && write) {
1656 res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
1657 if (res) {
1658 fuse_io_free(ia);
1659 return res;
1660 }
1661 }
1662
1663 io->should_dirty = !write && user_backed_iter(iter);
1664 while (count) {
1665 ssize_t nres;
1666 fl_owner_t owner = current->files;
1667 size_t nbytes = min(count, nmax);
1668
1669 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1670 max_pages, fc->use_pages_for_kvec_io);
1671 if (err && !nbytes)
1672 break;
1673
1674 if (write) {
1675 if (!capable(CAP_FSETID))
1676 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
1677
1678 nres = fuse_send_write(ia, pos, nbytes, owner);
1679 } else {
1680 nres = fuse_send_read(ia, pos, nbytes, owner);
1681 }
1682
1683 if (!io->async || nres < 0) {
1684 fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
1685 fuse_io_free(ia);
1686 }
1687 ia = NULL;
1688 if (nres < 0) {
1689 iov_iter_revert(iter, nbytes);
1690 err = nres;
1691 break;
1692 }
1693 WARN_ON(nres > nbytes);
1694
1695 count -= nres;
1696 res += nres;
1697 pos += nres;
1698 if (nres != nbytes) {
1699 iov_iter_revert(iter, nbytes - nres);
1700 break;
1701 }
1702 if (count) {
1703 max_pages = iov_iter_npages(iter, fc->max_pages);
1704 ia = fuse_io_alloc(io, max_pages);
1705 if (!ia)
1706 break;
1707 }
1708 }
1709 if (ia)
1710 fuse_io_free(ia);
1711 if (res > 0)
1712 *ppos = pos;
1713
1714 return res > 0 ? res : err;
1715 }
1716 EXPORT_SYMBOL_GPL(fuse_direct_io);
1717
__fuse_direct_read(struct fuse_io_priv * io,struct iov_iter * iter,loff_t * ppos)1718 static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1719 struct iov_iter *iter,
1720 loff_t *ppos)
1721 {
1722 ssize_t res;
1723 struct inode *inode = file_inode(io->iocb->ki_filp);
1724
1725 res = fuse_direct_io(io, iter, ppos, 0);
1726
1727 fuse_invalidate_atime(inode);
1728
1729 return res;
1730 }
1731
1732 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1733
fuse_direct_read_iter(struct kiocb * iocb,struct iov_iter * to)1734 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1735 {
1736 ssize_t res;
1737
1738 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1739 res = fuse_direct_IO(iocb, to);
1740 } else {
1741 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1742
1743 res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1744 }
1745
1746 return res;
1747 }
1748
fuse_direct_write_iter(struct kiocb * iocb,struct iov_iter * from)1749 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1750 {
1751 struct inode *inode = file_inode(iocb->ki_filp);
1752 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1753 ssize_t res;
1754 bool exclusive;
1755
1756 fuse_dio_lock(iocb, from, &exclusive);
1757 res = generic_write_checks(iocb, from);
1758 if (res > 0) {
1759 task_io_account_write(res);
1760 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1761 res = fuse_direct_IO(iocb, from);
1762 } else {
1763 res = fuse_direct_io(&io, from, &iocb->ki_pos,
1764 FUSE_DIO_WRITE);
1765 fuse_write_update_attr(inode, iocb->ki_pos, res);
1766 }
1767 }
1768 fuse_dio_unlock(iocb, exclusive);
1769
1770 return res;
1771 }
1772
fuse_file_read_iter(struct kiocb * iocb,struct iov_iter * to)1773 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1774 {
1775 struct file *file = iocb->ki_filp;
1776 struct fuse_file *ff = file->private_data;
1777 struct inode *inode = file_inode(file);
1778
1779 if (fuse_is_bad(inode))
1780 return -EIO;
1781
1782 if (FUSE_IS_DAX(inode))
1783 return fuse_dax_read_iter(iocb, to);
1784
1785 #ifdef CONFIG_FUSE_BPF
1786 {
1787 struct fuse_err_ret fer;
1788
1789 fer = fuse_bpf_backing(inode, struct fuse_file_read_iter_io,
1790 fuse_file_read_iter_initialize,
1791 fuse_file_read_iter_backing,
1792 fuse_file_read_iter_finalize,
1793 iocb, to);
1794 if (fer.ret)
1795 return PTR_ERR(fer.result);
1796 }
1797 #endif
1798
1799 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1800 if (ff->open_flags & FOPEN_DIRECT_IO)
1801 return fuse_direct_read_iter(iocb, to);
1802 else if (fuse_file_passthrough(ff))
1803 return fuse_passthrough_read_iter(iocb, to);
1804 else
1805 return fuse_cache_read_iter(iocb, to);
1806 }
1807
fuse_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1808 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1809 {
1810 struct file *file = iocb->ki_filp;
1811 struct fuse_file *ff = file->private_data;
1812 struct inode *inode = file_inode(file);
1813
1814 if (fuse_is_bad(inode))
1815 return -EIO;
1816
1817 if (FUSE_IS_DAX(inode))
1818 return fuse_dax_write_iter(iocb, from);
1819
1820 #ifdef CONFIG_FUSE_BPF
1821 {
1822 struct fuse_err_ret fer;
1823
1824 fer = fuse_bpf_backing(inode, struct fuse_file_write_iter_io,
1825 fuse_file_write_iter_initialize,
1826 fuse_file_write_iter_backing,
1827 fuse_file_write_iter_finalize,
1828 iocb, from);
1829 if (fer.ret)
1830 return PTR_ERR(fer.result);
1831 }
1832 #endif
1833
1834 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1835 if (ff->open_flags & FOPEN_DIRECT_IO)
1836 return fuse_direct_write_iter(iocb, from);
1837 else if (fuse_file_passthrough(ff))
1838 return fuse_passthrough_write_iter(iocb, from);
1839 else
1840 return fuse_cache_write_iter(iocb, from);
1841 }
1842
fuse_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)1843 static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
1844 struct pipe_inode_info *pipe, size_t len,
1845 unsigned int flags)
1846 {
1847 struct fuse_file *ff = in->private_data;
1848
1849 #ifdef CONFIG_FUSE_BPF
1850 /* TODO - this is simply passthrough, not a proper BPF filter */
1851 if (ff->backing_file)
1852 return fuse_splice_read_backing(in, ppos, pipe, len, flags);
1853 #endif
1854
1855 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1856 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1857 return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
1858 else
1859 return filemap_splice_read(in, ppos, pipe, len, flags);
1860 }
1861
fuse_splice_write(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags)1862 static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
1863 loff_t *ppos, size_t len, unsigned int flags)
1864 {
1865 struct fuse_file *ff = out->private_data;
1866
1867 #ifdef CONFIG_FUSE_BPF
1868 /* TODO - this is simply passthrough, not a proper BPF filter */
1869 if (ff->backing_file)
1870 return fuse_splice_write_backing(pipe, out, ppos, len, flags);
1871 #endif
1872
1873 /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
1874 if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
1875 return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
1876 else
1877 return iter_file_splice_write(pipe, out, ppos, len, flags);
1878 }
1879
fuse_writepage_free(struct fuse_writepage_args * wpa)1880 static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1881 {
1882 struct fuse_args_pages *ap = &wpa->ia.ap;
1883 int i;
1884
1885 if (wpa->bucket)
1886 fuse_sync_bucket_dec(wpa->bucket);
1887
1888 for (i = 0; i < ap->num_pages; i++)
1889 __free_page(ap->pages[i]);
1890
1891 fuse_file_put(wpa->inode, wpa->ia.ff, false);
1892
1893 kfree(ap->pages);
1894 kfree(wpa);
1895 }
1896
fuse_writepage_finish_stat(struct inode * inode,struct page * page)1897 static void fuse_writepage_finish_stat(struct inode *inode, struct page *page)
1898 {
1899 struct backing_dev_info *bdi = inode_to_bdi(inode);
1900
1901 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1902 dec_node_page_state(page, NR_WRITEBACK_TEMP);
1903 wb_writeout_inc(&bdi->wb);
1904 }
1905
fuse_writepage_finish(struct fuse_writepage_args * wpa)1906 static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
1907 {
1908 struct fuse_args_pages *ap = &wpa->ia.ap;
1909 struct inode *inode = wpa->inode;
1910 struct fuse_inode *fi = get_fuse_inode(inode);
1911 int i;
1912
1913 for (i = 0; i < ap->num_pages; i++)
1914 fuse_writepage_finish_stat(inode, ap->pages[i]);
1915
1916 wake_up(&fi->page_waitq);
1917 }
1918
1919 /* Called under fi->lock, may release and reacquire it */
fuse_send_writepage(struct fuse_mount * fm,struct fuse_writepage_args * wpa,loff_t size)1920 static void fuse_send_writepage(struct fuse_mount *fm,
1921 struct fuse_writepage_args *wpa, loff_t size)
1922 __releases(fi->lock)
1923 __acquires(fi->lock)
1924 {
1925 struct fuse_writepage_args *aux, *next;
1926 struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1927 struct fuse_write_in *inarg = &wpa->ia.write.in;
1928 struct fuse_args *args = &wpa->ia.ap.args;
1929 __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1930 int err;
1931
1932 fi->writectr++;
1933 if (inarg->offset + data_size <= size) {
1934 inarg->size = data_size;
1935 } else if (inarg->offset < size) {
1936 inarg->size = size - inarg->offset;
1937 } else {
1938 /* Got truncated off completely */
1939 goto out_free;
1940 }
1941
1942 args->in_args[1].size = inarg->size;
1943 args->force = true;
1944 args->nocreds = true;
1945
1946 err = fuse_simple_background(fm, args, GFP_ATOMIC);
1947 if (err == -ENOMEM) {
1948 spin_unlock(&fi->lock);
1949 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
1950 spin_lock(&fi->lock);
1951 }
1952
1953 /* Fails on broken connection only */
1954 if (unlikely(err))
1955 goto out_free;
1956
1957 return;
1958
1959 out_free:
1960 fi->writectr--;
1961 rb_erase(&wpa->writepages_entry, &fi->writepages);
1962 fuse_writepage_finish(wpa);
1963 spin_unlock(&fi->lock);
1964
1965 /* After rb_erase() aux request list is private */
1966 for (aux = wpa->next; aux; aux = next) {
1967 next = aux->next;
1968 aux->next = NULL;
1969 fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]);
1970 fuse_writepage_free(aux);
1971 }
1972
1973 fuse_writepage_free(wpa);
1974 spin_lock(&fi->lock);
1975 }
1976
1977 /*
1978 * If fi->writectr is positive (no truncate or fsync going on) send
1979 * all queued writepage requests.
1980 *
1981 * Called with fi->lock
1982 */
fuse_flush_writepages(struct inode * inode)1983 void fuse_flush_writepages(struct inode *inode)
1984 __releases(fi->lock)
1985 __acquires(fi->lock)
1986 {
1987 struct fuse_mount *fm = get_fuse_mount(inode);
1988 struct fuse_inode *fi = get_fuse_inode(inode);
1989 loff_t crop = i_size_read(inode);
1990 struct fuse_writepage_args *wpa;
1991
1992 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1993 wpa = list_entry(fi->queued_writes.next,
1994 struct fuse_writepage_args, queue_entry);
1995 list_del_init(&wpa->queue_entry);
1996 fuse_send_writepage(fm, wpa, crop);
1997 }
1998 }
1999
fuse_insert_writeback(struct rb_root * root,struct fuse_writepage_args * wpa)2000 static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
2001 struct fuse_writepage_args *wpa)
2002 {
2003 pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
2004 pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
2005 struct rb_node **p = &root->rb_node;
2006 struct rb_node *parent = NULL;
2007
2008 WARN_ON(!wpa->ia.ap.num_pages);
2009 while (*p) {
2010 struct fuse_writepage_args *curr;
2011 pgoff_t curr_index;
2012
2013 parent = *p;
2014 curr = rb_entry(parent, struct fuse_writepage_args,
2015 writepages_entry);
2016 WARN_ON(curr->inode != wpa->inode);
2017 curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
2018
2019 if (idx_from >= curr_index + curr->ia.ap.num_pages)
2020 p = &(*p)->rb_right;
2021 else if (idx_to < curr_index)
2022 p = &(*p)->rb_left;
2023 else
2024 return curr;
2025 }
2026
2027 rb_link_node(&wpa->writepages_entry, parent, p);
2028 rb_insert_color(&wpa->writepages_entry, root);
2029 return NULL;
2030 }
2031
tree_insert(struct rb_root * root,struct fuse_writepage_args * wpa)2032 static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
2033 {
2034 WARN_ON(fuse_insert_writeback(root, wpa));
2035 }
2036
fuse_writepage_end(struct fuse_mount * fm,struct fuse_args * args,int error)2037 static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
2038 int error)
2039 {
2040 struct fuse_writepage_args *wpa =
2041 container_of(args, typeof(*wpa), ia.ap.args);
2042 struct inode *inode = wpa->inode;
2043 struct fuse_inode *fi = get_fuse_inode(inode);
2044 struct fuse_conn *fc = get_fuse_conn(inode);
2045
2046 mapping_set_error(inode->i_mapping, error);
2047 /*
2048 * A writeback finished and this might have updated mtime/ctime on
2049 * server making local mtime/ctime stale. Hence invalidate attrs.
2050 * Do this only if writeback_cache is not enabled. If writeback_cache
2051 * is enabled, we trust local ctime/mtime.
2052 */
2053 if (!fc->writeback_cache)
2054 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
2055 spin_lock(&fi->lock);
2056 rb_erase(&wpa->writepages_entry, &fi->writepages);
2057 while (wpa->next) {
2058 struct fuse_mount *fm = get_fuse_mount(inode);
2059 struct fuse_write_in *inarg = &wpa->ia.write.in;
2060 struct fuse_writepage_args *next = wpa->next;
2061
2062 wpa->next = next->next;
2063 next->next = NULL;
2064 tree_insert(&fi->writepages, next);
2065
2066 /*
2067 * Skip fuse_flush_writepages() to make it easy to crop requests
2068 * based on primary request size.
2069 *
2070 * 1st case (trivial): there are no concurrent activities using
2071 * fuse_set/release_nowrite. Then we're on safe side because
2072 * fuse_flush_writepages() would call fuse_send_writepage()
2073 * anyway.
2074 *
2075 * 2nd case: someone called fuse_set_nowrite and it is waiting
2076 * now for completion of all in-flight requests. This happens
2077 * rarely and no more than once per page, so this should be
2078 * okay.
2079 *
2080 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
2081 * of fuse_set_nowrite..fuse_release_nowrite section. The fact
2082 * that fuse_set_nowrite returned implies that all in-flight
2083 * requests were completed along with all of their secondary
2084 * requests. Further primary requests are blocked by negative
2085 * writectr. Hence there cannot be any in-flight requests and
2086 * no invocations of fuse_writepage_end() while we're in
2087 * fuse_set_nowrite..fuse_release_nowrite section.
2088 */
2089 fuse_send_writepage(fm, next, inarg->offset + inarg->size);
2090 }
2091 fi->writectr--;
2092 fuse_writepage_finish(wpa);
2093 spin_unlock(&fi->lock);
2094 fuse_writepage_free(wpa);
2095 }
2096
__fuse_write_file_get(struct fuse_inode * fi)2097 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
2098 {
2099 struct fuse_file *ff;
2100
2101 spin_lock(&fi->lock);
2102 ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
2103 write_entry);
2104 if (ff)
2105 fuse_file_get(ff);
2106 spin_unlock(&fi->lock);
2107
2108 return ff;
2109 }
2110
fuse_write_file_get(struct fuse_inode * fi)2111 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
2112 {
2113 struct fuse_file *ff = __fuse_write_file_get(fi);
2114 WARN_ON(!ff);
2115 return ff;
2116 }
2117
fuse_write_inode(struct inode * inode,struct writeback_control * wbc)2118 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
2119 {
2120 struct fuse_inode *fi = get_fuse_inode(inode);
2121 struct fuse_file *ff;
2122 int err;
2123
2124 /**
2125 * TODO - fully understand why this is necessary
2126 *
2127 * With fuse-bpf, fsstress fails if rename is enabled without this
2128 *
2129 * We are getting writes here on directory inodes, which do not have an
2130 * initialized file list so crash.
2131 *
2132 * The question is why we are getting those writes
2133 */
2134 if (!S_ISREG(inode->i_mode))
2135 return 0;
2136
2137 /*
2138 * Inode is always written before the last reference is dropped and
2139 * hence this should not be reached from reclaim.
2140 *
2141 * Writing back the inode from reclaim can deadlock if the request
2142 * processing itself needs an allocation. Allocations triggering
2143 * reclaim while serving a request can't be prevented, because it can
2144 * involve any number of unrelated userspace processes.
2145 */
2146 WARN_ON(wbc->for_reclaim);
2147
2148 ff = __fuse_write_file_get(fi);
2149 err = fuse_flush_times(inode, ff);
2150 if (ff)
2151 fuse_file_put(inode, ff, false);
2152
2153 return err;
2154 }
2155
fuse_writepage_args_alloc(void)2156 static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
2157 {
2158 struct fuse_writepage_args *wpa;
2159 struct fuse_args_pages *ap;
2160
2161 wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
2162 if (wpa) {
2163 ap = &wpa->ia.ap;
2164 ap->num_pages = 0;
2165 ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
2166 if (!ap->pages) {
2167 kfree(wpa);
2168 wpa = NULL;
2169 }
2170 }
2171 return wpa;
2172
2173 }
2174
fuse_writepage_add_to_bucket(struct fuse_conn * fc,struct fuse_writepage_args * wpa)2175 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
2176 struct fuse_writepage_args *wpa)
2177 {
2178 if (!fc->sync_fs)
2179 return;
2180
2181 rcu_read_lock();
2182 /* Prevent resurrection of dead bucket in unlikely race with syncfs */
2183 do {
2184 wpa->bucket = rcu_dereference(fc->curr_bucket);
2185 } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
2186 rcu_read_unlock();
2187 }
2188
fuse_writepage_args_page_fill(struct fuse_writepage_args * wpa,struct folio * folio,struct folio * tmp_folio,uint32_t page_index)2189 static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
2190 struct folio *tmp_folio, uint32_t page_index)
2191 {
2192 struct inode *inode = folio->mapping->host;
2193 struct fuse_args_pages *ap = &wpa->ia.ap;
2194
2195 folio_copy(tmp_folio, folio);
2196
2197 ap->pages[page_index] = &tmp_folio->page;
2198 ap->descs[page_index].offset = 0;
2199 ap->descs[page_index].length = PAGE_SIZE;
2200
2201 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2202 inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP);
2203 }
2204
fuse_writepage_args_setup(struct folio * folio,struct fuse_file * ff)2205 static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
2206 struct fuse_file *ff)
2207 {
2208 struct inode *inode = folio->mapping->host;
2209 struct fuse_conn *fc = get_fuse_conn(inode);
2210 struct fuse_writepage_args *wpa;
2211 struct fuse_args_pages *ap;
2212
2213 wpa = fuse_writepage_args_alloc();
2214 if (!wpa)
2215 return NULL;
2216
2217 fuse_writepage_add_to_bucket(fc, wpa);
2218 fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
2219 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2220 wpa->inode = inode;
2221 wpa->ia.ff = ff;
2222
2223 ap = &wpa->ia.ap;
2224 ap->args.in_pages = true;
2225 ap->args.end = fuse_writepage_end;
2226
2227 return wpa;
2228 }
2229
fuse_writepage_locked(struct folio * folio)2230 static int fuse_writepage_locked(struct folio *folio)
2231 {
2232 struct address_space *mapping = folio->mapping;
2233 struct inode *inode = mapping->host;
2234 struct fuse_inode *fi = get_fuse_inode(inode);
2235 struct fuse_writepage_args *wpa;
2236 struct fuse_args_pages *ap;
2237 struct folio *tmp_folio;
2238 struct fuse_file *ff;
2239 int error = -ENOMEM;
2240
2241 tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2242 if (!tmp_folio)
2243 goto err;
2244
2245 error = -EIO;
2246 ff = fuse_write_file_get(fi);
2247 if (!ff)
2248 goto err_nofile;
2249
2250 wpa = fuse_writepage_args_setup(folio, ff);
2251 error = -ENOMEM;
2252 if (!wpa)
2253 goto err_writepage_args;
2254
2255 ap = &wpa->ia.ap;
2256 ap->num_pages = 1;
2257
2258 folio_start_writeback(folio);
2259 fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
2260
2261 spin_lock(&fi->lock);
2262 tree_insert(&fi->writepages, wpa);
2263 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2264 fuse_flush_writepages(inode);
2265 spin_unlock(&fi->lock);
2266
2267 folio_end_writeback(folio);
2268
2269 return 0;
2270
2271 err_writepage_args:
2272 fuse_file_put(inode, ff, false);
2273 err_nofile:
2274 folio_put(tmp_folio);
2275 err:
2276 mapping_set_error(folio->mapping, error);
2277 return error;
2278 }
2279
2280 struct fuse_fill_wb_data {
2281 struct fuse_writepage_args *wpa;
2282 struct fuse_file *ff;
2283 struct inode *inode;
2284 struct page **orig_pages;
2285 unsigned int max_pages;
2286 };
2287
fuse_pages_realloc(struct fuse_fill_wb_data * data)2288 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
2289 {
2290 struct fuse_args_pages *ap = &data->wpa->ia.ap;
2291 struct fuse_conn *fc = get_fuse_conn(data->inode);
2292 struct page **pages;
2293 struct fuse_page_desc *descs;
2294 unsigned int npages = min_t(unsigned int,
2295 max_t(unsigned int, data->max_pages * 2,
2296 FUSE_DEFAULT_MAX_PAGES_PER_REQ),
2297 fc->max_pages);
2298 WARN_ON(npages <= data->max_pages);
2299
2300 pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
2301 if (!pages)
2302 return false;
2303
2304 memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
2305 memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
2306 kfree(ap->pages);
2307 ap->pages = pages;
2308 ap->descs = descs;
2309 data->max_pages = npages;
2310
2311 return true;
2312 }
2313
fuse_writepages_send(struct fuse_fill_wb_data * data)2314 static void fuse_writepages_send(struct fuse_fill_wb_data *data)
2315 {
2316 struct fuse_writepage_args *wpa = data->wpa;
2317 struct inode *inode = data->inode;
2318 struct fuse_inode *fi = get_fuse_inode(inode);
2319 int num_pages = wpa->ia.ap.num_pages;
2320 int i;
2321
2322 spin_lock(&fi->lock);
2323 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
2324 fuse_flush_writepages(inode);
2325 spin_unlock(&fi->lock);
2326
2327 for (i = 0; i < num_pages; i++)
2328 end_page_writeback(data->orig_pages[i]);
2329 }
2330
2331 /*
2332 * Check under fi->lock if the page is under writeback, and insert it onto the
2333 * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
2334 * one already added for a page at this offset. If there's none, then insert
2335 * this new request onto the auxiliary list, otherwise reuse the existing one by
2336 * swapping the new temp page with the old one.
2337 */
fuse_writepage_add(struct fuse_writepage_args * new_wpa,struct page * page)2338 static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
2339 struct page *page)
2340 {
2341 struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
2342 struct fuse_writepage_args *tmp;
2343 struct fuse_writepage_args *old_wpa;
2344 struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
2345
2346 WARN_ON(new_ap->num_pages != 0);
2347 new_ap->num_pages = 1;
2348
2349 spin_lock(&fi->lock);
2350 old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
2351 if (!old_wpa) {
2352 spin_unlock(&fi->lock);
2353 return true;
2354 }
2355
2356 for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
2357 pgoff_t curr_index;
2358
2359 WARN_ON(tmp->inode != new_wpa->inode);
2360 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
2361 if (curr_index == page->index) {
2362 WARN_ON(tmp->ia.ap.num_pages != 1);
2363 swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
2364 break;
2365 }
2366 }
2367
2368 if (!tmp) {
2369 new_wpa->next = old_wpa->next;
2370 old_wpa->next = new_wpa;
2371 }
2372
2373 spin_unlock(&fi->lock);
2374
2375 if (tmp) {
2376 fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]);
2377 fuse_writepage_free(new_wpa);
2378 }
2379
2380 return false;
2381 }
2382
fuse_writepage_need_send(struct fuse_conn * fc,struct page * page,struct fuse_args_pages * ap,struct fuse_fill_wb_data * data)2383 static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
2384 struct fuse_args_pages *ap,
2385 struct fuse_fill_wb_data *data)
2386 {
2387 WARN_ON(!ap->num_pages);
2388
2389 /*
2390 * Being under writeback is unlikely but possible. For example direct
2391 * read to an mmaped fuse file will set the page dirty twice; once when
2392 * the pages are faulted with get_user_pages(), and then after the read
2393 * completed.
2394 */
2395 if (fuse_page_is_writeback(data->inode, page->index))
2396 return true;
2397
2398 /* Reached max pages */
2399 if (ap->num_pages == fc->max_pages)
2400 return true;
2401
2402 /* Reached max write bytes */
2403 if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
2404 return true;
2405
2406 /* Discontinuity */
2407 if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
2408 return true;
2409
2410 /* Need to grow the pages array? If so, did the expansion fail? */
2411 if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
2412 return true;
2413
2414 return false;
2415 }
2416
fuse_writepages_fill(struct folio * folio,struct writeback_control * wbc,void * _data)2417 static int fuse_writepages_fill(struct folio *folio,
2418 struct writeback_control *wbc, void *_data)
2419 {
2420 struct fuse_fill_wb_data *data = _data;
2421 struct fuse_writepage_args *wpa = data->wpa;
2422 struct fuse_args_pages *ap = &wpa->ia.ap;
2423 struct inode *inode = data->inode;
2424 struct fuse_inode *fi = get_fuse_inode(inode);
2425 struct fuse_conn *fc = get_fuse_conn(inode);
2426 struct folio *tmp_folio;
2427 int err;
2428
2429 if (!data->ff) {
2430 err = -EIO;
2431 data->ff = fuse_write_file_get(fi);
2432 if (!data->ff)
2433 goto out_unlock;
2434 }
2435
2436 if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
2437 fuse_writepages_send(data);
2438 data->wpa = NULL;
2439 }
2440
2441 err = -ENOMEM;
2442 tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
2443 if (!tmp_folio)
2444 goto out_unlock;
2445
2446 /*
2447 * The page must not be redirtied until the writeout is completed
2448 * (i.e. userspace has sent a reply to the write request). Otherwise
2449 * there could be more than one temporary page instance for each real
2450 * page.
2451 *
2452 * This is ensured by holding the page lock in page_mkwrite() while
2453 * checking fuse_page_is_writeback(). We already hold the page lock
2454 * since clear_page_dirty_for_io() and keep it held until we add the
2455 * request to the fi->writepages list and increment ap->num_pages.
2456 * After this fuse_page_is_writeback() will indicate that the page is
2457 * under writeback, so we can release the page lock.
2458 */
2459 if (data->wpa == NULL) {
2460 err = -ENOMEM;
2461 wpa = fuse_writepage_args_setup(folio, data->ff);
2462 if (!wpa) {
2463 folio_put(tmp_folio);
2464 goto out_unlock;
2465 }
2466 fuse_file_get(wpa->ia.ff);
2467 data->max_pages = 1;
2468 ap = &wpa->ia.ap;
2469 }
2470 folio_start_writeback(folio);
2471
2472 fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages);
2473 data->orig_pages[ap->num_pages] = &folio->page;
2474
2475 err = 0;
2476 if (data->wpa) {
2477 /*
2478 * Protected by fi->lock against concurrent access by
2479 * fuse_page_is_writeback().
2480 */
2481 spin_lock(&fi->lock);
2482 ap->num_pages++;
2483 spin_unlock(&fi->lock);
2484 } else if (fuse_writepage_add(wpa, &folio->page)) {
2485 data->wpa = wpa;
2486 } else {
2487 folio_end_writeback(folio);
2488 }
2489 out_unlock:
2490 folio_unlock(folio);
2491
2492 return err;
2493 }
2494
fuse_writepages(struct address_space * mapping,struct writeback_control * wbc)2495 static int fuse_writepages(struct address_space *mapping,
2496 struct writeback_control *wbc)
2497 {
2498 struct inode *inode = mapping->host;
2499 struct fuse_conn *fc = get_fuse_conn(inode);
2500 struct fuse_fill_wb_data data;
2501 int err;
2502
2503 err = -EIO;
2504 if (fuse_is_bad(inode))
2505 goto out;
2506
2507 if (wbc->sync_mode == WB_SYNC_NONE &&
2508 fc->num_background >= fc->congestion_threshold)
2509 return 0;
2510
2511 data.inode = inode;
2512 data.wpa = NULL;
2513 data.ff = NULL;
2514
2515 err = -ENOMEM;
2516 data.orig_pages = kcalloc(fc->max_pages,
2517 sizeof(struct page *),
2518 GFP_NOFS);
2519 if (!data.orig_pages)
2520 goto out;
2521
2522 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2523 if (data.wpa) {
2524 WARN_ON(!data.wpa->ia.ap.num_pages);
2525 fuse_writepages_send(&data);
2526 }
2527 if (data.ff)
2528 fuse_file_put(inode, data.ff, false);
2529
2530 kfree(data.orig_pages);
2531 out:
2532 return err;
2533 }
2534
2535 /*
2536 * It's worthy to make sure that space is reserved on disk for the write,
2537 * but how to implement it without killing performance need more thinking.
2538 */
fuse_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,void ** fsdata)2539 static int fuse_write_begin(struct file *file, struct address_space *mapping,
2540 loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
2541 {
2542 pgoff_t index = pos >> PAGE_SHIFT;
2543 struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2544 struct folio *folio;
2545 loff_t fsize;
2546 int err = -ENOMEM;
2547
2548 WARN_ON(!fc->writeback_cache);
2549
2550 folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
2551 mapping_gfp_mask(mapping));
2552 if (IS_ERR(folio))
2553 goto error;
2554
2555 fuse_wait_on_page_writeback(mapping->host, folio->index);
2556
2557 if (folio_test_uptodate(folio) || len >= folio_size(folio))
2558 goto success;
2559 /*
2560 * Check if the start of this folio comes after the end of file,
2561 * in which case the readpage can be optimized away.
2562 */
2563 fsize = i_size_read(mapping->host);
2564 if (fsize <= folio_pos(folio)) {
2565 size_t off = offset_in_folio(folio, pos);
2566 if (off)
2567 folio_zero_segment(folio, 0, off);
2568 goto success;
2569 }
2570 err = fuse_do_readpage(file, &folio->page);
2571 if (err)
2572 goto cleanup;
2573 success:
2574 *foliop = folio;
2575 return 0;
2576
2577 cleanup:
2578 folio_unlock(folio);
2579 folio_put(folio);
2580 error:
2581 return err;
2582 }
2583
fuse_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct folio * folio,void * fsdata)2584 static int fuse_write_end(struct file *file, struct address_space *mapping,
2585 loff_t pos, unsigned len, unsigned copied,
2586 struct folio *folio, void *fsdata)
2587 {
2588 struct inode *inode = folio->mapping->host;
2589
2590 /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
2591 if (!copied)
2592 goto unlock;
2593
2594 pos += copied;
2595 if (!folio_test_uptodate(folio)) {
2596 /* Zero any unwritten bytes at the end of the page */
2597 size_t endoff = pos & ~PAGE_MASK;
2598 if (endoff)
2599 folio_zero_segment(folio, endoff, PAGE_SIZE);
2600 folio_mark_uptodate(folio);
2601 }
2602
2603 if (pos > inode->i_size)
2604 i_size_write(inode, pos);
2605
2606 folio_mark_dirty(folio);
2607
2608 unlock:
2609 folio_unlock(folio);
2610 folio_put(folio);
2611
2612 return copied;
2613 }
2614
fuse_launder_folio(struct folio * folio)2615 static int fuse_launder_folio(struct folio *folio)
2616 {
2617 int err = 0;
2618 if (folio_clear_dirty_for_io(folio)) {
2619 struct inode *inode = folio->mapping->host;
2620
2621 /* Serialize with pending writeback for the same page */
2622 fuse_wait_on_page_writeback(inode, folio->index);
2623 err = fuse_writepage_locked(folio);
2624 if (!err)
2625 fuse_wait_on_page_writeback(inode, folio->index);
2626 }
2627 return err;
2628 }
2629
2630 /*
2631 * Write back dirty data/metadata now (there may not be any suitable
2632 * open files later for data)
2633 */
fuse_vma_close(struct vm_area_struct * vma)2634 static void fuse_vma_close(struct vm_area_struct *vma)
2635 {
2636 int err;
2637
2638 err = write_inode_now(vma->vm_file->f_mapping->host, 1);
2639 mapping_set_error(vma->vm_file->f_mapping, err);
2640 }
2641
2642 /*
2643 * Wait for writeback against this page to complete before allowing it
2644 * to be marked dirty again, and hence written back again, possibly
2645 * before the previous writepage completed.
2646 *
2647 * Block here, instead of in ->writepage(), so that the userspace fs
2648 * can only block processes actually operating on the filesystem.
2649 *
2650 * Otherwise unprivileged userspace fs would be able to block
2651 * unrelated:
2652 *
2653 * - page migration
2654 * - sync(2)
2655 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2656 */
fuse_page_mkwrite(struct vm_fault * vmf)2657 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
2658 {
2659 struct page *page = vmf->page;
2660 struct inode *inode = file_inode(vmf->vma->vm_file);
2661
2662 file_update_time(vmf->vma->vm_file);
2663 lock_page(page);
2664 if (page->mapping != inode->i_mapping) {
2665 unlock_page(page);
2666 return VM_FAULT_NOPAGE;
2667 }
2668
2669 fuse_wait_on_page_writeback(inode, page->index);
2670 return VM_FAULT_LOCKED;
2671 }
2672
2673 static const struct vm_operations_struct fuse_file_vm_ops = {
2674 .close = fuse_vma_close,
2675 .fault = filemap_fault,
2676 .map_pages = filemap_map_pages,
2677 .page_mkwrite = fuse_page_mkwrite,
2678 };
2679
fuse_file_mmap(struct file * file,struct vm_area_struct * vma)2680 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2681 {
2682 struct fuse_file *ff = file->private_data;
2683 struct fuse_conn *fc = ff->fm->fc;
2684 struct inode *inode = file_inode(file);
2685 int rc;
2686
2687 /* DAX mmap is superior to direct_io mmap */
2688 if (FUSE_IS_DAX(inode))
2689 return fuse_dax_mmap(file, vma);
2690
2691 #ifdef CONFIG_FUSE_BPF
2692 /* TODO - this is simply passthrough, not a proper BPF filter */
2693 if (ff->backing_file)
2694 return fuse_backing_mmap(file, vma);
2695 #endif
2696
2697 /*
2698 * If inode is in passthrough io mode, because it has some file open
2699 * in passthrough mode, either mmap to backing file or fail mmap,
2700 * because mixing cached mmap and passthrough io mode is not allowed.
2701 */
2702 if (fuse_file_passthrough(ff))
2703 return fuse_passthrough_mmap(file, vma);
2704 /*
2705 * Old Android passthrough did not handle this case, but did allow the mmap to continue.
2706 * This will not cleanly handle the case of a shared mmap across passthrough and
2707 * nonpassthrough at the same time, although shared mmap through cache and file io through
2708 * the lower filesystem should work as expected, at a performance penalty.
2709 */
2710 #if 0
2711 else if (fuse_inode_backing(get_fuse_inode(inode)))
2712 return -ENODEV;
2713 #endif
2714
2715 /*
2716 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
2717 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
2718 */
2719 if (ff->open_flags & FOPEN_DIRECT_IO) {
2720 /*
2721 * Can't provide the coherency needed for MAP_SHARED
2722 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
2723 */
2724 if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
2725 return -ENODEV;
2726
2727 invalidate_inode_pages2(file->f_mapping);
2728
2729 if (!(vma->vm_flags & VM_MAYSHARE)) {
2730 /* MAP_PRIVATE */
2731 return generic_file_mmap(file, vma);
2732 }
2733
2734 /*
2735 * First mmap of direct_io file enters caching inode io mode.
2736 * Also waits for parallel dio writers to go into serial mode
2737 * (exclusive instead of shared lock).
2738 * After first mmap, the inode stays in caching io mode until
2739 * the direct_io file release.
2740 */
2741 rc = fuse_file_cached_io_open(inode, ff);
2742 if (rc)
2743 return rc;
2744 }
2745
2746 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2747 fuse_link_write_file(file);
2748
2749 file_accessed(file);
2750 vma->vm_ops = &fuse_file_vm_ops;
2751 return 0;
2752 }
2753
convert_fuse_file_lock(struct fuse_conn * fc,const struct fuse_file_lock * ffl,struct file_lock * fl)2754 static int convert_fuse_file_lock(struct fuse_conn *fc,
2755 const struct fuse_file_lock *ffl,
2756 struct file_lock *fl)
2757 {
2758 switch (ffl->type) {
2759 case F_UNLCK:
2760 break;
2761
2762 case F_RDLCK:
2763 case F_WRLCK:
2764 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2765 ffl->end < ffl->start)
2766 return -EIO;
2767
2768 fl->fl_start = ffl->start;
2769 fl->fl_end = ffl->end;
2770
2771 /*
2772 * Convert pid into init's pid namespace. The locks API will
2773 * translate it into the caller's pid namespace.
2774 */
2775 rcu_read_lock();
2776 fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2777 rcu_read_unlock();
2778 break;
2779
2780 default:
2781 return -EIO;
2782 }
2783 fl->c.flc_type = ffl->type;
2784 return 0;
2785 }
2786
fuse_lk_fill(struct fuse_args * args,struct file * file,const struct file_lock * fl,int opcode,pid_t pid,int flock,struct fuse_lk_in * inarg)2787 static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2788 const struct file_lock *fl, int opcode, pid_t pid,
2789 int flock, struct fuse_lk_in *inarg)
2790 {
2791 struct inode *inode = file_inode(file);
2792 struct fuse_conn *fc = get_fuse_conn(inode);
2793 struct fuse_file *ff = file->private_data;
2794
2795 memset(inarg, 0, sizeof(*inarg));
2796 inarg->fh = ff->fh;
2797 inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
2798 inarg->lk.start = fl->fl_start;
2799 inarg->lk.end = fl->fl_end;
2800 inarg->lk.type = fl->c.flc_type;
2801 inarg->lk.pid = pid;
2802 if (flock)
2803 inarg->lk_flags |= FUSE_LK_FLOCK;
2804 args->opcode = opcode;
2805 args->nodeid = get_node_id(inode);
2806 args->in_numargs = 1;
2807 args->in_args[0].size = sizeof(*inarg);
2808 args->in_args[0].value = inarg;
2809 }
2810
fuse_getlk(struct file * file,struct file_lock * fl)2811 static int fuse_getlk(struct file *file, struct file_lock *fl)
2812 {
2813 struct inode *inode = file_inode(file);
2814 struct fuse_mount *fm = get_fuse_mount(inode);
2815 FUSE_ARGS(args);
2816 struct fuse_lk_in inarg;
2817 struct fuse_lk_out outarg;
2818 int err;
2819
2820 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2821 args.out_numargs = 1;
2822 args.out_args[0].size = sizeof(outarg);
2823 args.out_args[0].value = &outarg;
2824 err = fuse_simple_request(fm, &args);
2825 if (!err)
2826 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
2827
2828 return err;
2829 }
2830
fuse_setlk(struct file * file,struct file_lock * fl,int flock)2831 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2832 {
2833 struct inode *inode = file_inode(file);
2834 struct fuse_mount *fm = get_fuse_mount(inode);
2835 FUSE_ARGS(args);
2836 struct fuse_lk_in inarg;
2837 int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2838 struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
2839 pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
2840 int err;
2841
2842 if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
2843 /* NLM needs asynchronous locks, which we don't support yet */
2844 return -ENOLCK;
2845 }
2846
2847 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2848 err = fuse_simple_request(fm, &args);
2849
2850 /* locking is restartable */
2851 if (err == -EINTR)
2852 err = -ERESTARTSYS;
2853
2854 return err;
2855 }
2856
fuse_file_lock(struct file * file,int cmd,struct file_lock * fl)2857 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2858 {
2859 struct inode *inode = file_inode(file);
2860 struct fuse_conn *fc = get_fuse_conn(inode);
2861 int err;
2862
2863 if (cmd == F_CANCELLK) {
2864 err = 0;
2865 } else if (cmd == F_GETLK) {
2866 if (fc->no_lock) {
2867 posix_test_lock(file, fl);
2868 err = 0;
2869 } else
2870 err = fuse_getlk(file, fl);
2871 } else {
2872 if (fc->no_lock)
2873 err = posix_lock_file(file, fl, NULL);
2874 else
2875 err = fuse_setlk(file, fl, 0);
2876 }
2877 return err;
2878 }
2879
fuse_file_flock(struct file * file,int cmd,struct file_lock * fl)2880 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2881 {
2882 struct inode *inode = file_inode(file);
2883 struct fuse_conn *fc = get_fuse_conn(inode);
2884 struct fuse_file *ff = file->private_data;
2885 int err;
2886
2887 #ifdef CONFIG_FUSE_BPF
2888 /* TODO - this is simply passthrough, not a proper BPF filter */
2889 if (ff->backing_file)
2890 return fuse_file_flock_backing(file, cmd, fl);
2891 #endif
2892
2893 if (fc->no_flock) {
2894 err = locks_lock_file_wait(file, fl);
2895 } else {
2896
2897 /* emulate flock with POSIX locks */
2898 ff->flock = true;
2899 err = fuse_setlk(file, fl, 1);
2900 }
2901
2902 return err;
2903 }
2904
fuse_bmap(struct address_space * mapping,sector_t block)2905 static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2906 {
2907 struct inode *inode = mapping->host;
2908 struct fuse_mount *fm = get_fuse_mount(inode);
2909 FUSE_ARGS(args);
2910 struct fuse_bmap_in inarg;
2911 struct fuse_bmap_out outarg;
2912 int err;
2913
2914 if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
2915 return 0;
2916
2917 memset(&inarg, 0, sizeof(inarg));
2918 inarg.block = block;
2919 inarg.blocksize = inode->i_sb->s_blocksize;
2920 args.opcode = FUSE_BMAP;
2921 args.nodeid = get_node_id(inode);
2922 args.in_numargs = 1;
2923 args.in_args[0].size = sizeof(inarg);
2924 args.in_args[0].value = &inarg;
2925 args.out_numargs = 1;
2926 args.out_args[0].size = sizeof(outarg);
2927 args.out_args[0].value = &outarg;
2928 err = fuse_simple_request(fm, &args);
2929 if (err == -ENOSYS)
2930 fm->fc->no_bmap = 1;
2931
2932 return err ? 0 : outarg.block;
2933 }
2934
fuse_lseek(struct file * file,loff_t offset,int whence)2935 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2936 {
2937 struct inode *inode = file->f_mapping->host;
2938 struct fuse_mount *fm = get_fuse_mount(inode);
2939 struct fuse_file *ff = file->private_data;
2940 FUSE_ARGS(args);
2941 struct fuse_lseek_in inarg = {
2942 .fh = ff->fh,
2943 .offset = offset,
2944 .whence = whence
2945 };
2946 struct fuse_lseek_out outarg;
2947 int err;
2948
2949 if (fm->fc->no_lseek)
2950 goto fallback;
2951
2952 args.opcode = FUSE_LSEEK;
2953 args.nodeid = ff->nodeid;
2954 args.in_numargs = 1;
2955 args.in_args[0].size = sizeof(inarg);
2956 args.in_args[0].value = &inarg;
2957 args.out_numargs = 1;
2958 args.out_args[0].size = sizeof(outarg);
2959 args.out_args[0].value = &outarg;
2960 err = fuse_simple_request(fm, &args);
2961 if (err) {
2962 if (err == -ENOSYS) {
2963 fm->fc->no_lseek = 1;
2964 goto fallback;
2965 }
2966 return err;
2967 }
2968
2969 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2970
2971 fallback:
2972 err = fuse_update_attributes(inode, file, STATX_SIZE);
2973 if (!err)
2974 return generic_file_llseek(file, offset, whence);
2975 else
2976 return err;
2977 }
2978
fuse_file_llseek(struct file * file,loff_t offset,int whence)2979 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
2980 {
2981 loff_t retval;
2982 struct inode *inode = file_inode(file);
2983 #ifdef CONFIG_FUSE_BPF
2984 struct fuse_err_ret fer;
2985
2986 fer = fuse_bpf_backing(inode, struct fuse_lseek_io,
2987 fuse_lseek_initialize,
2988 fuse_lseek_backing,
2989 fuse_lseek_finalize,
2990 file, offset, whence);
2991 if (fer.ret)
2992 return PTR_ERR(fer.result);
2993 #endif
2994
2995 switch (whence) {
2996 case SEEK_SET:
2997 case SEEK_CUR:
2998 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2999 retval = generic_file_llseek(file, offset, whence);
3000 break;
3001 case SEEK_END:
3002 inode_lock(inode);
3003 retval = fuse_update_attributes(inode, file, STATX_SIZE);
3004 if (!retval)
3005 retval = generic_file_llseek(file, offset, whence);
3006 inode_unlock(inode);
3007 break;
3008 case SEEK_HOLE:
3009 case SEEK_DATA:
3010 inode_lock(inode);
3011 retval = fuse_lseek(file, offset, whence);
3012 inode_unlock(inode);
3013 break;
3014 default:
3015 retval = -EINVAL;
3016 }
3017
3018 return retval;
3019 }
3020
3021 /*
3022 * All files which have been polled are linked to RB tree
3023 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
3024 * find the matching one.
3025 */
fuse_find_polled_node(struct fuse_conn * fc,u64 kh,struct rb_node ** parent_out)3026 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
3027 struct rb_node **parent_out)
3028 {
3029 struct rb_node **link = &fc->polled_files.rb_node;
3030 struct rb_node *last = NULL;
3031
3032 while (*link) {
3033 struct fuse_file *ff;
3034
3035 last = *link;
3036 ff = rb_entry(last, struct fuse_file, polled_node);
3037
3038 if (kh < ff->kh)
3039 link = &last->rb_left;
3040 else if (kh > ff->kh)
3041 link = &last->rb_right;
3042 else
3043 return link;
3044 }
3045
3046 if (parent_out)
3047 *parent_out = last;
3048 return link;
3049 }
3050
3051 /*
3052 * The file is about to be polled. Make sure it's on the polled_files
3053 * RB tree. Note that files once added to the polled_files tree are
3054 * not removed before the file is released. This is because a file
3055 * polled once is likely to be polled again.
3056 */
fuse_register_polled_file(struct fuse_conn * fc,struct fuse_file * ff)3057 static void fuse_register_polled_file(struct fuse_conn *fc,
3058 struct fuse_file *ff)
3059 {
3060 spin_lock(&fc->lock);
3061 if (RB_EMPTY_NODE(&ff->polled_node)) {
3062 struct rb_node **link, *parent;
3063
3064 link = fuse_find_polled_node(fc, ff->kh, &parent);
3065 BUG_ON(*link);
3066 rb_link_node(&ff->polled_node, parent, link);
3067 rb_insert_color(&ff->polled_node, &fc->polled_files);
3068 }
3069 spin_unlock(&fc->lock);
3070 }
3071
fuse_file_poll(struct file * file,poll_table * wait)3072 __poll_t fuse_file_poll(struct file *file, poll_table *wait)
3073 {
3074 struct fuse_file *ff = file->private_data;
3075 struct fuse_mount *fm = ff->fm;
3076 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
3077 struct fuse_poll_out outarg;
3078 FUSE_ARGS(args);
3079 int err;
3080
3081 if (fm->fc->no_poll)
3082 return DEFAULT_POLLMASK;
3083
3084 poll_wait(file, &ff->poll_wait, wait);
3085 inarg.events = mangle_poll(poll_requested_events(wait));
3086
3087 /*
3088 * Ask for notification iff there's someone waiting for it.
3089 * The client may ignore the flag and always notify.
3090 */
3091 if (waitqueue_active(&ff->poll_wait)) {
3092 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
3093 fuse_register_polled_file(fm->fc, ff);
3094 }
3095
3096 args.opcode = FUSE_POLL;
3097 args.nodeid = ff->nodeid;
3098 args.in_numargs = 1;
3099 args.in_args[0].size = sizeof(inarg);
3100 args.in_args[0].value = &inarg;
3101 args.out_numargs = 1;
3102 args.out_args[0].size = sizeof(outarg);
3103 args.out_args[0].value = &outarg;
3104 err = fuse_simple_request(fm, &args);
3105
3106 if (!err)
3107 return demangle_poll(outarg.revents);
3108 if (err == -ENOSYS) {
3109 fm->fc->no_poll = 1;
3110 return DEFAULT_POLLMASK;
3111 }
3112 return EPOLLERR;
3113 }
3114 EXPORT_SYMBOL_GPL(fuse_file_poll);
3115
3116 /*
3117 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
3118 * wakes up the poll waiters.
3119 */
fuse_notify_poll_wakeup(struct fuse_conn * fc,struct fuse_notify_poll_wakeup_out * outarg)3120 int fuse_notify_poll_wakeup(struct fuse_conn *fc,
3121 struct fuse_notify_poll_wakeup_out *outarg)
3122 {
3123 u64 kh = outarg->kh;
3124 struct rb_node **link;
3125
3126 spin_lock(&fc->lock);
3127
3128 link = fuse_find_polled_node(fc, kh, NULL);
3129 if (*link) {
3130 struct fuse_file *ff;
3131
3132 ff = rb_entry(*link, struct fuse_file, polled_node);
3133 wake_up_interruptible_sync(&ff->poll_wait);
3134 }
3135
3136 spin_unlock(&fc->lock);
3137 return 0;
3138 }
3139
fuse_do_truncate(struct file * file)3140 static void fuse_do_truncate(struct file *file)
3141 {
3142 struct inode *inode = file->f_mapping->host;
3143 struct iattr attr;
3144
3145 attr.ia_valid = ATTR_SIZE;
3146 attr.ia_size = i_size_read(inode);
3147
3148 attr.ia_file = file;
3149 attr.ia_valid |= ATTR_FILE;
3150
3151 fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file);
3152 }
3153
fuse_round_up(struct fuse_conn * fc,loff_t off)3154 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3155 {
3156 return round_up(off, fc->max_pages << PAGE_SHIFT);
3157 }
3158
3159 static ssize_t
fuse_direct_IO(struct kiocb * iocb,struct iov_iter * iter)3160 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3161 {
3162 DECLARE_COMPLETION_ONSTACK(wait);
3163 ssize_t ret = 0;
3164 struct file *file = iocb->ki_filp;
3165 struct fuse_file *ff = file->private_data;
3166 loff_t pos = 0;
3167 struct inode *inode;
3168 loff_t i_size;
3169 size_t count = iov_iter_count(iter), shortened = 0;
3170 loff_t offset = iocb->ki_pos;
3171 struct fuse_io_priv *io;
3172
3173 pos = offset;
3174 inode = file->f_mapping->host;
3175 i_size = i_size_read(inode);
3176
3177 if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
3178 return 0;
3179
3180 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3181 if (!io)
3182 return -ENOMEM;
3183 spin_lock_init(&io->lock);
3184 kref_init(&io->refcnt);
3185 io->reqs = 1;
3186 io->bytes = -1;
3187 io->size = 0;
3188 io->offset = offset;
3189 io->write = (iov_iter_rw(iter) == WRITE);
3190 io->err = 0;
3191 /*
3192 * By default, we want to optimize all I/Os with async request
3193 * submission to the client filesystem if supported.
3194 */
3195 io->async = ff->fm->fc->async_dio;
3196 io->iocb = iocb;
3197 io->blocking = is_sync_kiocb(iocb);
3198
3199 /* optimization for short read */
3200 if (io->async && !io->write && offset + count > i_size) {
3201 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
3202 shortened = count - iov_iter_count(iter);
3203 count -= shortened;
3204 }
3205
3206 /*
3207 * We cannot asynchronously extend the size of a file.
3208 * In such case the aio will behave exactly like sync io.
3209 */
3210 if ((offset + count > i_size) && io->write)
3211 io->blocking = true;
3212
3213 if (io->async && io->blocking) {
3214 /*
3215 * Additional reference to keep io around after
3216 * calling fuse_aio_complete()
3217 */
3218 kref_get(&io->refcnt);
3219 io->done = &wait;
3220 }
3221
3222 if (iov_iter_rw(iter) == WRITE) {
3223 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3224 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3225 } else {
3226 ret = __fuse_direct_read(io, iter, &pos);
3227 }
3228 iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
3229
3230 if (io->async) {
3231 bool blocking = io->blocking;
3232
3233 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3234
3235 /* we have a non-extending, async request, so return */
3236 if (!blocking)
3237 return -EIOCBQUEUED;
3238
3239 wait_for_completion(&wait);
3240 ret = fuse_get_res_by_io(io);
3241 }
3242
3243 kref_put(&io->refcnt, fuse_io_release);
3244
3245 if (iov_iter_rw(iter) == WRITE) {
3246 fuse_write_update_attr(inode, pos, ret);
3247 /* For extending writes we already hold exclusive lock */
3248 if (ret < 0 && offset + count > i_size)
3249 fuse_do_truncate(file);
3250 }
3251
3252 return ret;
3253 }
3254
fuse_writeback_range(struct inode * inode,loff_t start,loff_t end)3255 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3256 {
3257 int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
3258
3259 if (!err)
3260 fuse_sync_writes(inode);
3261
3262 return err;
3263 }
3264
fuse_file_fallocate(struct file * file,int mode,loff_t offset,loff_t length)3265 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3266 loff_t length)
3267 {
3268 struct fuse_file *ff = file->private_data;
3269 struct inode *inode = file_inode(file);
3270 struct fuse_inode *fi = get_fuse_inode(inode);
3271 struct fuse_mount *fm = ff->fm;
3272 FUSE_ARGS(args);
3273 struct fuse_fallocate_in inarg = {
3274 .fh = ff->fh,
3275 .offset = offset,
3276 .length = length,
3277 .mode = mode
3278 };
3279 int err;
3280 bool block_faults = FUSE_IS_DAX(inode) &&
3281 (!(mode & FALLOC_FL_KEEP_SIZE) ||
3282 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
3283
3284 #ifdef CONFIG_FUSE_BPF
3285 struct fuse_err_ret fer;
3286
3287 fer = fuse_bpf_backing(inode, struct fuse_fallocate_in,
3288 fuse_file_fallocate_initialize,
3289 fuse_file_fallocate_backing,
3290 fuse_file_fallocate_finalize,
3291 file, mode, offset, length);
3292 if (fer.ret)
3293 return PTR_ERR(fer.result);
3294 #endif
3295
3296 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3297 FALLOC_FL_ZERO_RANGE))
3298 return -EOPNOTSUPP;
3299
3300 if (fm->fc->no_fallocate)
3301 return -EOPNOTSUPP;
3302
3303 inode_lock(inode);
3304 if (block_faults) {
3305 filemap_invalidate_lock(inode->i_mapping);
3306 err = fuse_dax_break_layouts(inode, 0, -1);
3307 if (err)
3308 goto out;
3309 }
3310
3311 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
3312 loff_t endbyte = offset + length - 1;
3313
3314 err = fuse_writeback_range(inode, offset, endbyte);
3315 if (err)
3316 goto out;
3317 }
3318
3319 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3320 offset + length > i_size_read(inode)) {
3321 err = inode_newsize_ok(inode, offset + length);
3322 if (err)
3323 goto out;
3324 }
3325
3326 err = file_modified(file);
3327 if (err)
3328 goto out;
3329
3330 if (!(mode & FALLOC_FL_KEEP_SIZE))
3331 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3332
3333 args.opcode = FUSE_FALLOCATE;
3334 args.nodeid = ff->nodeid;
3335 args.in_numargs = 1;
3336 args.in_args[0].size = sizeof(inarg);
3337 args.in_args[0].value = &inarg;
3338 err = fuse_simple_request(fm, &args);
3339 if (err == -ENOSYS) {
3340 fm->fc->no_fallocate = 1;
3341 err = -EOPNOTSUPP;
3342 }
3343 if (err)
3344 goto out;
3345
3346 /* we could have extended the file */
3347 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3348 if (fuse_write_update_attr(inode, offset + length, length))
3349 file_update_time(file);
3350 }
3351
3352 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
3353 truncate_pagecache_range(inode, offset, offset + length - 1);
3354
3355 fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
3356
3357 out:
3358 if (!(mode & FALLOC_FL_KEEP_SIZE))
3359 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3360
3361 if (block_faults)
3362 filemap_invalidate_unlock(inode->i_mapping);
3363
3364 inode_unlock(inode);
3365
3366 fuse_flush_time_update(inode);
3367
3368 return err;
3369 }
3370
__fuse_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)3371 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3372 struct file *file_out, loff_t pos_out,
3373 size_t len, unsigned int flags)
3374 {
3375 struct fuse_file *ff_in = file_in->private_data;
3376 struct fuse_file *ff_out = file_out->private_data;
3377 struct inode *inode_in = file_inode(file_in);
3378 struct inode *inode_out = file_inode(file_out);
3379 struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3380 struct fuse_mount *fm = ff_in->fm;
3381 struct fuse_conn *fc = fm->fc;
3382 FUSE_ARGS(args);
3383 struct fuse_copy_file_range_in inarg = {
3384 .fh_in = ff_in->fh,
3385 .off_in = pos_in,
3386 .nodeid_out = ff_out->nodeid,
3387 .fh_out = ff_out->fh,
3388 .off_out = pos_out,
3389 .len = min_t(size_t, len, UINT_MAX & PAGE_MASK),
3390 .flags = flags
3391 };
3392 struct fuse_write_out outarg;
3393 ssize_t err;
3394 /* mark unstable when write-back is not used, and file_out gets
3395 * extended */
3396 bool is_unstable = (!fc->writeback_cache) &&
3397 ((pos_out + len) > inode_out->i_size);
3398
3399 #ifdef CONFIG_FUSE_BPF
3400 struct fuse_err_ret fer;
3401
3402 fer = fuse_bpf_backing(file_in->f_inode, struct fuse_copy_file_range_io,
3403 fuse_copy_file_range_initialize,
3404 fuse_copy_file_range_backing,
3405 fuse_copy_file_range_finalize,
3406 file_in, pos_in, file_out, pos_out, len, flags);
3407 if (fer.ret)
3408 return PTR_ERR(fer.result);
3409 #endif
3410
3411 if (fc->no_copy_file_range)
3412 return -EOPNOTSUPP;
3413
3414 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3415 return -EXDEV;
3416
3417 inode_lock(inode_in);
3418 err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
3419 inode_unlock(inode_in);
3420 if (err)
3421 return err;
3422
3423 inode_lock(inode_out);
3424
3425 err = file_modified(file_out);
3426 if (err)
3427 goto out;
3428
3429 /*
3430 * Write out dirty pages in the destination file before sending the COPY
3431 * request to userspace. After the request is completed, truncate off
3432 * pages (including partial ones) from the cache that have been copied,
3433 * since these contain stale data at that point.
3434 *
3435 * This should be mostly correct, but if the COPY writes to partial
3436 * pages (at the start or end) and the parts not covered by the COPY are
3437 * written through a memory map after calling fuse_writeback_range(),
3438 * then these partial page modifications will be lost on truncation.
3439 *
3440 * It is unlikely that someone would rely on such mixed style
3441 * modifications. Yet this does give less guarantees than if the
3442 * copying was performed with write(2).
3443 *
3444 * To fix this a mapping->invalidate_lock could be used to prevent new
3445 * faults while the copy is ongoing.
3446 */
3447 err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
3448 if (err)
3449 goto out;
3450
3451 if (is_unstable)
3452 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3453
3454 args.opcode = FUSE_COPY_FILE_RANGE;
3455 args.nodeid = ff_in->nodeid;
3456 args.in_numargs = 1;
3457 args.in_args[0].size = sizeof(inarg);
3458 args.in_args[0].value = &inarg;
3459 args.out_numargs = 1;
3460 args.out_args[0].size = sizeof(outarg);
3461 args.out_args[0].value = &outarg;
3462 err = fuse_simple_request(fm, &args);
3463 if (err == -ENOSYS) {
3464 fc->no_copy_file_range = 1;
3465 err = -EOPNOTSUPP;
3466 }
3467 if (!err && outarg.size > len)
3468 err = -EIO;
3469
3470 if (err)
3471 goto out;
3472
3473 truncate_inode_pages_range(inode_out->i_mapping,
3474 ALIGN_DOWN(pos_out, PAGE_SIZE),
3475 ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
3476
3477 file_update_time(file_out);
3478 fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
3479
3480 err = outarg.size;
3481 out:
3482 if (is_unstable)
3483 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3484
3485 inode_unlock(inode_out);
3486 file_accessed(file_in);
3487
3488 fuse_flush_time_update(inode_out);
3489
3490 return err;
3491 }
3492
fuse_copy_file_range(struct file * src_file,loff_t src_off,struct file * dst_file,loff_t dst_off,size_t len,unsigned int flags)3493 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3494 struct file *dst_file, loff_t dst_off,
3495 size_t len, unsigned int flags)
3496 {
3497 ssize_t ret;
3498
3499 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3500 len, flags);
3501
3502 if (ret == -EOPNOTSUPP || ret == -EXDEV)
3503 ret = splice_copy_file_range(src_file, src_off, dst_file,
3504 dst_off, len);
3505 return ret;
3506 }
3507
3508 static const struct file_operations fuse_file_operations = {
3509 .llseek = fuse_file_llseek,
3510 .read_iter = fuse_file_read_iter,
3511 .write_iter = fuse_file_write_iter,
3512 .mmap = fuse_file_mmap,
3513 .open = fuse_open,
3514 .flush = fuse_flush,
3515 .release = fuse_release,
3516 .fsync = fuse_fsync,
3517 .lock = fuse_file_lock,
3518 .get_unmapped_area = thp_get_unmapped_area,
3519 .flock = fuse_file_flock,
3520 .splice_read = fuse_splice_read,
3521 .splice_write = fuse_splice_write,
3522 .unlocked_ioctl = fuse_file_ioctl,
3523 .compat_ioctl = fuse_file_compat_ioctl,
3524 .poll = fuse_file_poll,
3525 .fallocate = fuse_file_fallocate,
3526 .copy_file_range = fuse_copy_file_range,
3527 };
3528
3529 static const struct address_space_operations fuse_file_aops = {
3530 .read_folio = fuse_read_folio,
3531 .readahead = fuse_readahead,
3532 .writepages = fuse_writepages,
3533 .launder_folio = fuse_launder_folio,
3534 .dirty_folio = filemap_dirty_folio,
3535 .migrate_folio = filemap_migrate_folio,
3536 .bmap = fuse_bmap,
3537 .direct_IO = fuse_direct_IO,
3538 .write_begin = fuse_write_begin,
3539 .write_end = fuse_write_end,
3540 };
3541
fuse_init_file_inode(struct inode * inode,unsigned int flags)3542 void fuse_init_file_inode(struct inode *inode, unsigned int flags)
3543 {
3544 struct fuse_inode *fi = get_fuse_inode(inode);
3545
3546 inode->i_fop = &fuse_file_operations;
3547 inode->i_data.a_ops = &fuse_file_aops;
3548
3549 INIT_LIST_HEAD(&fi->write_files);
3550 INIT_LIST_HEAD(&fi->queued_writes);
3551 fi->writectr = 0;
3552 fi->iocachectr = 0;
3553 fi->iopassctr = 0;
3554 init_waitqueue_head(&fi->page_waitq);
3555 init_waitqueue_head(&fi->direct_io_waitq);
3556 fi->writepages = RB_ROOT;
3557
3558 if (IS_ENABLED(CONFIG_FUSE_DAX))
3559 fuse_dax_inode_init(inode, flags);
3560 }
3561