1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * fs/hmdfs/file_remote.c
4 *
5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6 */
7
8 #include <linux/backing-dev.h>
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/namei.h>
12 #include <linux/page-flags.h>
13 #include <linux/pagemap.h>
14 #include <linux/pagevec.h>
15 #include <linux/sched/signal.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18
19 #include "file_remote.h"
20
21 #include "comm/socket_adapter.h"
22 #include "hmdfs.h"
23 #include "hmdfs_client.h"
24 #include "hmdfs_dentryfile.h"
25 #include "hmdfs_trace.h"
26
hmdfs_remote_write_cache_expired(struct hmdfs_inode_info * info)27 static inline bool hmdfs_remote_write_cache_expired(
28 struct hmdfs_inode_info *info)
29 {
30 return time_after(jiffies, info->writecache_expire);
31 }
32
33 enum expire_reason {
34 ALL_GOOD = 0,
35 INO_DISMATCH = 1,
36 SIZE_OR_CTIME_DISMATCH = 2,
37 TIMER_EXPIRE = 3,
38 TIMER_WORKING = 4,
39 STABLE_CTIME_DISMATCH = 5,
40 KEEP_CACHE = 6,
41 };
42
43 /*
44 * hmdfs_open_final_remote - Do final steps of opening a remote file, update
45 * local inode cache and decide whether of not to truncate inode pages.
46 *
47 * @info: hmdfs inode info
48 * @open_ret: values returned from remote when opening a remote file
49 * @keep_cache: keep local cache & i_size
50 */
hmdfs_open_final_remote(struct hmdfs_inode_info * info,struct hmdfs_open_ret * open_ret,struct file * file,bool keep_cache)51 static int hmdfs_open_final_remote(struct hmdfs_inode_info *info,
52 struct hmdfs_open_ret *open_ret,
53 struct file *file, bool keep_cache)
54 {
55 struct inode *inode = &info->vfs_inode;
56 bool truncate = false;
57 enum expire_reason reason = ALL_GOOD;
58 int ret = 0;
59
60 /*
61 * if remote inode number changed and lookup stale data, we'll return
62 * -ESTALE, and reopen the file with metedate from remote getattr.
63 */
64 if (info->remote_ino != open_ret->ino) {
65 hmdfs_debug(
66 "got stale local inode, ino in local %llu, ino from open %llu",
67 info->remote_ino, open_ret->ino);
68 hmdfs_send_close(info->conn, &open_ret->fid);
69 reason = INO_DISMATCH;
70 ret = -ESTALE;
71 goto out;
72 }
73
74 if (keep_cache) {
75 reason = KEEP_CACHE;
76 trace_hmdfs_open_final_remote(info, open_ret, file, reason);
77 goto set_fid_out;
78 }
79
80 /*
81 * if remote size do not match local inode, or remote ctime do not match
82 * the last time same file was opened.
83 */
84 if (inode->i_size != open_ret->file_size ||
85 hmdfs_time_compare(&info->remote_ctime, &open_ret->remote_ctime)) {
86 truncate = true;
87 reason = SIZE_OR_CTIME_DISMATCH;
88 goto out;
89 }
90
91 /*
92 * If 'writecache_expire' is set, check if it expires. And skip the
93 * checking of stable_ctime.
94 */
95 if (info->writecache_expire) {
96 truncate = hmdfs_remote_write_cache_expired(info);
97 if (truncate)
98 reason = TIMER_EXPIRE;
99 else
100 reason = TIMER_WORKING;
101 goto out;
102 }
103
104 /* the first time, or remote ctime is ahead of remote time */
105 if (info->stable_ctime.tv_sec == 0 && info->stable_ctime.tv_nsec == 0) {
106 truncate = true;
107 reason = STABLE_CTIME_DISMATCH;
108 goto out;
109 }
110
111 /*
112 * - if last stable_ctime == stable_ctime, we do nothing.
113 * a. if ctime < stable_ctime, data is ensured to be uptodate,
114 * b. if ctime == stable_ctime, stale data might be accessed. This is
115 * acceptable since pagecache will be dropped later.
116 * c. ctime > stable_ctime is impossible.
117 * - if last stable_ctime < stable_ctime, we clear the cache.
118 * d. ctime != last stable_ctime is impossible
119 * e. ctime == last stable_ctime, this is possible to read again from
120 * b, thus we need to drop the cache.
121 * - if last stable_ctime > stable_ctime, we clear the cache.
122 * stable_ctime must be zero in this case, this is possible because
123 * system time might be changed.
124 */
125 if (hmdfs_time_compare(&info->stable_ctime, &open_ret->stable_ctime)) {
126 truncate = true;
127 reason = STABLE_CTIME_DISMATCH;
128 goto out;
129 }
130
131 out:
132 trace_hmdfs_open_final_remote(info, open_ret, file, reason);
133 if (ret)
134 return ret;
135
136 if (reason == SIZE_OR_CTIME_DISMATCH) {
137 inode->i_ctime = open_ret->remote_ctime;
138 info->remote_ctime = open_ret->remote_ctime;
139 }
140
141 if (truncate) {
142 info->writecache_expire = 0;
143 truncate_inode_pages(inode->i_mapping, 0);
144 }
145
146 atomic64_set(&info->write_counter, 0);
147 info->stable_ctime = open_ret->stable_ctime;
148 i_size_write(inode, open_ret->file_size);
149 info->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
150 set_fid_out:
151 spin_lock(&info->fid_lock);
152 info->fid = open_ret->fid;
153 spin_unlock(&info->fid_lock);
154 return 0;
155 }
156
hmdfs_do_open_remote(struct file * file,bool keep_cache)157 int hmdfs_do_open_remote(struct file *file, bool keep_cache)
158 {
159 struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
160 struct hmdfs_peer *conn = info->conn;
161 struct hmdfs_open_ret open_ret;
162 __u8 file_type = hmdfs_d(file->f_path.dentry)->file_type;
163 char *send_buf;
164 int err = 0;
165
166 send_buf = hmdfs_get_dentry_relative_path(file->f_path.dentry);
167 if (!send_buf) {
168 err = -ENOMEM;
169 goto out_free;
170 }
171 err = hmdfs_send_open(conn, send_buf, file_type, &open_ret);
172 if (err) {
173 hmdfs_err("hmdfs_send_open return failed with %d", err);
174 goto out_free;
175 }
176
177 err = hmdfs_open_final_remote(info, &open_ret, file, keep_cache);
178
179 out_free:
180 kfree(send_buf);
181 return err;
182 }
183
hmdfs_remote_need_reopen(struct hmdfs_inode_info * info)184 static inline bool hmdfs_remote_need_reopen(struct hmdfs_inode_info *info)
185 {
186 return test_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
187 }
188
hmdfs_remote_is_opening_file(struct hmdfs_inode_info * info)189 static inline bool hmdfs_remote_is_opening_file(struct hmdfs_inode_info *info)
190 {
191 return test_bit(HMDFS_FID_OPENING, &info->fid_flags);
192 }
193
hmdfs_remote_wait_opening_file(struct hmdfs_inode_info * info)194 static int hmdfs_remote_wait_opening_file(struct hmdfs_inode_info *info)
195 {
196 int err;
197
198 if (!hmdfs_remote_is_opening_file(info))
199 return 0;
200
201 err = ___wait_event(info->fid_wq, hmdfs_remote_is_opening_file(info),
202 TASK_INTERRUPTIBLE, 0, 0,
203 spin_unlock(&info->fid_lock);
204 schedule();
205 spin_lock(&info->fid_lock));
206 if (err)
207 err = -EINTR;
208
209 return err;
210 }
211
hmdfs_remote_file_reopen(struct hmdfs_inode_info * info,struct file * filp)212 static int hmdfs_remote_file_reopen(struct hmdfs_inode_info *info,
213 struct file *filp)
214 {
215 int err = 0;
216 struct hmdfs_peer *conn = info->conn;
217 struct inode *inode = NULL;
218 struct hmdfs_fid fid;
219
220 if (conn->status == NODE_STAT_OFFLINE)
221 return -EAGAIN;
222
223 spin_lock(&info->fid_lock);
224 err = hmdfs_remote_wait_opening_file(info);
225 if (err || !hmdfs_remote_need_reopen(info)) {
226 spin_unlock(&info->fid_lock);
227 goto out;
228 }
229
230 set_bit(HMDFS_FID_OPENING, &info->fid_flags);
231 fid = info->fid;
232 spin_unlock(&info->fid_lock);
233
234 inode = &info->vfs_inode;
235 inode_lock(inode);
236 /*
237 * Most closing cases are meaningless, except for one:
238 * read process A read process B
239 * err = -EBADF err = -EBADF (caused by re-online)
240 * set_need_reopen
241 * do reopen
242 * fid = new fid_1 [server hold fid_1]
243 * set need_reopen
244 * do reopen
245 * send close (fid_1) // In case of leak
246 * fid = new fid_2
247 */
248 if (fid.id != HMDFS_INODE_INVALID_FILE_ID)
249 hmdfs_send_close(conn, &fid);
250 err = hmdfs_do_open_remote(filp, true);
251 inode_unlock(inode);
252
253 spin_lock(&info->fid_lock);
254 /*
255 * May make the bit set in offline handler lost, but server
256 * will tell us whether or not the newly-opened file id is
257 * generated before offline, if it is opened before offline,
258 * the operation on the file id will return -EBADF and
259 * HMDFS_FID_NEED_OPEN bit will be set again.
260 */
261 if (!err)
262 clear_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
263 clear_bit(HMDFS_FID_OPENING, &info->fid_flags);
264 spin_unlock(&info->fid_lock);
265
266 wake_up_interruptible_all(&info->fid_wq);
267 out:
268 return err;
269 }
270
hmdfs_remote_check_and_reopen(struct hmdfs_inode_info * info,struct file * filp)271 static int hmdfs_remote_check_and_reopen(struct hmdfs_inode_info *info,
272 struct file *filp)
273 {
274 if (!hmdfs_remote_need_reopen(info))
275 return 0;
276
277 return hmdfs_remote_file_reopen(info, filp);
278 }
279
hmdfs_do_close_remote(struct kref * kref)280 void hmdfs_do_close_remote(struct kref *kref)
281 {
282 struct hmdfs_inode_info *info =
283 container_of(kref, struct hmdfs_inode_info, ref);
284 struct hmdfs_fid fid;
285
286 hmdfs_remote_fetch_fid(info, &fid);
287 /* This function can return asynchronously */
288 hmdfs_send_close(info->conn, &fid);
289 }
290
hmdfs_remote_need_track_file(const struct hmdfs_sb_info * sbi,fmode_t mode)291 static inline bool hmdfs_remote_need_track_file(const struct hmdfs_sb_info *sbi,
292 fmode_t mode)
293 {
294 return (hmdfs_is_stash_enabled(sbi) && (mode & FMODE_WRITE));
295 }
296
297 static void
hmdfs_remote_del_wr_opened_inode_nolock(struct hmdfs_inode_info * info)298 hmdfs_remote_del_wr_opened_inode_nolock(struct hmdfs_inode_info *info)
299 {
300 WARN_ON(list_empty(&info->wr_opened_node));
301 if (atomic_dec_and_test(&info->wr_opened_cnt))
302 list_del_init(&info->wr_opened_node);
303 }
304
hmdfs_remote_del_wr_opened_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)305 void hmdfs_remote_del_wr_opened_inode(struct hmdfs_peer *conn,
306 struct hmdfs_inode_info *info)
307 {
308 spin_lock(&conn->wr_opened_inode_lock);
309 hmdfs_remote_del_wr_opened_inode_nolock(info);
310 spin_unlock(&conn->wr_opened_inode_lock);
311 }
312
hmdfs_remote_add_wr_opened_inode_nolock(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)313 void hmdfs_remote_add_wr_opened_inode_nolock(struct hmdfs_peer *conn,
314 struct hmdfs_inode_info *info)
315 {
316 if (list_empty(&info->wr_opened_node)) {
317 atomic_set(&info->wr_opened_cnt, 1);
318 list_add_tail(&info->wr_opened_node,
319 &conn->wr_opened_inode_list);
320 } else {
321 atomic_inc(&info->wr_opened_cnt);
322 }
323 }
324
hmdfs_remote_add_wr_opened_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)325 static void hmdfs_remote_add_wr_opened_inode(struct hmdfs_peer *conn,
326 struct hmdfs_inode_info *info)
327 {
328 spin_lock(&conn->wr_opened_inode_lock);
329 hmdfs_remote_add_wr_opened_inode_nolock(conn, info);
330 spin_unlock(&conn->wr_opened_inode_lock);
331 }
332
hmdfs_file_open_remote(struct inode * inode,struct file * file)333 int hmdfs_file_open_remote(struct inode *inode, struct file *file)
334 {
335 struct hmdfs_inode_info *info = hmdfs_i(inode);
336 struct kref *ref = &(info->ref);
337 int err = 0;
338
339 inode_lock(inode);
340 if (kref_read(ref) == 0) {
341 err = hmdfs_do_open_remote(file, false);
342 if (err == 0)
343 kref_init(ref);
344 } else {
345 kref_get(ref);
346 }
347 inode_unlock(inode);
348
349 if (!err && hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb),
350 file->f_mode))
351 hmdfs_remote_add_wr_opened_inode(info->conn, info);
352
353 return err;
354 }
355
hmdfs_set_writecache_expire(struct hmdfs_inode_info * info,unsigned int seconds)356 static void hmdfs_set_writecache_expire(struct hmdfs_inode_info *info,
357 unsigned int seconds)
358 {
359 unsigned long new_expire = jiffies + seconds * HZ;
360
361 /*
362 * When file has been written before closing, set pagecache expire
363 * if it has not been set yet. This is necessary because ctime might
364 * stay the same after overwrite.
365 */
366 if (info->writecache_expire &&
367 time_after(new_expire, info->writecache_expire))
368 return;
369
370 info->writecache_expire = new_expire;
371 }
372
hmdfs_remote_keep_writecache(struct inode * inode,struct file * file)373 static void hmdfs_remote_keep_writecache(struct inode *inode, struct file *file)
374 {
375 struct hmdfs_inode_info *info = NULL;
376 struct kref *ref = NULL;
377 struct hmdfs_getattr_ret *getattr_ret = NULL;
378 unsigned int write_cache_timeout =
379 hmdfs_sb(inode->i_sb)->write_cache_timeout;
380 int err;
381
382 if (!write_cache_timeout)
383 return;
384
385 info = hmdfs_i(inode);
386 ref = &(info->ref);
387 /*
388 * don't do anything if file is still opening or file hasn't been
389 * written.
390 */
391 if (kref_read(ref) > 0 || !atomic64_read(&info->write_counter))
392 return;
393
394 /*
395 * If remote getattr failed, and we don't update ctime,
396 * pagecache will be truncated the next time file is opened.
397 */
398 err = hmdfs_remote_getattr(info->conn, file_dentry(file), 0,
399 &getattr_ret);
400 if (err) {
401 hmdfs_err("remote getattr failed with err %d", err);
402 return;
403 }
404
405 if (!(getattr_ret->stat.result_mask & STATX_CTIME)) {
406 hmdfs_err("get remote ctime failed with mask 0x%x",
407 getattr_ret->stat.result_mask);
408 kfree(getattr_ret);
409 return;
410 }
411 /*
412 * update ctime from remote, in case that pagecahe will be
413 * truncated in next open.
414 */
415 inode->i_ctime = getattr_ret->stat.ctime;
416 info->remote_ctime = getattr_ret->stat.ctime;
417 hmdfs_set_writecache_expire(info, write_cache_timeout);
418 kfree(getattr_ret);
419 }
420
hmdfs_file_release_remote(struct inode * inode,struct file * file)421 int hmdfs_file_release_remote(struct inode *inode, struct file *file)
422 {
423 struct hmdfs_inode_info *info = hmdfs_i(inode);
424
425 if (hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb), file->f_mode))
426 hmdfs_remote_del_wr_opened_inode(info->conn, info);
427
428 inode_lock(inode);
429 kref_put(&info->ref, hmdfs_do_close_remote);
430 hmdfs_remote_keep_writecache(inode, file);
431 inode_unlock(inode);
432
433 return 0;
434 }
435
hmdfs_file_flush(struct file * file,fl_owner_t id)436 static int hmdfs_file_flush(struct file *file, fl_owner_t id)
437 {
438 int err = 0;
439 struct inode *inode = file_inode(file);
440
441 if (!(file->f_mode & FMODE_WRITE))
442 return 0;
443
444 /*
445 * Continue regardless of whether file reopen fails or not,
446 * because there may be no dirty page.
447 */
448 hmdfs_remote_check_and_reopen(hmdfs_i(inode), file);
449
450 /*
451 * Wait for wsem here would impact the performance greatly, so we
452 * overlap the time to issue as many wbs as we can, expecting async
453 * wbs are eliminated afterwards.
454 */
455 filemap_fdatawrite(inode->i_mapping);
456 down_write(&hmdfs_i(inode)->wpage_sem);
457 err = filemap_write_and_wait(inode->i_mapping);
458 up_write(&hmdfs_i(inode)->wpage_sem);
459 return err;
460 }
461
hmdfs_file_read_iter_remote(struct kiocb * iocb,struct iov_iter * iter)462 static ssize_t hmdfs_file_read_iter_remote(struct kiocb *iocb,
463 struct iov_iter *iter)
464 {
465 struct file *filp = iocb->ki_filp;
466 struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp));
467 struct file_ra_state *ra = NULL;
468 unsigned int rtt;
469 int err;
470 bool tried = false;
471
472 retry:
473 err = hmdfs_remote_check_and_reopen(info, filp);
474 if (err)
475 return err;
476
477 ra = &filp->f_ra;
478 /* rtt is measured in 10 msecs */
479 rtt = hmdfs_tcpi_rtt(info->conn) / 10000;
480 switch (rtt) {
481 case 0:
482 break;
483 case 1:
484 ra->ra_pages = 256;
485 break;
486 case 2:
487 ra->ra_pages = 512;
488 break;
489 default:
490 ra->ra_pages = 1024;
491 break;
492 }
493
494 err = generic_file_read_iter(iocb, iter);
495 if (err < 0 && !tried && hmdfs_remote_need_reopen(info)) {
496 /* Read from a stale fid, try read again once. */
497 tried = true;
498 goto retry;
499 }
500
501 return err;
502 }
503
hmdfs_is_file_unwritable(const struct hmdfs_inode_info * info,bool check_stash)504 static inline bool hmdfs_is_file_unwritable(const struct hmdfs_inode_info *info,
505 bool check_stash)
506 {
507 return (check_stash && hmdfs_inode_is_stashing(info)) ||
508 !hmdfs_is_node_online(info->conn);
509 }
510
__hmdfs_file_write_iter_remote(struct kiocb * iocb,struct iov_iter * iter,bool check_stash)511 static ssize_t __hmdfs_file_write_iter_remote(struct kiocb *iocb,
512 struct iov_iter *iter,
513 bool check_stash)
514 {
515 struct file *filp = iocb->ki_filp;
516 struct inode *inode = file_inode(filp);
517 struct hmdfs_inode_info *info = hmdfs_i(inode);
518 ssize_t ret;
519
520 if (hmdfs_is_file_unwritable(info, check_stash))
521 return -EAGAIN;
522
523 ret = hmdfs_remote_check_and_reopen(info, filp);
524 if (ret)
525 return ret;
526
527 inode_lock(inode);
528 if (hmdfs_is_file_unwritable(info, check_stash)) {
529 ret = -EAGAIN;
530 goto out;
531 }
532 ret = generic_write_checks(iocb, iter);
533 if (ret > 0)
534 ret = __generic_file_write_iter(iocb, iter);
535 out:
536 inode_unlock(inode);
537
538 if (ret > 0)
539 ret = generic_write_sync(iocb, ret);
540 return ret;
541 }
542
hmdfs_file_write_iter_remote_nocheck(struct kiocb * iocb,struct iov_iter * iter)543 ssize_t hmdfs_file_write_iter_remote_nocheck(struct kiocb *iocb,
544 struct iov_iter *iter)
545 {
546 return __hmdfs_file_write_iter_remote(iocb, iter, false);
547 }
548
hmdfs_file_write_iter_remote(struct kiocb * iocb,struct iov_iter * iter)549 static ssize_t hmdfs_file_write_iter_remote(struct kiocb *iocb,
550 struct iov_iter *iter)
551 {
552 return __hmdfs_file_write_iter_remote(iocb, iter, true);
553 }
554
555 /* hmdfs not support mmap write remote file */
hmdfs_page_mkwrite(struct vm_fault * vmf)556 static vm_fault_t hmdfs_page_mkwrite(struct vm_fault *vmf)
557 {
558 return VM_FAULT_SIGBUS;
559 }
560
561 static const struct vm_operations_struct hmdfs_file_vm_ops = {
562 .fault = filemap_fault,
563 .map_pages = filemap_map_pages,
564 .page_mkwrite = hmdfs_page_mkwrite,
565 };
566
hmdfs_file_mmap_remote(struct file * file,struct vm_area_struct * vma)567 static int hmdfs_file_mmap_remote(struct file *file, struct vm_area_struct *vma)
568 {
569 vma->vm_ops = &hmdfs_file_vm_ops;
570 file_accessed(file);
571
572 return 0;
573 }
574
hmdfs_file_fsync_remote(struct file * file,loff_t start,loff_t end,int datasync)575 static int hmdfs_file_fsync_remote(struct file *file, loff_t start, loff_t end,
576 int datasync)
577 {
578 struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
579 struct hmdfs_peer *conn = info->conn;
580 struct hmdfs_fid fid;
581 int err;
582
583 trace_hmdfs_fsync_enter_remote(conn->sbi, conn->device_id,
584 info->remote_ino, datasync);
585 /*
586 * Continue regardless of whether file reopen fails or not,
587 * because there may be no dirty page.
588 */
589 hmdfs_remote_check_and_reopen(info, file);
590
591 filemap_fdatawrite(file->f_mapping);
592 down_write(&info->wpage_sem);
593 err = file_write_and_wait_range(file, start, end);
594 up_write(&info->wpage_sem);
595 if (err) {
596 hmdfs_err("local fsync fail with %d", err);
597 goto out;
598 }
599
600 hmdfs_remote_fetch_fid(info, &fid);
601 err = hmdfs_send_fsync(conn, &fid, start, end, datasync);
602 if (err)
603 hmdfs_err("send fsync fail with %d", err);
604
605 out:
606 trace_hmdfs_fsync_exit_remote(conn->sbi, conn->device_id,
607 info->remote_ino,
608 get_cmd_timeout(conn->sbi, F_FSYNC), err);
609
610 /* Compatible with POSIX retcode */
611 if (err == -ETIME)
612 err = -EIO;
613
614 return err;
615 }
616
617 const struct file_operations hmdfs_dev_file_fops_remote = {
618 .owner = THIS_MODULE,
619 .llseek = generic_file_llseek,
620 .read_iter = hmdfs_file_read_iter_remote,
621 .write_iter = hmdfs_file_write_iter_remote,
622 .mmap = hmdfs_file_mmap_remote,
623 .open = hmdfs_file_open_remote,
624 .release = hmdfs_file_release_remote,
625 .flush = hmdfs_file_flush,
626 .fsync = hmdfs_file_fsync_remote,
627 };
628
hmdfs_fill_page_zero(struct page * page)629 static void hmdfs_fill_page_zero(struct page *page)
630 {
631 void *addr = NULL;
632
633 addr = kmap(page);
634 memset(addr, 0, PAGE_SIZE);
635 kunmap(page);
636 SetPageUptodate(page);
637 unlock_page(page);
638 }
639
hmdfs_readpage_remote(struct file * file,struct page * page)640 static int hmdfs_readpage_remote(struct file *file, struct page *page)
641 {
642 struct inode *inode = file_inode(file);
643 struct hmdfs_inode_info *info = hmdfs_i(inode);
644 loff_t isize = i_size_read(inode);
645 pgoff_t end_index = (isize - 1) >> PAGE_SHIFT;
646 struct hmdfs_fid fid;
647
648 if (!isize || page->index > end_index) {
649 hmdfs_fill_page_zero(page);
650 return 0;
651 }
652
653 if (!isize || page->index > end_index) {
654 hmdfs_fill_page_zero(page);
655 return 0;
656 }
657
658 hmdfs_remote_fetch_fid(info, &fid);
659 return hmdfs_client_readpage(info->conn, &fid, page);
660 }
661
hmdfs_get_writecount(struct page * page)662 uint32_t hmdfs_get_writecount(struct page *page)
663 {
664 uint32_t count = 0;
665 loff_t pos = (loff_t)page->index << HMDFS_PAGE_OFFSET;
666 struct inode *inode = page->mapping->host;
667 loff_t size = i_size_read(inode);
668 /*
669 * If page offset is greater than i_size, this is possible when
670 * writepage concurrent with truncate. In this case, we don't need to
671 * do remote writepage since it'll be truncated after the page is
672 * unlocked.
673 */
674 if (pos >= size)
675 count = 0;
676 /*
677 * If the page about to write is beyond i_size, we can't write beyond
678 * i_size because remote file size will be wrong.
679 */
680 else if (size < pos + HMDFS_PAGE_SIZE)
681 count = size - pos;
682 /* It's safe to write the whole page */
683 else
684 count = HMDFS_PAGE_SIZE;
685
686 return count;
687 }
688
allow_cur_thread_wpage(struct hmdfs_inode_info * info,bool * rsem_held,bool sync_all)689 static bool allow_cur_thread_wpage(struct hmdfs_inode_info *info,
690 bool *rsem_held, bool sync_all)
691 {
692 WARN_ON(!rsem_held);
693
694 if (sync_all) {
695 *rsem_held = false;
696 return true;
697 }
698 *rsem_held = down_read_trylock(&info->wpage_sem);
699 return *rsem_held;
700 }
701
702 /**
703 * hmdfs_writepage_remote - writeback a dirty page to remote
704 *
705 * INFO:
706 * When asked to WB_SYNC_ALL, this function should leave with both the page and
707 * the radix tree node clean to achieve close-to-open consitency. Moreover,
708 * this shall never return -EIO to help filemap to iterate all dirty pages.
709 *
710 * INFO:
711 * When asked to WB_SYNC_NONE, this function should be mercy if faults(oom or
712 * bad pipe) happended to enable subsequent r/w & wb.
713 */
hmdfs_writepage_remote(struct page * page,struct writeback_control * wbc)714 static int hmdfs_writepage_remote(struct page *page,
715 struct writeback_control *wbc)
716 {
717 struct inode *inode = page->mapping->host;
718 struct hmdfs_inode_info *info = hmdfs_i(inode);
719 struct hmdfs_sb_info *sbi = hmdfs_sb(inode->i_sb);
720 int ret = 0;
721 bool rsem_held = false;
722 bool sync = wbc->sync_mode == WB_SYNC_ALL;
723 struct hmdfs_writepage_context *param = NULL;
724
725 if (!allow_cur_thread_wpage(info, &rsem_held, sync))
726 goto out_unlock;
727
728 set_page_writeback(page);
729
730 param = kzalloc(sizeof(*param), GFP_NOFS);
731 if (!param) {
732 ret = -ENOMEM;
733 goto out_endwb;
734 }
735
736 if (sync && hmdfs_usr_sig_pending(current)) {
737 ClearPageUptodate(page);
738 goto out_free;
739 }
740 param->count = hmdfs_get_writecount(page);
741 if (!param->count)
742 goto out_free;
743 param->rsem_held = rsem_held;
744 hmdfs_remote_fetch_fid(info, ¶m->fid);
745 param->sync_all = sync;
746 param->caller = current;
747 get_task_struct(current);
748 param->page = page;
749 param->timeout = jiffies + msecs_to_jiffies(sbi->wb_timeout_ms);
750 INIT_DELAYED_WORK(¶m->retry_dwork, hmdfs_remote_writepage_retry);
751 ret = hmdfs_remote_do_writepage(info->conn, param);
752 if (likely(!ret))
753 return 0;
754
755 put_task_struct(current);
756 out_free:
757 kfree(param);
758 out_endwb:
759 end_page_writeback(page);
760 if (rsem_held)
761 up_read(&info->wpage_sem);
762 out_unlock:
763 if (sync || !hmdfs_need_redirty_page(info, ret)) {
764 SetPageError(page);
765 mapping_set_error(page->mapping, ret);
766 } else {
767 redirty_page_for_writepage(wbc, page);
768 }
769 unlock_page(page);
770 return ret;
771 }
772
hmdfs_account_dirty_pages(struct address_space * mapping)773 static void hmdfs_account_dirty_pages(struct address_space *mapping)
774 {
775 struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info;
776
777 if (!sbi->h_wb->dirty_writeback_control)
778 return;
779
780 this_cpu_inc(*sbi->h_wb->bdp_ratelimits);
781 }
782
hmdfs_write_begin_remote(struct file * file,struct address_space * mapping,loff_t pos,unsigned int len,unsigned int flags,struct page ** pagep,void ** fsdata)783 static int hmdfs_write_begin_remote(struct file *file,
784 struct address_space *mapping, loff_t pos,
785 unsigned int len, unsigned int flags,
786 struct page **pagep, void **fsdata)
787 {
788 pgoff_t index = ((unsigned long long)pos) >> PAGE_SHIFT;
789 struct inode *inode = file_inode(file);
790 struct page *page = NULL;
791 int ret = 0;
792
793 start:
794 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
795 if (!page)
796 return -ENOMEM;
797 *pagep = page;
798 wait_on_page_writeback(page);
799
800 // If this page will be covered completely.
801 if (len == HMDFS_PAGE_SIZE || PageUptodate(page))
802 return 0;
803
804 /*
805 * If data existed in this page will covered,
806 * we just need to clear this page.
807 */
808 if (!((unsigned long long)pos & (HMDFS_PAGE_SIZE - 1)) &&
809 (pos + len) >= i_size_read(inode)) {
810 zero_user_segment(page, len, HMDFS_PAGE_SIZE);
811 return 0;
812 }
813 /*
814 * We need readpage before write date to this page.
815 */
816 ret = hmdfs_readpage_remote(file, page);
817 if (!ret) {
818 if (PageLocked(page)) {
819 ret = __lock_page_killable(page);
820 if (!ret)
821 unlock_page(page);
822 }
823
824 if (!ret && PageUptodate(page)) {
825 put_page(page);
826 goto start;
827 }
828 if (!ret)
829 ret = -EIO;
830 }
831 put_page(page);
832 return ret;
833 }
834
hmdfs_write_end_remote(struct file * file,struct address_space * mapping,loff_t pos,unsigned int len,unsigned int copied,struct page * page,void * fsdata)835 static int hmdfs_write_end_remote(struct file *file,
836 struct address_space *mapping, loff_t pos,
837 unsigned int len, unsigned int copied,
838 struct page *page, void *fsdata)
839 {
840 struct inode *inode = page->mapping->host;
841
842 if (!PageUptodate(page)) {
843 if (unlikely(copied != len))
844 copied = 0;
845 else
846 SetPageUptodate(page);
847 }
848 if (!copied)
849 goto unlock_out;
850
851 if (!PageDirty(page)) {
852 hmdfs_account_dirty_pages(mapping);
853 set_page_dirty(page);
854 }
855
856 if (pos + copied > i_size_read(inode)) {
857 i_size_write(inode, pos + copied);
858 hmdfs_i(inode)->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
859 }
860 unlock_out:
861 unlock_page(page);
862 put_page(page);
863
864 /* hmdfs private writeback control */
865 hmdfs_balance_dirty_pages_ratelimited(mapping);
866 return copied;
867 }
868
869 const struct address_space_operations hmdfs_dev_file_aops_remote = {
870 .readpage = hmdfs_readpage_remote,
871 .write_begin = hmdfs_write_begin_remote,
872 .write_end = hmdfs_write_end_remote,
873 .writepage = hmdfs_writepage_remote,
874 .set_page_dirty = __set_page_dirty_nobuffers,
875 };
876
hmdfs_set_pos(unsigned long dev_id,unsigned long group_id,unsigned long offset)877 loff_t hmdfs_set_pos(unsigned long dev_id, unsigned long group_id,
878 unsigned long offset)
879 {
880 loff_t pos;
881
882 pos = ((loff_t)dev_id << (POS_BIT_NUM - 1 - DEV_ID_BIT_NUM)) +
883 ((loff_t)group_id << OFFSET_BIT_NUM) + offset;
884 if (dev_id)
885 pos |= ((loff_t)1 << (POS_BIT_NUM - 1));
886 return pos;
887 }
888
analysis_dentry_file_from_con(struct hmdfs_sb_info * sbi,struct file * file,struct file * handler,struct dir_context * ctx)889 static int analysis_dentry_file_from_con(struct hmdfs_sb_info *sbi,
890 struct file *file,
891 struct file *handler,
892 struct dir_context *ctx)
893 {
894 struct hmdfs_dentry_group *dentry_group = NULL;
895 loff_t pos = ctx->pos;
896 unsigned long dev_id = (unsigned long)((pos << 1) >> (POS_BIT_NUM - DEV_ID_BIT_NUM));
897 unsigned long group_id = (unsigned long)((pos << (1 + DEV_ID_BIT_NUM)) >>
898 (POS_BIT_NUM - GROUP_ID_BIT_NUM));
899 loff_t offset = pos & OFFSET_BIT_MASK;
900 int group_num = 0;
901 char *dentry_name = NULL;
902 int iterate_result = 0;
903 int i, j;
904
905 dentry_group = kzalloc(sizeof(*dentry_group), GFP_KERNEL);
906
907 if (!dentry_group)
908 return -ENOMEM;
909
910 if (IS_ERR_OR_NULL(handler)) {
911 kfree(dentry_group);
912 return -ENOENT;
913 }
914
915 group_num = get_dentry_group_cnt(file_inode(handler));
916 dentry_name = kzalloc(DENTRY_NAME_MAX_LEN, GFP_KERNEL);
917 if (!dentry_name) {
918 kfree(dentry_group);
919 return -ENOMEM;
920 }
921
922 for (i = group_id; i < group_num; i++) {
923 int ret = hmdfs_metainfo_read(sbi, handler, dentry_group,
924 sizeof(struct hmdfs_dentry_group),
925 i);
926 if (ret != sizeof(struct hmdfs_dentry_group)) {
927 hmdfs_err("read dentry group failed ret:%d", ret);
928 goto done;
929 }
930
931 for (j = offset; j < DENTRY_PER_GROUP; j++) {
932 int len;
933 int file_type = DT_UNKNOWN;
934 bool is_continue;
935
936 len = le16_to_cpu(dentry_group->nsl[j].namelen);
937 if (!test_bit_le(j, dentry_group->bitmap) || len == 0)
938 continue;
939
940 memset(dentry_name, 0, DENTRY_NAME_MAX_LEN);
941 // TODO: Support more file_type
942 if (S_ISDIR(le16_to_cpu(dentry_group->nsl[j].i_mode)))
943 file_type = DT_DIR;
944 else if (S_ISREG(le16_to_cpu(
945 dentry_group->nsl[j].i_mode)))
946 file_type = DT_REG;
947
948 strncat(dentry_name, dentry_group->filename[j], len);
949 pos = hmdfs_set_pos(dev_id, i, j);
950 is_continue =
951 dir_emit(ctx, dentry_name, len,
952 pos + INUNUMBER_START, file_type);
953 if (!is_continue) {
954 ctx->pos = pos;
955 iterate_result = 1;
956 goto done;
957 }
958 }
959 offset = 0;
960 }
961
962 done:
963 kfree(dentry_name);
964 kfree(dentry_group);
965 return iterate_result;
966 }
967
hmdfs_dev_readdir_from_con(struct hmdfs_peer * con,struct file * file,struct dir_context * ctx)968 int hmdfs_dev_readdir_from_con(struct hmdfs_peer *con, struct file *file,
969 struct dir_context *ctx)
970 {
971 int iterate_result = 0;
972
973 iterate_result = analysis_dentry_file_from_con(
974 con->sbi, file, file->private_data, ctx);
975 return iterate_result;
976 }
977
hmdfs_iterate_remote(struct file * file,struct dir_context * ctx)978 static int hmdfs_iterate_remote(struct file *file, struct dir_context *ctx)
979 {
980 int err = 0;
981 loff_t start_pos = ctx->pos;
982 struct hmdfs_peer *con = NULL;
983 struct hmdfs_dentry_info *di = hmdfs_d(file->f_path.dentry);
984 bool is_local = !((ctx->pos) >> (POS_BIT_NUM - 1));
985 uint64_t dev_id = di->device_id;
986
987 if (ctx->pos == -1)
988 return 0;
989 if (is_local)
990 ctx->pos = hmdfs_set_pos(dev_id, 0, 0);
991
992 con = hmdfs_lookup_from_devid(file->f_inode->i_sb->s_fs_info, dev_id);
993 if (con) {
994 // ctx->pos = 0;
995 err = con->conn_operations->remote_readdir(con, file, ctx);
996 if (unlikely(!con)) {
997 hmdfs_err("con is null");
998 goto done;
999 }
1000 peer_put(con);
1001 if (err)
1002 goto done;
1003 }
1004
1005 done:
1006 if (err <= 0)
1007 ctx->pos = -1;
1008
1009 trace_hmdfs_iterate_remote(file->f_path.dentry, start_pos, ctx->pos,
1010 err);
1011 return err;
1012 }
1013
hmdfs_dir_open_remote(struct inode * inode,struct file * file)1014 int hmdfs_dir_open_remote(struct inode *inode, struct file *file)
1015 {
1016 struct hmdfs_inode_info *info = hmdfs_i(inode);
1017 struct clearcache_item *cache_item = NULL;
1018
1019 if (info->conn && info->conn->version <= USERSPACE_MAX_VER) {
1020 return 0;
1021 } else if (info->conn) {
1022 if (!hmdfs_cache_revalidate(READ_ONCE(info->conn->conn_time),
1023 info->conn->device_id,
1024 file->f_path.dentry))
1025 get_remote_dentry_file_sync(file->f_path.dentry,
1026 info->conn);
1027 cache_item = hmdfs_find_cache_item(info->conn->device_id,
1028 file->f_path.dentry);
1029 if (cache_item) {
1030 file->private_data = cache_item->filp;
1031 get_file(file->private_data);
1032 kref_put(&cache_item->ref, release_cache_item);
1033 return 0;
1034 }
1035 return -ENOENT;
1036 }
1037 return -ENOENT;
1038 }
1039
hmdfs_dir_release_remote(struct inode * inode,struct file * file)1040 static int hmdfs_dir_release_remote(struct inode *inode, struct file *file)
1041 {
1042 if (file->private_data)
1043 fput(file->private_data);
1044 file->private_data = NULL;
1045 return 0;
1046 }
1047
1048 const struct file_operations hmdfs_dev_dir_ops_remote = {
1049 .owner = THIS_MODULE,
1050 .iterate = hmdfs_iterate_remote,
1051 .open = hmdfs_dir_open_remote,
1052 .release = hmdfs_dir_release_remote,
1053 .fsync = __generic_file_fsync,
1054 };
1055