• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * fs/hmdfs/file_remote.c
4  *
5  * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6  */
7 
8 #include <linux/backing-dev.h>
9 #include <linux/file.h>
10 #include <linux/fs.h>
11 #include <linux/namei.h>
12 #include <linux/page-flags.h>
13 #include <linux/pagemap.h>
14 #include <linux/pagevec.h>
15 #include <linux/sched/signal.h>
16 #include <linux/slab.h>
17 #include <linux/wait.h>
18 
19 #include "file_remote.h"
20 
21 #include "comm/socket_adapter.h"
22 #include "hmdfs.h"
23 #include "hmdfs_client.h"
24 #include "hmdfs_dentryfile.h"
25 #include "hmdfs_trace.h"
26 
hmdfs_remote_write_cache_expired(struct hmdfs_inode_info * info)27 static inline bool hmdfs_remote_write_cache_expired(
28 		struct hmdfs_inode_info *info)
29 {
30 	return time_after(jiffies, info->writecache_expire);
31 }
32 
33 enum expire_reason {
34 	ALL_GOOD = 0,
35 	INO_DISMATCH = 1,
36 	SIZE_OR_CTIME_DISMATCH = 2,
37 	TIMER_EXPIRE = 3,
38 	TIMER_WORKING = 4,
39 	STABLE_CTIME_DISMATCH = 5,
40 	KEEP_CACHE = 6,
41 };
42 
43 /*
44  * hmdfs_open_final_remote - Do final steps of opening a remote file, update
45  * local inode cache and decide whether of not to truncate inode pages.
46  *
47  * @info: hmdfs inode info
48  * @open_ret: values returned from remote when opening a remote file
49  * @keep_cache: keep local cache & i_size
50  */
hmdfs_open_final_remote(struct hmdfs_inode_info * info,struct hmdfs_open_ret * open_ret,struct file * file,bool keep_cache)51 static int hmdfs_open_final_remote(struct hmdfs_inode_info *info,
52 				   struct hmdfs_open_ret *open_ret,
53 				   struct file *file, bool keep_cache)
54 {
55 	struct inode *inode = &info->vfs_inode;
56 	bool truncate = false;
57 	enum expire_reason reason = ALL_GOOD;
58 	int ret = 0;
59 
60 	/*
61 	 * if remote inode number changed and lookup stale data, we'll return
62 	 * -ESTALE, and reopen the file with metedate from remote getattr.
63 	 */
64 	if (info->remote_ino != open_ret->ino) {
65 		hmdfs_debug(
66 			"got stale local inode, ino in local %llu, ino from open %llu",
67 			info->remote_ino, open_ret->ino);
68 		hmdfs_send_close(info->conn, &open_ret->fid);
69 		reason = INO_DISMATCH;
70 		ret = -ESTALE;
71 		goto out;
72 	}
73 
74 	if (keep_cache) {
75 		reason = KEEP_CACHE;
76 		trace_hmdfs_open_final_remote(info, open_ret, file, reason);
77 		goto set_fid_out;
78 	}
79 
80 	/*
81 	 * if remote size do not match local inode, or remote ctime do not match
82 	 * the last time same file was opened.
83 	 */
84 	if (inode->i_size != open_ret->file_size ||
85 	    hmdfs_time_compare(&info->remote_ctime, &open_ret->remote_ctime)) {
86 		truncate = true;
87 		reason = SIZE_OR_CTIME_DISMATCH;
88 		goto out;
89 	}
90 
91 	/*
92 	 * If 'writecache_expire' is set, check if it expires. And skip the
93 	 * checking of stable_ctime.
94 	 */
95 	if (info->writecache_expire) {
96 		truncate = hmdfs_remote_write_cache_expired(info);
97 		if (truncate)
98 			reason = TIMER_EXPIRE;
99 		else
100 			reason = TIMER_WORKING;
101 		goto out;
102 	}
103 
104 	/* the first time, or remote ctime is ahead of remote time */
105 	if (info->stable_ctime.tv_sec == 0 && info->stable_ctime.tv_nsec == 0) {
106 		truncate = true;
107 		reason = STABLE_CTIME_DISMATCH;
108 		goto out;
109 	}
110 
111 	/*
112 	 * - if last stable_ctime == stable_ctime, we do nothing.
113 	 *   a. if ctime < stable_ctime, data is ensured to be uptodate,
114 	 *   b. if ctime == stable_ctime, stale data might be accessed. This is
115 	 *      acceptable since pagecache will be dropped later.
116 	 *   c. ctime > stable_ctime is impossible.
117 	 * - if last stable_ctime < stable_ctime, we clear the cache.
118 	 *   d. ctime != last stable_ctime is impossible
119 	 *   e. ctime == last stable_ctime, this is possible to read again from
120 	 *      b, thus we need to drop the cache.
121 	 * - if last stable_ctime > stable_ctime, we clear the cache.
122 	 *   stable_ctime must be zero in this case, this is possible because
123 	 *   system time might be changed.
124 	 */
125 	if (hmdfs_time_compare(&info->stable_ctime, &open_ret->stable_ctime)) {
126 		truncate = true;
127 		reason = STABLE_CTIME_DISMATCH;
128 		goto out;
129 	}
130 
131 out:
132 	trace_hmdfs_open_final_remote(info, open_ret, file, reason);
133 	if (ret)
134 		return ret;
135 
136 	if (reason == SIZE_OR_CTIME_DISMATCH) {
137 		inode->i_ctime = open_ret->remote_ctime;
138 		info->remote_ctime = open_ret->remote_ctime;
139 	}
140 
141 	if (truncate) {
142 		info->writecache_expire = 0;
143 		truncate_inode_pages(inode->i_mapping, 0);
144 	}
145 
146 	atomic64_set(&info->write_counter, 0);
147 	info->stable_ctime = open_ret->stable_ctime;
148 	i_size_write(inode, open_ret->file_size);
149 	info->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
150 set_fid_out:
151 	spin_lock(&info->fid_lock);
152 	info->fid = open_ret->fid;
153 	spin_unlock(&info->fid_lock);
154 	return 0;
155 }
156 
hmdfs_do_open_remote(struct file * file,bool keep_cache)157 int hmdfs_do_open_remote(struct file *file, bool keep_cache)
158 {
159 	struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
160 	struct hmdfs_peer *conn = info->conn;
161 	struct hmdfs_open_ret open_ret;
162 	__u8 file_type = hmdfs_d(file->f_path.dentry)->file_type;
163 	char *send_buf;
164 	int err = 0;
165 
166 	send_buf = hmdfs_get_dentry_relative_path(file->f_path.dentry);
167 	if (!send_buf) {
168 		err = -ENOMEM;
169 		goto out_free;
170 	}
171 	err = hmdfs_send_open(conn, send_buf, file_type, &open_ret);
172 	if (err) {
173 		hmdfs_err("hmdfs_send_open return failed with %d", err);
174 		goto out_free;
175 	}
176 
177 	err = hmdfs_open_final_remote(info, &open_ret, file, keep_cache);
178 
179 out_free:
180 	kfree(send_buf);
181 	return err;
182 }
183 
hmdfs_remote_need_reopen(struct hmdfs_inode_info * info)184 static inline bool hmdfs_remote_need_reopen(struct hmdfs_inode_info *info)
185 {
186 	return test_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
187 }
188 
hmdfs_remote_is_opening_file(struct hmdfs_inode_info * info)189 static inline bool hmdfs_remote_is_opening_file(struct hmdfs_inode_info *info)
190 {
191 	return test_bit(HMDFS_FID_OPENING, &info->fid_flags);
192 }
193 
hmdfs_remote_wait_opening_file(struct hmdfs_inode_info * info)194 static int hmdfs_remote_wait_opening_file(struct hmdfs_inode_info *info)
195 {
196 	int err;
197 
198 	if (!hmdfs_remote_is_opening_file(info))
199 		return 0;
200 
201 	err = ___wait_event(info->fid_wq, hmdfs_remote_is_opening_file(info),
202 			    TASK_INTERRUPTIBLE, 0, 0,
203 			    spin_unlock(&info->fid_lock);
204 			    schedule();
205 			    spin_lock(&info->fid_lock));
206 	if (err)
207 		err = -EINTR;
208 
209 	return err;
210 }
211 
hmdfs_remote_file_reopen(struct hmdfs_inode_info * info,struct file * filp)212 static int hmdfs_remote_file_reopen(struct hmdfs_inode_info *info,
213 				    struct file *filp)
214 {
215 	int err = 0;
216 	struct hmdfs_peer *conn = info->conn;
217 	struct inode *inode = NULL;
218 	struct hmdfs_fid fid;
219 
220 	if (conn->status == NODE_STAT_OFFLINE)
221 		return -EAGAIN;
222 
223 	spin_lock(&info->fid_lock);
224 	err = hmdfs_remote_wait_opening_file(info);
225 	if (err || !hmdfs_remote_need_reopen(info)) {
226 		spin_unlock(&info->fid_lock);
227 		goto out;
228 	}
229 
230 	set_bit(HMDFS_FID_OPENING, &info->fid_flags);
231 	fid = info->fid;
232 	spin_unlock(&info->fid_lock);
233 
234 	inode = &info->vfs_inode;
235 	inode_lock(inode);
236 	/*
237 	 * Most closing cases are meaningless, except for one:
238 	 *        read process A         read process B
239 	 *    err = -EBADF              err = -EBADF       (caused by re-online)
240 	 *    set_need_reopen
241 	 *    do reopen
242 	 *    fid = new fid_1 [server hold fid_1]
243 	 *                              set need_reopen
244 	 *                              do reopen
245 	 *                                send close (fid_1) // In case of leak
246 	 *                              fid = new fid_2
247 	 */
248 	if (fid.id != HMDFS_INODE_INVALID_FILE_ID)
249 		hmdfs_send_close(conn, &fid);
250 	err = hmdfs_do_open_remote(filp, true);
251 	inode_unlock(inode);
252 
253 	spin_lock(&info->fid_lock);
254 	/*
255 	 * May make the bit set in offline handler lost, but server
256 	 * will tell us whether or not the newly-opened file id is
257 	 * generated before offline, if it is opened before offline,
258 	 * the operation on the file id will return -EBADF and
259 	 * HMDFS_FID_NEED_OPEN bit will be set again.
260 	 */
261 	if (!err)
262 		clear_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
263 	clear_bit(HMDFS_FID_OPENING, &info->fid_flags);
264 	spin_unlock(&info->fid_lock);
265 
266 	wake_up_interruptible_all(&info->fid_wq);
267 out:
268 	return err;
269 }
270 
hmdfs_remote_check_and_reopen(struct hmdfs_inode_info * info,struct file * filp)271 static int hmdfs_remote_check_and_reopen(struct hmdfs_inode_info *info,
272 					 struct file *filp)
273 {
274 	if (!hmdfs_remote_need_reopen(info))
275 		return 0;
276 
277 	return hmdfs_remote_file_reopen(info, filp);
278 }
279 
hmdfs_do_close_remote(struct kref * kref)280 void hmdfs_do_close_remote(struct kref *kref)
281 {
282 	struct hmdfs_inode_info *info =
283 		container_of(kref, struct hmdfs_inode_info, ref);
284 	struct hmdfs_fid fid;
285 
286 	hmdfs_remote_fetch_fid(info, &fid);
287 	/* This function can return asynchronously */
288 	hmdfs_send_close(info->conn, &fid);
289 }
290 
hmdfs_remote_need_track_file(const struct hmdfs_sb_info * sbi,fmode_t mode)291 static inline bool hmdfs_remote_need_track_file(const struct hmdfs_sb_info *sbi,
292 						fmode_t mode)
293 {
294 	return (hmdfs_is_stash_enabled(sbi) && (mode & FMODE_WRITE));
295 }
296 
297 static void
hmdfs_remote_del_wr_opened_inode_nolock(struct hmdfs_inode_info * info)298 hmdfs_remote_del_wr_opened_inode_nolock(struct hmdfs_inode_info *info)
299 {
300 	WARN_ON(list_empty(&info->wr_opened_node));
301 	if (atomic_dec_and_test(&info->wr_opened_cnt))
302 		list_del_init(&info->wr_opened_node);
303 }
304 
hmdfs_remote_del_wr_opened_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)305 void hmdfs_remote_del_wr_opened_inode(struct hmdfs_peer *conn,
306 				      struct hmdfs_inode_info *info)
307 {
308 	spin_lock(&conn->wr_opened_inode_lock);
309 	hmdfs_remote_del_wr_opened_inode_nolock(info);
310 	spin_unlock(&conn->wr_opened_inode_lock);
311 }
312 
hmdfs_remote_add_wr_opened_inode_nolock(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)313 void hmdfs_remote_add_wr_opened_inode_nolock(struct hmdfs_peer *conn,
314 					     struct hmdfs_inode_info *info)
315 {
316 	if (list_empty(&info->wr_opened_node)) {
317 		atomic_set(&info->wr_opened_cnt, 1);
318 		list_add_tail(&info->wr_opened_node,
319 			      &conn->wr_opened_inode_list);
320 	} else {
321 		atomic_inc(&info->wr_opened_cnt);
322 	}
323 }
324 
hmdfs_remote_add_wr_opened_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)325 static void hmdfs_remote_add_wr_opened_inode(struct hmdfs_peer *conn,
326 					     struct hmdfs_inode_info *info)
327 {
328 	spin_lock(&conn->wr_opened_inode_lock);
329 	hmdfs_remote_add_wr_opened_inode_nolock(conn, info);
330 	spin_unlock(&conn->wr_opened_inode_lock);
331 }
332 
hmdfs_file_open_remote(struct inode * inode,struct file * file)333 int hmdfs_file_open_remote(struct inode *inode, struct file *file)
334 {
335 	struct hmdfs_inode_info *info = hmdfs_i(inode);
336 	struct kref *ref = &(info->ref);
337 	int err = 0;
338 
339 	inode_lock(inode);
340 	if (kref_read(ref) == 0) {
341 		err = hmdfs_do_open_remote(file, false);
342 		if (err == 0)
343 			kref_init(ref);
344 	} else {
345 		kref_get(ref);
346 	}
347 	inode_unlock(inode);
348 
349 	if (!err && hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb),
350 						 file->f_mode))
351 		hmdfs_remote_add_wr_opened_inode(info->conn, info);
352 
353 	return err;
354 }
355 
hmdfs_set_writecache_expire(struct hmdfs_inode_info * info,unsigned int seconds)356 static void hmdfs_set_writecache_expire(struct hmdfs_inode_info *info,
357 					unsigned int seconds)
358 {
359 	unsigned long new_expire = jiffies + (unsigned long)seconds * HZ;
360 
361 	/*
362 	 * When file has been written before closing, set pagecache expire
363 	 * if it has not been set yet. This is necessary because ctime might
364 	 * stay the same after overwrite.
365 	 */
366 	if (info->writecache_expire &&
367 	    time_after(new_expire, info->writecache_expire))
368 		return;
369 
370 	info->writecache_expire = new_expire;
371 }
372 
hmdfs_remote_keep_writecache(struct inode * inode,struct file * file)373 static void hmdfs_remote_keep_writecache(struct inode *inode, struct file *file)
374 {
375 	struct hmdfs_inode_info *info = NULL;
376 	struct kref *ref = NULL;
377 	struct hmdfs_getattr_ret *getattr_ret = NULL;
378 	unsigned int write_cache_timeout =
379 		hmdfs_sb(inode->i_sb)->write_cache_timeout;
380 	int err;
381 
382 	if (!write_cache_timeout)
383 		return;
384 
385 	info = hmdfs_i(inode);
386 	ref = &(info->ref);
387 	/*
388 	 * don't do anything if file is still opening or file hasn't been
389 	 * written.
390 	 */
391 	if (kref_read(ref) > 0 || !atomic64_read(&info->write_counter))
392 		return;
393 
394 	/*
395 	 * If remote getattr failed, and we don't update ctime,
396 	 * pagecache will be truncated the next time file is opened.
397 	 */
398 	err = hmdfs_remote_getattr(info->conn, file_dentry(file), 0,
399 				   &getattr_ret);
400 	if (err) {
401 		hmdfs_err("remote getattr failed with err %d", err);
402 		return;
403 	}
404 
405 	if (!(getattr_ret->stat.result_mask & STATX_CTIME)) {
406 		hmdfs_err("get remote ctime failed with mask 0x%x",
407 			  getattr_ret->stat.result_mask);
408 		kfree(getattr_ret);
409 		return;
410 	}
411 	/*
412 	 * update ctime from remote, in case that pagecahe will be
413 	 * truncated in next open.
414 	 */
415 	inode->i_ctime = getattr_ret->stat.ctime;
416 	info->remote_ctime = getattr_ret->stat.ctime;
417 	hmdfs_set_writecache_expire(info, write_cache_timeout);
418 	kfree(getattr_ret);
419 }
420 
hmdfs_file_release_remote(struct inode * inode,struct file * file)421 int hmdfs_file_release_remote(struct inode *inode, struct file *file)
422 {
423 	struct hmdfs_inode_info *info = hmdfs_i(inode);
424 
425 	if (hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb), file->f_mode))
426 		hmdfs_remote_del_wr_opened_inode(info->conn, info);
427 
428 	inode_lock(inode);
429 	kref_put(&info->ref, hmdfs_do_close_remote);
430 	hmdfs_remote_keep_writecache(inode, file);
431 	inode_unlock(inode);
432 
433 	return 0;
434 }
435 
hmdfs_file_flush(struct file * file,fl_owner_t id)436 static int hmdfs_file_flush(struct file *file, fl_owner_t id)
437 {
438 	int err = 0;
439 	struct inode *inode = file_inode(file);
440 
441 	if (!(file->f_mode & FMODE_WRITE))
442 		return 0;
443 
444 	/*
445 	 * Continue regardless of whether file reopen fails or not,
446 	 * because there may be no dirty page.
447 	 */
448 	hmdfs_remote_check_and_reopen(hmdfs_i(inode), file);
449 
450 	/*
451 	 * Wait for wsem here would impact the performance greatly, so we
452 	 * overlap the time to issue as many wbs as we can, expecting async
453 	 * wbs are eliminated afterwards.
454 	 */
455 	filemap_fdatawrite(inode->i_mapping);
456 	down_write(&hmdfs_i(inode)->wpage_sem);
457 	err = filemap_write_and_wait(inode->i_mapping);
458 	up_write(&hmdfs_i(inode)->wpage_sem);
459 	return err;
460 }
461 
hmdfs_file_read_iter_remote(struct kiocb * iocb,struct iov_iter * iter)462 static ssize_t hmdfs_file_read_iter_remote(struct kiocb *iocb,
463 					   struct iov_iter *iter)
464 {
465 	struct file *filp = iocb->ki_filp;
466 	struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp));
467 	struct file_ra_state *ra = NULL;
468 	unsigned int rtt;
469 	int err;
470 	bool tried = false;
471 
472 retry:
473 	err = hmdfs_remote_check_and_reopen(info, filp);
474 	if (err)
475 		return err;
476 
477 	ra = &filp->f_ra;
478 	/* rtt is measured in 10 msecs */
479 	rtt = hmdfs_tcpi_rtt(info->conn) / 10000;
480 	switch (rtt) {
481 	case 0:
482 		break;
483 	case 1:
484 		ra->ra_pages = 256;
485 		break;
486 	case 2:
487 		ra->ra_pages = 512;
488 		break;
489 	default:
490 		ra->ra_pages = 1024;
491 		break;
492 	}
493 
494 	err = generic_file_read_iter(iocb, iter);
495 	if (err < 0 && !tried && hmdfs_remote_need_reopen(info)) {
496 		/* Read from a stale fid, try read again once. */
497 		tried = true;
498 		goto retry;
499 	}
500 
501 	return err;
502 }
503 
hmdfs_is_file_unwritable(const struct hmdfs_inode_info * info,bool check_stash)504 static inline bool hmdfs_is_file_unwritable(const struct hmdfs_inode_info *info,
505 					    bool check_stash)
506 {
507 	return (check_stash && hmdfs_inode_is_stashing(info)) ||
508 	       !hmdfs_is_node_online(info->conn);
509 }
510 
__hmdfs_file_write_iter_remote(struct kiocb * iocb,struct iov_iter * iter,bool check_stash)511 static ssize_t __hmdfs_file_write_iter_remote(struct kiocb *iocb,
512 					      struct iov_iter *iter,
513 					      bool check_stash)
514 {
515 	struct file *filp = iocb->ki_filp;
516 	struct inode *inode = file_inode(filp);
517 	struct hmdfs_inode_info *info = hmdfs_i(inode);
518 	ssize_t ret;
519 
520 	if (hmdfs_is_file_unwritable(info, check_stash))
521 		return -EAGAIN;
522 
523 	ret = hmdfs_remote_check_and_reopen(info, filp);
524 	if (ret)
525 		return ret;
526 
527 	inode_lock(inode);
528 	if (hmdfs_is_file_unwritable(info, check_stash)) {
529 		ret = -EAGAIN;
530 		goto out;
531 	}
532 	ret = generic_write_checks(iocb, iter);
533 	if (ret > 0)
534 		ret = __generic_file_write_iter(iocb, iter);
535 out:
536 	inode_unlock(inode);
537 
538 	if (ret > 0)
539 		ret = generic_write_sync(iocb, ret);
540 	return ret;
541 }
542 
hmdfs_file_write_iter_remote_nocheck(struct kiocb * iocb,struct iov_iter * iter)543 ssize_t hmdfs_file_write_iter_remote_nocheck(struct kiocb *iocb,
544 					     struct iov_iter *iter)
545 {
546 	return __hmdfs_file_write_iter_remote(iocb, iter, false);
547 }
548 
hmdfs_file_write_iter_remote(struct kiocb * iocb,struct iov_iter * iter)549 static ssize_t hmdfs_file_write_iter_remote(struct kiocb *iocb,
550 					    struct iov_iter *iter)
551 {
552 	return __hmdfs_file_write_iter_remote(iocb, iter, true);
553 }
554 
555 /* hmdfs not support mmap write remote file */
hmdfs_page_mkwrite(struct vm_fault * vmf)556 static vm_fault_t hmdfs_page_mkwrite(struct vm_fault *vmf)
557 {
558 	return VM_FAULT_SIGBUS;
559 }
560 
561 static const struct vm_operations_struct hmdfs_file_vm_ops = {
562 	.fault = filemap_fault,
563 	.map_pages = filemap_map_pages,
564 	.page_mkwrite = hmdfs_page_mkwrite,
565 };
566 
hmdfs_file_mmap_remote(struct file * file,struct vm_area_struct * vma)567 static int hmdfs_file_mmap_remote(struct file *file, struct vm_area_struct *vma)
568 {
569 	vma->vm_ops = &hmdfs_file_vm_ops;
570 	file_accessed(file);
571 
572 	return 0;
573 }
574 
hmdfs_file_fsync_remote(struct file * file,loff_t start,loff_t end,int datasync)575 static int hmdfs_file_fsync_remote(struct file *file, loff_t start, loff_t end,
576 				   int datasync)
577 {
578 	struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
579 	struct hmdfs_peer *conn = info->conn;
580 	struct hmdfs_fid fid;
581 	int err;
582 
583 	trace_hmdfs_fsync_enter_remote(conn->sbi, conn->device_id,
584 				       info->remote_ino, datasync);
585 	/*
586 	 * Continue regardless of whether file reopen fails or not,
587 	 * because there may be no dirty page.
588 	 */
589 	hmdfs_remote_check_and_reopen(info, file);
590 
591 	filemap_fdatawrite(file->f_mapping);
592 	down_write(&info->wpage_sem);
593 	err = file_write_and_wait_range(file, start, end);
594 	up_write(&info->wpage_sem);
595 	if (err) {
596 		hmdfs_err("local fsync fail with %d", err);
597 		goto out;
598 	}
599 
600 	hmdfs_remote_fetch_fid(info, &fid);
601 	err = hmdfs_send_fsync(conn, &fid, start, end, datasync);
602 	if (err)
603 		hmdfs_err("send fsync fail with %d", err);
604 
605 out:
606 	trace_hmdfs_fsync_exit_remote(conn->sbi, conn->device_id,
607 				      info->remote_ino,
608 				      get_cmd_timeout(conn->sbi, F_FSYNC), err);
609 
610 	/* Compatible with POSIX retcode */
611 	if (err == -ETIME)
612 		err = -EIO;
613 
614 	return err;
615 }
616 
617 const struct file_operations hmdfs_dev_file_fops_remote = {
618 	.owner = THIS_MODULE,
619 	.llseek = generic_file_llseek,
620 	.read_iter = hmdfs_file_read_iter_remote,
621 	.write_iter = hmdfs_file_write_iter_remote,
622 	.mmap = hmdfs_file_mmap_remote,
623 	.open = hmdfs_file_open_remote,
624 	.release = hmdfs_file_release_remote,
625 	.flush = hmdfs_file_flush,
626 	.fsync = hmdfs_file_fsync_remote,
627 	.splice_read = generic_file_splice_read,
628 	.splice_write = iter_file_splice_write,
629 };
630 
hmdfs_fill_page_zero(struct page * page)631 static void hmdfs_fill_page_zero(struct page *page)
632 {
633 	void *addr = NULL;
634 
635 	addr = kmap(page);
636 	memset(addr, 0, PAGE_SIZE);
637 	kunmap(page);
638 	SetPageUptodate(page);
639 	unlock_page(page);
640 }
641 
hmdfs_readpage_remote(struct file * file,struct page * page)642 static int hmdfs_readpage_remote(struct file *file, struct page *page)
643 {
644 	struct inode *inode = file_inode(file);
645 	struct hmdfs_inode_info *info = hmdfs_i(inode);
646 	loff_t isize = i_size_read(inode);
647 	pgoff_t end_index = (isize - 1) >> PAGE_SHIFT;
648 	struct hmdfs_fid fid;
649 
650 	if (!isize || page->index > end_index) {
651 		hmdfs_fill_page_zero(page);
652 		return 0;
653 	}
654 
655 	if (!isize || page->index > end_index) {
656 		hmdfs_fill_page_zero(page);
657 		return 0;
658 	}
659 
660 	hmdfs_remote_fetch_fid(info, &fid);
661 	return hmdfs_client_readpage(info->conn, &fid, page);
662 }
663 
hmdfs_get_writecount(struct page * page)664 uint32_t hmdfs_get_writecount(struct page *page)
665 {
666 	uint32_t count = 0;
667 	loff_t pos = (loff_t)page->index << HMDFS_PAGE_OFFSET;
668 	struct inode *inode = page->mapping->host;
669 	loff_t size = i_size_read(inode);
670 	/*
671 	 * If page offset is greater than i_size, this is possible when
672 	 * writepage concurrent with truncate. In this case, we don't need to
673 	 * do remote writepage since it'll be truncated after the page is
674 	 * unlocked.
675 	 */
676 	if (pos >= size)
677 		count = 0;
678 	/*
679 	 * If the page about to write is beyond i_size, we can't write beyond
680 	 * i_size because remote file size will be wrong.
681 	 */
682 	else if (size < pos + HMDFS_PAGE_SIZE)
683 		count = size - pos;
684 	/* It's safe to write the whole page */
685 	else
686 		count = HMDFS_PAGE_SIZE;
687 
688 	return count;
689 }
690 
allow_cur_thread_wpage(struct hmdfs_inode_info * info,bool * rsem_held,bool sync_all)691 static bool allow_cur_thread_wpage(struct hmdfs_inode_info *info,
692 				   bool *rsem_held, bool sync_all)
693 {
694 	WARN_ON(!rsem_held);
695 
696 	if (sync_all) {
697 		*rsem_held = false;
698 		return true;
699 	}
700 	*rsem_held = down_read_trylock(&info->wpage_sem);
701 	return *rsem_held;
702 }
703 
704 /**
705  * hmdfs_writepage_remote - writeback a dirty page to remote
706  *
707  * INFO:
708  * When asked to WB_SYNC_ALL, this function should leave with both the page and
709  * the radix tree node clean to achieve close-to-open consitency. Moreover,
710  * this shall never return -EIO to help filemap to iterate all dirty pages.
711  *
712  * INFO:
713  * When asked to WB_SYNC_NONE, this function should be mercy if faults(oom or
714  * bad pipe) happended to enable subsequent r/w & wb.
715  */
hmdfs_writepage_remote(struct page * page,struct writeback_control * wbc)716 static int hmdfs_writepage_remote(struct page *page,
717 				  struct writeback_control *wbc)
718 {
719 	struct inode *inode = page->mapping->host;
720 	struct hmdfs_inode_info *info = hmdfs_i(inode);
721 	struct hmdfs_sb_info *sbi = hmdfs_sb(inode->i_sb);
722 	int ret = 0;
723 	bool rsem_held = false;
724 	bool sync = wbc->sync_mode == WB_SYNC_ALL;
725 	struct hmdfs_writepage_context *param = NULL;
726 
727 	if (!allow_cur_thread_wpage(info, &rsem_held, sync))
728 		goto out_unlock;
729 
730 	set_page_writeback(page);
731 
732 	param = kzalloc(sizeof(*param), GFP_NOFS);
733 	if (!param) {
734 		ret = -ENOMEM;
735 		goto out_endwb;
736 	}
737 
738 	if (sync && hmdfs_usr_sig_pending(current)) {
739 		ClearPageUptodate(page);
740 		goto out_free;
741 	}
742 	param->count = hmdfs_get_writecount(page);
743 	if (!param->count)
744 		goto out_free;
745 	param->rsem_held = rsem_held;
746 	hmdfs_remote_fetch_fid(info, &param->fid);
747 	param->sync_all = sync;
748 	param->caller = current;
749 	get_task_struct(current);
750 	param->page = page;
751 	param->timeout = jiffies + msecs_to_jiffies(sbi->wb_timeout_ms);
752 	INIT_DELAYED_WORK(&param->retry_dwork, hmdfs_remote_writepage_retry);
753 	ret = hmdfs_remote_do_writepage(info->conn, param);
754 	if (likely(!ret))
755 		return 0;
756 
757 	put_task_struct(current);
758 out_free:
759 	kfree(param);
760 out_endwb:
761 	end_page_writeback(page);
762 	if (rsem_held)
763 		up_read(&info->wpage_sem);
764 out_unlock:
765 	if (sync || !hmdfs_need_redirty_page(info, ret)) {
766 		SetPageError(page);
767 		mapping_set_error(page->mapping, ret);
768 	} else {
769 		redirty_page_for_writepage(wbc, page);
770 	}
771 	unlock_page(page);
772 	return ret;
773 }
774 
hmdfs_account_dirty_pages(struct address_space * mapping)775 static void hmdfs_account_dirty_pages(struct address_space *mapping)
776 {
777 	struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info;
778 
779 	if (!sbi->h_wb->dirty_writeback_control)
780 		return;
781 
782 	this_cpu_inc(*sbi->h_wb->bdp_ratelimits);
783 }
784 
hmdfs_write_begin_remote(struct file * file,struct address_space * mapping,loff_t pos,unsigned int len,unsigned int flags,struct page ** pagep,void ** fsdata)785 static int hmdfs_write_begin_remote(struct file *file,
786 				    struct address_space *mapping, loff_t pos,
787 				    unsigned int len, unsigned int flags,
788 				    struct page **pagep, void **fsdata)
789 {
790 	pgoff_t index = ((unsigned long long)pos) >> PAGE_SHIFT;
791 	struct inode *inode = file_inode(file);
792 	struct page *page = NULL;
793 	int ret = 0;
794 
795 start:
796 	page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
797 	if (!page)
798 		return -ENOMEM;
799 	*pagep = page;
800 	wait_on_page_writeback(page);
801 
802 	// If this page will be covered completely.
803 	if (len == HMDFS_PAGE_SIZE || PageUptodate(page))
804 		return 0;
805 
806 	/*
807 	 * If data existed in this page will covered,
808 	 * we just need to clear this page.
809 	 */
810 	if (!((unsigned long long)pos & (HMDFS_PAGE_SIZE - 1)) &&
811 	    (pos + len) >= i_size_read(inode)) {
812 		zero_user_segment(page, len, HMDFS_PAGE_SIZE);
813 		return 0;
814 	}
815 	/*
816 	 * We need readpage before write date to this page.
817 	 */
818 	ret = hmdfs_readpage_remote(file, page);
819 	if (!ret) {
820 		if (PageLocked(page)) {
821 			ret = __lock_page_killable(page);
822 			if (!ret)
823 				unlock_page(page);
824 		}
825 
826 		if (!ret && PageUptodate(page)) {
827 			put_page(page);
828 			goto start;
829 		}
830 		if (!ret)
831 			ret = -EIO;
832 	}
833 	put_page(page);
834 	return ret;
835 }
836 
hmdfs_write_end_remote(struct file * file,struct address_space * mapping,loff_t pos,unsigned int len,unsigned int copied,struct page * page,void * fsdata)837 static int hmdfs_write_end_remote(struct file *file,
838 				  struct address_space *mapping, loff_t pos,
839 				  unsigned int len, unsigned int copied,
840 				  struct page *page, void *fsdata)
841 {
842 	struct inode *inode = page->mapping->host;
843 
844 	if (!PageUptodate(page)) {
845 		if (unlikely(copied != len))
846 			copied = 0;
847 		else
848 			SetPageUptodate(page);
849 	}
850 	if (!copied)
851 		goto unlock_out;
852 
853 	if (!PageDirty(page)) {
854 		hmdfs_account_dirty_pages(mapping);
855 		set_page_dirty(page);
856 	}
857 
858 	if (pos + copied > i_size_read(inode)) {
859 		i_size_write(inode, pos + copied);
860 		hmdfs_i(inode)->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
861 	}
862 unlock_out:
863 	unlock_page(page);
864 	put_page(page);
865 
866 	/* hmdfs private writeback control */
867 	hmdfs_balance_dirty_pages_ratelimited(mapping);
868 	return copied;
869 }
870 
871 const struct address_space_operations hmdfs_dev_file_aops_remote = {
872 	.readpage = hmdfs_readpage_remote,
873 	.write_begin = hmdfs_write_begin_remote,
874 	.write_end = hmdfs_write_end_remote,
875 	.writepage = hmdfs_writepage_remote,
876 	.set_page_dirty = __set_page_dirty_nobuffers,
877 };
878 
hmdfs_set_pos(unsigned long dev_id,unsigned long group_id,unsigned long offset)879 loff_t hmdfs_set_pos(unsigned long dev_id, unsigned long group_id,
880 			    unsigned long offset)
881 {
882 	loff_t pos;
883 
884 	pos = ((loff_t)dev_id << (POS_BIT_NUM - 1 - DEV_ID_BIT_NUM)) +
885 	      ((loff_t)group_id << OFFSET_BIT_NUM) + offset;
886 	if (dev_id)
887 		pos |= ((loff_t)1 << (POS_BIT_NUM - 1));
888 	return pos;
889 }
890 
analysis_dentry_file_from_con(struct hmdfs_sb_info * sbi,struct file * file,struct file * handler,struct dir_context * ctx)891 static int analysis_dentry_file_from_con(struct hmdfs_sb_info *sbi,
892 					 struct file *file,
893 					 struct file *handler,
894 					 struct dir_context *ctx)
895 {
896 	struct hmdfs_dentry_group *dentry_group = NULL;
897 	loff_t pos = ctx->pos;
898 	unsigned long dev_id = (unsigned long)((pos << 1) >> (POS_BIT_NUM - DEV_ID_BIT_NUM));
899 	unsigned long group_id = (unsigned long)((pos << (1 + DEV_ID_BIT_NUM)) >>
900 				 (POS_BIT_NUM - GROUP_ID_BIT_NUM));
901 	loff_t offset = pos & OFFSET_BIT_MASK;
902 	int group_num = 0;
903 	char *dentry_name = NULL;
904 	int iterate_result = 0;
905 	int i, j;
906 
907 	dentry_group = kzalloc(sizeof(*dentry_group), GFP_KERNEL);
908 
909 	if (!dentry_group)
910 		return -ENOMEM;
911 
912 	if (IS_ERR_OR_NULL(handler)) {
913 		kfree(dentry_group);
914 		return -ENOENT;
915 	}
916 
917 	group_num = get_dentry_group_cnt(file_inode(handler));
918 	dentry_name = kzalloc(DENTRY_NAME_MAX_LEN, GFP_KERNEL);
919 	if (!dentry_name) {
920 		kfree(dentry_group);
921 		return -ENOMEM;
922 	}
923 
924 	for (i = group_id; i < group_num; i++) {
925 		int ret = hmdfs_metainfo_read(sbi, handler, dentry_group,
926 					      sizeof(struct hmdfs_dentry_group),
927 					      i);
928 		if (ret != sizeof(struct hmdfs_dentry_group)) {
929 			hmdfs_err("read dentry group failed ret:%d", ret);
930 			goto done;
931 		}
932 
933 		for (j = offset; j < DENTRY_PER_GROUP; j++) {
934 			int len;
935 			int file_type = DT_UNKNOWN;
936 			bool is_continue;
937 
938 			len = le16_to_cpu(dentry_group->nsl[j].namelen);
939 			if (!test_bit_le(j, dentry_group->bitmap) || len == 0)
940 				continue;
941 
942 			memset(dentry_name, 0, DENTRY_NAME_MAX_LEN);
943 			// TODO: Support more file_type
944 			if (S_ISDIR(le16_to_cpu(dentry_group->nsl[j].i_mode)))
945 				file_type = DT_DIR;
946 			else if (S_ISREG(le16_to_cpu(
947 					 dentry_group->nsl[j].i_mode)))
948 				file_type = DT_REG;
949 
950 			strncat(dentry_name, dentry_group->filename[j], len);
951 			pos = hmdfs_set_pos(dev_id, i, j);
952 			is_continue =
953 				dir_emit(ctx, dentry_name, len,
954 					 pos + INUNUMBER_START, file_type);
955 			if (!is_continue) {
956 				ctx->pos = pos;
957 				iterate_result = 1;
958 				goto done;
959 			}
960 		}
961 		offset = 0;
962 	}
963 
964 done:
965 	kfree(dentry_name);
966 	kfree(dentry_group);
967 	return iterate_result;
968 }
969 
hmdfs_dev_readdir_from_con(struct hmdfs_peer * con,struct file * file,struct dir_context * ctx)970 int hmdfs_dev_readdir_from_con(struct hmdfs_peer *con, struct file *file,
971 			       struct dir_context *ctx)
972 {
973 	int iterate_result = 0;
974 
975 	iterate_result = analysis_dentry_file_from_con(
976 		con->sbi, file, file->private_data, ctx);
977 	return iterate_result;
978 }
979 
hmdfs_iterate_remote(struct file * file,struct dir_context * ctx)980 static int hmdfs_iterate_remote(struct file *file, struct dir_context *ctx)
981 {
982 	int err = 0;
983 	loff_t start_pos = ctx->pos;
984 	struct hmdfs_peer *con = NULL;
985 	struct hmdfs_dentry_info *di = hmdfs_d(file->f_path.dentry);
986 	bool is_local = !((ctx->pos) >> (POS_BIT_NUM - 1));
987 	uint64_t dev_id = di->device_id;
988 
989 	if (ctx->pos == -1)
990 		return 0;
991 	if (is_local)
992 		ctx->pos = hmdfs_set_pos(dev_id, 0, 0);
993 
994 	con = hmdfs_lookup_from_devid(file->f_inode->i_sb->s_fs_info, dev_id);
995 	if (con) {
996 		// ctx->pos = 0;
997 		err = con->conn_operations->remote_readdir(con, file, ctx);
998 		if (unlikely(!con)) {
999 			hmdfs_err("con is null");
1000 			goto done;
1001 		}
1002 		peer_put(con);
1003 		if (err)
1004 			goto done;
1005 	}
1006 
1007 done:
1008 	if (err <= 0)
1009 		ctx->pos = -1;
1010 
1011 	trace_hmdfs_iterate_remote(file->f_path.dentry, start_pos, ctx->pos,
1012 				   err);
1013 	return err;
1014 }
1015 
hmdfs_dir_open_remote(struct inode * inode,struct file * file)1016 int hmdfs_dir_open_remote(struct inode *inode, struct file *file)
1017 {
1018 	struct hmdfs_inode_info *info = hmdfs_i(inode);
1019 	struct clearcache_item *cache_item = NULL;
1020 
1021 	if (info->conn && info->conn->version <= USERSPACE_MAX_VER) {
1022 		return 0;
1023 	} else if (info->conn) {
1024 		if (!hmdfs_cache_revalidate(READ_ONCE(info->conn->conn_time),
1025 					    info->conn->device_id,
1026 					    file->f_path.dentry))
1027 			get_remote_dentry_file_sync(file->f_path.dentry,
1028 						    info->conn);
1029 		cache_item = hmdfs_find_cache_item(info->conn->device_id,
1030 						   file->f_path.dentry);
1031 		if (cache_item) {
1032 			file->private_data = cache_item->filp;
1033 			get_file(file->private_data);
1034 			kref_put(&cache_item->ref, release_cache_item);
1035 			return 0;
1036 		}
1037 		return -ENOENT;
1038 	}
1039 	return -ENOENT;
1040 }
1041 
hmdfs_dir_release_remote(struct inode * inode,struct file * file)1042 static int hmdfs_dir_release_remote(struct inode *inode, struct file *file)
1043 {
1044 	if (file->private_data)
1045 		fput(file->private_data);
1046 	file->private_data = NULL;
1047 	return 0;
1048 }
1049 
1050 const struct file_operations hmdfs_dev_dir_ops_remote = {
1051 	.owner = THIS_MODULE,
1052 	.iterate = hmdfs_iterate_remote,
1053 	.open = hmdfs_dir_open_remote,
1054 	.release = hmdfs_dir_release_remote,
1055 	.fsync = __generic_file_fsync,
1056 };
1057