• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * fs/hmdfs/stash.c
4  *
5  * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/fs.h>
10 #include <linux/file.h>
11 #include <linux/dcache.h>
12 #include <linux/namei.h>
13 #include <linux/mount.h>
14 #include <linux/slab.h>
15 #include <linux/list.h>
16 #include <linux/pagemap.h>
17 #include <linux/sched/mm.h>
18 #include <linux/sched/task.h>
19 #include <linux/errseq.h>
20 #include <linux/crc32.h>
21 
22 #include "stash.h"
23 #include "comm/node_cb.h"
24 #include "comm/protocol.h"
25 #include "comm/connection.h"
26 #include "file_remote.h"
27 #include "hmdfs_dentryfile.h"
28 #include "authority/authentication.h"
29 
30 /* Head magic used to identify a stash file */
31 #define HMDFS_STASH_FILE_HEAD_MAGIC 0xF7AB06C3
32 /* Head and path in stash file are aligned with HMDFS_STASH_BLK_SIZE */
33 #define HMDFS_STASH_BLK_SIZE 4096
34 #define HMDFS_STASH_BLK_SHIFT 12
35 #define HMDFS_STASH_PAGE_TO_SECTOR_SHIFT 3
36 #define HMDFS_STASH_DIR_NAME "stash"
37 #define HMDFS_STASH_FMT_DIR_NAME "v1"
38 #define HMDFS_STASH_WORK_DIR_NAME \
39 	(HMDFS_STASH_DIR_NAME "/" HMDFS_STASH_FMT_DIR_NAME)
40 
41 #define HMDFS_STASH_FILE_NAME_LEN 20
42 
43 #define HMDFS_STASH_FLUSH_CNT 2
44 
45 #define HMDFS_STASH_PATH_LEN (HMDFS_CID_SIZE + HMDFS_STASH_FILE_NAME_LEN + 1)
46 
47 struct hmdfs_cache_file_head {
48 	__le32 magic;
49 	__le32 crc_offset;
50 	__le64 ino;
51 	__le64 size;
52 	__le64 blocks;
53 	__le64 last_write_pos;
54 	__le64 ctime;
55 	__le32 ctime_nsec;
56 	__le32 change_detect_cap;
57 	__le64 ichange_count;
58 	__le32 path_offs;
59 	__le32 path_len;
60 	__le32 path_cnt;
61 	__le32 data_offs;
62 	/* Attention: expand new fields in here to compatible with old ver */
63 	__le32 crc32;
64 } __packed;
65 
66 struct hmdfs_stash_work {
67 	struct hmdfs_peer *conn;
68 	struct list_head *list;
69 	struct work_struct work;
70 	struct completion done;
71 };
72 
73 struct hmdfs_inode_tbl {
74 	unsigned int cnt;
75 	unsigned int max;
76 	uint64_t inodes[0];
77 };
78 
79 struct hmdfs_stash_dir_context {
80 	struct dir_context dctx;
81 	char name[NAME_MAX + 1];
82 	struct hmdfs_inode_tbl *tbl;
83 };
84 
85 struct hmdfs_restore_stats {
86 	unsigned int succeed;
87 	unsigned int fail;
88 	unsigned int keep;
89 	unsigned long long ok_pages;
90 	unsigned long long fail_pages;
91 };
92 
93 struct hmdfs_stash_stats {
94 	unsigned int succeed;
95 	unsigned int donothing;
96 	unsigned int fail;
97 	unsigned long long ok_pages;
98 	unsigned long long fail_pages;
99 };
100 
101 struct hmdfs_file_restore_ctx {
102 	struct hmdfs_peer *conn;
103 	struct path src_dir_path;
104 	struct path dst_root_path;
105 	char *dst;
106 	char *page;
107 	struct file *src_filp;
108 	uint64_t inum;
109 	uint64_t pages;
110 	unsigned int seq;
111 	unsigned int data_offs;
112 	/* output */
113 	bool keep;
114 };
115 
116 struct hmdfs_copy_args {
117 	struct file *src;
118 	struct file *dst;
119 	void *buf;
120 	size_t buf_len;
121 	unsigned int seq;
122 	unsigned int data_offs;
123 	uint64_t inum;
124 };
125 
126 struct hmdfs_copy_ctx {
127 	struct hmdfs_copy_args args;
128 	loff_t src_pos;
129 	loff_t dst_pos;
130 	/* output */
131 	size_t copied;
132 	bool eof;
133 };
134 
135 struct hmdfs_rebuild_stats {
136 	unsigned int succeed;
137 	unsigned int total;
138 	unsigned int fail;
139 	unsigned int invalid;
140 };
141 
142 struct hmdfs_check_work {
143 	struct hmdfs_peer *conn;
144 	struct work_struct work;
145 	struct completion done;
146 };
147 
148 typedef int (*stash_operation_func)(struct hmdfs_peer *,
149 				    unsigned int,
150 				    struct path *,
151 				    const struct hmdfs_inode_tbl *,
152 				    void *);
153 
hmdfs_do_vfs_mkdir(struct dentry * parent,const char * name,int namelen,umode_t mode)154 static struct dentry *hmdfs_do_vfs_mkdir(struct dentry *parent,
155 					 const char *name, int namelen,
156 					 umode_t mode)
157 {
158 	struct inode *dir = d_inode(parent);
159 	struct dentry *child = NULL;
160 	int err;
161 
162 	inode_lock_nested(dir, I_MUTEX_PARENT);
163 
164 	child = lookup_one_len(name, parent, namelen);
165 	if (IS_ERR(child))
166 		goto out;
167 
168 	if (d_is_positive(child)) {
169 		if (d_can_lookup(child))
170 			goto out;
171 
172 		dput(child);
173 		child = ERR_PTR(-EINVAL);
174 		goto out;
175 	}
176 
177 	err = vfs_mkdir(dir, child, mode);
178 	if (err) {
179 		dput(child);
180 		child = ERR_PTR(err);
181 		goto out;
182 	}
183 
184 out:
185 	inode_unlock(dir);
186 	return child;
187 }
188 
hmdfs_stash_new_work_dir(struct dentry * parent)189 struct dentry *hmdfs_stash_new_work_dir(struct dentry *parent)
190 {
191 	struct dentry *base = NULL;
192 	struct dentry *work = NULL;
193 
194 	base = hmdfs_do_vfs_mkdir(parent, HMDFS_STASH_DIR_NAME,
195 				   strlen(HMDFS_STASH_DIR_NAME), 0700);
196 	if (IS_ERR(base))
197 		return base;
198 
199 	work = hmdfs_do_vfs_mkdir(base, HMDFS_STASH_FMT_DIR_NAME,
200 				  strlen(HMDFS_STASH_FMT_DIR_NAME), 0700);
201 	dput(base);
202 
203 	return work;
204 }
205 
hmdfs_new_stash_file(struct path * d_path,const char * cid)206 static struct file *hmdfs_new_stash_file(struct path *d_path, const char *cid)
207 {
208 	struct dentry *parent = NULL;
209 	struct dentry *child = NULL;
210 	struct file *filp = NULL;
211 	struct path stash;
212 	int err;
213 
214 	parent = hmdfs_do_vfs_mkdir(d_path->dentry, cid, strlen(cid), 0700);
215 	if (IS_ERR(parent)) {
216 		err = PTR_ERR(parent);
217 		hmdfs_err("mkdir error %d", err);
218 		goto mkdir_err;
219 	}
220 
221 	child = vfs_tmpfile(parent, S_IFREG | 0600, 0);
222 	if (IS_ERR(child)) {
223 		err = PTR_ERR(child);
224 		hmdfs_err("new stash file error %d", err);
225 		goto tmpfile_err;
226 	}
227 
228 	stash.mnt = d_path->mnt;
229 	stash.dentry = child;
230 	filp = dentry_open(&stash, O_LARGEFILE | O_WRONLY, current_cred());
231 	if (IS_ERR(filp)) {
232 		err = PTR_ERR(filp);
233 		hmdfs_err("open stash file error %d", err);
234 		goto open_err;
235 	}
236 
237 	dput(child);
238 	dput(parent);
239 
240 	return filp;
241 
242 open_err:
243 	dput(child);
244 tmpfile_err:
245 	dput(parent);
246 mkdir_err:
247 	return ERR_PTR(err);
248 }
249 
hmdfs_is_dir(struct dentry * child)250 static inline bool hmdfs_is_dir(struct dentry *child)
251 {
252 	return d_is_positive(child) && d_can_lookup(child);
253 }
254 
hmdfs_is_reg(struct dentry * child)255 static inline bool hmdfs_is_reg(struct dentry *child)
256 {
257 	return d_is_positive(child) && d_is_reg(child);
258 }
259 
hmdfs_set_stash_file_head(const struct hmdfs_cache_info * cache,uint64_t ino,struct hmdfs_cache_file_head * head)260 static void hmdfs_set_stash_file_head(const struct hmdfs_cache_info *cache,
261 				      uint64_t ino,
262 				      struct hmdfs_cache_file_head *head)
263 {
264 	long long blocks;
265 	unsigned int crc_offset;
266 
267 	memset(head, 0, sizeof(*head));
268 	head->magic = cpu_to_le32(HMDFS_STASH_FILE_HEAD_MAGIC);
269 	head->ino = cpu_to_le64(ino);
270 	head->size = cpu_to_le64(i_size_read(file_inode(cache->cache_file)));
271 	blocks = atomic64_read(&cache->written_pgs) <<
272 			       HMDFS_STASH_PAGE_TO_SECTOR_SHIFT;
273 	head->blocks = cpu_to_le64(blocks);
274 	head->path_offs = cpu_to_le32(cache->path_offs);
275 	head->path_len = cpu_to_le32(cache->path_len);
276 	head->path_cnt = cpu_to_le32(cache->path_cnt);
277 	head->data_offs = cpu_to_le32(cache->data_offs);
278 	crc_offset = offsetof(struct hmdfs_cache_file_head, crc32);
279 	head->crc_offset = cpu_to_le32(crc_offset);
280 	head->crc32 = cpu_to_le32(crc32(0, head, crc_offset));
281 }
282 
hmdfs_flush_stash_file_metadata(struct hmdfs_inode_info * info)283 static int hmdfs_flush_stash_file_metadata(struct hmdfs_inode_info *info)
284 {
285 	struct hmdfs_cache_info *cache = NULL;
286 	struct hmdfs_peer *conn = info->conn;
287 	struct hmdfs_cache_file_head cache_head;
288 	size_t written;
289 	loff_t pos;
290 	unsigned int head_size;
291 
292 	/* No metadata if no cache file info */
293 	cache = info->cache;
294 	if (!cache)
295 		return -EINVAL;
296 
297 	if (strlen(cache->path) == 0) {
298 		long long to_write_pgs = atomic64_read(&cache->to_write_pgs);
299 
300 		/* Nothing to stash. No need to flush meta data. */
301 		if (to_write_pgs == 0)
302 			return 0;
303 
304 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx lost %lld pages due to no path",
305 			  conn->owner, conn->device_id,
306 			  info->remote_ino, to_write_pgs);
307 		return -EINVAL;
308 	}
309 
310 	hmdfs_set_stash_file_head(cache, info->remote_ino, &cache_head);
311 
312 	/* Write head */
313 	pos = 0;
314 	head_size = sizeof(cache_head);
315 	written = kernel_write(cache->cache_file, &cache_head, head_size, &pos);
316 	if (written != head_size) {
317 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write head len %u err %zd",
318 			   conn->owner, conn->device_id, info->remote_ino,
319 			   head_size, written);
320 		return -EIO;
321 	}
322 	/* Write path */
323 	pos = (loff_t)cache->path_offs << HMDFS_STASH_BLK_SHIFT;
324 	written = kernel_write(cache->cache_file, cache->path, cache->path_len,
325 			       &pos);
326 	if (written != cache->path_len) {
327 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write path len %u err %zd",
328 			   conn->owner, conn->device_id, info->remote_ino,
329 			   cache->path_len, written);
330 		return -EIO;
331 	}
332 
333 	return 0;
334 }
335 
336 /* Mainly from inode_wait_for_writeback() */
hmdfs_wait_remote_writeback_once(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)337 static void hmdfs_wait_remote_writeback_once(struct hmdfs_peer *conn,
338 					     struct hmdfs_inode_info *info)
339 {
340 	struct inode *inode = &info->vfs_inode;
341 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
342 	wait_queue_head_t *wq_head = NULL;
343 	bool in_sync = false;
344 
345 	spin_lock(&inode->i_lock);
346 	in_sync = inode->i_state & I_SYNC;
347 	spin_unlock(&inode->i_lock);
348 
349 	if (!in_sync)
350 		return;
351 
352 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx wait for wb once",
353 		   conn->owner, conn->device_id, info->remote_ino);
354 
355 	wq_head = bit_waitqueue(&inode->i_state, __I_SYNC);
356 	__wait_on_bit(wq_head, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
357 }
358 
hmdfs_reset_remote_write_err(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)359 static void hmdfs_reset_remote_write_err(struct hmdfs_peer *conn,
360 					 struct hmdfs_inode_info *info)
361 {
362 	struct address_space *mapping = info->vfs_inode.i_mapping;
363 	int flags_err;
364 	errseq_t old;
365 	int wb_err;
366 
367 	flags_err = filemap_check_errors(mapping);
368 
369 	old = errseq_sample(&mapping->wb_err);
370 	wb_err = errseq_check_and_advance(&mapping->wb_err, &old);
371 	if (flags_err || wb_err)
372 		hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx wb error %d %d before stash",
373 			      conn->owner, conn->device_id, info->remote_ino,
374 			      flags_err, wb_err);
375 }
376 
hmdfs_is_mapping_clean(struct address_space * mapping)377 static bool hmdfs_is_mapping_clean(struct address_space *mapping)
378 {
379 	bool clean = false;
380 
381 	/* b93b016313b3b ("page cache: use xa_lock") introduces i_pages */
382 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
383 	xa_lock_irq(&mapping->i_pages);
384 #else
385 	spin_lock_irq(&mapping->tree_lock);
386 #endif
387 	clean = !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
388 		!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
389 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
390 	xa_unlock_irq(&mapping->i_pages);
391 #else
392 	spin_unlock_irq(&mapping->tree_lock);
393 #endif
394 	return clean;
395 }
396 
hmdfs_flush_stash_file_data(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)397 static int hmdfs_flush_stash_file_data(struct hmdfs_peer *conn,
398 				       struct hmdfs_inode_info *info)
399 {
400 	struct inode *inode = &info->vfs_inode;
401 	struct address_space *mapping = inode->i_mapping;
402 	bool all_clean = true;
403 	int err = 0;
404 	int i;
405 
406 	/* Wait for the completion of write syscall */
407 	inode_lock(inode);
408 	inode_unlock(inode);
409 
410 	all_clean = hmdfs_is_mapping_clean(mapping);
411 	if (all_clean) {
412 		hmdfs_reset_remote_write_err(conn, info);
413 		return 0;
414 	}
415 
416 	/*
417 	 * No-sync_all writeback during offline may have not seen
418 	 * the setting of stash_status as HMDFS_REMOTE_INODE_STASHING
419 	 * and will call mapping_set_error() after we just reset
420 	 * the previous error. So waiting for these writeback once,
421 	 * and the following writeback will do local write.
422 	 */
423 	hmdfs_wait_remote_writeback_once(conn, info);
424 
425 	/* Need to clear previous error ? */
426 	hmdfs_reset_remote_write_err(conn, info);
427 
428 	/*
429 	 * 1. dirty page: do write back
430 	 * 2. writeback page: wait for its completion
431 	 * 3. writeback -> redirty page: do filemap_write_and_wait()
432 	 *    twice, so 2th writeback should not allow
433 	 *    writeback -> redirty transition
434 	 */
435 	for (i = 0; i < HMDFS_STASH_FLUSH_CNT; i++) {
436 		err = filemap_write_and_wait(mapping);
437 		if (err) {
438 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx #%d stash flush error %d",
439 				  conn->owner, conn->device_id,
440 				  info->remote_ino, i, err);
441 			return err;
442 		}
443 	}
444 
445 	if (!hmdfs_is_mapping_clean(mapping))
446 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx is still dirty dt %d wb %d",
447 			  conn->owner, conn->device_id, info->remote_ino,
448 			  !!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY),
449 			  !!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK));
450 
451 	return 0;
452 }
453 
hmdfs_flush_stash_file(struct hmdfs_inode_info * info)454 static int hmdfs_flush_stash_file(struct hmdfs_inode_info *info)
455 {
456 	int err;
457 
458 	err = hmdfs_flush_stash_file_data(info->conn, info);
459 	if (!err)
460 		err = hmdfs_flush_stash_file_metadata(info);
461 
462 	return err;
463 }
464 
hmdfs_enable_stash_file(struct hmdfs_inode_info * info,struct dentry * stash)465 static int hmdfs_enable_stash_file(struct hmdfs_inode_info *info,
466 				   struct dentry *stash)
467 {
468 	char name[HMDFS_STASH_FILE_NAME_LEN];
469 	struct dentry *parent = NULL;
470 	struct inode *dir = NULL;
471 	struct dentry *child = NULL;
472 	int err = 0;
473 	bool retried = false;
474 
475 	snprintf(name, sizeof(name), "0x%llx", info->remote_ino);
476 
477 	parent = lock_parent(stash);
478 	dir = d_inode(parent);
479 
480 lookup_again:
481 	child = lookup_one_len(name, parent, strlen(name));
482 	if (IS_ERR(child)) {
483 		err = PTR_ERR(child);
484 		child = NULL;
485 		hmdfs_err("lookup %s err %d", name, err);
486 		goto out;
487 	}
488 
489 	if (d_is_positive(child)) {
490 		hmdfs_warning("%s exists (mode 0%o)",
491 			      name, d_inode(child)->i_mode);
492 
493 		err = vfs_unlink(dir, child, NULL);
494 		if (err) {
495 			hmdfs_err("unlink %s err %d", name, err);
496 			goto out;
497 		}
498 		if (retried) {
499 			err = -EEXIST;
500 			goto out;
501 		}
502 
503 		retried = true;
504 		dput(child);
505 		goto lookup_again;
506 	}
507 
508 	err = vfs_link(stash, dir, child, NULL);
509 	if (err) {
510 		hmdfs_err("link stash file to %s err %d", name, err);
511 		goto out;
512 	}
513 
514 out:
515 	unlock_dir(parent);
516 	if (child)
517 		dput(child);
518 
519 	return err;
520 }
521 
522 /* Return 1 if stash is done, 0 if nothing is stashed */
hmdfs_close_stash_file(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)523 static int hmdfs_close_stash_file(struct hmdfs_peer *conn,
524 				  struct hmdfs_inode_info *info)
525 {
526 	struct file *cache_file = info->cache->cache_file;
527 	struct dentry *c_dentry = file_dentry(cache_file);
528 	struct inode *c_inode = d_inode(c_dentry);
529 	long long to_write_pgs = atomic64_read(&info->cache->to_write_pgs);
530 	int err;
531 
532 	hmdfs_info("peer 0x%x:0x%llx inode 0x%llx stashed bytes %lld pages %lld",
533 		   conn->owner, conn->device_id, info->remote_ino,
534 		   i_size_read(c_inode), to_write_pgs);
535 
536 	if (to_write_pgs == 0)
537 		return 0;
538 
539 	err = vfs_fsync(cache_file, 0);
540 	if (!err)
541 		err = hmdfs_enable_stash_file(info, c_dentry);
542 	else
543 		hmdfs_err("fsync stash file err %d", err);
544 
545 	return err < 0 ? err : 1;
546 }
547 
hmdfs_del_file_cache(struct hmdfs_cache_info * cache)548 static void hmdfs_del_file_cache(struct hmdfs_cache_info *cache)
549 {
550 	if (!cache)
551 		return;
552 
553 	fput(cache->cache_file);
554 	kfree(cache->path_buf);
555 	kfree(cache);
556 }
557 
558 static struct hmdfs_cache_info *
hmdfs_new_file_cache(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)559 hmdfs_new_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)
560 {
561 	struct hmdfs_cache_info *cache = NULL;
562 	struct dentry *stash_dentry = NULL;
563 	int err;
564 
565 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
566 	if (!cache)
567 		return ERR_PTR(-ENOMEM);
568 
569 	atomic64_set(&cache->to_write_pgs, 0);
570 	atomic64_set(&cache->written_pgs, 0);
571 	cache->path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
572 	if (!cache->path_buf) {
573 		err = -ENOMEM;
574 		goto free_cache;
575 	}
576 
577 	/* Need to handle "hardlink" ? */
578 	stash_dentry = d_find_any_alias(&info->vfs_inode);
579 	if (stash_dentry) {
580 		/* Needs full path in hmdfs, will be a device-view path */
581 		cache->path = dentry_path_raw(stash_dentry, cache->path_buf,
582 					      PATH_MAX);
583 		dput(stash_dentry);
584 		if (IS_ERR(cache->path)) {
585 			err = PTR_ERR(cache->path);
586 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx gen path err %d",
587 				  conn->owner, conn->device_id,
588 				  info->remote_ino, err);
589 			goto free_path;
590 		}
591 	} else {
592 		/* Write-opened file was closed before finding dentry */
593 		hmdfs_info("peer 0x%x:0x%llx inode 0x%llx no dentry found",
594 			   conn->owner, conn->device_id, info->remote_ino);
595 		cache->path_buf[0] = '\0';
596 		cache->path = cache->path_buf;
597 	}
598 
599 	cache->path_cnt = 1;
600 	cache->path_len = strlen(cache->path) + 1;
601 	cache->path_offs = DIV_ROUND_UP(sizeof(struct hmdfs_cache_file_head),
602 					HMDFS_STASH_BLK_SIZE);
603 	cache->data_offs = cache->path_offs + DIV_ROUND_UP(cache->path_len,
604 					HMDFS_STASH_BLK_SIZE);
605 	cache->cache_file = hmdfs_new_stash_file(&conn->sbi->stash_work_dir,
606 						 conn->cid);
607 	if (IS_ERR(cache->cache_file)) {
608 		err = PTR_ERR(cache->cache_file);
609 		goto free_path;
610 	}
611 
612 	return cache;
613 
614 free_path:
615 	kfree(cache->path_buf);
616 free_cache:
617 	kfree(cache);
618 	return ERR_PTR(err);
619 }
620 
hmdfs_init_stash_file_cache(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)621 static void hmdfs_init_stash_file_cache(struct hmdfs_peer *conn,
622 					struct hmdfs_inode_info *info)
623 {
624 	struct hmdfs_cache_info *cache = NULL;
625 
626 	cache = hmdfs_new_file_cache(conn, info);
627 	if (IS_ERR(cache))
628 		/*
629 		 * Continue even creating stash info failed.
630 		 * We need to ensure there is no dirty pages
631 		 * after stash completes
632 		 */
633 		cache = NULL;
634 
635 	/* Make write() returns */
636 	spin_lock(&info->stash_lock);
637 	info->cache = cache;
638 	info->stash_status = HMDFS_REMOTE_INODE_STASHING;
639 	spin_unlock(&info->stash_lock);
640 }
641 
hmdfs_update_stash_stats(struct hmdfs_stash_stats * stats,const struct hmdfs_cache_info * cache,int err)642 static void hmdfs_update_stash_stats(struct hmdfs_stash_stats *stats,
643 				     const struct hmdfs_cache_info *cache,
644 				     int err)
645 {
646 	unsigned long long ok_pages, fail_pages;
647 
648 	if (cache) {
649 		ok_pages = err > 0 ? atomic64_read(&cache->written_pgs) : 0;
650 		fail_pages = atomic64_read(&cache->to_write_pgs) - ok_pages;
651 		stats->ok_pages += ok_pages;
652 		stats->fail_pages += fail_pages;
653 	}
654 
655 	if (err > 0)
656 		stats->succeed++;
657 	else if (!err)
658 		stats->donothing++;
659 	else
660 		stats->fail++;
661 }
662 
663 /* Return 1 if stash is done, 0 if nothing is stashed */
hmdfs_stash_remote_inode(struct hmdfs_inode_info * info,struct hmdfs_stash_stats * stats)664 static int hmdfs_stash_remote_inode(struct hmdfs_inode_info *info,
665 				    struct hmdfs_stash_stats *stats)
666 {
667 	struct hmdfs_cache_info *cache = info->cache;
668 	struct hmdfs_peer *conn = info->conn;
669 	unsigned int status;
670 	int err = 0;
671 
672 	hmdfs_info("stash peer 0x%x:0x%llx ino 0x%llx",
673 		   conn->owner, conn->device_id, info->remote_ino);
674 
675 	err = hmdfs_flush_stash_file(info);
676 	if (!err)
677 		err = hmdfs_close_stash_file(conn, info);
678 
679 	if (err <= 0)
680 		set_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
681 	status = err > 0 ? HMDFS_REMOTE_INODE_RESTORING :
682 			   HMDFS_REMOTE_INODE_NONE;
683 	spin_lock(&info->stash_lock);
684 	info->cache = NULL;
685 	/*
686 	 * Use smp_store_release() to ensure order between HMDFS_FID_NEED_OPEN
687 	 * and HMDFS_REMOTE_INODE_NONE.
688 	 */
689 	smp_store_release(&info->stash_status, status);
690 	spin_unlock(&info->stash_lock);
691 
692 	hmdfs_update_stash_stats(stats, cache, err);
693 	hmdfs_del_file_cache(cache);
694 
695 	return err;
696 }
697 
hmdfs_init_cache_for_stash_files(struct hmdfs_peer * conn,struct list_head * list)698 static void hmdfs_init_cache_for_stash_files(struct hmdfs_peer *conn,
699 					     struct list_head *list)
700 {
701 	const struct cred *old_cred = NULL;
702 	struct hmdfs_inode_info *info = NULL;
703 
704 	/* For file creation under stash_work_dir */
705 	old_cred = hmdfs_override_creds(conn->sbi->cred);
706 	list_for_each_entry(info, list, stash_node)
707 		hmdfs_init_stash_file_cache(conn, info);
708 	hmdfs_revert_creds(old_cred);
709 }
710 
hmdfs_init_stash_cache_work_fn(struct work_struct * base)711 static void hmdfs_init_stash_cache_work_fn(struct work_struct *base)
712 {
713 	struct hmdfs_stash_work *work =
714 		container_of(base, struct hmdfs_stash_work, work);
715 
716 	hmdfs_init_cache_for_stash_files(work->conn, work->list);
717 	complete(&work->done);
718 }
719 
hmdfs_init_cache_for_stash_files_by_work(struct hmdfs_peer * conn,struct list_head * list)720 static void hmdfs_init_cache_for_stash_files_by_work(struct hmdfs_peer *conn,
721 						     struct list_head *list)
722 {
723 	struct hmdfs_stash_work work = {
724 		.conn = conn,
725 		.list = list,
726 		.done = COMPLETION_INITIALIZER_ONSTACK(work.done),
727 	};
728 
729 	INIT_WORK_ONSTACK(&work.work, hmdfs_init_stash_cache_work_fn);
730 	schedule_work(&work.work);
731 	wait_for_completion(&work.done);
732 }
733 
hmdfs_stash_fetch_ready_files(struct hmdfs_peer * conn,bool check,struct list_head * list)734 static void hmdfs_stash_fetch_ready_files(struct hmdfs_peer *conn,
735 					  bool check, struct list_head *list)
736 {
737 	struct hmdfs_inode_info *info = NULL;
738 
739 	spin_lock(&conn->wr_opened_inode_lock);
740 	list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) {
741 		int status;
742 
743 		/* Paired with *_release() in hmdfs_reset_stashed_inode() */
744 		status = smp_load_acquire(&info->stash_status);
745 		if (status == HMDFS_REMOTE_INODE_NONE) {
746 			list_add_tail(&info->stash_node, list);
747 			/*
748 			 * Prevent close() removing the inode from
749 			 * writeable-opened inode list
750 			 */
751 			hmdfs_remote_add_wr_opened_inode_nolock(conn, info);
752 			/* Prevent the inode from eviction */
753 			ihold(&info->vfs_inode);
754 		} else if (check && status == HMDFS_REMOTE_INODE_STASHING) {
755 			hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unexpected stash status %d",
756 				      conn->owner, conn->device_id,
757 				      info->remote_ino, status);
758 		}
759 	}
760 	spin_unlock(&conn->wr_opened_inode_lock);
761 }
762 
hmdfs_stash_offline_prepare(struct hmdfs_peer * conn,int evt,unsigned int seq)763 static void hmdfs_stash_offline_prepare(struct hmdfs_peer *conn, int evt,
764 					unsigned int seq)
765 {
766 	LIST_HEAD(preparing);
767 
768 	if (!hmdfs_is_stash_enabled(conn->sbi))
769 		return;
770 
771 	mutex_lock(&conn->offline_cb_lock);
772 
773 	hmdfs_stash_fetch_ready_files(conn, true, &preparing);
774 
775 	if (list_empty(&preparing))
776 		goto out;
777 
778 	hmdfs_init_cache_for_stash_files_by_work(conn, &preparing);
779 out:
780 	mutex_unlock(&conn->offline_cb_lock);
781 }
782 
hmdfs_track_inode_locked(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)783 static void hmdfs_track_inode_locked(struct hmdfs_peer *conn,
784 				     struct hmdfs_inode_info *info)
785 {
786 	spin_lock(&conn->stashed_inode_lock);
787 	list_add_tail(&info->stash_node, &conn->stashed_inode_list);
788 	conn->stashed_inode_nr++;
789 	spin_unlock(&conn->stashed_inode_lock);
790 }
791 
792 static void
hmdfs_update_peer_stash_stats(struct hmdfs_stash_statistics * stash_stats,const struct hmdfs_stash_stats * stats)793 hmdfs_update_peer_stash_stats(struct hmdfs_stash_statistics *stash_stats,
794 			      const struct hmdfs_stash_stats *stats)
795 {
796 	stash_stats->cur_ok = stats->succeed;
797 	stash_stats->cur_nothing = stats->donothing;
798 	stash_stats->cur_fail = stats->fail;
799 	stash_stats->total_ok += stats->succeed;
800 	stash_stats->total_nothing += stats->donothing;
801 	stash_stats->total_fail += stats->fail;
802 	stash_stats->ok_pages += stats->ok_pages;
803 	stash_stats->fail_pages += stats->fail_pages;
804 }
805 
hmdfs_stash_remote_inodes(struct hmdfs_peer * conn,struct list_head * list)806 static void hmdfs_stash_remote_inodes(struct hmdfs_peer *conn,
807 				      struct list_head *list)
808 {
809 	const struct cred *old_cred = NULL;
810 	struct hmdfs_inode_info *info = NULL;
811 	struct hmdfs_inode_info *next = NULL;
812 	struct hmdfs_stash_stats stats;
813 
814 	/* For file creation, write and relink under stash_work_dir */
815 	old_cred = hmdfs_override_creds(conn->sbi->cred);
816 
817 	memset(&stats, 0, sizeof(stats));
818 	list_for_each_entry_safe(info, next, list, stash_node) {
819 		int err;
820 
821 		list_del_init(&info->stash_node);
822 
823 		err = hmdfs_stash_remote_inode(info, &stats);
824 		if (err > 0)
825 			hmdfs_track_inode_locked(conn, info);
826 
827 		hmdfs_remote_del_wr_opened_inode(conn, info);
828 		if (err <= 0)
829 			iput(&info->vfs_inode);
830 	}
831 	hmdfs_revert_creds(old_cred);
832 
833 	hmdfs_update_peer_stash_stats(&conn->stats.stash, &stats);
834 	hmdfs_info("peer 0x%x:0x%llx total stashed %u cur ok %u none %u fail %u",
835 		   conn->owner, conn->device_id, conn->stashed_inode_nr,
836 		   stats.succeed, stats.donothing, stats.fail);
837 }
838 
hmdfs_stash_offline_do_stash(struct hmdfs_peer * conn,int evt,unsigned int seq)839 static void hmdfs_stash_offline_do_stash(struct hmdfs_peer *conn, int evt,
840 					 unsigned int seq)
841 {
842 	struct hmdfs_inode_info *info = NULL;
843 	LIST_HEAD(preparing);
844 	LIST_HEAD(stashing);
845 
846 	if (!hmdfs_is_stash_enabled(conn->sbi))
847 		return;
848 
849 	/* release seq_lock to prevent blocking no-offline sync cb */
850 	mutex_unlock(&conn->seq_lock);
851 	/* acquire offline_cb_lock to serialized with offline sync cb */
852 	mutex_lock(&conn->offline_cb_lock);
853 
854 	hmdfs_stash_fetch_ready_files(conn, false, &preparing);
855 	if (!list_empty(&preparing))
856 		hmdfs_init_cache_for_stash_files(conn, &preparing);
857 
858 	spin_lock(&conn->wr_opened_inode_lock);
859 	list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) {
860 		int status = READ_ONCE(info->stash_status);
861 
862 		if (status == HMDFS_REMOTE_INODE_STASHING)
863 			list_add_tail(&info->stash_node, &stashing);
864 	}
865 	spin_unlock(&conn->wr_opened_inode_lock);
866 
867 	if (list_empty(&stashing))
868 		goto unlock;
869 
870 	hmdfs_stash_remote_inodes(conn, &stashing);
871 
872 unlock:
873 	mutex_unlock(&conn->offline_cb_lock);
874 	mutex_lock(&conn->seq_lock);
875 }
876 
877 static struct hmdfs_inode_info *
hmdfs_lookup_stash_inode(struct hmdfs_peer * conn,uint64_t inum)878 hmdfs_lookup_stash_inode(struct hmdfs_peer *conn, uint64_t inum)
879 {
880 	struct hmdfs_inode_info *info = NULL;
881 
882 	list_for_each_entry(info, &conn->stashed_inode_list, stash_node) {
883 		if (info->remote_ino == inum)
884 			return info;
885 	}
886 
887 	return NULL;
888 }
889 
hmdfs_untrack_stashed_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)890 static void hmdfs_untrack_stashed_inode(struct hmdfs_peer *conn,
891 					struct hmdfs_inode_info *info)
892 {
893 	list_del_init(&info->stash_node);
894 	iput(&info->vfs_inode);
895 
896 	conn->stashed_inode_nr--;
897 }
898 
hmdfs_reset_stashed_inode(struct hmdfs_peer * conn,struct hmdfs_inode_info * info)899 static void hmdfs_reset_stashed_inode(struct hmdfs_peer *conn,
900 				      struct hmdfs_inode_info *info)
901 {
902 	struct inode *ino = &info->vfs_inode;
903 
904 	/*
905 	 * For updating stash_status after iput()
906 	 * in hmdfs_untrack_stashed_inode()
907 	 */
908 	ihold(ino);
909 	hmdfs_untrack_stashed_inode(conn, info);
910 	/*
911 	 * Ensure the order of stash_node and stash_status:
912 	 * only update stash_status to NONE after removal of
913 	 * stash_node is completed.
914 	 */
915 	smp_store_release(&info->stash_status,
916 			  HMDFS_REMOTE_INODE_NONE);
917 	iput(ino);
918 }
919 
hmdfs_drop_stashed_inodes(struct hmdfs_peer * conn)920 static void hmdfs_drop_stashed_inodes(struct hmdfs_peer *conn)
921 {
922 	struct hmdfs_inode_info *info = NULL;
923 	struct hmdfs_inode_info *next = NULL;
924 
925 	if (list_empty(&conn->stashed_inode_list))
926 		return;
927 
928 	hmdfs_warning("peer 0x%x:0x%llx drop unrestorable file %u",
929 		      conn->owner, conn->device_id, conn->stashed_inode_nr);
930 
931 	list_for_each_entry_safe(info, next,
932 				 &conn->stashed_inode_list, stash_node) {
933 		hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unrestorable status %u",
934 			      conn->owner, conn->device_id, info->remote_ino,
935 			      READ_ONCE(info->stash_status));
936 
937 		hmdfs_reset_stashed_inode(conn, info);
938 	}
939 }
940 
hmdfs_open_stash_dir(struct path * d_path,const char * cid)941 static struct file *hmdfs_open_stash_dir(struct path *d_path, const char *cid)
942 {
943 	int err = 0;
944 	struct dentry *parent = d_path->dentry;
945 	struct inode *dir = d_inode(parent);
946 	struct dentry *child = NULL;
947 	struct path peer_path;
948 	struct file *filp = NULL;
949 
950 	inode_lock_nested(dir, I_MUTEX_PARENT);
951 	child = lookup_one_len(cid, parent, strlen(cid));
952 	if (!IS_ERR(child)) {
953 		if (!hmdfs_is_dir(child)) {
954 			if (d_is_positive(child)) {
955 				hmdfs_err("invalid stash dir mode 0%o", d_inode(child)->i_mode);
956 				err = -EINVAL;
957 			} else {
958 				err = -ENOENT;
959 			}
960 			dput(child);
961 		}
962 	} else {
963 		err = PTR_ERR(child);
964 		hmdfs_err("lookup stash dir err %d", err);
965 	}
966 	inode_unlock(dir);
967 
968 	if (err)
969 		return ERR_PTR(err);
970 
971 	peer_path.mnt = d_path->mnt;
972 	peer_path.dentry = child;
973 	filp = dentry_open(&peer_path, O_RDONLY | O_DIRECTORY, current_cred());
974 	if (IS_ERR(filp))
975 		hmdfs_err("open err %d", (int)PTR_ERR(filp));
976 
977 	dput(child);
978 
979 	return filp;
980 }
981 
hmdfs_new_inode_tbl(struct hmdfs_inode_tbl ** tbl)982 static int hmdfs_new_inode_tbl(struct hmdfs_inode_tbl **tbl)
983 {
984 	struct hmdfs_inode_tbl *new = NULL;
985 
986 	new = kmalloc(PAGE_SIZE, GFP_KERNEL);
987 	if (!new)
988 		return -ENOMEM;
989 
990 	new->cnt = 0;
991 	new->max = (PAGE_SIZE - offsetof(struct hmdfs_inode_tbl, inodes)) /
992 		   sizeof(new->inodes[0]);
993 	*tbl = new;
994 
995 	return 0;
996 }
997 
hmdfs_parse_stash_file_name(struct dir_context * dctx,const char * name,int namelen,unsigned int d_type,uint64_t * stash_inum)998 static int hmdfs_parse_stash_file_name(struct dir_context *dctx,
999 					const char *name,
1000 					int namelen,
1001 					unsigned int d_type,
1002 					uint64_t *stash_inum)
1003 {
1004 	struct hmdfs_stash_dir_context *ctx = NULL;
1005 	int err;
1006 
1007 	if (d_type != DT_UNKNOWN && d_type != DT_REG)
1008 		return 0;
1009 	if (namelen > NAME_MAX)
1010 		return 0;
1011 
1012 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1013 	memcpy(ctx->name, name, namelen);
1014 	ctx->name[namelen] = '\0';
1015 	err = kstrtoull(ctx->name, 16, stash_inum);
1016 	if (err) {
1017 		hmdfs_err("unexpected stash file err %d", err);
1018 		return 0;
1019 	}
1020 	return 1;
1021 }
1022 
hmdfs_has_stash_file(struct dir_context * dctx,const char * name,int namelen,loff_t offset,u64 inum,unsigned int d_type)1023 static int hmdfs_has_stash_file(struct dir_context *dctx, const char *name,
1024 				int namelen, loff_t offset,
1025 				u64 inum, unsigned int d_type)
1026 {
1027 	struct hmdfs_stash_dir_context *ctx = NULL;
1028 	uint64_t stash_inum;
1029 	int err;
1030 
1031 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1032 	err = hmdfs_parse_stash_file_name(dctx, name, namelen,
1033 					   d_type, &stash_inum);
1034 	if (!err)
1035 		return 0;
1036 
1037 	ctx->tbl->cnt++;
1038 	return 1;
1039 }
1040 
hmdfs_fill_stash_file(struct dir_context * dctx,const char * name,int namelen,loff_t offset,u64 inum,unsigned int d_type)1041 static int hmdfs_fill_stash_file(struct dir_context *dctx, const char *name,
1042 				 int namelen, loff_t offset,
1043 				 u64 inum, unsigned int d_type)
1044 {
1045 	struct hmdfs_stash_dir_context *ctx = NULL;
1046 	uint64_t stash_inum;
1047 	int err;
1048 
1049 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1050 	err = hmdfs_parse_stash_file_name(dctx, name, namelen,
1051 					   d_type, &stash_inum);
1052 	if (!err)
1053 		return 0;
1054 	if (ctx->tbl->cnt >= ctx->tbl->max)
1055 		return 1;
1056 
1057 	ctx->tbl->inodes[ctx->tbl->cnt++] = stash_inum;
1058 
1059 	return 0;
1060 }
1061 
hmdfs_del_stash_file(struct dentry * parent,struct dentry * child)1062 static int hmdfs_del_stash_file(struct dentry *parent, struct dentry *child)
1063 {
1064 	struct inode *dir = d_inode(parent);
1065 	int err = 0;
1066 
1067 	/* Prevent d_delete() from calling dentry_unlink_inode() */
1068 	dget(child);
1069 
1070 	inode_lock_nested(dir, I_MUTEX_PARENT);
1071 	err = vfs_unlink(dir, child, NULL);
1072 	if (err)
1073 		hmdfs_err("remove stash file err %d", err);
1074 	inode_unlock(dir);
1075 
1076 	dput(child);
1077 
1078 	return err;
1079 }
1080 
hmdfs_is_node_offlined(const struct hmdfs_peer * conn,unsigned int seq)1081 static inline bool hmdfs_is_node_offlined(const struct hmdfs_peer *conn,
1082 					  unsigned int seq)
1083 {
1084 	/*
1085 	 * open()/fsync() may fail due to "status = NODE_STAT_OFFLINE"
1086 	 * in hmdfs_disconnect_node().
1087 	 * Pair with smp_mb() in hmdfs_disconnect_node() to ensure
1088 	 * getting the newest event sequence.
1089 	 */
1090 	smp_mb__before_atomic();
1091 	return hmdfs_node_evt_seq(conn) != seq;
1092 }
1093 
hmdfs_verify_restore_file_head(struct hmdfs_file_restore_ctx * ctx,const struct hmdfs_cache_file_head * head)1094 static int hmdfs_verify_restore_file_head(struct hmdfs_file_restore_ctx *ctx,
1095 				    const struct hmdfs_cache_file_head *head)
1096 {
1097 	struct inode *inode = file_inode(ctx->src_filp);
1098 	struct hmdfs_peer *conn = ctx->conn;
1099 	unsigned int crc, read_crc, crc_offset;
1100 	loff_t path_offs, data_offs, isize;
1101 	int err = 0;
1102 
1103 	if (le32_to_cpu(head->magic) != HMDFS_STASH_FILE_HEAD_MAGIC) {
1104 		err = -EUCLEAN;
1105 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid magic: got 0x%x, exp 0x%x",
1106 			  conn->owner, conn->device_id, ctx->inum,
1107 			  le32_to_cpu(head->magic),
1108 			  HMDFS_STASH_FILE_HEAD_MAGIC);
1109 		goto out;
1110 	}
1111 
1112 	crc_offset = le32_to_cpu(head->crc_offset);
1113 	read_crc = le32_to_cpu(*((__le32 *)((char *)head + crc_offset)));
1114 	crc = crc32(0, head, crc_offset);
1115 	if (read_crc != crc) {
1116 		err = -EUCLEAN;
1117 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid crc: got 0x%x, exp 0x%x",
1118 			  conn->owner, conn->device_id, ctx->inum,
1119 			  read_crc, crc);
1120 		goto out;
1121 	}
1122 
1123 	if (le64_to_cpu(head->ino) != ctx->inum) {
1124 		err = -EUCLEAN;
1125 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid ino: got %llu, exp %llu",
1126 			  conn->owner, conn->device_id, ctx->inum,
1127 			  le64_to_cpu(head->ino), ctx->inum);
1128 		goto out;
1129 	}
1130 
1131 	path_offs = (loff_t)le32_to_cpu(head->path_offs) <<
1132 		    HMDFS_STASH_BLK_SHIFT;
1133 	if (path_offs <= 0 || path_offs >= i_size_read(inode)) {
1134 		err = -EUCLEAN;
1135 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_offs %d, stash file size %llu",
1136 			  conn->owner, conn->device_id, ctx->inum,
1137 			  le32_to_cpu(head->path_offs), i_size_read(inode));
1138 		goto out;
1139 	}
1140 
1141 	data_offs = (loff_t)le32_to_cpu(head->data_offs) <<
1142 		    HMDFS_STASH_BLK_SHIFT;
1143 	if (path_offs >= data_offs) {
1144 		err = -EUCLEAN;
1145 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, path_offs %d",
1146 			  conn->owner, conn->device_id, ctx->inum,
1147 			  le32_to_cpu(head->data_offs),
1148 			  le32_to_cpu(head->path_offs));
1149 		goto out;
1150 	}
1151 	if (data_offs <= 0 || data_offs >= i_size_read(inode)) {
1152 		err = -EUCLEAN;
1153 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, stash file size %llu",
1154 			  conn->owner, conn->device_id, ctx->inum,
1155 			  le32_to_cpu(head->data_offs), i_size_read(inode));
1156 		goto out;
1157 	}
1158 
1159 	isize = le64_to_cpu(head->size);
1160 	if (isize != i_size_read(inode)) {
1161 		err = -EUCLEAN;
1162 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid isize: got %llu, exp %llu",
1163 			  conn->owner, conn->device_id, ctx->inum,
1164 			  le64_to_cpu(head->size), i_size_read(inode));
1165 		goto out;
1166 	}
1167 
1168 	if (le32_to_cpu(head->path_cnt) < 1) {
1169 		err = -EUCLEAN;
1170 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_cnt %d",
1171 			  conn->owner, conn->device_id, ctx->inum,
1172 			  le32_to_cpu(head->path_cnt));
1173 		goto out;
1174 	}
1175 
1176 out:
1177 	return err;
1178 }
1179 
hmdfs_get_restore_file_metadata(struct hmdfs_file_restore_ctx * ctx)1180 static int hmdfs_get_restore_file_metadata(struct hmdfs_file_restore_ctx *ctx)
1181 {
1182 	struct hmdfs_cache_file_head head;
1183 	struct hmdfs_peer *conn = ctx->conn;
1184 	unsigned int head_size, read_size, head_crc_offset;
1185 	loff_t pos;
1186 	ssize_t rd;
1187 	int err = 0;
1188 
1189 	head_size = sizeof(struct hmdfs_cache_file_head);
1190 	memset(&head, 0, head_size);
1191 	/* Read part head */
1192 	pos = 0;
1193 	read_size = offsetof(struct hmdfs_cache_file_head, crc_offset) +
1194 		    sizeof(head.crc_offset);
1195 	rd = kernel_read(ctx->src_filp, &head, read_size, &pos);
1196 	if (rd != read_size) {
1197 		err = rd < 0 ? rd : -ENODATA;
1198 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read part head err %d",
1199 			  conn->owner, conn->device_id, ctx->inum, err);
1200 		goto out;
1201 	}
1202 	head_crc_offset = le32_to_cpu(head.crc_offset);
1203 	if (head_crc_offset + sizeof(head.crc32) < head_crc_offset ||
1204 	    head_crc_offset + sizeof(head.crc32) > head_size) {
1205 		err = -EUCLEAN;
1206 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx got bad head: Too long crc_offset %u which exceeds head size %u",
1207 			  conn->owner, conn->device_id, ctx->inum,
1208 			  head_crc_offset, head_size);
1209 		goto out;
1210 	}
1211 
1212 	/* Read full head */
1213 	pos = 0;
1214 	read_size = le32_to_cpu(head.crc_offset) + sizeof(head.crc32);
1215 	rd = kernel_read(ctx->src_filp, &head, read_size, &pos);
1216 	if (rd != read_size) {
1217 		err = rd < 0 ? rd : -ENODATA;
1218 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read full head err %d",
1219 			  conn->owner, conn->device_id, ctx->inum, err);
1220 		goto out;
1221 	}
1222 
1223 	err = hmdfs_verify_restore_file_head(ctx, &head);
1224 	if (err)
1225 		goto out;
1226 
1227 	ctx->pages = le64_to_cpu(head.blocks) >>
1228 		     HMDFS_STASH_PAGE_TO_SECTOR_SHIFT;
1229 	ctx->data_offs = le32_to_cpu(head.data_offs);
1230 	/* Read path */
1231 	read_size = min_t(unsigned int, le32_to_cpu(head.path_len), PATH_MAX);
1232 	pos = (loff_t)le32_to_cpu(head.path_offs) << HMDFS_STASH_BLK_SHIFT;
1233 	rd = kernel_read(ctx->src_filp, ctx->dst, read_size, &pos);
1234 	if (rd != read_size) {
1235 		err = rd < 0 ? rd : -ENODATA;
1236 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path err %d",
1237 			  conn->owner, conn->device_id, ctx->inum, err);
1238 		goto out;
1239 	}
1240 	if (strnlen(ctx->dst, read_size) >= read_size) {
1241 		err = -EUCLEAN;
1242 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path not end with \\0",
1243 			  conn->owner, conn->device_id, ctx->inum);
1244 		goto out;
1245 	}
1246 	/* TODO: Pick a valid path from all paths */
1247 
1248 out:
1249 	return err;
1250 }
1251 
hmdfs_open_restore_dst_file(struct hmdfs_file_restore_ctx * ctx,unsigned int rw_flag,struct file ** filp)1252 static int hmdfs_open_restore_dst_file(struct hmdfs_file_restore_ctx *ctx,
1253 				       unsigned int rw_flag, struct file **filp)
1254 {
1255 	struct hmdfs_peer *conn = ctx->conn;
1256 	struct file *dst = NULL;
1257 	int err = 0;
1258 
1259 	err = hmdfs_get_restore_file_metadata(ctx);
1260 	if (err)
1261 		goto out;
1262 
1263 	/* Error comes from connection or server ? */
1264 	dst = file_open_root(&ctx->dst_root_path,
1265 			     ctx->dst, O_LARGEFILE | rw_flag, 0);
1266 	if (IS_ERR(dst)) {
1267 		err = PTR_ERR(dst);
1268 		hmdfs_err("open remote file ino 0x%llx err %d", ctx->inum, err);
1269 		if (hmdfs_is_node_offlined(conn, ctx->seq))
1270 			err = -ESHUTDOWN;
1271 		goto out;
1272 	}
1273 
1274 	*filp = dst;
1275 out:
1276 	return err;
1277 }
1278 
hmdfs_need_abort_restore(struct hmdfs_file_restore_ctx * ctx,struct hmdfs_inode_info * pinned,struct file * opened_file)1279 static bool hmdfs_need_abort_restore(struct hmdfs_file_restore_ctx *ctx,
1280 				     struct hmdfs_inode_info *pinned,
1281 				     struct file *opened_file)
1282 {
1283 	struct hmdfs_inode_info *opened = hmdfs_i(file_inode(opened_file));
1284 
1285 	if (opened->inode_type != HMDFS_LAYER_OTHER_REMOTE)
1286 		goto abort;
1287 
1288 	if (opened == pinned)
1289 		return false;
1290 
1291 abort:
1292 	hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx invalid remote file",
1293 		      ctx->conn->owner, ctx->conn->device_id, ctx->inum);
1294 	hmdfs_warning("got: peer 0x%x:0x%llx inode 0x%llx type %d status %d",
1295 		      opened->conn ? opened->conn->owner : 0,
1296 		      opened->conn ? opened->conn->device_id : 0,
1297 		      opened->remote_ino, opened->inode_type,
1298 		      opened->stash_status);
1299 	hmdfs_warning("pinned: peer 0x%x:0x%llx inode 0x%llx type %d status %d",
1300 		      pinned->conn->owner, pinned->conn->device_id,
1301 		      pinned->remote_ino, pinned->inode_type,
1302 		      pinned->stash_status);
1303 	return true;
1304 }
1305 
hmdfs_init_copy_args(const struct hmdfs_file_restore_ctx * ctx,struct file * dst,struct hmdfs_copy_args * args)1306 static void hmdfs_init_copy_args(const struct hmdfs_file_restore_ctx *ctx,
1307 				 struct file *dst, struct hmdfs_copy_args *args)
1308 {
1309 	args->src = ctx->src_filp;
1310 	args->dst = dst;
1311 	args->buf = ctx->page;
1312 	args->buf_len = PAGE_SIZE;
1313 	args->seq = ctx->seq;
1314 	args->data_offs = ctx->data_offs;
1315 	args->inum = ctx->inum;
1316 }
1317 
hmdfs_write_dst(struct hmdfs_peer * conn,struct file * filp,void * buf,size_t len,loff_t pos)1318 static ssize_t hmdfs_write_dst(struct hmdfs_peer *conn, struct file *filp,
1319 			       void *buf, size_t len, loff_t pos)
1320 {
1321 	mm_segment_t old_fs;
1322 	struct kiocb kiocb;
1323 	struct iovec iov;
1324 	struct iov_iter iter;
1325 	ssize_t wr;
1326 	int err = 0;
1327 
1328 	file_start_write(filp);
1329 
1330 	old_fs = force_uaccess_begin();
1331 
1332 	init_sync_kiocb(&kiocb, filp);
1333 	kiocb.ki_pos = pos;
1334 
1335 	iov.iov_base = buf;
1336 	iov.iov_len = len;
1337 	iov_iter_init(&iter, WRITE, &iov, 1, len);
1338 
1339 	wr = hmdfs_file_write_iter_remote_nocheck(&kiocb, &iter);
1340 
1341 	force_uaccess_end(old_fs);
1342 
1343 	file_end_write(filp);
1344 
1345 	if (wr != len) {
1346 		struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp));
1347 
1348 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short write ret %zd exp %zu",
1349 			  conn->owner, conn->device_id, info->remote_ino,
1350 			  wr, len);
1351 		err = wr < 0 ? (int)wr : -EFAULT;
1352 	}
1353 
1354 	return err;
1355 }
1356 
hmdfs_rd_src_wr_dst(struct hmdfs_peer * conn,struct hmdfs_copy_ctx * ctx)1357 static int hmdfs_rd_src_wr_dst(struct hmdfs_peer *conn,
1358 			       struct hmdfs_copy_ctx *ctx)
1359 {
1360 	const struct hmdfs_copy_args *args = NULL;
1361 	int err = 0;
1362 	loff_t rd_pos;
1363 	ssize_t rd;
1364 
1365 	ctx->eof = false;
1366 	ctx->copied = 0;
1367 
1368 	args = &ctx->args;
1369 	rd_pos = ctx->src_pos;
1370 	rd = kernel_read(args->src, args->buf, args->buf_len, &rd_pos);
1371 	if (rd < 0) {
1372 		err = (int)rd;
1373 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short read err %d",
1374 			  conn->owner, conn->device_id, args->inum, err);
1375 		goto out;
1376 	} else if (rd == 0) {
1377 		ctx->eof = true;
1378 		goto out;
1379 	}
1380 
1381 	err = hmdfs_write_dst(conn, args->dst, args->buf, rd, ctx->dst_pos);
1382 	if (!err)
1383 		ctx->copied = rd;
1384 	else if (hmdfs_is_node_offlined(conn, args->seq))
1385 		err = -ESHUTDOWN;
1386 out:
1387 	return err;
1388 }
1389 
hmdfs_copy_src_to_dst(struct hmdfs_peer * conn,const struct hmdfs_copy_args * args)1390 static int hmdfs_copy_src_to_dst(struct hmdfs_peer *conn,
1391 				 const struct hmdfs_copy_args *args)
1392 {
1393 	int err = 0;
1394 	struct file *src = NULL;
1395 	struct hmdfs_copy_ctx ctx;
1396 	loff_t seek_pos, data_init_pos;
1397 	loff_t src_size;
1398 
1399 	ctx.args = *args;
1400 
1401 	src = ctx.args.src;
1402 	data_init_pos = (loff_t)ctx.args.data_offs << HMDFS_STASH_BLK_SHIFT;
1403 	seek_pos = data_init_pos;
1404 	src_size = i_size_read(file_inode(src));
1405 	while (true) {
1406 		loff_t data_pos;
1407 
1408 		data_pos = vfs_llseek(src, seek_pos, SEEK_DATA);
1409 		if (data_pos > seek_pos) {
1410 			seek_pos = data_pos;
1411 			continue;
1412 		} else if (data_pos < 0) {
1413 			if (data_pos == -ENXIO) {
1414 				loff_t src_blks = file_inode(src)->i_blocks;
1415 
1416 				hmdfs_info("peer 0x%x:0x%llx ino 0x%llx end at 0x%llx (sz 0x%llx blk 0x%llx)",
1417 					   conn->owner, conn->device_id,
1418 					   args->inum, seek_pos,
1419 					   src_size, src_blks);
1420 			} else {
1421 				err = (int)data_pos;
1422 				hmdfs_err("peer 0x%x:0x%llx ino 0x%llx seek pos 0x%llx err %d",
1423 					  conn->owner, conn->device_id,
1424 					  args->inum, seek_pos, err);
1425 			}
1426 			break;
1427 		}
1428 
1429 		hmdfs_debug("peer 0x%x:0x%llx ino 0x%llx seek to 0x%llx",
1430 			    conn->owner, conn->device_id, args->inum, data_pos);
1431 
1432 		ctx.src_pos = data_pos;
1433 		ctx.dst_pos = data_pos - data_init_pos;
1434 		err = hmdfs_rd_src_wr_dst(conn, &ctx);
1435 		if (err || ctx.eof)
1436 			break;
1437 
1438 		seek_pos += ctx.copied;
1439 		if (seek_pos >= src_size)
1440 			break;
1441 	}
1442 
1443 	return err;
1444 }
1445 
hmdfs_restore_src_to_dst(struct hmdfs_file_restore_ctx * ctx,struct file * dst)1446 static int hmdfs_restore_src_to_dst(struct hmdfs_file_restore_ctx *ctx,
1447 				    struct file *dst)
1448 {
1449 	struct file *src = ctx->src_filp;
1450 	struct hmdfs_copy_args args;
1451 	int err;
1452 
1453 	hmdfs_init_copy_args(ctx, dst, &args);
1454 	err = hmdfs_copy_src_to_dst(ctx->conn, &args);
1455 	if (err)
1456 		goto out;
1457 
1458 	err = vfs_fsync(dst, 0);
1459 	if (err) {
1460 		hmdfs_err("fsync remote file ino 0x%llx err %d", ctx->inum, err);
1461 		if (hmdfs_is_node_offlined(ctx->conn, ctx->seq))
1462 			err = -ESHUTDOWN;
1463 	}
1464 
1465 out:
1466 	if (err)
1467 		truncate_inode_pages(file_inode(dst)->i_mapping, 0);
1468 
1469 	/* Remove the unnecessary cache */
1470 	invalidate_mapping_pages(file_inode(src)->i_mapping, 0, -1);
1471 
1472 	return err;
1473 }
1474 
1475 
hmdfs_restore_file(struct hmdfs_file_restore_ctx * ctx)1476 static int hmdfs_restore_file(struct hmdfs_file_restore_ctx *ctx)
1477 {
1478 	struct hmdfs_peer *conn = ctx->conn;
1479 	uint64_t inum = ctx->inum;
1480 	struct hmdfs_inode_info *pinned_info = NULL;
1481 	struct file *dst_filp = NULL;
1482 	int err = 0;
1483 	bool keep = false;
1484 
1485 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx do restore",
1486 		   conn->owner, conn->device_id, inum);
1487 
1488 	pinned_info = hmdfs_lookup_stash_inode(conn, inum);
1489 	if (pinned_info) {
1490 		unsigned int status = READ_ONCE(pinned_info->stash_status);
1491 
1492 		if (status != HMDFS_REMOTE_INODE_RESTORING) {
1493 			hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid status %u",
1494 				  conn->owner, conn->device_id, inum, status);
1495 			err = -EINVAL;
1496 			goto clean;
1497 		}
1498 	} else {
1499 		hmdfs_warning("peer 0x%x:0x%llx ino 0x%llx doesn't being pinned",
1500 			      conn->owner, conn->device_id, inum);
1501 		err = -EINVAL;
1502 		goto clean;
1503 	}
1504 
1505 	set_bit(HMDFS_FID_NEED_OPEN, &pinned_info->fid_flags);
1506 	err = hmdfs_open_restore_dst_file(ctx, O_RDWR, &dst_filp);
1507 	if (err) {
1508 		if (err == -ESHUTDOWN)
1509 			keep = true;
1510 		goto clean;
1511 	}
1512 
1513 	if (hmdfs_need_abort_restore(ctx, pinned_info, dst_filp))
1514 		goto abort;
1515 
1516 	err = hmdfs_restore_src_to_dst(ctx, dst_filp);
1517 	if (err == -ESHUTDOWN)
1518 		keep = true;
1519 abort:
1520 	fput(dst_filp);
1521 clean:
1522 	if (pinned_info && !keep)
1523 		hmdfs_reset_stashed_inode(conn, pinned_info);
1524 	ctx->keep = keep;
1525 
1526 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx restore err %d keep %d",
1527 		   conn->owner, conn->device_id, inum, err, ctx->keep);
1528 
1529 	return err;
1530 }
1531 
hmdfs_init_file_restore_ctx(struct hmdfs_peer * conn,unsigned int seq,struct path * src_dir,struct hmdfs_file_restore_ctx * ctx)1532 static int hmdfs_init_file_restore_ctx(struct hmdfs_peer *conn,
1533 				       unsigned int seq, struct path *src_dir,
1534 				       struct hmdfs_file_restore_ctx *ctx)
1535 {
1536 	struct hmdfs_sb_info *sbi = conn->sbi;
1537 	struct path dst_root;
1538 	char *dst = NULL;
1539 	char *page = NULL;
1540 	int err = 0;
1541 
1542 	err = hmdfs_get_path_in_sb(sbi->sb, sbi->real_dst, LOOKUP_DIRECTORY,
1543 				   &dst_root);
1544 	if (err)
1545 		return err;
1546 
1547 	dst = kmalloc(PATH_MAX, GFP_KERNEL);
1548 	if (!dst) {
1549 		err = -ENOMEM;
1550 		goto put_path;
1551 	}
1552 
1553 	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
1554 	if (!page) {
1555 		err = -ENOMEM;
1556 		goto free_dst;
1557 	}
1558 
1559 	ctx->conn = conn;
1560 	ctx->src_dir_path = *src_dir;
1561 	ctx->dst_root_path = dst_root;
1562 	ctx->dst = dst;
1563 	ctx->page = page;
1564 	ctx->seq = seq;
1565 
1566 	return 0;
1567 free_dst:
1568 	kfree(dst);
1569 put_path:
1570 	path_put(&dst_root);
1571 	return err;
1572 }
1573 
hmdfs_exit_file_restore_ctx(struct hmdfs_file_restore_ctx * ctx)1574 static void hmdfs_exit_file_restore_ctx(struct hmdfs_file_restore_ctx *ctx)
1575 {
1576 	path_put(&ctx->dst_root_path);
1577 	kfree(ctx->dst);
1578 	kfree(ctx->page);
1579 }
1580 
hmdfs_open_stash_file(struct path * p_path,char * name)1581 static struct file *hmdfs_open_stash_file(struct path *p_path, char *name)
1582 {
1583 	struct dentry *parent = NULL;
1584 	struct inode *dir = NULL;
1585 	struct dentry *child = NULL;
1586 	struct file *filp = NULL;
1587 	struct path c_path;
1588 	int err = 0;
1589 
1590 	parent = p_path->dentry;
1591 	dir = d_inode(parent);
1592 	inode_lock_nested(dir, I_MUTEX_PARENT);
1593 	child = lookup_one_len(name, parent, strlen(name));
1594 	if (!IS_ERR(child) && !hmdfs_is_reg(child)) {
1595 		if (d_is_positive(child)) {
1596 			hmdfs_err("invalid stash file (mode 0%o)",
1597 				  d_inode(child)->i_mode);
1598 			err = -EINVAL;
1599 		} else {
1600 			hmdfs_err("missing stash file");
1601 			err = -ENOENT;
1602 		}
1603 		dput(child);
1604 	} else if (IS_ERR(child)) {
1605 		err = PTR_ERR(child);
1606 		hmdfs_err("lookup stash file err %d", err);
1607 	}
1608 	inode_unlock(dir);
1609 
1610 	if (err)
1611 		return ERR_PTR(err);
1612 
1613 	c_path.mnt = p_path->mnt;
1614 	c_path.dentry = child;
1615 	filp = dentry_open(&c_path, O_RDONLY | O_LARGEFILE, current_cred());
1616 	if (IS_ERR(filp))
1617 		hmdfs_err("open stash file err %d", (int)PTR_ERR(filp));
1618 
1619 	dput(child);
1620 
1621 	return filp;
1622 }
1623 
hmdfs_update_restore_stats(struct hmdfs_restore_stats * stats,bool keep,uint64_t pages,int err)1624 static void hmdfs_update_restore_stats(struct hmdfs_restore_stats *stats,
1625 				       bool keep, uint64_t pages, int err)
1626 {
1627 	if (!err) {
1628 		stats->succeed++;
1629 		stats->ok_pages += pages;
1630 	} else if (keep) {
1631 		stats->keep++;
1632 	} else {
1633 		stats->fail++;
1634 		stats->fail_pages += pages;
1635 	}
1636 }
1637 
hmdfs_restore_files(struct hmdfs_peer * conn,unsigned int seq,struct path * dir,const struct hmdfs_inode_tbl * tbl,void * priv)1638 static int hmdfs_restore_files(struct hmdfs_peer *conn,
1639 			       unsigned int seq, struct path *dir,
1640 			       const struct hmdfs_inode_tbl *tbl,
1641 			       void *priv)
1642 {
1643 	unsigned int i;
1644 	struct hmdfs_file_restore_ctx ctx;
1645 	int err = 0;
1646 	struct hmdfs_restore_stats *stats = priv;
1647 
1648 	err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx);
1649 	if (err)
1650 		return err;
1651 
1652 	for (i = 0; i < tbl->cnt; i++) {
1653 		char name[HMDFS_STASH_FILE_NAME_LEN];
1654 		struct file *filp = NULL;
1655 
1656 		snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]);
1657 		filp = hmdfs_open_stash_file(dir, name);
1658 		/* Continue to restore if any error */
1659 		if (IS_ERR(filp)) {
1660 			stats->fail++;
1661 			continue;
1662 		}
1663 
1664 		ctx.inum = tbl->inodes[i];
1665 		ctx.src_filp = filp;
1666 		ctx.keep = false;
1667 		ctx.pages = 0;
1668 		err = hmdfs_restore_file(&ctx);
1669 		hmdfs_update_restore_stats(stats, ctx.keep, ctx.pages, err);
1670 
1671 		if (!ctx.keep)
1672 			hmdfs_del_stash_file(dir->dentry,
1673 					     file_dentry(ctx.src_filp));
1674 		fput(ctx.src_filp);
1675 
1676 		/* Continue to restore */
1677 		if (err == -ESHUTDOWN)
1678 			break;
1679 		err = 0;
1680 	}
1681 
1682 	hmdfs_exit_file_restore_ctx(&ctx);
1683 
1684 	return err;
1685 }
1686 
hmdfs_is_valid_stash_status(struct hmdfs_inode_info * inode_info,uint64_t ino)1687 static bool hmdfs_is_valid_stash_status(struct hmdfs_inode_info *inode_info,
1688 					uint64_t ino)
1689 {
1690 	return (inode_info->inode_type == HMDFS_LAYER_OTHER_REMOTE &&
1691 		inode_info->stash_status == HMDFS_REMOTE_INODE_RESTORING &&
1692 		inode_info->remote_ino == ino);
1693 }
1694 
hmdfs_rebuild_stash_list(struct hmdfs_peer * conn,unsigned int seq,struct path * dir,const struct hmdfs_inode_tbl * tbl,void * priv)1695 static int hmdfs_rebuild_stash_list(struct hmdfs_peer *conn,
1696 				    unsigned int seq,
1697 				    struct path *dir,
1698 				    const struct hmdfs_inode_tbl *tbl,
1699 				    void *priv)
1700 {
1701 	struct hmdfs_file_restore_ctx ctx;
1702 	unsigned int i;
1703 	int err;
1704 	struct hmdfs_rebuild_stats *stats = priv;
1705 
1706 	err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx);
1707 	if (err)
1708 		return err;
1709 
1710 	stats->total += tbl->cnt;
1711 
1712 	for (i = 0; i < tbl->cnt; i++) {
1713 		char name[HMDFS_STASH_FILE_NAME_LEN];
1714 		struct file *src_filp = NULL;
1715 		struct file *dst_filp = NULL;
1716 		struct hmdfs_inode_info *inode_info = NULL;
1717 		bool is_valid = true;
1718 
1719 		snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]);
1720 		src_filp = hmdfs_open_stash_file(dir, name);
1721 		if (IS_ERR(src_filp)) {
1722 			stats->fail++;
1723 			continue;
1724 		}
1725 		ctx.inum = tbl->inodes[i];
1726 		ctx.src_filp = src_filp;
1727 
1728 		/* No need to track the open which only needs meta info */
1729 		err = hmdfs_open_restore_dst_file(&ctx, O_RDONLY, &dst_filp);
1730 		if (err) {
1731 			fput(src_filp);
1732 			if (err == -ESHUTDOWN)
1733 				break;
1734 			stats->fail++;
1735 			err = 0;
1736 			continue;
1737 		}
1738 
1739 		inode_info = hmdfs_i(file_inode(dst_filp));
1740 		is_valid = hmdfs_is_valid_stash_status(inode_info,
1741 						       ctx.inum);
1742 		if (is_valid) {
1743 			stats->succeed++;
1744 		} else {
1745 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx invalid state: type: %d, status: %u, inode: %llu",
1746 				  conn->owner, conn->device_id, ctx.inum,
1747 				  inode_info->inode_type,
1748 				  READ_ONCE(inode_info->stash_status),
1749 				  inode_info->remote_ino);
1750 			stats->invalid++;
1751 		}
1752 
1753 		fput(ctx.src_filp);
1754 		fput(dst_filp);
1755 	}
1756 
1757 	hmdfs_exit_file_restore_ctx(&ctx);
1758 	return err;
1759 }
1760 
hmdfs_iter_stash_file(struct hmdfs_peer * conn,unsigned int seq,struct file * filp,stash_operation_func op,void * priv)1761 static int hmdfs_iter_stash_file(struct hmdfs_peer *conn,
1762 				 unsigned int seq,
1763 				 struct file *filp,
1764 				 stash_operation_func op,
1765 				 void *priv)
1766 {
1767 	int err = 0;
1768 	struct hmdfs_stash_dir_context ctx = {
1769 		.dctx.actor = hmdfs_fill_stash_file,
1770 	};
1771 	struct hmdfs_inode_tbl *tbl = NULL;
1772 	struct path dir;
1773 
1774 	err = hmdfs_new_inode_tbl(&tbl);
1775 	if (err)
1776 		goto out;
1777 
1778 	dir.mnt = filp->f_path.mnt;
1779 	dir.dentry = file_dentry(filp);
1780 
1781 	ctx.tbl = tbl;
1782 	ctx.dctx.pos = 0;
1783 	do {
1784 		tbl->cnt = 0;
1785 		err = iterate_dir(filp, &ctx.dctx);
1786 		if (err || !tbl->cnt) {
1787 			if (err)
1788 				hmdfs_err("iterate stash dir err %d", err);
1789 			break;
1790 		}
1791 		err = op(conn, seq, &dir, tbl, priv);
1792 	} while (!err);
1793 
1794 out:
1795 	kfree(tbl);
1796 	return err;
1797 }
1798 
hmdfs_rebuild_check_work_fn(struct work_struct * base)1799 static void hmdfs_rebuild_check_work_fn(struct work_struct *base)
1800 {
1801 	struct hmdfs_check_work *work =
1802 		container_of(base, struct hmdfs_check_work, work);
1803 	struct hmdfs_peer *conn = work->conn;
1804 	struct hmdfs_sb_info *sbi = conn->sbi;
1805 	struct file *filp = NULL;
1806 	const struct cred *old_cred = NULL;
1807 	struct hmdfs_stash_dir_context ctx = {
1808 		.dctx.actor = hmdfs_has_stash_file,
1809 	};
1810 	struct hmdfs_inode_tbl tbl;
1811 	int err;
1812 
1813 	old_cred = hmdfs_override_creds(sbi->cred);
1814 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1815 	if (IS_ERR(filp))
1816 		goto out;
1817 
1818 	memset(&tbl, 0, sizeof(tbl));
1819 	ctx.tbl = &tbl;
1820 	err = iterate_dir(filp, &ctx.dctx);
1821 	if (!err && ctx.tbl->cnt > 0)
1822 		conn->need_rebuild_stash_list = true;
1823 
1824 	fput(filp);
1825 out:
1826 	hmdfs_revert_creds(old_cred);
1827 	hmdfs_info("peer 0x%x:0x%llx %sneed to rebuild stash list",
1828 		   conn->owner, conn->device_id,
1829 		   conn->need_rebuild_stash_list ? "" : "don't ");
1830 	complete(&work->done);
1831 }
1832 
hmdfs_stash_add_do_check(struct hmdfs_peer * conn,int evt,unsigned int seq)1833 static void hmdfs_stash_add_do_check(struct hmdfs_peer *conn, int evt,
1834 				     unsigned int seq)
1835 {
1836 	struct hmdfs_sb_info *sbi = conn->sbi;
1837 	struct hmdfs_check_work work = {
1838 		.conn = conn,
1839 		.done = COMPLETION_INITIALIZER_ONSTACK(work.done),
1840 	};
1841 
1842 	if (!hmdfs_is_stash_enabled(sbi))
1843 		return;
1844 
1845 	INIT_WORK_ONSTACK(&work.work, hmdfs_rebuild_check_work_fn);
1846 	schedule_work(&work.work);
1847 	wait_for_completion(&work.done);
1848 }
1849 
1850 static void
hmdfs_update_peer_rebuild_stats(struct hmdfs_rebuild_statistics * rebuild_stats,const struct hmdfs_rebuild_stats * stats)1851 hmdfs_update_peer_rebuild_stats(struct hmdfs_rebuild_statistics *rebuild_stats,
1852 				const struct hmdfs_rebuild_stats *stats)
1853 {
1854 	rebuild_stats->cur_ok = stats->succeed;
1855 	rebuild_stats->cur_fail = stats->fail;
1856 	rebuild_stats->cur_invalid = stats->invalid;
1857 	rebuild_stats->total_ok += stats->succeed;
1858 	rebuild_stats->total_fail += stats->fail;
1859 	rebuild_stats->total_invalid += stats->invalid;
1860 }
1861 
1862 /* rebuild stash inode list */
hmdfs_stash_online_prepare(struct hmdfs_peer * conn,int evt,unsigned int seq)1863 static void hmdfs_stash_online_prepare(struct hmdfs_peer *conn, int evt,
1864 				       unsigned int seq)
1865 {
1866 	struct hmdfs_sb_info *sbi = conn->sbi;
1867 	struct file *filp = NULL;
1868 	const struct cred *old_cred = NULL;
1869 	int err;
1870 	struct hmdfs_rebuild_stats stats;
1871 
1872 	if (!hmdfs_is_stash_enabled(sbi) ||
1873 	    !conn->need_rebuild_stash_list)
1874 		return;
1875 
1876 	/* release seq_lock to prevent blocking no-online sync cb */
1877 	mutex_unlock(&conn->seq_lock);
1878 	old_cred = hmdfs_override_creds(sbi->cred);
1879 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1880 	if (IS_ERR(filp))
1881 		goto out;
1882 
1883 	memset(&stats, 0, sizeof(stats));
1884 	err = hmdfs_iter_stash_file(conn, seq, filp,
1885 				    hmdfs_rebuild_stash_list, &stats);
1886 	if (err == -ESHUTDOWN) {
1887 		hmdfs_info("peer 0x%x:0x%llx offline again during rebuild",
1888 			   conn->owner, conn->device_id);
1889 	} else {
1890 		WRITE_ONCE(conn->need_rebuild_stash_list, false);
1891 		if (err)
1892 			hmdfs_warning("partial rebuild fail err %d", err);
1893 	}
1894 
1895 	hmdfs_update_peer_rebuild_stats(&conn->stats.rebuild, &stats);
1896 	hmdfs_info("peer 0x%x:0x%llx rebuild stashed-file total %u succeed %u fail %u invalid %u",
1897 		   conn->owner, conn->device_id, stats.total, stats.succeed,
1898 		   stats.fail, stats.invalid);
1899 	fput(filp);
1900 out:
1901 	conn->stats.rebuild.time++;
1902 	hmdfs_revert_creds(old_cred);
1903 	if (!READ_ONCE(conn->need_rebuild_stash_list)) {
1904 		/*
1905 		 * Use smp_mb__before_atomic() to ensure order between
1906 		 * writing @conn->need_rebuild_stash_list and
1907 		 * reading conn->rebuild_inode_status_nr.
1908 		 */
1909 		smp_mb__before_atomic();
1910 		/*
1911 		 * Wait until all inodes finish rebuilding stash status before
1912 		 * accessing @conn->stashed_inode_list in restoring.
1913 		 */
1914 		wait_event(conn->rebuild_inode_status_wq,
1915 			   !atomic_read(&conn->rebuild_inode_status_nr));
1916 	}
1917 	mutex_lock(&conn->seq_lock);
1918 }
1919 
1920 static void
hmdfs_update_peer_restore_stats(struct hmdfs_restore_statistics * restore_stats,const struct hmdfs_restore_stats * stats)1921 hmdfs_update_peer_restore_stats(struct hmdfs_restore_statistics *restore_stats,
1922 				const struct hmdfs_restore_stats *stats)
1923 {
1924 	restore_stats->cur_ok = stats->succeed;
1925 	restore_stats->cur_fail = stats->fail;
1926 	restore_stats->cur_keep = stats->keep;
1927 	restore_stats->total_ok += stats->succeed;
1928 	restore_stats->total_fail += stats->fail;
1929 	restore_stats->total_keep += stats->keep;
1930 	restore_stats->ok_pages += stats->ok_pages;
1931 	restore_stats->fail_pages += stats->fail_pages;
1932 }
1933 
hmdfs_stash_online_do_restore(struct hmdfs_peer * conn,int evt,unsigned int seq)1934 static void hmdfs_stash_online_do_restore(struct hmdfs_peer *conn, int evt,
1935 					  unsigned int seq)
1936 {
1937 	struct hmdfs_sb_info *sbi = conn->sbi;
1938 	struct file *filp = NULL;
1939 	const struct cred *old_cred = NULL;
1940 	struct hmdfs_restore_stats stats;
1941 	int err = 0;
1942 
1943 	if (!hmdfs_is_stash_enabled(sbi) || conn->need_rebuild_stash_list) {
1944 		if (conn->need_rebuild_stash_list)
1945 			hmdfs_info("peer 0x%x:0x%llx skip restoring due to rebuild-need",
1946 				   conn->owner, conn->device_id);
1947 		return;
1948 	}
1949 
1950 	/* release seq_lock to prevent blocking no-online sync cb */
1951 	mutex_unlock(&conn->seq_lock);
1952 	/* For dir iteration, file read and unlink */
1953 	old_cred = hmdfs_override_creds(conn->sbi->cred);
1954 
1955 	memset(&stats, 0, sizeof(stats));
1956 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1957 	if (IS_ERR(filp)) {
1958 		err = PTR_ERR(filp);
1959 		goto out;
1960 	}
1961 
1962 	err = hmdfs_iter_stash_file(conn, seq, filp,
1963 				    hmdfs_restore_files, &stats);
1964 
1965 	fput(filp);
1966 out:
1967 	hmdfs_revert_creds(old_cred);
1968 
1969 	/* offline again ? */
1970 	if (err != -ESHUTDOWN)
1971 		hmdfs_drop_stashed_inodes(conn);
1972 
1973 	hmdfs_update_peer_restore_stats(&conn->stats.restore, &stats);
1974 	hmdfs_info("peer 0x%x:0x%llx restore stashed-file ok %u fail %u keep %u",
1975 		   conn->owner, conn->device_id,
1976 		   stats.succeed, stats.fail, stats.keep);
1977 
1978 	mutex_lock(&conn->seq_lock);
1979 }
1980 
hmdfs_stash_del_do_cleanup(struct hmdfs_peer * conn,int evt,unsigned int seq)1981 static void hmdfs_stash_del_do_cleanup(struct hmdfs_peer *conn, int evt,
1982 				       unsigned int seq)
1983 {
1984 	struct hmdfs_inode_info *info = NULL;
1985 	struct hmdfs_inode_info *next = NULL;
1986 	unsigned int preparing;
1987 
1988 	if (!hmdfs_is_stash_enabled(conn->sbi))
1989 		return;
1990 
1991 	/* Async cb is cancelled */
1992 	preparing = 0;
1993 	list_for_each_entry_safe(info, next, &conn->wr_opened_inode_list,
1994 				 wr_opened_node) {
1995 		int status = READ_ONCE(info->stash_status);
1996 
1997 		if (status == HMDFS_REMOTE_INODE_STASHING) {
1998 			struct hmdfs_cache_info *cache = NULL;
1999 
2000 			spin_lock(&info->stash_lock);
2001 			cache = info->cache;
2002 			info->cache = NULL;
2003 			info->stash_status = HMDFS_REMOTE_INODE_NONE;
2004 			spin_unlock(&info->stash_lock);
2005 
2006 			hmdfs_remote_del_wr_opened_inode(conn, info);
2007 			hmdfs_del_file_cache(cache);
2008 			/* put inode after all access are completed */
2009 			iput(&info->vfs_inode);
2010 			preparing++;
2011 		}
2012 	}
2013 	hmdfs_info("release %u preparing inodes", preparing);
2014 
2015 	hmdfs_info("release %u pinned inodes", conn->stashed_inode_nr);
2016 	if (list_empty(&conn->stashed_inode_list))
2017 		return;
2018 
2019 	list_for_each_entry_safe(info, next,
2020 				 &conn->stashed_inode_list, stash_node)
2021 		hmdfs_untrack_stashed_inode(conn, info);
2022 }
2023 
hmdfs_exit_stash(struct hmdfs_sb_info * sbi)2024 void hmdfs_exit_stash(struct hmdfs_sb_info *sbi)
2025 {
2026 	if (!sbi->s_offline_stash)
2027 		return;
2028 
2029 	if (sbi->stash_work_dir.dentry) {
2030 		path_put(&sbi->stash_work_dir);
2031 		sbi->stash_work_dir.dentry = NULL;
2032 	}
2033 }
2034 
hmdfs_init_stash(struct hmdfs_sb_info * sbi)2035 int hmdfs_init_stash(struct hmdfs_sb_info *sbi)
2036 {
2037 	int err = 0;
2038 	struct path parent;
2039 	struct dentry *child = NULL;
2040 
2041 	if (!sbi->s_offline_stash)
2042 		return 0;
2043 
2044 	err = kern_path(sbi->cache_dir, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
2045 			&parent);
2046 	if (err) {
2047 		hmdfs_err("invalid cache dir err %d", err);
2048 		goto out;
2049 	}
2050 
2051 	child = hmdfs_stash_new_work_dir(parent.dentry);
2052 	if (!IS_ERR(child)) {
2053 		sbi->stash_work_dir.mnt = mntget(parent.mnt);
2054 		sbi->stash_work_dir.dentry = child;
2055 	} else {
2056 		err = PTR_ERR(child);
2057 		hmdfs_err("create stash work dir err %d", err);
2058 	}
2059 
2060 	path_put(&parent);
2061 out:
2062 	return err;
2063 }
2064 
hmdfs_stash_write_local_file(struct hmdfs_peer * conn,struct hmdfs_inode_info * info,struct hmdfs_writepage_context * ctx,struct hmdfs_cache_info * cache)2065 static int hmdfs_stash_write_local_file(struct hmdfs_peer *conn,
2066 					struct hmdfs_inode_info *info,
2067 					struct hmdfs_writepage_context *ctx,
2068 					struct hmdfs_cache_info *cache)
2069 {
2070 	struct page *page = ctx->page;
2071 	const struct cred *old_cred = NULL;
2072 	void *buf = NULL;
2073 	loff_t pos;
2074 	unsigned int flags;
2075 	ssize_t written;
2076 	int err = 0;
2077 
2078 	buf = kmap(page);
2079 	pos = (loff_t)page->index << PAGE_SHIFT;
2080 	/* enable NOFS for memory allocation */
2081 	flags = memalloc_nofs_save();
2082 	old_cred = hmdfs_override_creds(conn->sbi->cred);
2083 	pos += cache->data_offs << HMDFS_STASH_BLK_SHIFT;
2084 	written = kernel_write(cache->cache_file, buf, ctx->count, &pos);
2085 	hmdfs_revert_creds(old_cred);
2086 	memalloc_nofs_restore(flags);
2087 	kunmap(page);
2088 
2089 	if (written != ctx->count) {
2090 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx page 0x%lx data_offs 0x%x len %u err %zd",
2091 			  conn->owner, conn->device_id, info->remote_ino,
2092 			  page->index, cache->data_offs, ctx->count, written);
2093 		err = -EIO;
2094 	}
2095 
2096 	return err;
2097 }
2098 
hmdfs_stash_writepage(struct hmdfs_peer * conn,struct hmdfs_writepage_context * ctx)2099 int hmdfs_stash_writepage(struct hmdfs_peer *conn,
2100 			  struct hmdfs_writepage_context *ctx)
2101 {
2102 	struct inode *inode = ctx->page->mapping->host;
2103 	struct hmdfs_inode_info *info = hmdfs_i(inode);
2104 	struct hmdfs_cache_info *cache = NULL;
2105 	int err;
2106 
2107 	/* e.g. fail to create stash file */
2108 	cache = info->cache;
2109 	if (!cache)
2110 		return -EIO;
2111 
2112 	err = hmdfs_stash_write_local_file(conn, info, ctx, cache);
2113 	if (!err) {
2114 		hmdfs_client_writepage_done(info, ctx);
2115 		atomic64_inc(&cache->written_pgs);
2116 		put_task_struct(ctx->caller);
2117 		kfree(ctx);
2118 	}
2119 	atomic64_inc(&cache->to_write_pgs);
2120 
2121 	return err;
2122 }
2123 
hmdfs_stash_rebuild_status(struct hmdfs_peer * conn,struct inode * inode)2124 static void hmdfs_stash_rebuild_status(struct hmdfs_peer *conn,
2125 				       struct inode *inode)
2126 {
2127 	char *path_str = NULL;
2128 	struct hmdfs_inode_info *info = NULL;
2129 	const struct cred *old_cred = NULL;
2130 	struct path path;
2131 	struct path *stash_path = NULL;
2132 	int err = 0;
2133 
2134 	path_str = kmalloc(HMDFS_STASH_PATH_LEN, GFP_KERNEL);
2135 	if (!path_str) {
2136 		err = -ENOMEM;
2137 		return;
2138 	}
2139 
2140 	info = hmdfs_i(inode);
2141 	err = snprintf(path_str, HMDFS_STASH_PATH_LEN, "%s/0x%llx",
2142 		       conn->cid, info->remote_ino);
2143 	if (err >= HMDFS_STASH_PATH_LEN) {
2144 		kfree(path_str);
2145 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx too long name len",
2146 			  conn->owner, conn->device_id, info->remote_ino);
2147 		return;
2148 	}
2149 	old_cred = hmdfs_override_creds(conn->sbi->cred);
2150 	stash_path = &conn->sbi->stash_work_dir;
2151 	err = vfs_path_lookup(stash_path->dentry, stash_path->mnt,
2152 			      path_str, 0, &path);
2153 	hmdfs_revert_creds(old_cred);
2154 	if (!err) {
2155 		if (hmdfs_is_reg(path.dentry)) {
2156 			WRITE_ONCE(info->stash_status,
2157 				   HMDFS_REMOTE_INODE_RESTORING);
2158 			ihold(&info->vfs_inode);
2159 			hmdfs_track_inode_locked(conn, info);
2160 		} else {
2161 			hmdfs_info("peer 0x%x:0x%llx inode 0x%llx unexpected stashed file mode 0%o",
2162 				    conn->owner, conn->device_id,
2163 				    info->remote_ino,
2164 				    d_inode(path.dentry)->i_mode);
2165 		}
2166 
2167 		path_put(&path);
2168 	} else if (err && err != -ENOENT) {
2169 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx find %s err %d",
2170 			   conn->owner, conn->device_id, info->remote_ino,
2171 			   path_str, err);
2172 	}
2173 
2174 	kfree(path_str);
2175 }
2176 
2177 static inline bool
hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer * conn,umode_t mode)2178 hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer *conn, umode_t mode)
2179 {
2180 	return hmdfs_is_stash_enabled(conn->sbi) &&
2181 	       READ_ONCE(conn->need_rebuild_stash_list) &&
2182 	       S_ISREG(mode);
2183 }
2184 
hmdfs_remote_init_stash_status(struct hmdfs_peer * conn,struct inode * inode,umode_t mode)2185 void hmdfs_remote_init_stash_status(struct hmdfs_peer *conn,
2186 				    struct inode *inode, umode_t mode)
2187 {
2188 	if (!hmdfs_need_rebuild_inode_stash_status(conn, mode))
2189 		return;
2190 
2191 	atomic_inc(&conn->rebuild_inode_status_nr);
2192 	/*
2193 	 * Use smp_mb__after_atomic() to ensure order between writing
2194 	 * @conn->rebuild_inode_status_nr and reading
2195 	 * @conn->need_rebuild_stash_list.
2196 	 */
2197 	smp_mb__after_atomic();
2198 	if (READ_ONCE(conn->need_rebuild_stash_list))
2199 		hmdfs_stash_rebuild_status(conn, inode);
2200 	if (atomic_dec_and_test(&conn->rebuild_inode_status_nr))
2201 		wake_up(&conn->rebuild_inode_status_wq);
2202 }
2203 
2204 static struct hmdfs_node_cb_desc stash_cb[] = {
2205 	{
2206 		.evt = NODE_EVT_OFFLINE,
2207 		.sync = true,
2208 		.min_version = DFS_2_0,
2209 		.fn = hmdfs_stash_offline_prepare,
2210 	},
2211 	{
2212 		.evt = NODE_EVT_OFFLINE,
2213 		.sync = false,
2214 		.min_version = DFS_2_0,
2215 		.fn = hmdfs_stash_offline_do_stash,
2216 	},
2217 	/* Don't known peer version yet, so min_version is 0 */
2218 	{
2219 		.evt = NODE_EVT_ADD,
2220 		.sync = true,
2221 		.fn = hmdfs_stash_add_do_check,
2222 	},
2223 	{
2224 		.evt = NODE_EVT_ONLINE,
2225 		.sync = false,
2226 		.min_version = DFS_2_0,
2227 		.fn = hmdfs_stash_online_prepare,
2228 	},
2229 	{
2230 		.evt = NODE_EVT_ONLINE,
2231 		.sync = false,
2232 		.min_version = DFS_2_0,
2233 		.fn = hmdfs_stash_online_do_restore,
2234 	},
2235 	{
2236 		.evt = NODE_EVT_DEL,
2237 		.sync = true,
2238 		.min_version = DFS_2_0,
2239 		.fn = hmdfs_stash_del_do_cleanup,
2240 	},
2241 };
2242 
hmdfs_stash_add_node_evt_cb(void)2243 void __init hmdfs_stash_add_node_evt_cb(void)
2244 {
2245 	hmdfs_node_add_evt_cb(stash_cb, ARRAY_SIZE(stash_cb));
2246 }
2247 
2248