• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   *  pNFS functions to call and manage layout drivers.
3   *
4   *  Copyright (c) 2002 [year of first publication]
5   *  The Regents of the University of Michigan
6   *  All Rights Reserved
7   *
8   *  Dean Hildebrand <dhildebz@umich.edu>
9   *
10   *  Permission is granted to use, copy, create derivative works, and
11   *  redistribute this software and such derivative works for any purpose,
12   *  so long as the name of the University of Michigan is not used in
13   *  any advertising or publicity pertaining to the use or distribution
14   *  of this software without specific, written prior authorization. If
15   *  the above copyright notice or any other identification of the
16   *  University of Michigan is included in any copy of any portion of
17   *  this software, then the disclaimer below must also be included.
18   *
19   *  This software is provided as is, without representation or warranty
20   *  of any kind either express or implied, including without limitation
21   *  the implied warranties of merchantability, fitness for a particular
22   *  purpose, or noninfringement.  The Regents of the University of
23   *  Michigan shall not be liable for any damages, including special,
24   *  indirect, incidental, or consequential damages, with respect to any
25   *  claim arising out of or in connection with the use of the software,
26   *  even if it has been or is hereafter advised of the possibility of
27   *  such damages.
28   */
29  
30  #include <linux/nfs_fs.h>
31  #include <linux/nfs_page.h>
32  #include <linux/module.h>
33  #include <linux/sort.h>
34  #include "internal.h"
35  #include "pnfs.h"
36  #include "iostat.h"
37  #include "nfs4trace.h"
38  #include "delegation.h"
39  #include "nfs42.h"
40  
41  #define NFSDBG_FACILITY		NFSDBG_PNFS
42  #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
43  
44  /* Locking:
45   *
46   * pnfs_spinlock:
47   *      protects pnfs_modules_tbl.
48   */
49  static DEFINE_SPINLOCK(pnfs_spinlock);
50  
51  /*
52   * pnfs_modules_tbl holds all pnfs modules
53   */
54  static LIST_HEAD(pnfs_modules_tbl);
55  
56  static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
57  
58  /* Return the registered pnfs layout driver module matching given id */
59  static struct pnfs_layoutdriver_type *
find_pnfs_driver_locked(u32 id)60  find_pnfs_driver_locked(u32 id)
61  {
62  	struct pnfs_layoutdriver_type *local;
63  
64  	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
65  		if (local->id == id)
66  			goto out;
67  	local = NULL;
68  out:
69  	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
70  	return local;
71  }
72  
73  static struct pnfs_layoutdriver_type *
find_pnfs_driver(u32 id)74  find_pnfs_driver(u32 id)
75  {
76  	struct pnfs_layoutdriver_type *local;
77  
78  	spin_lock(&pnfs_spinlock);
79  	local = find_pnfs_driver_locked(id);
80  	if (local != NULL && !try_module_get(local->owner)) {
81  		dprintk("%s: Could not grab reference on module\n", __func__);
82  		local = NULL;
83  	}
84  	spin_unlock(&pnfs_spinlock);
85  	return local;
86  }
87  
88  void
unset_pnfs_layoutdriver(struct nfs_server * nfss)89  unset_pnfs_layoutdriver(struct nfs_server *nfss)
90  {
91  	if (nfss->pnfs_curr_ld) {
92  		if (nfss->pnfs_curr_ld->clear_layoutdriver)
93  			nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
94  		/* Decrement the MDS count. Purge the deviceid cache if zero */
95  		if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
96  			nfs4_deviceid_purge_client(nfss->nfs_client);
97  		module_put(nfss->pnfs_curr_ld->owner);
98  	}
99  	nfss->pnfs_curr_ld = NULL;
100  }
101  
102  /*
103   * When the server sends a list of layout types, we choose one in the order
104   * given in the list below.
105   *
106   * FIXME: should this list be configurable in some fashion? module param?
107   * 	  mount option? something else?
108   */
109  static const u32 ld_prefs[] = {
110  	LAYOUT_SCSI,
111  	LAYOUT_BLOCK_VOLUME,
112  	LAYOUT_OSD2_OBJECTS,
113  	LAYOUT_FLEX_FILES,
114  	LAYOUT_NFSV4_1_FILES,
115  	0
116  };
117  
118  static int
ld_cmp(const void * e1,const void * e2)119  ld_cmp(const void *e1, const void *e2)
120  {
121  	u32 ld1 = *((u32 *)e1);
122  	u32 ld2 = *((u32 *)e2);
123  	int i;
124  
125  	for (i = 0; ld_prefs[i] != 0; i++) {
126  		if (ld1 == ld_prefs[i])
127  			return -1;
128  
129  		if (ld2 == ld_prefs[i])
130  			return 1;
131  	}
132  	return 0;
133  }
134  
135  /*
136   * Try to set the server's pnfs module to the pnfs layout type specified by id.
137   * Currently only one pNFS layout driver per filesystem is supported.
138   *
139   * @ids array of layout types supported by MDS.
140   */
141  void
set_pnfs_layoutdriver(struct nfs_server * server,const struct nfs_fh * mntfh,struct nfs_fsinfo * fsinfo)142  set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
143  		      struct nfs_fsinfo *fsinfo)
144  {
145  	struct pnfs_layoutdriver_type *ld_type = NULL;
146  	u32 id;
147  	int i;
148  
149  	if (fsinfo->nlayouttypes == 0)
150  		goto out_no_driver;
151  	if (!(server->nfs_client->cl_exchange_flags &
152  		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
153  		printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
154  			__func__, server->nfs_client->cl_exchange_flags);
155  		goto out_no_driver;
156  	}
157  
158  	sort(fsinfo->layouttype, fsinfo->nlayouttypes,
159  		sizeof(*fsinfo->layouttype), ld_cmp, NULL);
160  
161  	for (i = 0; i < fsinfo->nlayouttypes; i++) {
162  		id = fsinfo->layouttype[i];
163  		ld_type = find_pnfs_driver(id);
164  		if (!ld_type) {
165  			request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
166  					id);
167  			ld_type = find_pnfs_driver(id);
168  		}
169  		if (ld_type)
170  			break;
171  	}
172  
173  	if (!ld_type) {
174  		dprintk("%s: No pNFS module found!\n", __func__);
175  		goto out_no_driver;
176  	}
177  
178  	server->pnfs_curr_ld = ld_type;
179  	if (ld_type->set_layoutdriver
180  	    && ld_type->set_layoutdriver(server, mntfh)) {
181  		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
182  			"driver %u.\n", __func__, id);
183  		module_put(ld_type->owner);
184  		goto out_no_driver;
185  	}
186  	/* Bump the MDS count */
187  	atomic_inc(&server->nfs_client->cl_mds_count);
188  
189  	dprintk("%s: pNFS module for %u set\n", __func__, id);
190  	return;
191  
192  out_no_driver:
193  	dprintk("%s: Using NFSv4 I/O\n", __func__);
194  	server->pnfs_curr_ld = NULL;
195  }
196  
197  int
pnfs_register_layoutdriver(struct pnfs_layoutdriver_type * ld_type)198  pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
199  {
200  	int status = -EINVAL;
201  	struct pnfs_layoutdriver_type *tmp;
202  
203  	if (ld_type->id == 0) {
204  		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
205  		return status;
206  	}
207  	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
208  		printk(KERN_ERR "NFS: %s Layout driver must provide "
209  		       "alloc_lseg and free_lseg.\n", __func__);
210  		return status;
211  	}
212  
213  	spin_lock(&pnfs_spinlock);
214  	tmp = find_pnfs_driver_locked(ld_type->id);
215  	if (!tmp) {
216  		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
217  		status = 0;
218  		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
219  			ld_type->name);
220  	} else {
221  		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
222  			__func__, ld_type->id);
223  	}
224  	spin_unlock(&pnfs_spinlock);
225  
226  	return status;
227  }
228  EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
229  
230  void
pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type * ld_type)231  pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
232  {
233  	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
234  	spin_lock(&pnfs_spinlock);
235  	list_del(&ld_type->pnfs_tblid);
236  	spin_unlock(&pnfs_spinlock);
237  }
238  EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
239  
240  /*
241   * pNFS client layout cache
242   */
243  
244  /* Need to hold i_lock if caller does not already hold reference */
245  void
pnfs_get_layout_hdr(struct pnfs_layout_hdr * lo)246  pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
247  {
248  	atomic_inc(&lo->plh_refcount);
249  }
250  
251  static struct pnfs_layout_hdr *
pnfs_alloc_layout_hdr(struct inode * ino,gfp_t gfp_flags)252  pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
253  {
254  	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
255  	return ld->alloc_layout_hdr(ino, gfp_flags);
256  }
257  
258  static void
pnfs_free_layout_hdr(struct pnfs_layout_hdr * lo)259  pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
260  {
261  	struct nfs_server *server = NFS_SERVER(lo->plh_inode);
262  	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
263  
264  	if (!list_empty(&lo->plh_layouts)) {
265  		struct nfs_client *clp = server->nfs_client;
266  
267  		spin_lock(&clp->cl_lock);
268  		list_del_init(&lo->plh_layouts);
269  		spin_unlock(&clp->cl_lock);
270  	}
271  	put_rpccred(lo->plh_lc_cred);
272  	return ld->free_layout_hdr(lo);
273  }
274  
275  static void
pnfs_detach_layout_hdr(struct pnfs_layout_hdr * lo)276  pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
277  {
278  	struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
279  	dprintk("%s: freeing layout cache %p\n", __func__, lo);
280  	nfsi->layout = NULL;
281  	/* Reset MDS Threshold I/O counters */
282  	nfsi->write_io = 0;
283  	nfsi->read_io = 0;
284  }
285  
286  void
pnfs_put_layout_hdr(struct pnfs_layout_hdr * lo)287  pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
288  {
289  	struct inode *inode = lo->plh_inode;
290  
291  	pnfs_layoutreturn_before_put_layout_hdr(lo);
292  
293  	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
294  		if (!list_empty(&lo->plh_segs))
295  			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
296  		pnfs_detach_layout_hdr(lo);
297  		spin_unlock(&inode->i_lock);
298  		pnfs_free_layout_hdr(lo);
299  	}
300  }
301  
302  static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr * lo)303  pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
304  {
305  	lo->plh_return_iomode = 0;
306  	lo->plh_return_seq = 0;
307  	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
308  }
309  
310  /*
311   * Mark a pnfs_layout_hdr and all associated layout segments as invalid
312   *
313   * In order to continue using the pnfs_layout_hdr, a full recovery
314   * is required.
315   * Note that caller must hold inode->i_lock.
316   */
317  int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr * lo,struct list_head * lseg_list)318  pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
319  		struct list_head *lseg_list)
320  {
321  	struct pnfs_layout_range range = {
322  		.iomode = IOMODE_ANY,
323  		.offset = 0,
324  		.length = NFS4_MAX_UINT64,
325  	};
326  
327  	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
328  	pnfs_clear_layoutreturn_info(lo);
329  	return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
330  }
331  
332  static int
pnfs_iomode_to_fail_bit(u32 iomode)333  pnfs_iomode_to_fail_bit(u32 iomode)
334  {
335  	return iomode == IOMODE_RW ?
336  		NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
337  }
338  
339  static void
pnfs_layout_set_fail_bit(struct pnfs_layout_hdr * lo,int fail_bit)340  pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
341  {
342  	lo->plh_retry_timestamp = jiffies;
343  	if (!test_and_set_bit(fail_bit, &lo->plh_flags))
344  		atomic_inc(&lo->plh_refcount);
345  }
346  
347  static void
pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr * lo,int fail_bit)348  pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
349  {
350  	if (test_and_clear_bit(fail_bit, &lo->plh_flags))
351  		atomic_dec(&lo->plh_refcount);
352  }
353  
354  static void
pnfs_layout_io_set_failed(struct pnfs_layout_hdr * lo,u32 iomode)355  pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
356  {
357  	struct inode *inode = lo->plh_inode;
358  	struct pnfs_layout_range range = {
359  		.iomode = iomode,
360  		.offset = 0,
361  		.length = NFS4_MAX_UINT64,
362  	};
363  	LIST_HEAD(head);
364  
365  	spin_lock(&inode->i_lock);
366  	pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
367  	pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
368  	spin_unlock(&inode->i_lock);
369  	pnfs_free_lseg_list(&head);
370  	dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
371  			iomode == IOMODE_RW ?  "RW" : "READ");
372  }
373  
374  static bool
pnfs_layout_io_test_failed(struct pnfs_layout_hdr * lo,u32 iomode)375  pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
376  {
377  	unsigned long start, end;
378  	int fail_bit = pnfs_iomode_to_fail_bit(iomode);
379  
380  	if (test_bit(fail_bit, &lo->plh_flags) == 0)
381  		return false;
382  	end = jiffies;
383  	start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
384  	if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
385  		/* It is time to retry the failed layoutgets */
386  		pnfs_layout_clear_fail_bit(lo, fail_bit);
387  		return false;
388  	}
389  	return true;
390  }
391  
392  static void
pnfs_init_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,const struct pnfs_layout_range * range,const nfs4_stateid * stateid)393  pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
394  		const struct pnfs_layout_range *range,
395  		const nfs4_stateid *stateid)
396  {
397  	INIT_LIST_HEAD(&lseg->pls_list);
398  	INIT_LIST_HEAD(&lseg->pls_lc_list);
399  	atomic_set(&lseg->pls_refcount, 1);
400  	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
401  	lseg->pls_layout = lo;
402  	lseg->pls_range = *range;
403  	lseg->pls_seq = be32_to_cpu(stateid->seqid);
404  }
405  
pnfs_free_lseg(struct pnfs_layout_segment * lseg)406  static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
407  {
408  	struct inode *ino = lseg->pls_layout->plh_inode;
409  
410  	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
411  }
412  
413  static void
pnfs_layout_remove_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg)414  pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
415  		struct pnfs_layout_segment *lseg)
416  {
417  	struct inode *inode = lo->plh_inode;
418  
419  	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
420  	list_del_init(&lseg->pls_list);
421  	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
422  	atomic_dec(&lo->plh_refcount);
423  	if (list_empty(&lo->plh_segs) &&
424  	    !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
425  	    !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
426  		if (atomic_read(&lo->plh_outstanding) == 0)
427  			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
428  		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
429  	}
430  	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
431  }
432  
433  void
pnfs_put_lseg(struct pnfs_layout_segment * lseg)434  pnfs_put_lseg(struct pnfs_layout_segment *lseg)
435  {
436  	struct pnfs_layout_hdr *lo;
437  	struct inode *inode;
438  
439  	if (!lseg)
440  		return;
441  
442  	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
443  		atomic_read(&lseg->pls_refcount),
444  		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
445  
446  	lo = lseg->pls_layout;
447  	inode = lo->plh_inode;
448  
449  	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
450  		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
451  			spin_unlock(&inode->i_lock);
452  			return;
453  		}
454  		pnfs_get_layout_hdr(lo);
455  		pnfs_layout_remove_lseg(lo, lseg);
456  		spin_unlock(&inode->i_lock);
457  		pnfs_free_lseg(lseg);
458  		pnfs_put_layout_hdr(lo);
459  	}
460  }
461  EXPORT_SYMBOL_GPL(pnfs_put_lseg);
462  
pnfs_free_lseg_async_work(struct work_struct * work)463  static void pnfs_free_lseg_async_work(struct work_struct *work)
464  {
465  	struct pnfs_layout_segment *lseg;
466  	struct pnfs_layout_hdr *lo;
467  
468  	lseg = container_of(work, struct pnfs_layout_segment, pls_work);
469  	lo = lseg->pls_layout;
470  
471  	pnfs_free_lseg(lseg);
472  	pnfs_put_layout_hdr(lo);
473  }
474  
pnfs_free_lseg_async(struct pnfs_layout_segment * lseg)475  static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
476  {
477  	INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
478  	schedule_work(&lseg->pls_work);
479  }
480  
481  void
pnfs_put_lseg_locked(struct pnfs_layout_segment * lseg)482  pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
483  {
484  	if (!lseg)
485  		return;
486  
487  	assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
488  
489  	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
490  		atomic_read(&lseg->pls_refcount),
491  		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
492  	if (atomic_dec_and_test(&lseg->pls_refcount)) {
493  		struct pnfs_layout_hdr *lo = lseg->pls_layout;
494  		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
495  			return;
496  		pnfs_get_layout_hdr(lo);
497  		pnfs_layout_remove_lseg(lo, lseg);
498  		pnfs_free_lseg_async(lseg);
499  	}
500  }
501  EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
502  
503  static u64
end_offset(u64 start,u64 len)504  end_offset(u64 start, u64 len)
505  {
506  	u64 end;
507  
508  	end = start + len;
509  	return end >= start ? end : NFS4_MAX_UINT64;
510  }
511  
512  /*
513   * is l2 fully contained in l1?
514   *   start1                             end1
515   *   [----------------------------------)
516   *           start2           end2
517   *           [----------------)
518   */
519  static bool
pnfs_lseg_range_contained(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)520  pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
521  		 const struct pnfs_layout_range *l2)
522  {
523  	u64 start1 = l1->offset;
524  	u64 end1 = end_offset(start1, l1->length);
525  	u64 start2 = l2->offset;
526  	u64 end2 = end_offset(start2, l2->length);
527  
528  	return (start1 <= start2) && (end1 >= end2);
529  }
530  
531  /*
532   * is l1 and l2 intersecting?
533   *   start1                             end1
534   *   [----------------------------------)
535   *                              start2           end2
536   *                              [----------------)
537   */
538  static bool
pnfs_lseg_range_intersecting(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)539  pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
540  		    const struct pnfs_layout_range *l2)
541  {
542  	u64 start1 = l1->offset;
543  	u64 end1 = end_offset(start1, l1->length);
544  	u64 start2 = l2->offset;
545  	u64 end2 = end_offset(start2, l2->length);
546  
547  	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
548  	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
549  }
550  
pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment * lseg,struct list_head * tmp_list)551  static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
552  		struct list_head *tmp_list)
553  {
554  	if (!atomic_dec_and_test(&lseg->pls_refcount))
555  		return false;
556  	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
557  	list_add(&lseg->pls_list, tmp_list);
558  	return true;
559  }
560  
561  /* Returns 1 if lseg is removed from list, 0 otherwise */
mark_lseg_invalid(struct pnfs_layout_segment * lseg,struct list_head * tmp_list)562  static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
563  			     struct list_head *tmp_list)
564  {
565  	int rv = 0;
566  
567  	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
568  		/* Remove the reference keeping the lseg in the
569  		 * list.  It will now be removed when all
570  		 * outstanding io is finished.
571  		 */
572  		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
573  			atomic_read(&lseg->pls_refcount));
574  		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
575  			rv = 1;
576  	}
577  	return rv;
578  }
579  
580  /*
581   * Compare 2 layout stateid sequence ids, to see which is newer,
582   * taking into account wraparound issues.
583   */
pnfs_seqid_is_newer(u32 s1,u32 s2)584  static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
585  {
586  	return (s32)(s1 - s2) > 0;
587  }
588  
589  static bool
pnfs_should_free_range(const struct pnfs_layout_range * lseg_range,const struct pnfs_layout_range * recall_range)590  pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
591  		 const struct pnfs_layout_range *recall_range)
592  {
593  	return (recall_range->iomode == IOMODE_ANY ||
594  		lseg_range->iomode == recall_range->iomode) &&
595  	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
596  }
597  
598  static bool
pnfs_match_lseg_recall(const struct pnfs_layout_segment * lseg,const struct pnfs_layout_range * recall_range,u32 seq)599  pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
600  		const struct pnfs_layout_range *recall_range,
601  		u32 seq)
602  {
603  	if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
604  		return false;
605  	if (recall_range == NULL)
606  		return true;
607  	return pnfs_should_free_range(&lseg->pls_range, recall_range);
608  }
609  
610  /**
611   * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
612   * @lo: layout header containing the lsegs
613   * @tmp_list: list head where doomed lsegs should go
614   * @recall_range: optional recall range argument to match (may be NULL)
615   * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
616   *
617   * Walk the list of lsegs in the layout header, and tear down any that should
618   * be destroyed. If "recall_range" is specified then the segment must match
619   * that range. If "seq" is non-zero, then only match segments that were handed
620   * out at or before that sequence.
621   *
622   * Returns number of matching invalid lsegs remaining in list after scanning
623   * it and purging them.
624   */
625  int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr * lo,struct list_head * tmp_list,const struct pnfs_layout_range * recall_range,u32 seq)626  pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
627  			    struct list_head *tmp_list,
628  			    const struct pnfs_layout_range *recall_range,
629  			    u32 seq)
630  {
631  	struct pnfs_layout_segment *lseg, *next;
632  	int remaining = 0;
633  
634  	dprintk("%s:Begin lo %p\n", __func__, lo);
635  
636  	if (list_empty(&lo->plh_segs))
637  		return 0;
638  	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
639  		if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
640  			dprintk("%s: freeing lseg %p iomode %d seq %u"
641  				"offset %llu length %llu\n", __func__,
642  				lseg, lseg->pls_range.iomode, lseg->pls_seq,
643  				lseg->pls_range.offset, lseg->pls_range.length);
644  			if (!mark_lseg_invalid(lseg, tmp_list))
645  				remaining++;
646  		}
647  	dprintk("%s:Return %i\n", __func__, remaining);
648  	return remaining;
649  }
650  
651  /* note free_me must contain lsegs from a single layout_hdr */
652  void
pnfs_free_lseg_list(struct list_head * free_me)653  pnfs_free_lseg_list(struct list_head *free_me)
654  {
655  	struct pnfs_layout_segment *lseg, *tmp;
656  
657  	if (list_empty(free_me))
658  		return;
659  
660  	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
661  		list_del(&lseg->pls_list);
662  		pnfs_free_lseg(lseg);
663  	}
664  }
665  
666  void
pnfs_destroy_layout(struct nfs_inode * nfsi)667  pnfs_destroy_layout(struct nfs_inode *nfsi)
668  {
669  	struct pnfs_layout_hdr *lo;
670  	LIST_HEAD(tmp_list);
671  
672  	spin_lock(&nfsi->vfs_inode.i_lock);
673  	lo = nfsi->layout;
674  	if (lo) {
675  		pnfs_get_layout_hdr(lo);
676  		pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
677  		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
678  		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
679  		spin_unlock(&nfsi->vfs_inode.i_lock);
680  		pnfs_free_lseg_list(&tmp_list);
681  		pnfs_put_layout_hdr(lo);
682  	} else
683  		spin_unlock(&nfsi->vfs_inode.i_lock);
684  }
685  EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
686  
687  static bool
pnfs_layout_add_bulk_destroy_list(struct inode * inode,struct list_head * layout_list)688  pnfs_layout_add_bulk_destroy_list(struct inode *inode,
689  		struct list_head *layout_list)
690  {
691  	struct pnfs_layout_hdr *lo;
692  	bool ret = false;
693  
694  	spin_lock(&inode->i_lock);
695  	lo = NFS_I(inode)->layout;
696  	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
697  		pnfs_get_layout_hdr(lo);
698  		list_add(&lo->plh_bulk_destroy, layout_list);
699  		ret = true;
700  	}
701  	spin_unlock(&inode->i_lock);
702  	return ret;
703  }
704  
705  /* Caller must hold rcu_read_lock and clp->cl_lock */
706  static int
pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client * clp,struct nfs_server * server,struct list_head * layout_list)707  pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
708  		struct nfs_server *server,
709  		struct list_head *layout_list)
710  {
711  	struct pnfs_layout_hdr *lo, *next;
712  	struct inode *inode;
713  
714  	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
715  		inode = igrab(lo->plh_inode);
716  		if (inode == NULL)
717  			continue;
718  		list_del_init(&lo->plh_layouts);
719  		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
720  			continue;
721  		rcu_read_unlock();
722  		spin_unlock(&clp->cl_lock);
723  		iput(inode);
724  		spin_lock(&clp->cl_lock);
725  		rcu_read_lock();
726  		return -EAGAIN;
727  	}
728  	return 0;
729  }
730  
731  static int
pnfs_layout_free_bulk_destroy_list(struct list_head * layout_list,bool is_bulk_recall)732  pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
733  		bool is_bulk_recall)
734  {
735  	struct pnfs_layout_hdr *lo;
736  	struct inode *inode;
737  	LIST_HEAD(lseg_list);
738  	int ret = 0;
739  
740  	while (!list_empty(layout_list)) {
741  		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
742  				plh_bulk_destroy);
743  		dprintk("%s freeing layout for inode %lu\n", __func__,
744  			lo->plh_inode->i_ino);
745  		inode = lo->plh_inode;
746  
747  		pnfs_layoutcommit_inode(inode, false);
748  
749  		spin_lock(&inode->i_lock);
750  		list_del_init(&lo->plh_bulk_destroy);
751  		if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
752  			if (is_bulk_recall)
753  				set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
754  			ret = -EAGAIN;
755  		}
756  		spin_unlock(&inode->i_lock);
757  		pnfs_free_lseg_list(&lseg_list);
758  		/* Free all lsegs that are attached to commit buckets */
759  		nfs_commit_inode(inode, 0);
760  		pnfs_put_layout_hdr(lo);
761  		iput(inode);
762  	}
763  	return ret;
764  }
765  
766  int
pnfs_destroy_layouts_byfsid(struct nfs_client * clp,struct nfs_fsid * fsid,bool is_recall)767  pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
768  		struct nfs_fsid *fsid,
769  		bool is_recall)
770  {
771  	struct nfs_server *server;
772  	LIST_HEAD(layout_list);
773  
774  	spin_lock(&clp->cl_lock);
775  	rcu_read_lock();
776  restart:
777  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
778  		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
779  			continue;
780  		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
781  				server,
782  				&layout_list) != 0)
783  			goto restart;
784  	}
785  	rcu_read_unlock();
786  	spin_unlock(&clp->cl_lock);
787  
788  	if (list_empty(&layout_list))
789  		return 0;
790  	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
791  }
792  
793  int
pnfs_destroy_layouts_byclid(struct nfs_client * clp,bool is_recall)794  pnfs_destroy_layouts_byclid(struct nfs_client *clp,
795  		bool is_recall)
796  {
797  	struct nfs_server *server;
798  	LIST_HEAD(layout_list);
799  
800  	spin_lock(&clp->cl_lock);
801  	rcu_read_lock();
802  restart:
803  	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
804  		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
805  					server,
806  					&layout_list) != 0)
807  			goto restart;
808  	}
809  	rcu_read_unlock();
810  	spin_unlock(&clp->cl_lock);
811  
812  	if (list_empty(&layout_list))
813  		return 0;
814  	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
815  }
816  
817  /*
818   * Called by the state manger to remove all layouts established under an
819   * expired lease.
820   */
821  void
pnfs_destroy_all_layouts(struct nfs_client * clp)822  pnfs_destroy_all_layouts(struct nfs_client *clp)
823  {
824  	nfs4_deviceid_mark_client_invalid(clp);
825  	nfs4_deviceid_purge_client(clp);
826  
827  	pnfs_destroy_layouts_byclid(clp, false);
828  }
829  
830  /* update lo->plh_stateid with new if is more recent */
831  void
pnfs_set_layout_stateid(struct pnfs_layout_hdr * lo,const nfs4_stateid * new,bool update_barrier)832  pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
833  			bool update_barrier)
834  {
835  	u32 oldseq, newseq, new_barrier = 0;
836  
837  	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
838  	newseq = be32_to_cpu(new->seqid);
839  
840  	if (!pnfs_layout_is_valid(lo)) {
841  		nfs4_stateid_copy(&lo->plh_stateid, new);
842  		lo->plh_barrier = newseq;
843  		pnfs_clear_layoutreturn_info(lo);
844  		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
845  		return;
846  	}
847  	if (pnfs_seqid_is_newer(newseq, oldseq)) {
848  		nfs4_stateid_copy(&lo->plh_stateid, new);
849  		/*
850  		 * Because of wraparound, we want to keep the barrier
851  		 * "close" to the current seqids.
852  		 */
853  		new_barrier = newseq - atomic_read(&lo->plh_outstanding);
854  	}
855  	if (update_barrier)
856  		new_barrier = be32_to_cpu(new->seqid);
857  	else if (new_barrier == 0)
858  		return;
859  	if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
860  		lo->plh_barrier = new_barrier;
861  }
862  
863  static bool
pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr * lo,const nfs4_stateid * stateid)864  pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
865  		const nfs4_stateid *stateid)
866  {
867  	u32 seqid = be32_to_cpu(stateid->seqid);
868  
869  	return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
870  }
871  
872  /* lget is set to 1 if called from inside send_layoutget call chain */
873  static bool
pnfs_layoutgets_blocked(const struct pnfs_layout_hdr * lo)874  pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
875  {
876  	return lo->plh_block_lgets ||
877  		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
878  }
879  
880  /*
881   * Get layout from server.
882   *    for now, assume that whole file layouts are requested.
883   *    arg->offset: 0
884   *    arg->length: all ones
885   */
886  static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr * lo,struct nfs_open_context * ctx,nfs4_stateid * stateid,const struct pnfs_layout_range * range,long * timeout,gfp_t gfp_flags)887  send_layoutget(struct pnfs_layout_hdr *lo,
888  	   struct nfs_open_context *ctx,
889  	   nfs4_stateid *stateid,
890  	   const struct pnfs_layout_range *range,
891  	   long *timeout, gfp_t gfp_flags)
892  {
893  	struct inode *ino = lo->plh_inode;
894  	struct nfs_server *server = NFS_SERVER(ino);
895  	struct nfs4_layoutget *lgp;
896  	loff_t i_size;
897  
898  	dprintk("--> %s\n", __func__);
899  
900  	/*
901  	 * Synchronously retrieve layout information from server and
902  	 * store in lseg. If we race with a concurrent seqid morphing
903  	 * op, then re-send the LAYOUTGET.
904  	 */
905  	lgp = kzalloc(sizeof(*lgp), gfp_flags);
906  	if (lgp == NULL)
907  		return ERR_PTR(-ENOMEM);
908  
909  	i_size = i_size_read(ino);
910  
911  	lgp->args.minlength = PAGE_SIZE;
912  	if (lgp->args.minlength > range->length)
913  		lgp->args.minlength = range->length;
914  	if (range->iomode == IOMODE_READ) {
915  		if (range->offset >= i_size)
916  			lgp->args.minlength = 0;
917  		else if (i_size - range->offset < lgp->args.minlength)
918  			lgp->args.minlength = i_size - range->offset;
919  	}
920  	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
921  	pnfs_copy_range(&lgp->args.range, range);
922  	lgp->args.type = server->pnfs_curr_ld->id;
923  	lgp->args.inode = ino;
924  	lgp->args.ctx = get_nfs_open_context(ctx);
925  	nfs4_stateid_copy(&lgp->args.stateid, stateid);
926  	lgp->gfp_flags = gfp_flags;
927  	lgp->cred = lo->plh_lc_cred;
928  
929  	return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
930  }
931  
pnfs_clear_layoutcommit(struct inode * inode,struct list_head * head)932  static void pnfs_clear_layoutcommit(struct inode *inode,
933  		struct list_head *head)
934  {
935  	struct nfs_inode *nfsi = NFS_I(inode);
936  	struct pnfs_layout_segment *lseg, *tmp;
937  
938  	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
939  		return;
940  	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
941  		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
942  			continue;
943  		pnfs_lseg_dec_and_remove_zero(lseg, head);
944  	}
945  }
946  
pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr * lo)947  void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
948  {
949  	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
950  	clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
951  	smp_mb__after_atomic();
952  	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
953  	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
954  }
955  
956  static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr * lo,nfs4_stateid * stateid,enum pnfs_iomode * iomode)957  pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
958  		nfs4_stateid *stateid,
959  		enum pnfs_iomode *iomode)
960  {
961  	/* Serialise LAYOUTGET/LAYOUTRETURN */
962  	if (atomic_read(&lo->plh_outstanding) != 0)
963  		return false;
964  	if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
965  		return false;
966  	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
967  	pnfs_get_layout_hdr(lo);
968  	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
969  		if (stateid != NULL) {
970  			nfs4_stateid_copy(stateid, &lo->plh_stateid);
971  			if (lo->plh_return_seq != 0)
972  				stateid->seqid = cpu_to_be32(lo->plh_return_seq);
973  		}
974  		if (iomode != NULL)
975  			*iomode = lo->plh_return_iomode;
976  		pnfs_clear_layoutreturn_info(lo);
977  		return true;
978  	}
979  	if (stateid != NULL)
980  		nfs4_stateid_copy(stateid, &lo->plh_stateid);
981  	if (iomode != NULL)
982  		*iomode = IOMODE_ANY;
983  	return true;
984  }
985  
986  static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr * lo,const nfs4_stateid * stateid,enum pnfs_iomode iomode,bool sync)987  pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
988  		       enum pnfs_iomode iomode, bool sync)
989  {
990  	struct inode *ino = lo->plh_inode;
991  	struct nfs4_layoutreturn *lrp;
992  	int status = 0;
993  
994  	lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
995  	if (unlikely(lrp == NULL)) {
996  		status = -ENOMEM;
997  		spin_lock(&ino->i_lock);
998  		pnfs_clear_layoutreturn_waitbit(lo);
999  		spin_unlock(&ino->i_lock);
1000  		pnfs_put_layout_hdr(lo);
1001  		goto out;
1002  	}
1003  
1004  	nfs4_stateid_copy(&lrp->args.stateid, stateid);
1005  	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
1006  	lrp->args.inode = ino;
1007  	lrp->args.range.iomode = iomode;
1008  	lrp->args.range.offset = 0;
1009  	lrp->args.range.length = NFS4_MAX_UINT64;
1010  	lrp->args.layout = lo;
1011  	lrp->clp = NFS_SERVER(ino)->nfs_client;
1012  	lrp->cred = lo->plh_lc_cred;
1013  
1014  	status = nfs4_proc_layoutreturn(lrp, sync);
1015  out:
1016  	dprintk("<-- %s status: %d\n", __func__, status);
1017  	return status;
1018  }
1019  
1020  /* Return true if layoutreturn is needed */
1021  static bool
pnfs_layout_need_return(struct pnfs_layout_hdr * lo)1022  pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
1023  {
1024  	struct pnfs_layout_segment *s;
1025  
1026  	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1027  		return false;
1028  
1029  	/* Defer layoutreturn until all lsegs are done */
1030  	list_for_each_entry(s, &lo->plh_segs, pls_list) {
1031  		if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
1032  			return false;
1033  	}
1034  
1035  	return true;
1036  }
1037  
pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr * lo)1038  static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
1039  {
1040  	struct inode *inode= lo->plh_inode;
1041  
1042  	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1043  		return;
1044  	spin_lock(&inode->i_lock);
1045  	if (pnfs_layout_need_return(lo)) {
1046  		nfs4_stateid stateid;
1047  		enum pnfs_iomode iomode;
1048  		bool send;
1049  
1050  		send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1051  		spin_unlock(&inode->i_lock);
1052  		if (send) {
1053  			/* Send an async layoutreturn so we dont deadlock */
1054  			pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1055  		}
1056  	} else
1057  		spin_unlock(&inode->i_lock);
1058  }
1059  
1060  /*
1061   * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1062   * when the layout segment list is empty.
1063   *
1064   * Note that a pnfs_layout_hdr can exist with an empty layout segment
1065   * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1066   * deviceid is marked invalid.
1067   */
1068  int
_pnfs_return_layout(struct inode * ino)1069  _pnfs_return_layout(struct inode *ino)
1070  {
1071  	struct pnfs_layout_hdr *lo = NULL;
1072  	struct nfs_inode *nfsi = NFS_I(ino);
1073  	LIST_HEAD(tmp_list);
1074  	nfs4_stateid stateid;
1075  	int status = 0, empty;
1076  	bool send;
1077  
1078  	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1079  
1080  	spin_lock(&ino->i_lock);
1081  	lo = nfsi->layout;
1082  	if (!lo) {
1083  		spin_unlock(&ino->i_lock);
1084  		dprintk("NFS: %s no layout to return\n", __func__);
1085  		goto out;
1086  	}
1087  	/* Reference matched in nfs4_layoutreturn_release */
1088  	pnfs_get_layout_hdr(lo);
1089  	empty = list_empty(&lo->plh_segs);
1090  	pnfs_clear_layoutcommit(ino, &tmp_list);
1091  	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
1092  
1093  	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1094  		struct pnfs_layout_range range = {
1095  			.iomode		= IOMODE_ANY,
1096  			.offset		= 0,
1097  			.length		= NFS4_MAX_UINT64,
1098  		};
1099  		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1100  	}
1101  
1102  	/* Don't send a LAYOUTRETURN if list was initially empty */
1103  	if (empty) {
1104  		spin_unlock(&ino->i_lock);
1105  		dprintk("NFS: %s no layout segments to return\n", __func__);
1106  		goto out_put_layout_hdr;
1107  	}
1108  
1109  	send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
1110  	spin_unlock(&ino->i_lock);
1111  	pnfs_free_lseg_list(&tmp_list);
1112  	if (send)
1113  		status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1114  out_put_layout_hdr:
1115  	pnfs_put_layout_hdr(lo);
1116  out:
1117  	dprintk("<-- %s status: %d\n", __func__, status);
1118  	return status;
1119  }
1120  EXPORT_SYMBOL_GPL(_pnfs_return_layout);
1121  
1122  int
pnfs_commit_and_return_layout(struct inode * inode)1123  pnfs_commit_and_return_layout(struct inode *inode)
1124  {
1125  	struct pnfs_layout_hdr *lo;
1126  	int ret;
1127  
1128  	spin_lock(&inode->i_lock);
1129  	lo = NFS_I(inode)->layout;
1130  	if (lo == NULL) {
1131  		spin_unlock(&inode->i_lock);
1132  		return 0;
1133  	}
1134  	pnfs_get_layout_hdr(lo);
1135  	/* Block new layoutgets and read/write to ds */
1136  	lo->plh_block_lgets++;
1137  	spin_unlock(&inode->i_lock);
1138  	filemap_fdatawait(inode->i_mapping);
1139  	ret = pnfs_layoutcommit_inode(inode, true);
1140  	if (ret == 0)
1141  		ret = _pnfs_return_layout(inode);
1142  	spin_lock(&inode->i_lock);
1143  	lo->plh_block_lgets--;
1144  	spin_unlock(&inode->i_lock);
1145  	pnfs_put_layout_hdr(lo);
1146  	return ret;
1147  }
1148  
pnfs_roc(struct inode * ino)1149  bool pnfs_roc(struct inode *ino)
1150  {
1151  	struct nfs_inode *nfsi = NFS_I(ino);
1152  	struct nfs_open_context *ctx;
1153  	struct nfs4_state *state;
1154  	struct pnfs_layout_hdr *lo;
1155  	struct pnfs_layout_segment *lseg, *tmp;
1156  	nfs4_stateid stateid;
1157  	LIST_HEAD(tmp_list);
1158  	bool found = false, layoutreturn = false, roc = false;
1159  
1160  	spin_lock(&ino->i_lock);
1161  	lo = nfsi->layout;
1162  	if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1163  		goto out_noroc;
1164  
1165  	/* no roc if we hold a delegation */
1166  	if (nfs4_check_delegation(ino, FMODE_READ))
1167  		goto out_noroc;
1168  
1169  	list_for_each_entry(ctx, &nfsi->open_files, list) {
1170  		state = ctx->state;
1171  		/* Don't return layout if there is open file state */
1172  		if (state != NULL && state->state != 0)
1173  			goto out_noroc;
1174  	}
1175  
1176  	/* always send layoutreturn if being marked so */
1177  	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1178  		layoutreturn = pnfs_prepare_layoutreturn(lo,
1179  				&stateid, NULL);
1180  
1181  	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1182  		/* If we are sending layoutreturn, invalidate all valid lsegs */
1183  		if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
1184  			mark_lseg_invalid(lseg, &tmp_list);
1185  			found = true;
1186  		}
1187  	/* ROC in two conditions:
1188  	 * 1. there are ROC lsegs
1189  	 * 2. we don't send layoutreturn
1190  	 */
1191  	if (found && !layoutreturn) {
1192  		/* lo ref dropped in pnfs_roc_release() */
1193  		pnfs_get_layout_hdr(lo);
1194  		roc = true;
1195  	}
1196  
1197  out_noroc:
1198  	spin_unlock(&ino->i_lock);
1199  	pnfs_free_lseg_list(&tmp_list);
1200  	pnfs_layoutcommit_inode(ino, true);
1201  	if (layoutreturn)
1202  		pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1203  	return roc;
1204  }
1205  
pnfs_roc_release(struct inode * ino)1206  void pnfs_roc_release(struct inode *ino)
1207  {
1208  	struct pnfs_layout_hdr *lo;
1209  
1210  	spin_lock(&ino->i_lock);
1211  	lo = NFS_I(ino)->layout;
1212  	pnfs_clear_layoutreturn_waitbit(lo);
1213  	if (atomic_dec_and_test(&lo->plh_refcount)) {
1214  		pnfs_detach_layout_hdr(lo);
1215  		spin_unlock(&ino->i_lock);
1216  		pnfs_free_layout_hdr(lo);
1217  	} else
1218  		spin_unlock(&ino->i_lock);
1219  }
1220  
pnfs_roc_set_barrier(struct inode * ino,u32 barrier)1221  void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1222  {
1223  	struct pnfs_layout_hdr *lo;
1224  
1225  	spin_lock(&ino->i_lock);
1226  	lo = NFS_I(ino)->layout;
1227  	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1228  		lo->plh_barrier = barrier;
1229  	spin_unlock(&ino->i_lock);
1230  	trace_nfs4_layoutreturn_on_close(ino, 0);
1231  }
1232  
pnfs_roc_get_barrier(struct inode * ino,u32 * barrier)1233  void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
1234  {
1235  	struct nfs_inode *nfsi = NFS_I(ino);
1236  	struct pnfs_layout_hdr *lo;
1237  	u32 current_seqid;
1238  
1239  	spin_lock(&ino->i_lock);
1240  	lo = nfsi->layout;
1241  	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
1242  
1243  	/* Since close does not return a layout stateid for use as
1244  	 * a barrier, we choose the worst-case barrier.
1245  	 */
1246  	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1247  	spin_unlock(&ino->i_lock);
1248  }
1249  
pnfs_wait_on_layoutreturn(struct inode * ino,struct rpc_task * task)1250  bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1251  {
1252  	struct nfs_inode *nfsi = NFS_I(ino);
1253          struct pnfs_layout_hdr *lo;
1254          bool sleep = false;
1255  
1256  	/* we might not have grabbed lo reference. so need to check under
1257  	 * i_lock */
1258          spin_lock(&ino->i_lock);
1259          lo = nfsi->layout;
1260          if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1261                  rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1262                  sleep = true;
1263  	}
1264          spin_unlock(&ino->i_lock);
1265          return sleep;
1266  }
1267  
1268  /*
1269   * Compare two layout segments for sorting into layout cache.
1270   * We want to preferentially return RW over RO layouts, so ensure those
1271   * are seen first.
1272   */
1273  static s64
pnfs_lseg_range_cmp(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)1274  pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1275  	   const struct pnfs_layout_range *l2)
1276  {
1277  	s64 d;
1278  
1279  	/* high offset > low offset */
1280  	d = l1->offset - l2->offset;
1281  	if (d)
1282  		return d;
1283  
1284  	/* short length > long length */
1285  	d = l2->length - l1->length;
1286  	if (d)
1287  		return d;
1288  
1289  	/* read > read/write */
1290  	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1291  }
1292  
1293  static bool
pnfs_lseg_range_is_after(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)1294  pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1295  		const struct pnfs_layout_range *l2)
1296  {
1297  	return pnfs_lseg_range_cmp(l1, l2) > 0;
1298  }
1299  
1300  static bool
pnfs_lseg_no_merge(struct pnfs_layout_segment * lseg,struct pnfs_layout_segment * old)1301  pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1302  		struct pnfs_layout_segment *old)
1303  {
1304  	return false;
1305  }
1306  
1307  void
pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,bool (* is_after)(const struct pnfs_layout_range *,const struct pnfs_layout_range *),bool (* do_merge)(struct pnfs_layout_segment *,struct pnfs_layout_segment *),struct list_head * free_me)1308  pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1309  		   struct pnfs_layout_segment *lseg,
1310  		   bool (*is_after)(const struct pnfs_layout_range *,
1311  			   const struct pnfs_layout_range *),
1312  		   bool (*do_merge)(struct pnfs_layout_segment *,
1313  			   struct pnfs_layout_segment *),
1314  		   struct list_head *free_me)
1315  {
1316  	struct pnfs_layout_segment *lp, *tmp;
1317  
1318  	dprintk("%s:Begin\n", __func__);
1319  
1320  	list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1321  		if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1322  			continue;
1323  		if (do_merge(lseg, lp)) {
1324  			mark_lseg_invalid(lp, free_me);
1325  			continue;
1326  		}
1327  		if (is_after(&lseg->pls_range, &lp->pls_range))
1328  			continue;
1329  		list_add_tail(&lseg->pls_list, &lp->pls_list);
1330  		dprintk("%s: inserted lseg %p "
1331  			"iomode %d offset %llu length %llu before "
1332  			"lp %p iomode %d offset %llu length %llu\n",
1333  			__func__, lseg, lseg->pls_range.iomode,
1334  			lseg->pls_range.offset, lseg->pls_range.length,
1335  			lp, lp->pls_range.iomode, lp->pls_range.offset,
1336  			lp->pls_range.length);
1337  		goto out;
1338  	}
1339  	list_add_tail(&lseg->pls_list, &lo->plh_segs);
1340  	dprintk("%s: inserted lseg %p "
1341  		"iomode %d offset %llu length %llu at tail\n",
1342  		__func__, lseg, lseg->pls_range.iomode,
1343  		lseg->pls_range.offset, lseg->pls_range.length);
1344  out:
1345  	pnfs_get_layout_hdr(lo);
1346  
1347  	dprintk("%s:Return\n", __func__);
1348  }
1349  EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1350  
1351  static void
pnfs_layout_insert_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,struct list_head * free_me)1352  pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1353  		   struct pnfs_layout_segment *lseg,
1354  		   struct list_head *free_me)
1355  {
1356  	struct inode *inode = lo->plh_inode;
1357  	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1358  
1359  	if (ld->add_lseg != NULL)
1360  		ld->add_lseg(lo, lseg, free_me);
1361  	else
1362  		pnfs_generic_layout_insert_lseg(lo, lseg,
1363  				pnfs_lseg_range_is_after,
1364  				pnfs_lseg_no_merge,
1365  				free_me);
1366  }
1367  
1368  static struct pnfs_layout_hdr *
alloc_init_layout_hdr(struct inode * ino,struct nfs_open_context * ctx,gfp_t gfp_flags)1369  alloc_init_layout_hdr(struct inode *ino,
1370  		      struct nfs_open_context *ctx,
1371  		      gfp_t gfp_flags)
1372  {
1373  	struct pnfs_layout_hdr *lo;
1374  
1375  	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1376  	if (!lo)
1377  		return NULL;
1378  	atomic_set(&lo->plh_refcount, 1);
1379  	INIT_LIST_HEAD(&lo->plh_layouts);
1380  	INIT_LIST_HEAD(&lo->plh_segs);
1381  	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1382  	lo->plh_inode = ino;
1383  	lo->plh_lc_cred = get_rpccred(ctx->cred);
1384  	lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1385  	return lo;
1386  }
1387  
1388  static struct pnfs_layout_hdr *
pnfs_find_alloc_layout(struct inode * ino,struct nfs_open_context * ctx,gfp_t gfp_flags)1389  pnfs_find_alloc_layout(struct inode *ino,
1390  		       struct nfs_open_context *ctx,
1391  		       gfp_t gfp_flags)
1392  	__releases(&ino->i_lock)
1393  	__acquires(&ino->i_lock)
1394  {
1395  	struct nfs_inode *nfsi = NFS_I(ino);
1396  	struct pnfs_layout_hdr *new = NULL;
1397  
1398  	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1399  
1400  	if (nfsi->layout != NULL)
1401  		goto out_existing;
1402  	spin_unlock(&ino->i_lock);
1403  	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1404  	spin_lock(&ino->i_lock);
1405  
1406  	if (likely(nfsi->layout == NULL)) {	/* Won the race? */
1407  		nfsi->layout = new;
1408  		return new;
1409  	} else if (new != NULL)
1410  		pnfs_free_layout_hdr(new);
1411  out_existing:
1412  	pnfs_get_layout_hdr(nfsi->layout);
1413  	return nfsi->layout;
1414  }
1415  
1416  /*
1417   * iomode matching rules:
1418   * iomode	lseg	strict match
1419   *                      iomode
1420   * -----	-----	------ -----
1421   * ANY		READ	N/A    true
1422   * ANY		RW	N/A    true
1423   * RW		READ	N/A    false
1424   * RW		RW	N/A    true
1425   * READ		READ	N/A    true
1426   * READ		RW	true   false
1427   * READ		RW	false  true
1428   */
1429  static bool
pnfs_lseg_range_match(const struct pnfs_layout_range * ls_range,const struct pnfs_layout_range * range,bool strict_iomode)1430  pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1431  		 const struct pnfs_layout_range *range,
1432  		 bool strict_iomode)
1433  {
1434  	struct pnfs_layout_range range1;
1435  
1436  	if ((range->iomode == IOMODE_RW &&
1437  	     ls_range->iomode != IOMODE_RW) ||
1438  	    (range->iomode != ls_range->iomode &&
1439  	     strict_iomode == true) ||
1440  	    !pnfs_lseg_range_intersecting(ls_range, range))
1441  		return 0;
1442  
1443  	/* range1 covers only the first byte in the range */
1444  	range1 = *range;
1445  	range1.length = 1;
1446  	return pnfs_lseg_range_contained(ls_range, &range1);
1447  }
1448  
1449  /*
1450   * lookup range in layout
1451   */
1452  static struct pnfs_layout_segment *
pnfs_find_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_range * range,bool strict_iomode)1453  pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1454  		struct pnfs_layout_range *range,
1455  		bool strict_iomode)
1456  {
1457  	struct pnfs_layout_segment *lseg, *ret = NULL;
1458  
1459  	dprintk("%s:Begin\n", __func__);
1460  
1461  	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1462  		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1463  		    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1464  		    pnfs_lseg_range_match(&lseg->pls_range, range,
1465  					  strict_iomode)) {
1466  			ret = pnfs_get_lseg(lseg);
1467  			break;
1468  		}
1469  	}
1470  
1471  	dprintk("%s:Return lseg %p ref %d\n",
1472  		__func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
1473  	return ret;
1474  }
1475  
1476  /*
1477   * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1478   * to the MDS or over pNFS
1479   *
1480   * The nfs_inode read_io and write_io fields are cumulative counters reset
1481   * when there are no layout segments. Note that in pnfs_update_layout iomode
1482   * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1483   * WRITE request.
1484   *
1485   * A return of true means use MDS I/O.
1486   *
1487   * From rfc 5661:
1488   * If a file's size is smaller than the file size threshold, data accesses
1489   * SHOULD be sent to the metadata server.  If an I/O request has a length that
1490   * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1491   * server.  If both file size and I/O size are provided, the client SHOULD
1492   * reach or exceed  both thresholds before sending its read or write
1493   * requests to the data server.
1494   */
pnfs_within_mdsthreshold(struct nfs_open_context * ctx,struct inode * ino,int iomode)1495  static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1496  				     struct inode *ino, int iomode)
1497  {
1498  	struct nfs4_threshold *t = ctx->mdsthreshold;
1499  	struct nfs_inode *nfsi = NFS_I(ino);
1500  	loff_t fsize = i_size_read(ino);
1501  	bool size = false, size_set = false, io = false, io_set = false, ret = false;
1502  
1503  	if (t == NULL)
1504  		return ret;
1505  
1506  	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1507  		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1508  
1509  	switch (iomode) {
1510  	case IOMODE_READ:
1511  		if (t->bm & THRESHOLD_RD) {
1512  			dprintk("%s fsize %llu\n", __func__, fsize);
1513  			size_set = true;
1514  			if (fsize < t->rd_sz)
1515  				size = true;
1516  		}
1517  		if (t->bm & THRESHOLD_RD_IO) {
1518  			dprintk("%s nfsi->read_io %llu\n", __func__,
1519  				nfsi->read_io);
1520  			io_set = true;
1521  			if (nfsi->read_io < t->rd_io_sz)
1522  				io = true;
1523  		}
1524  		break;
1525  	case IOMODE_RW:
1526  		if (t->bm & THRESHOLD_WR) {
1527  			dprintk("%s fsize %llu\n", __func__, fsize);
1528  			size_set = true;
1529  			if (fsize < t->wr_sz)
1530  				size = true;
1531  		}
1532  		if (t->bm & THRESHOLD_WR_IO) {
1533  			dprintk("%s nfsi->write_io %llu\n", __func__,
1534  				nfsi->write_io);
1535  			io_set = true;
1536  			if (nfsi->write_io < t->wr_io_sz)
1537  				io = true;
1538  		}
1539  		break;
1540  	}
1541  	if (size_set && io_set) {
1542  		if (size && io)
1543  			ret = true;
1544  	} else if (size || io)
1545  		ret = true;
1546  
1547  	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1548  	return ret;
1549  }
1550  
pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr * lo)1551  static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1552  {
1553  	/*
1554  	 * send layoutcommit as it can hold up layoutreturn due to lseg
1555  	 * reference
1556  	 */
1557  	pnfs_layoutcommit_inode(lo->plh_inode, false);
1558  	return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1559  				   nfs_wait_bit_killable,
1560  				   TASK_UNINTERRUPTIBLE);
1561  }
1562  
pnfs_clear_first_layoutget(struct pnfs_layout_hdr * lo)1563  static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1564  {
1565  	unsigned long *bitlock = &lo->plh_flags;
1566  
1567  	clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1568  	smp_mb__after_atomic();
1569  	wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1570  }
1571  
1572  /*
1573   * Layout segment is retreived from the server if not cached.
1574   * The appropriate layout segment is referenced and returned to the caller.
1575   */
1576  struct pnfs_layout_segment *
pnfs_update_layout(struct inode * ino,struct nfs_open_context * ctx,loff_t pos,u64 count,enum pnfs_iomode iomode,bool strict_iomode,gfp_t gfp_flags)1577  pnfs_update_layout(struct inode *ino,
1578  		   struct nfs_open_context *ctx,
1579  		   loff_t pos,
1580  		   u64 count,
1581  		   enum pnfs_iomode iomode,
1582  		   bool strict_iomode,
1583  		   gfp_t gfp_flags)
1584  {
1585  	struct pnfs_layout_range arg = {
1586  		.iomode = iomode,
1587  		.offset = pos,
1588  		.length = count,
1589  	};
1590  	unsigned pg_offset, seq;
1591  	struct nfs_server *server = NFS_SERVER(ino);
1592  	struct nfs_client *clp = server->nfs_client;
1593  	struct pnfs_layout_hdr *lo = NULL;
1594  	struct pnfs_layout_segment *lseg = NULL;
1595  	nfs4_stateid stateid;
1596  	long timeout = 0;
1597  	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1598  	bool first;
1599  
1600  	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1601  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1602  				 PNFS_UPDATE_LAYOUT_NO_PNFS);
1603  		goto out;
1604  	}
1605  
1606  	if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1607  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1608  				 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1609  		goto out;
1610  	}
1611  
1612  	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1613  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1614  				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1615  		goto out;
1616  	}
1617  
1618  lookup_again:
1619  	nfs4_client_recover_expired_lease(clp);
1620  	first = false;
1621  	spin_lock(&ino->i_lock);
1622  	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1623  	if (lo == NULL) {
1624  		spin_unlock(&ino->i_lock);
1625  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1626  				 PNFS_UPDATE_LAYOUT_NOMEM);
1627  		goto out;
1628  	}
1629  
1630  	/* Do we even need to bother with this? */
1631  	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1632  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1633  				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1634  		dprintk("%s matches recall, use MDS\n", __func__);
1635  		goto out_unlock;
1636  	}
1637  
1638  	/* if LAYOUTGET already failed once we don't try again */
1639  	if (pnfs_layout_io_test_failed(lo, iomode)) {
1640  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1641  				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1642  		goto out_unlock;
1643  	}
1644  
1645  	lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1646  	if (lseg) {
1647  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1648  				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1649  		goto out_unlock;
1650  	}
1651  
1652  	if (!nfs4_valid_open_stateid(ctx->state)) {
1653  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1654  				PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1655  		goto out_unlock;
1656  	}
1657  
1658  	/*
1659  	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
1660  	 * stateid, or it has been invalidated, then we must use the open
1661  	 * stateid.
1662  	 */
1663  	if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
1664  
1665  		/*
1666  		 * The first layoutget for the file. Need to serialize per
1667  		 * RFC 5661 Errata 3208.
1668  		 */
1669  		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1670  				     &lo->plh_flags)) {
1671  			spin_unlock(&ino->i_lock);
1672  			wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1673  				    TASK_UNINTERRUPTIBLE);
1674  			pnfs_put_layout_hdr(lo);
1675  			dprintk("%s retrying\n", __func__);
1676  			goto lookup_again;
1677  		}
1678  
1679  		first = true;
1680  		do {
1681  			seq = read_seqbegin(&ctx->state->seqlock);
1682  			nfs4_stateid_copy(&stateid, &ctx->state->stateid);
1683  		} while (read_seqretry(&ctx->state->seqlock, seq));
1684  	} else {
1685  		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1686  	}
1687  
1688  	/*
1689  	 * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1690  	 * for LAYOUTRETURN even if first is true.
1691  	 */
1692  	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1693  		spin_unlock(&ino->i_lock);
1694  		dprintk("%s wait for layoutreturn\n", __func__);
1695  		if (pnfs_prepare_to_retry_layoutget(lo)) {
1696  			if (first)
1697  				pnfs_clear_first_layoutget(lo);
1698  			pnfs_put_layout_hdr(lo);
1699  			dprintk("%s retrying\n", __func__);
1700  			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1701  					lseg, PNFS_UPDATE_LAYOUT_RETRY);
1702  			goto lookup_again;
1703  		}
1704  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1705  				PNFS_UPDATE_LAYOUT_RETURN);
1706  		goto out_put_layout_hdr;
1707  	}
1708  
1709  	if (pnfs_layoutgets_blocked(lo)) {
1710  		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1711  				PNFS_UPDATE_LAYOUT_BLOCKED);
1712  		goto out_unlock;
1713  	}
1714  	atomic_inc(&lo->plh_outstanding);
1715  	spin_unlock(&ino->i_lock);
1716  
1717  	if (list_empty(&lo->plh_layouts)) {
1718  		/* The lo must be on the clp list if there is any
1719  		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1720  		 */
1721  		spin_lock(&clp->cl_lock);
1722  		if (list_empty(&lo->plh_layouts))
1723  			list_add_tail(&lo->plh_layouts, &server->layouts);
1724  		spin_unlock(&clp->cl_lock);
1725  	}
1726  
1727  	pg_offset = arg.offset & ~PAGE_MASK;
1728  	if (pg_offset) {
1729  		arg.offset -= pg_offset;
1730  		arg.length += pg_offset;
1731  	}
1732  	if (arg.length != NFS4_MAX_UINT64)
1733  		arg.length = PAGE_ALIGN(arg.length);
1734  
1735  	lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1736  	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1737  				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1738  	atomic_dec(&lo->plh_outstanding);
1739  	if (IS_ERR(lseg)) {
1740  		switch(PTR_ERR(lseg)) {
1741  		case -EBUSY:
1742  			if (time_after(jiffies, giveup))
1743  				lseg = NULL;
1744  			break;
1745  		case -ERECALLCONFLICT:
1746  			/* Huh? We hold no layouts, how is there a recall? */
1747  			if (first) {
1748  				lseg = NULL;
1749  				break;
1750  			}
1751  			/* Destroy the existing layout and start over */
1752  			if (time_after(jiffies, giveup))
1753  				pnfs_destroy_layout(NFS_I(ino));
1754  			/* Fallthrough */
1755  		case -EAGAIN:
1756  			break;
1757  		default:
1758  			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1759  				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1760  				lseg = NULL;
1761  			}
1762  			goto out_put_layout_hdr;
1763  		}
1764  		if (lseg) {
1765  			if (first)
1766  				pnfs_clear_first_layoutget(lo);
1767  			trace_pnfs_update_layout(ino, pos, count,
1768  				iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1769  			pnfs_put_layout_hdr(lo);
1770  			goto lookup_again;
1771  		}
1772  	} else {
1773  		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1774  	}
1775  
1776  out_put_layout_hdr:
1777  	if (first)
1778  		pnfs_clear_first_layoutget(lo);
1779  	pnfs_put_layout_hdr(lo);
1780  out:
1781  	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1782  			"(%s, offset: %llu, length: %llu)\n",
1783  			__func__, ino->i_sb->s_id,
1784  			(unsigned long long)NFS_FILEID(ino),
1785  			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
1786  			iomode==IOMODE_RW ?  "read/write" : "read-only",
1787  			(unsigned long long)pos,
1788  			(unsigned long long)count);
1789  	return lseg;
1790  out_unlock:
1791  	spin_unlock(&ino->i_lock);
1792  	goto out_put_layout_hdr;
1793  }
1794  EXPORT_SYMBOL_GPL(pnfs_update_layout);
1795  
1796  static bool
pnfs_sanity_check_layout_range(struct pnfs_layout_range * range)1797  pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1798  {
1799  	switch (range->iomode) {
1800  	case IOMODE_READ:
1801  	case IOMODE_RW:
1802  		break;
1803  	default:
1804  		return false;
1805  	}
1806  	if (range->offset == NFS4_MAX_UINT64)
1807  		return false;
1808  	if (range->length == 0)
1809  		return false;
1810  	if (range->length != NFS4_MAX_UINT64 &&
1811  	    range->length > NFS4_MAX_UINT64 - range->offset)
1812  		return false;
1813  	return true;
1814  }
1815  
1816  struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget * lgp)1817  pnfs_layout_process(struct nfs4_layoutget *lgp)
1818  {
1819  	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1820  	struct nfs4_layoutget_res *res = &lgp->res;
1821  	struct pnfs_layout_segment *lseg;
1822  	struct inode *ino = lo->plh_inode;
1823  	LIST_HEAD(free_me);
1824  
1825  	if (!pnfs_sanity_check_layout_range(&res->range))
1826  		return ERR_PTR(-EINVAL);
1827  
1828  	/* Inject layout blob into I/O device driver */
1829  	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1830  	if (IS_ERR_OR_NULL(lseg)) {
1831  		if (!lseg)
1832  			lseg = ERR_PTR(-ENOMEM);
1833  
1834  		dprintk("%s: Could not allocate layout: error %ld\n",
1835  		       __func__, PTR_ERR(lseg));
1836  		return lseg;
1837  	}
1838  
1839  	pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
1840  
1841  	spin_lock(&ino->i_lock);
1842  	if (pnfs_layoutgets_blocked(lo)) {
1843  		dprintk("%s forget reply due to state\n", __func__);
1844  		goto out_forget;
1845  	}
1846  
1847  	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1848  		/* existing state ID, make sure the sequence number matches. */
1849  		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1850  			dprintk("%s forget reply due to sequence\n", __func__);
1851  			goto out_forget;
1852  		}
1853  		pnfs_set_layout_stateid(lo, &res->stateid, false);
1854  	} else {
1855  		/*
1856  		 * We got an entirely new state ID.  Mark all segments for the
1857  		 * inode invalid, and don't bother validating the stateid
1858  		 * sequence number.
1859  		 */
1860  		pnfs_mark_layout_stateid_invalid(lo, &free_me);
1861  
1862  		pnfs_set_layout_stateid(lo, &res->stateid, true);
1863  	}
1864  
1865  	pnfs_get_lseg(lseg);
1866  	pnfs_layout_insert_lseg(lo, lseg, &free_me);
1867  
1868  
1869  	if (res->return_on_close)
1870  		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1871  
1872  	spin_unlock(&ino->i_lock);
1873  	pnfs_free_lseg_list(&free_me);
1874  	return lseg;
1875  
1876  out_forget:
1877  	spin_unlock(&ino->i_lock);
1878  	lseg->pls_layout = lo;
1879  	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1880  	return ERR_PTR(-EAGAIN);
1881  }
1882  
1883  static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr * lo,enum pnfs_iomode iomode,u32 seq)1884  pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
1885  			 u32 seq)
1886  {
1887  	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
1888  		iomode = IOMODE_ANY;
1889  	lo->plh_return_iomode = iomode;
1890  	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
1891  	if (seq != 0) {
1892  		WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
1893  		lo->plh_return_seq = seq;
1894  	}
1895  }
1896  
1897  /**
1898   * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
1899   * @lo: pointer to layout header
1900   * @tmp_list: list header to be used with pnfs_free_lseg_list()
1901   * @return_range: describe layout segment ranges to be returned
1902   *
1903   * This function is mainly intended for use by layoutrecall. It attempts
1904   * to free the layout segment immediately, or else to mark it for return
1905   * as soon as its reference count drops to zero.
1906   */
1907  int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr * lo,struct list_head * tmp_list,const struct pnfs_layout_range * return_range,u32 seq)1908  pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1909  				struct list_head *tmp_list,
1910  				const struct pnfs_layout_range *return_range,
1911  				u32 seq)
1912  {
1913  	struct pnfs_layout_segment *lseg, *next;
1914  	int remaining = 0;
1915  
1916  	dprintk("%s:Begin lo %p\n", __func__, lo);
1917  
1918  	if (list_empty(&lo->plh_segs))
1919  		return 0;
1920  
1921  	assert_spin_locked(&lo->plh_inode->i_lock);
1922  
1923  	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1924  		if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
1925  			dprintk("%s: marking lseg %p iomode %d "
1926  				"offset %llu length %llu\n", __func__,
1927  				lseg, lseg->pls_range.iomode,
1928  				lseg->pls_range.offset,
1929  				lseg->pls_range.length);
1930  			if (mark_lseg_invalid(lseg, tmp_list))
1931  				continue;
1932  			remaining++;
1933  			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1934  		}
1935  
1936  	if (remaining)
1937  		pnfs_set_plh_return_info(lo, return_range->iomode, seq);
1938  
1939  	return remaining;
1940  }
1941  
pnfs_error_mark_layout_for_return(struct inode * inode,struct pnfs_layout_segment * lseg)1942  void pnfs_error_mark_layout_for_return(struct inode *inode,
1943  				       struct pnfs_layout_segment *lseg)
1944  {
1945  	struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1946  	struct pnfs_layout_range range = {
1947  		.iomode = lseg->pls_range.iomode,
1948  		.offset = 0,
1949  		.length = NFS4_MAX_UINT64,
1950  	};
1951  	LIST_HEAD(free_me);
1952  	bool return_now = false;
1953  
1954  	spin_lock(&inode->i_lock);
1955  	pnfs_set_plh_return_info(lo, range.iomode, 0);
1956  	/*
1957  	 * mark all matching lsegs so that we are sure to have no live
1958  	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1959  	 * for how it works.
1960  	 */
1961  	if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
1962  		nfs4_stateid stateid;
1963  		enum pnfs_iomode iomode;
1964  
1965  		return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1966  		spin_unlock(&inode->i_lock);
1967  		if (return_now)
1968  			pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1969  	} else {
1970  		spin_unlock(&inode->i_lock);
1971  		nfs_commit_inode(inode, 0);
1972  	}
1973  	pnfs_free_lseg_list(&free_me);
1974  }
1975  EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1976  
1977  void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor * pgio,struct nfs_page * req)1978  pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1979  {
1980  	u64 rd_size = req->wb_bytes;
1981  
1982  	if (pgio->pg_lseg == NULL) {
1983  		if (pgio->pg_dreq == NULL)
1984  			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1985  		else
1986  			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1987  
1988  		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1989  						   req->wb_context,
1990  						   req_offset(req),
1991  						   rd_size,
1992  						   IOMODE_READ,
1993  						   false,
1994  						   GFP_KERNEL);
1995  		if (IS_ERR(pgio->pg_lseg)) {
1996  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1997  			pgio->pg_lseg = NULL;
1998  			return;
1999  		}
2000  	}
2001  	/* If no lseg, fall back to read through mds */
2002  	if (pgio->pg_lseg == NULL)
2003  		nfs_pageio_reset_read_mds(pgio);
2004  
2005  }
2006  EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
2007  
2008  void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor * pgio,struct nfs_page * req,u64 wb_size)2009  pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
2010  			   struct nfs_page *req, u64 wb_size)
2011  {
2012  	if (pgio->pg_lseg == NULL) {
2013  		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2014  						   req->wb_context,
2015  						   req_offset(req),
2016  						   wb_size,
2017  						   IOMODE_RW,
2018  						   false,
2019  						   GFP_NOFS);
2020  		if (IS_ERR(pgio->pg_lseg)) {
2021  			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2022  			pgio->pg_lseg = NULL;
2023  			return;
2024  		}
2025  	}
2026  	/* If no lseg, fall back to write through mds */
2027  	if (pgio->pg_lseg == NULL)
2028  		nfs_pageio_reset_write_mds(pgio);
2029  }
2030  EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
2031  
2032  void
pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor * desc)2033  pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
2034  {
2035  	if (desc->pg_lseg) {
2036  		pnfs_put_lseg(desc->pg_lseg);
2037  		desc->pg_lseg = NULL;
2038  	}
2039  }
2040  EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
2041  
2042  /*
2043   * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
2044   * of bytes (maximum @req->wb_bytes) that can be coalesced.
2045   */
2046  size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor * pgio,struct nfs_page * prev,struct nfs_page * req)2047  pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
2048  		     struct nfs_page *prev, struct nfs_page *req)
2049  {
2050  	unsigned int size;
2051  	u64 seg_end, req_start, seg_left;
2052  
2053  	size = nfs_generic_pg_test(pgio, prev, req);
2054  	if (!size)
2055  		return 0;
2056  
2057  	/*
2058  	 * 'size' contains the number of bytes left in the current page (up
2059  	 * to the original size asked for in @req->wb_bytes).
2060  	 *
2061  	 * Calculate how many bytes are left in the layout segment
2062  	 * and if there are less bytes than 'size', return that instead.
2063  	 *
2064  	 * Please also note that 'end_offset' is actually the offset of the
2065  	 * first byte that lies outside the pnfs_layout_range. FIXME?
2066  	 *
2067  	 */
2068  	if (pgio->pg_lseg) {
2069  		seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
2070  				     pgio->pg_lseg->pls_range.length);
2071  		req_start = req_offset(req);
2072  		WARN_ON_ONCE(req_start >= seg_end);
2073  		/* start of request is past the last byte of this segment */
2074  		if (req_start >= seg_end) {
2075  			/* reference the new lseg */
2076  			if (pgio->pg_ops->pg_cleanup)
2077  				pgio->pg_ops->pg_cleanup(pgio);
2078  			if (pgio->pg_ops->pg_init)
2079  				pgio->pg_ops->pg_init(pgio, req);
2080  			return 0;
2081  		}
2082  
2083  		/* adjust 'size' iff there are fewer bytes left in the
2084  		 * segment than what nfs_generic_pg_test returned */
2085  		seg_left = seg_end - req_start;
2086  		if (seg_left < size)
2087  			size = (unsigned int)seg_left;
2088  	}
2089  
2090  	return size;
2091  }
2092  EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2093  
pnfs_write_done_resend_to_mds(struct nfs_pgio_header * hdr)2094  int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2095  {
2096  	struct nfs_pageio_descriptor pgio;
2097  
2098  	/* Resend all requests through the MDS */
2099  	nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2100  			      hdr->completion_ops);
2101  	set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2102  	return nfs_pageio_resend(&pgio, hdr);
2103  }
2104  EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2105  
pnfs_ld_handle_write_error(struct nfs_pgio_header * hdr)2106  static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2107  {
2108  
2109  	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2110  	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2111  	    PNFS_LAYOUTRET_ON_ERROR) {
2112  		pnfs_return_layout(hdr->inode);
2113  	}
2114  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2115  		hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2116  }
2117  
2118  /*
2119   * Called by non rpc-based layout drivers
2120   */
pnfs_ld_write_done(struct nfs_pgio_header * hdr)2121  void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2122  {
2123  	if (likely(!hdr->pnfs_error)) {
2124  		pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2125  				hdr->mds_offset + hdr->res.count);
2126  		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2127  	}
2128  	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2129  	if (unlikely(hdr->pnfs_error))
2130  		pnfs_ld_handle_write_error(hdr);
2131  	hdr->mds_ops->rpc_release(hdr);
2132  }
2133  EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2134  
2135  static void
pnfs_write_through_mds(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)2136  pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2137  		struct nfs_pgio_header *hdr)
2138  {
2139  	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2140  
2141  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2142  		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2143  		nfs_pageio_reset_write_mds(desc);
2144  		mirror->pg_recoalesce = 1;
2145  	}
2146  	hdr->completion_ops->completion(hdr);
2147  }
2148  
2149  static enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_pgio_header * hdr,const struct rpc_call_ops * call_ops,struct pnfs_layout_segment * lseg,int how)2150  pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2151  			const struct rpc_call_ops *call_ops,
2152  			struct pnfs_layout_segment *lseg,
2153  			int how)
2154  {
2155  	struct inode *inode = hdr->inode;
2156  	enum pnfs_try_status trypnfs;
2157  	struct nfs_server *nfss = NFS_SERVER(inode);
2158  
2159  	hdr->mds_ops = call_ops;
2160  
2161  	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2162  		inode->i_ino, hdr->args.count, hdr->args.offset, how);
2163  	trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2164  	if (trypnfs != PNFS_NOT_ATTEMPTED)
2165  		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2166  	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2167  	return trypnfs;
2168  }
2169  
2170  static void
pnfs_do_write(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr,int how)2171  pnfs_do_write(struct nfs_pageio_descriptor *desc,
2172  	      struct nfs_pgio_header *hdr, int how)
2173  {
2174  	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2175  	struct pnfs_layout_segment *lseg = desc->pg_lseg;
2176  	enum pnfs_try_status trypnfs;
2177  
2178  	trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2179  	if (trypnfs == PNFS_NOT_ATTEMPTED)
2180  		pnfs_write_through_mds(desc, hdr);
2181  }
2182  
pnfs_writehdr_free(struct nfs_pgio_header * hdr)2183  static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2184  {
2185  	pnfs_put_lseg(hdr->lseg);
2186  	nfs_pgio_header_free(hdr);
2187  }
2188  
2189  int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor * desc)2190  pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2191  {
2192  	struct nfs_pgio_header *hdr;
2193  	int ret;
2194  
2195  	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2196  	if (!hdr) {
2197  		desc->pg_error = -ENOMEM;
2198  		return desc->pg_error;
2199  	}
2200  	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2201  
2202  	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2203  	ret = nfs_generic_pgio(desc, hdr);
2204  	if (!ret)
2205  		pnfs_do_write(desc, hdr, desc->pg_ioflags);
2206  
2207  	return ret;
2208  }
2209  EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2210  
pnfs_read_done_resend_to_mds(struct nfs_pgio_header * hdr)2211  int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2212  {
2213  	struct nfs_pageio_descriptor pgio;
2214  
2215  	/* Resend all requests through the MDS */
2216  	nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2217  	return nfs_pageio_resend(&pgio, hdr);
2218  }
2219  EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2220  
pnfs_ld_handle_read_error(struct nfs_pgio_header * hdr)2221  static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2222  {
2223  	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2224  	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2225  	    PNFS_LAYOUTRET_ON_ERROR) {
2226  		pnfs_return_layout(hdr->inode);
2227  	}
2228  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2229  		hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2230  }
2231  
2232  /*
2233   * Called by non rpc-based layout drivers
2234   */
pnfs_ld_read_done(struct nfs_pgio_header * hdr)2235  void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2236  {
2237  	if (likely(!hdr->pnfs_error))
2238  		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2239  	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2240  	if (unlikely(hdr->pnfs_error))
2241  		pnfs_ld_handle_read_error(hdr);
2242  	hdr->mds_ops->rpc_release(hdr);
2243  }
2244  EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2245  
2246  static void
pnfs_read_through_mds(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)2247  pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2248  		struct nfs_pgio_header *hdr)
2249  {
2250  	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2251  
2252  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2253  		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2254  		nfs_pageio_reset_read_mds(desc);
2255  		mirror->pg_recoalesce = 1;
2256  	}
2257  	hdr->completion_ops->completion(hdr);
2258  }
2259  
2260  /*
2261   * Call the appropriate parallel I/O subsystem read function.
2262   */
2263  static enum pnfs_try_status
pnfs_try_to_read_data(struct nfs_pgio_header * hdr,const struct rpc_call_ops * call_ops,struct pnfs_layout_segment * lseg)2264  pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2265  		       const struct rpc_call_ops *call_ops,
2266  		       struct pnfs_layout_segment *lseg)
2267  {
2268  	struct inode *inode = hdr->inode;
2269  	struct nfs_server *nfss = NFS_SERVER(inode);
2270  	enum pnfs_try_status trypnfs;
2271  
2272  	hdr->mds_ops = call_ops;
2273  
2274  	dprintk("%s: Reading ino:%lu %u@%llu\n",
2275  		__func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2276  
2277  	trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2278  	if (trypnfs != PNFS_NOT_ATTEMPTED)
2279  		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2280  	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2281  	return trypnfs;
2282  }
2283  
2284  /* Resend all requests through pnfs. */
pnfs_read_resend_pnfs(struct nfs_pgio_header * hdr)2285  void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2286  {
2287  	struct nfs_pageio_descriptor pgio;
2288  
2289  	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2290  		/* Prevent deadlocks with layoutreturn! */
2291  		pnfs_put_lseg(hdr->lseg);
2292  		hdr->lseg = NULL;
2293  
2294  		nfs_pageio_init_read(&pgio, hdr->inode, false,
2295  					hdr->completion_ops);
2296  		hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2297  	}
2298  }
2299  EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2300  
2301  static void
pnfs_do_read(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)2302  pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2303  {
2304  	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2305  	struct pnfs_layout_segment *lseg = desc->pg_lseg;
2306  	enum pnfs_try_status trypnfs;
2307  
2308  	trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2309  	switch (trypnfs) {
2310  	case PNFS_NOT_ATTEMPTED:
2311  		pnfs_read_through_mds(desc, hdr);
2312  	case PNFS_ATTEMPTED:
2313  		break;
2314  	case PNFS_TRY_AGAIN:
2315  		/* cleanup hdr and prepare to redo pnfs */
2316  		if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2317  			struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2318  			list_splice_init(&hdr->pages, &mirror->pg_list);
2319  			mirror->pg_recoalesce = 1;
2320  		}
2321  		hdr->mds_ops->rpc_release(hdr);
2322  	}
2323  }
2324  
pnfs_readhdr_free(struct nfs_pgio_header * hdr)2325  static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2326  {
2327  	pnfs_put_lseg(hdr->lseg);
2328  	nfs_pgio_header_free(hdr);
2329  }
2330  
2331  int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor * desc)2332  pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
2333  {
2334  	struct nfs_pgio_header *hdr;
2335  	int ret;
2336  
2337  	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2338  	if (!hdr) {
2339  		desc->pg_error = -ENOMEM;
2340  		return desc->pg_error;
2341  	}
2342  	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
2343  	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2344  	ret = nfs_generic_pgio(desc, hdr);
2345  	if (!ret)
2346  		pnfs_do_read(desc, hdr);
2347  	return ret;
2348  }
2349  EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
2350  
pnfs_clear_layoutcommitting(struct inode * inode)2351  static void pnfs_clear_layoutcommitting(struct inode *inode)
2352  {
2353  	unsigned long *bitlock = &NFS_I(inode)->flags;
2354  
2355  	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
2356  	smp_mb__after_atomic();
2357  	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
2358  }
2359  
2360  /*
2361   * There can be multiple RW segments.
2362   */
pnfs_list_write_lseg(struct inode * inode,struct list_head * listp)2363  static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
2364  {
2365  	struct pnfs_layout_segment *lseg;
2366  
2367  	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
2368  		if (lseg->pls_range.iomode == IOMODE_RW &&
2369  		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
2370  			list_add(&lseg->pls_lc_list, listp);
2371  	}
2372  }
2373  
pnfs_list_write_lseg_done(struct inode * inode,struct list_head * listp)2374  static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
2375  {
2376  	struct pnfs_layout_segment *lseg, *tmp;
2377  
2378  	/* Matched by references in pnfs_set_layoutcommit */
2379  	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
2380  		list_del_init(&lseg->pls_lc_list);
2381  		pnfs_put_lseg(lseg);
2382  	}
2383  
2384  	pnfs_clear_layoutcommitting(inode);
2385  }
2386  
pnfs_set_lo_fail(struct pnfs_layout_segment * lseg)2387  void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2388  {
2389  	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
2390  }
2391  EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2392  
2393  void
pnfs_set_layoutcommit(struct inode * inode,struct pnfs_layout_segment * lseg,loff_t end_pos)2394  pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2395  		loff_t end_pos)
2396  {
2397  	struct nfs_inode *nfsi = NFS_I(inode);
2398  	bool mark_as_dirty = false;
2399  
2400  	spin_lock(&inode->i_lock);
2401  	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2402  		nfsi->layout->plh_lwb = end_pos;
2403  		mark_as_dirty = true;
2404  		dprintk("%s: Set layoutcommit for inode %lu ",
2405  			__func__, inode->i_ino);
2406  	} else if (end_pos > nfsi->layout->plh_lwb)
2407  		nfsi->layout->plh_lwb = end_pos;
2408  	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2409  		/* references matched in nfs4_layoutcommit_release */
2410  		pnfs_get_lseg(lseg);
2411  	}
2412  	spin_unlock(&inode->i_lock);
2413  	dprintk("%s: lseg %p end_pos %llu\n",
2414  		__func__, lseg, nfsi->layout->plh_lwb);
2415  
2416  	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2417  	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2418  	if (mark_as_dirty)
2419  		mark_inode_dirty_sync(inode);
2420  }
2421  EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2422  
pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data * data)2423  void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2424  {
2425  	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
2426  
2427  	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
2428  		nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
2429  	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
2430  }
2431  
2432  /*
2433   * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
2434   * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
2435   * data to disk to allow the server to recover the data if it crashes.
2436   * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
2437   * is off, and a COMMIT is sent to a data server, or
2438   * if WRITEs to a data server return NFS_DATA_SYNC.
2439   */
2440  int
pnfs_layoutcommit_inode(struct inode * inode,bool sync)2441  pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2442  {
2443  	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2444  	struct nfs4_layoutcommit_data *data;
2445  	struct nfs_inode *nfsi = NFS_I(inode);
2446  	loff_t end_pos;
2447  	int status;
2448  
2449  	if (!pnfs_layoutcommit_outstanding(inode))
2450  		return 0;
2451  
2452  	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
2453  
2454  	status = -EAGAIN;
2455  	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
2456  		if (!sync)
2457  			goto out;
2458  		status = wait_on_bit_lock_action(&nfsi->flags,
2459  				NFS_INO_LAYOUTCOMMITTING,
2460  				nfs_wait_bit_killable,
2461  				TASK_KILLABLE);
2462  		if (status)
2463  			goto out;
2464  	}
2465  
2466  	status = -ENOMEM;
2467  	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
2468  	data = kzalloc(sizeof(*data), GFP_NOFS);
2469  	if (!data)
2470  		goto clear_layoutcommitting;
2471  
2472  	status = 0;
2473  	spin_lock(&inode->i_lock);
2474  	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
2475  		goto out_unlock;
2476  
2477  	INIT_LIST_HEAD(&data->lseg_list);
2478  	pnfs_list_write_lseg(inode, &data->lseg_list);
2479  
2480  	end_pos = nfsi->layout->plh_lwb;
2481  
2482  	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2483  	spin_unlock(&inode->i_lock);
2484  
2485  	data->args.inode = inode;
2486  	data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
2487  	nfs_fattr_init(&data->fattr);
2488  	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2489  	data->res.fattr = &data->fattr;
2490  	if (end_pos != 0)
2491  		data->args.lastbytewritten = end_pos - 1;
2492  	else
2493  		data->args.lastbytewritten = U64_MAX;
2494  	data->res.server = NFS_SERVER(inode);
2495  
2496  	if (ld->prepare_layoutcommit) {
2497  		status = ld->prepare_layoutcommit(&data->args);
2498  		if (status) {
2499  			put_rpccred(data->cred);
2500  			spin_lock(&inode->i_lock);
2501  			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2502  			if (end_pos > nfsi->layout->plh_lwb)
2503  				nfsi->layout->plh_lwb = end_pos;
2504  			goto out_unlock;
2505  		}
2506  	}
2507  
2508  
2509  	status = nfs4_proc_layoutcommit(data, sync);
2510  out:
2511  	if (status)
2512  		mark_inode_dirty_sync(inode);
2513  	dprintk("<-- %s status %d\n", __func__, status);
2514  	return status;
2515  out_unlock:
2516  	spin_unlock(&inode->i_lock);
2517  	kfree(data);
2518  clear_layoutcommitting:
2519  	pnfs_clear_layoutcommitting(inode);
2520  	goto out;
2521  }
2522  EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2523  
2524  int
pnfs_generic_sync(struct inode * inode,bool datasync)2525  pnfs_generic_sync(struct inode *inode, bool datasync)
2526  {
2527  	return pnfs_layoutcommit_inode(inode, true);
2528  }
2529  EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2530  
pnfs_mdsthreshold_alloc(void)2531  struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2532  {
2533  	struct nfs4_threshold *thp;
2534  
2535  	thp = kzalloc(sizeof(*thp), GFP_NOFS);
2536  	if (!thp) {
2537  		dprintk("%s mdsthreshold allocation failed\n", __func__);
2538  		return NULL;
2539  	}
2540  	return thp;
2541  }
2542  
2543  #if IS_ENABLED(CONFIG_NFS_V4_2)
2544  int
pnfs_report_layoutstat(struct inode * inode,gfp_t gfp_flags)2545  pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2546  {
2547  	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2548  	struct nfs_server *server = NFS_SERVER(inode);
2549  	struct nfs_inode *nfsi = NFS_I(inode);
2550  	struct nfs42_layoutstat_data *data;
2551  	struct pnfs_layout_hdr *hdr;
2552  	int status = 0;
2553  
2554  	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2555  		goto out;
2556  
2557  	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2558  		goto out;
2559  
2560  	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2561  		goto out;
2562  
2563  	spin_lock(&inode->i_lock);
2564  	if (!NFS_I(inode)->layout) {
2565  		spin_unlock(&inode->i_lock);
2566  		goto out_clear_layoutstats;
2567  	}
2568  	hdr = NFS_I(inode)->layout;
2569  	pnfs_get_layout_hdr(hdr);
2570  	spin_unlock(&inode->i_lock);
2571  
2572  	data = kzalloc(sizeof(*data), gfp_flags);
2573  	if (!data) {
2574  		status = -ENOMEM;
2575  		goto out_put;
2576  	}
2577  
2578  	data->args.fh = NFS_FH(inode);
2579  	data->args.inode = inode;
2580  	status = ld->prepare_layoutstats(&data->args);
2581  	if (status)
2582  		goto out_free;
2583  
2584  	status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2585  
2586  out:
2587  	dprintk("%s returns %d\n", __func__, status);
2588  	return status;
2589  
2590  out_free:
2591  	kfree(data);
2592  out_put:
2593  	pnfs_put_layout_hdr(hdr);
2594  out_clear_layoutstats:
2595  	smp_mb__before_atomic();
2596  	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2597  	smp_mb__after_atomic();
2598  	goto out;
2599  }
2600  EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2601  #endif
2602  
2603  unsigned int layoutstats_timer;
2604  module_param(layoutstats_timer, uint, 0644);
2605  EXPORT_SYMBOL_GPL(layoutstats_timer);
2606