• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  linux/fs/nfs/blocklayout/blocklayout.c
3  *
4  *  Module for the NFSv4.1 pNFS block layout driver.
5  *
6  *  Copyright (c) 2006 The Regents of the University of Michigan.
7  *  All rights reserved.
8  *
9  *  Andy Adamson <andros@citi.umich.edu>
10  *  Fred Isaman <iisaman@umich.edu>
11  *
12  * permission is granted to use, copy, create derivative works and
13  * redistribute this software and such derivative works for any purpose,
14  * so long as the name of the university of michigan is not used in
15  * any advertising or publicity pertaining to the use or distribution
16  * of this software without specific, written prior authorization.  if
17  * the above copyright notice or any other identification of the
18  * university of michigan is included in any copy of any portion of
19  * this software, then the disclaimer below must also be included.
20  *
21  * this software is provided as is, without representation from the
22  * university of michigan as to its fitness for any purpose, and without
23  * warranty by the university of michigan of any kind, either express
24  * or implied, including without limitation the implied warranties of
25  * merchantability and fitness for a particular purpose.  the regents
26  * of the university of michigan shall not be liable for any damages,
27  * including special, indirect, incidental, or consequential damages,
28  * with respect to any claim arising out or in connection with the use
29  * of the software, even if it has been or is hereafter advised of the
30  * possibility of such damages.
31  */
32 
33 #include <linux/module.h>
34 #include <linux/init.h>
35 #include <linux/mount.h>
36 #include <linux/namei.h>
37 #include <linux/bio.h>		/* struct bio */
38 #include <linux/buffer_head.h>	/* various write calls */
39 #include <linux/prefetch.h>
40 
41 #include "../pnfs.h"
42 #include "../internal.h"
43 #include "blocklayout.h"
44 
45 #define NFSDBG_FACILITY	NFSDBG_PNFS_LD
46 
47 MODULE_LICENSE("GPL");
48 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
49 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
50 
print_page(struct page * page)51 static void print_page(struct page *page)
52 {
53 	dprintk("PRINTPAGE page %p\n", page);
54 	dprintk("	PagePrivate %d\n", PagePrivate(page));
55 	dprintk("	PageUptodate %d\n", PageUptodate(page));
56 	dprintk("	PageError %d\n", PageError(page));
57 	dprintk("	PageDirty %d\n", PageDirty(page));
58 	dprintk("	PageReferenced %d\n", PageReferenced(page));
59 	dprintk("	PageLocked %d\n", PageLocked(page));
60 	dprintk("	PageWriteback %d\n", PageWriteback(page));
61 	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page));
62 	dprintk("\n");
63 }
64 
65 /* Given the be associated with isect, determine if page data needs to be
66  * initialized.
67  */
is_hole(struct pnfs_block_extent * be,sector_t isect)68 static int is_hole(struct pnfs_block_extent *be, sector_t isect)
69 {
70 	if (be->be_state == PNFS_BLOCK_NONE_DATA)
71 		return 1;
72 	else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
73 		return 0;
74 	else
75 		return !bl_is_sector_init(be->be_inval, isect);
76 }
77 
78 /* Given the be associated with isect, determine if page data can be
79  * written to disk.
80  */
is_writable(struct pnfs_block_extent * be,sector_t isect)81 static int is_writable(struct pnfs_block_extent *be, sector_t isect)
82 {
83 	return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
84 		be->be_state == PNFS_BLOCK_INVALID_DATA);
85 }
86 
87 /* The data we are handed might be spread across several bios.  We need
88  * to track when the last one is finished.
89  */
90 struct parallel_io {
91 	struct kref refcnt;
92 	void (*pnfs_callback) (void *data, int num_se);
93 	void *data;
94 	int bse_count;
95 };
96 
alloc_parallel(void * data)97 static inline struct parallel_io *alloc_parallel(void *data)
98 {
99 	struct parallel_io *rv;
100 
101 	rv  = kmalloc(sizeof(*rv), GFP_NOFS);
102 	if (rv) {
103 		rv->data = data;
104 		kref_init(&rv->refcnt);
105 		rv->bse_count = 0;
106 	}
107 	return rv;
108 }
109 
get_parallel(struct parallel_io * p)110 static inline void get_parallel(struct parallel_io *p)
111 {
112 	kref_get(&p->refcnt);
113 }
114 
destroy_parallel(struct kref * kref)115 static void destroy_parallel(struct kref *kref)
116 {
117 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
118 
119 	dprintk("%s enter\n", __func__);
120 	p->pnfs_callback(p->data, p->bse_count);
121 	kfree(p);
122 }
123 
put_parallel(struct parallel_io * p)124 static inline void put_parallel(struct parallel_io *p)
125 {
126 	kref_put(&p->refcnt, destroy_parallel);
127 }
128 
129 static struct bio *
bl_submit_bio(int rw,struct bio * bio)130 bl_submit_bio(int rw, struct bio *bio)
131 {
132 	if (bio) {
133 		get_parallel(bio->bi_private);
134 		dprintk("%s submitting %s bio %u@%llu\n", __func__,
135 			rw == READ ? "read" : "write",
136 			bio->bi_size, (unsigned long long)bio->bi_sector);
137 		submit_bio(rw, bio);
138 	}
139 	return NULL;
140 }
141 
bl_alloc_init_bio(int npg,sector_t isect,struct pnfs_block_extent * be,void (* end_io)(struct bio *,int err),struct parallel_io * par)142 static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
143 				     struct pnfs_block_extent *be,
144 				     void (*end_io)(struct bio *, int err),
145 				     struct parallel_io *par)
146 {
147 	struct bio *bio;
148 
149 	npg = min(npg, BIO_MAX_PAGES);
150 	bio = bio_alloc(GFP_NOIO, npg);
151 	if (!bio && (current->flags & PF_MEMALLOC)) {
152 		while (!bio && (npg /= 2))
153 			bio = bio_alloc(GFP_NOIO, npg);
154 	}
155 
156 	if (bio) {
157 		bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
158 		bio->bi_bdev = be->be_mdev;
159 		bio->bi_end_io = end_io;
160 		bio->bi_private = par;
161 	}
162 	return bio;
163 }
164 
do_add_page_to_bio(struct bio * bio,int npg,int rw,sector_t isect,struct page * page,struct pnfs_block_extent * be,void (* end_io)(struct bio *,int err),struct parallel_io * par,unsigned int offset,int len)165 static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
166 				      sector_t isect, struct page *page,
167 				      struct pnfs_block_extent *be,
168 				      void (*end_io)(struct bio *, int err),
169 				      struct parallel_io *par,
170 				      unsigned int offset, int len)
171 {
172 	isect = isect + (offset >> SECTOR_SHIFT);
173 	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
174 		npg, rw, (unsigned long long)isect, offset, len);
175 retry:
176 	if (!bio) {
177 		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
178 		if (!bio)
179 			return ERR_PTR(-ENOMEM);
180 	}
181 	if (bio_add_page(bio, page, len, offset) < len) {
182 		bio = bl_submit_bio(rw, bio);
183 		goto retry;
184 	}
185 	return bio;
186 }
187 
bl_add_page_to_bio(struct bio * bio,int npg,int rw,sector_t isect,struct page * page,struct pnfs_block_extent * be,void (* end_io)(struct bio *,int err),struct parallel_io * par)188 static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
189 				      sector_t isect, struct page *page,
190 				      struct pnfs_block_extent *be,
191 				      void (*end_io)(struct bio *, int err),
192 				      struct parallel_io *par)
193 {
194 	return do_add_page_to_bio(bio, npg, rw, isect, page, be,
195 				  end_io, par, 0, PAGE_CACHE_SIZE);
196 }
197 
198 /* This is basically copied from mpage_end_io_read */
bl_end_io_read(struct bio * bio,int err)199 static void bl_end_io_read(struct bio *bio, int err)
200 {
201 	struct parallel_io *par = bio->bi_private;
202 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
203 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
204 	struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
205 
206 	do {
207 		struct page *page = bvec->bv_page;
208 
209 		if (--bvec >= bio->bi_io_vec)
210 			prefetchw(&bvec->bv_page->flags);
211 		if (uptodate)
212 			SetPageUptodate(page);
213 	} while (bvec >= bio->bi_io_vec);
214 	if (!uptodate) {
215 		if (!rdata->pnfs_error)
216 			rdata->pnfs_error = -EIO;
217 		pnfs_set_lo_fail(rdata->lseg);
218 	}
219 	bio_put(bio);
220 	put_parallel(par);
221 }
222 
bl_read_cleanup(struct work_struct * work)223 static void bl_read_cleanup(struct work_struct *work)
224 {
225 	struct rpc_task *task;
226 	struct nfs_read_data *rdata;
227 	dprintk("%s enter\n", __func__);
228 	task = container_of(work, struct rpc_task, u.tk_work);
229 	rdata = container_of(task, struct nfs_read_data, task);
230 	pnfs_ld_read_done(rdata);
231 }
232 
233 static void
bl_end_par_io_read(void * data,int unused)234 bl_end_par_io_read(void *data, int unused)
235 {
236 	struct nfs_read_data *rdata = data;
237 
238 	rdata->task.tk_status = rdata->pnfs_error;
239 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
240 	schedule_work(&rdata->task.u.tk_work);
241 }
242 
243 static enum pnfs_try_status
bl_read_pagelist(struct nfs_read_data * rdata)244 bl_read_pagelist(struct nfs_read_data *rdata)
245 {
246 	int i, hole;
247 	struct bio *bio = NULL;
248 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
249 	sector_t isect, extent_length = 0;
250 	struct parallel_io *par;
251 	loff_t f_offset = rdata->args.offset;
252 	struct page **pages = rdata->args.pages;
253 	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
254 
255 	dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
256 	       rdata->npages, f_offset, (unsigned int)rdata->args.count);
257 
258 	par = alloc_parallel(rdata);
259 	if (!par)
260 		goto use_mds;
261 	par->pnfs_callback = bl_end_par_io_read;
262 	/* At this point, we can no longer jump to use_mds */
263 
264 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
265 	/* Code assumes extents are page-aligned */
266 	for (i = pg_index; i < rdata->npages; i++) {
267 		if (!extent_length) {
268 			/* We've used up the previous extent */
269 			bl_put_extent(be);
270 			bl_put_extent(cow_read);
271 			bio = bl_submit_bio(READ, bio);
272 			/* Get the next one */
273 			be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
274 					     isect, &cow_read);
275 			if (!be) {
276 				rdata->pnfs_error = -EIO;
277 				goto out;
278 			}
279 			extent_length = be->be_length -
280 				(isect - be->be_f_offset);
281 			if (cow_read) {
282 				sector_t cow_length = cow_read->be_length -
283 					(isect - cow_read->be_f_offset);
284 				extent_length = min(extent_length, cow_length);
285 			}
286 		}
287 		hole = is_hole(be, isect);
288 		if (hole && !cow_read) {
289 			bio = bl_submit_bio(READ, bio);
290 			/* Fill hole w/ zeroes w/o accessing device */
291 			dprintk("%s Zeroing page for hole\n", __func__);
292 			zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
293 			print_page(pages[i]);
294 			SetPageUptodate(pages[i]);
295 		} else {
296 			struct pnfs_block_extent *be_read;
297 
298 			be_read = (hole && cow_read) ? cow_read : be;
299 			bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
300 						 isect, pages[i], be_read,
301 						 bl_end_io_read, par);
302 			if (IS_ERR(bio)) {
303 				rdata->pnfs_error = PTR_ERR(bio);
304 				bio = NULL;
305 				goto out;
306 			}
307 		}
308 		isect += PAGE_CACHE_SECTORS;
309 		extent_length -= PAGE_CACHE_SECTORS;
310 	}
311 	if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
312 		rdata->res.eof = 1;
313 		rdata->res.count = rdata->inode->i_size - f_offset;
314 	} else {
315 		rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
316 	}
317 out:
318 	bl_put_extent(be);
319 	bl_put_extent(cow_read);
320 	bl_submit_bio(READ, bio);
321 	put_parallel(par);
322 	return PNFS_ATTEMPTED;
323 
324  use_mds:
325 	dprintk("Giving up and using normal NFS\n");
326 	return PNFS_NOT_ATTEMPTED;
327 }
328 
mark_extents_written(struct pnfs_block_layout * bl,__u64 offset,__u32 count)329 static void mark_extents_written(struct pnfs_block_layout *bl,
330 				 __u64 offset, __u32 count)
331 {
332 	sector_t isect, end;
333 	struct pnfs_block_extent *be;
334 	struct pnfs_block_short_extent *se;
335 
336 	dprintk("%s(%llu, %u)\n", __func__, offset, count);
337 	if (count == 0)
338 		return;
339 	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
340 	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
341 	end >>= SECTOR_SHIFT;
342 	while (isect < end) {
343 		sector_t len;
344 		be = bl_find_get_extent(bl, isect, NULL);
345 		BUG_ON(!be); /* FIXME */
346 		len = min(end, be->be_f_offset + be->be_length) - isect;
347 		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
348 			se = bl_pop_one_short_extent(be->be_inval);
349 			BUG_ON(!se);
350 			bl_mark_for_commit(be, isect, len, se);
351 		}
352 		isect += len;
353 		bl_put_extent(be);
354 	}
355 }
356 
bl_end_io_write_zero(struct bio * bio,int err)357 static void bl_end_io_write_zero(struct bio *bio, int err)
358 {
359 	struct parallel_io *par = bio->bi_private;
360 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
361 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
362 	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
363 
364 	do {
365 		struct page *page = bvec->bv_page;
366 
367 		if (--bvec >= bio->bi_io_vec)
368 			prefetchw(&bvec->bv_page->flags);
369 		/* This is the zeroing page we added */
370 		end_page_writeback(page);
371 		page_cache_release(page);
372 	} while (bvec >= bio->bi_io_vec);
373 
374 	if (unlikely(!uptodate)) {
375 		if (!wdata->pnfs_error)
376 			wdata->pnfs_error = -EIO;
377 		pnfs_set_lo_fail(wdata->lseg);
378 	}
379 	bio_put(bio);
380 	put_parallel(par);
381 }
382 
bl_end_io_write(struct bio * bio,int err)383 static void bl_end_io_write(struct bio *bio, int err)
384 {
385 	struct parallel_io *par = bio->bi_private;
386 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
387 	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
388 
389 	if (!uptodate) {
390 		if (!wdata->pnfs_error)
391 			wdata->pnfs_error = -EIO;
392 		pnfs_set_lo_fail(wdata->lseg);
393 	}
394 	bio_put(bio);
395 	put_parallel(par);
396 }
397 
398 /* Function scheduled for call during bl_end_par_io_write,
399  * it marks sectors as written and extends the commitlist.
400  */
bl_write_cleanup(struct work_struct * work)401 static void bl_write_cleanup(struct work_struct *work)
402 {
403 	struct rpc_task *task;
404 	struct nfs_write_data *wdata;
405 	dprintk("%s enter\n", __func__);
406 	task = container_of(work, struct rpc_task, u.tk_work);
407 	wdata = container_of(task, struct nfs_write_data, task);
408 	if (likely(!wdata->pnfs_error)) {
409 		/* Marks for LAYOUTCOMMIT */
410 		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
411 				     wdata->args.offset, wdata->args.count);
412 	}
413 	pnfs_ld_write_done(wdata);
414 }
415 
416 /* Called when last of bios associated with a bl_write_pagelist call finishes */
bl_end_par_io_write(void * data,int num_se)417 static void bl_end_par_io_write(void *data, int num_se)
418 {
419 	struct nfs_write_data *wdata = data;
420 
421 	if (unlikely(wdata->pnfs_error)) {
422 		bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
423 					num_se);
424 	}
425 
426 	wdata->task.tk_status = wdata->pnfs_error;
427 	wdata->verf.committed = NFS_FILE_SYNC;
428 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
429 	schedule_work(&wdata->task.u.tk_work);
430 }
431 
432 /* FIXME STUB - mark intersection of layout and page as bad, so is not
433  * used again.
434  */
mark_bad_read(void)435 static void mark_bad_read(void)
436 {
437 	return;
438 }
439 
440 /*
441  * map_block:  map a requested I/0 block (isect) into an offset in the LVM
442  * block_device
443  */
444 static void
map_block(struct buffer_head * bh,sector_t isect,struct pnfs_block_extent * be)445 map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
446 {
447 	dprintk("%s enter be=%p\n", __func__, be);
448 
449 	set_buffer_mapped(bh);
450 	bh->b_bdev = be->be_mdev;
451 	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
452 	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
453 
454 	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
455 		__func__, (unsigned long long)isect, (long)bh->b_blocknr,
456 		bh->b_size);
457 	return;
458 }
459 
460 static void
bl_read_single_end_io(struct bio * bio,int error)461 bl_read_single_end_io(struct bio *bio, int error)
462 {
463 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
464 	struct page *page = bvec->bv_page;
465 
466 	/* Only one page in bvec */
467 	unlock_page(page);
468 }
469 
470 static int
bl_do_readpage_sync(struct page * page,struct pnfs_block_extent * be,unsigned int offset,unsigned int len)471 bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
472 		    unsigned int offset, unsigned int len)
473 {
474 	struct bio *bio;
475 	struct page *shadow_page;
476 	sector_t isect;
477 	char *kaddr, *kshadow_addr;
478 	int ret = 0;
479 
480 	dprintk("%s: offset %u len %u\n", __func__, offset, len);
481 
482 	shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
483 	if (shadow_page == NULL)
484 		return -ENOMEM;
485 
486 	bio = bio_alloc(GFP_NOIO, 1);
487 	if (bio == NULL)
488 		return -ENOMEM;
489 
490 	isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
491 		(offset / SECTOR_SIZE);
492 
493 	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
494 	bio->bi_bdev = be->be_mdev;
495 	bio->bi_end_io = bl_read_single_end_io;
496 
497 	lock_page(shadow_page);
498 	if (bio_add_page(bio, shadow_page,
499 			 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
500 		unlock_page(shadow_page);
501 		bio_put(bio);
502 		return -EIO;
503 	}
504 
505 	submit_bio(READ, bio);
506 	wait_on_page_locked(shadow_page);
507 	if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
508 		ret = -EIO;
509 	} else {
510 		kaddr = kmap_atomic(page);
511 		kshadow_addr = kmap_atomic(shadow_page);
512 		memcpy(kaddr + offset, kshadow_addr + offset, len);
513 		kunmap_atomic(kshadow_addr);
514 		kunmap_atomic(kaddr);
515 	}
516 	__free_page(shadow_page);
517 	bio_put(bio);
518 
519 	return ret;
520 }
521 
522 static int
bl_read_partial_page_sync(struct page * page,struct pnfs_block_extent * be,unsigned int dirty_offset,unsigned int dirty_len,bool full_page)523 bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
524 			  unsigned int dirty_offset, unsigned int dirty_len,
525 			  bool full_page)
526 {
527 	int ret = 0;
528 	unsigned int start, end;
529 
530 	if (full_page) {
531 		start = 0;
532 		end = PAGE_CACHE_SIZE;
533 	} else {
534 		start = round_down(dirty_offset, SECTOR_SIZE);
535 		end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
536 	}
537 
538 	dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
539 	if (!be) {
540 		zero_user_segments(page, start, dirty_offset,
541 				   dirty_offset + dirty_len, end);
542 		if (start == 0 && end == PAGE_CACHE_SIZE &&
543 		    trylock_page(page)) {
544 			SetPageUptodate(page);
545 			unlock_page(page);
546 		}
547 		return ret;
548 	}
549 
550 	if (start != dirty_offset)
551 		ret = bl_do_readpage_sync(page, be, start,
552 					  dirty_offset - start);
553 
554 	if (!ret && (dirty_offset + dirty_len < end))
555 		ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
556 					  end - dirty_offset - dirty_len);
557 
558 	return ret;
559 }
560 
561 /* Given an unmapped page, zero it or read in page for COW, page is locked
562  * by caller.
563  */
564 static int
init_page_for_write(struct page * page,struct pnfs_block_extent * cow_read)565 init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
566 {
567 	struct buffer_head *bh = NULL;
568 	int ret = 0;
569 	sector_t isect;
570 
571 	dprintk("%s enter, %p\n", __func__, page);
572 	BUG_ON(PageUptodate(page));
573 	if (!cow_read) {
574 		zero_user_segment(page, 0, PAGE_SIZE);
575 		SetPageUptodate(page);
576 		goto cleanup;
577 	}
578 
579 	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
580 	if (!bh) {
581 		ret = -ENOMEM;
582 		goto cleanup;
583 	}
584 
585 	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
586 	map_block(bh, isect, cow_read);
587 	if (!bh_uptodate_or_lock(bh))
588 		ret = bh_submit_read(bh);
589 	if (ret)
590 		goto cleanup;
591 	SetPageUptodate(page);
592 
593 cleanup:
594 	if (bh)
595 		free_buffer_head(bh);
596 	if (ret) {
597 		/* Need to mark layout with bad read...should now
598 		 * just use nfs4 for reads and writes.
599 		 */
600 		mark_bad_read();
601 	}
602 	return ret;
603 }
604 
605 /* Find or create a zeroing page marked being writeback.
606  * Return ERR_PTR on error, NULL to indicate skip this page and page itself
607  * to indicate write out.
608  */
609 static struct page *
bl_find_get_zeroing_page(struct inode * inode,pgoff_t index,struct pnfs_block_extent * cow_read)610 bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
611 			struct pnfs_block_extent *cow_read)
612 {
613 	struct page *page;
614 	int locked = 0;
615 	page = find_get_page(inode->i_mapping, index);
616 	if (page)
617 		goto check_page;
618 
619 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
620 	if (unlikely(!page)) {
621 		dprintk("%s oom\n", __func__);
622 		return ERR_PTR(-ENOMEM);
623 	}
624 	locked = 1;
625 
626 check_page:
627 	/* PageDirty: Other will write this out
628 	 * PageWriteback: Other is writing this out
629 	 * PageUptodate: It was read before
630 	 */
631 	if (PageDirty(page) || PageWriteback(page)) {
632 		print_page(page);
633 		if (locked)
634 			unlock_page(page);
635 		page_cache_release(page);
636 		return NULL;
637 	}
638 
639 	if (!locked) {
640 		lock_page(page);
641 		locked = 1;
642 		goto check_page;
643 	}
644 	if (!PageUptodate(page)) {
645 		/* New page, readin or zero it */
646 		init_page_for_write(page, cow_read);
647 	}
648 	set_page_writeback(page);
649 	unlock_page(page);
650 
651 	return page;
652 }
653 
654 static enum pnfs_try_status
bl_write_pagelist(struct nfs_write_data * wdata,int sync)655 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
656 {
657 	int i, ret, npg_zero, pg_index, last = 0;
658 	struct bio *bio = NULL;
659 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
660 	sector_t isect, last_isect = 0, extent_length = 0;
661 	struct parallel_io *par;
662 	loff_t offset = wdata->args.offset;
663 	size_t count = wdata->args.count;
664 	unsigned int pg_offset, pg_len, saved_len;
665 	struct page **pages = wdata->args.pages;
666 	struct page *page;
667 	pgoff_t index;
668 	u64 temp;
669 	int npg_per_block =
670 	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
671 
672 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
673 	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
674 	 * We want to write each, and if there is an error set pnfs_error
675 	 * to have it redone using nfs.
676 	 */
677 	par = alloc_parallel(wdata);
678 	if (!par)
679 		goto out_mds;
680 	par->pnfs_callback = bl_end_par_io_write;
681 	/* At this point, have to be more careful with error handling */
682 
683 	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
684 	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
685 	if (!be || !is_writable(be, isect)) {
686 		dprintk("%s no matching extents!\n", __func__);
687 		goto out_mds;
688 	}
689 
690 	/* First page inside INVALID extent */
691 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
692 		if (likely(!bl_push_one_short_extent(be->be_inval)))
693 			par->bse_count++;
694 		else
695 			goto out_mds;
696 		temp = offset >> PAGE_CACHE_SHIFT;
697 		npg_zero = do_div(temp, npg_per_block);
698 		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
699 				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
700 		extent_length = be->be_length - (isect - be->be_f_offset);
701 
702 fill_invalid_ext:
703 		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
704 		for (;npg_zero > 0; npg_zero--) {
705 			if (bl_is_sector_init(be->be_inval, isect)) {
706 				dprintk("isect %llu already init\n",
707 					(unsigned long long)isect);
708 				goto next_page;
709 			}
710 			/* page ref released in bl_end_io_write_zero */
711 			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
712 			dprintk("%s zero %dth page: index %lu isect %llu\n",
713 				__func__, npg_zero, index,
714 				(unsigned long long)isect);
715 			page = bl_find_get_zeroing_page(wdata->inode, index,
716 							cow_read);
717 			if (unlikely(IS_ERR(page))) {
718 				wdata->pnfs_error = PTR_ERR(page);
719 				goto out;
720 			} else if (page == NULL)
721 				goto next_page;
722 
723 			ret = bl_mark_sectors_init(be->be_inval, isect,
724 						       PAGE_CACHE_SECTORS);
725 			if (unlikely(ret)) {
726 				dprintk("%s bl_mark_sectors_init fail %d\n",
727 					__func__, ret);
728 				end_page_writeback(page);
729 				page_cache_release(page);
730 				wdata->pnfs_error = ret;
731 				goto out;
732 			}
733 			if (likely(!bl_push_one_short_extent(be->be_inval)))
734 				par->bse_count++;
735 			else {
736 				end_page_writeback(page);
737 				page_cache_release(page);
738 				wdata->pnfs_error = -ENOMEM;
739 				goto out;
740 			}
741 			/* FIXME: This should be done in bi_end_io */
742 			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
743 					     page->index << PAGE_CACHE_SHIFT,
744 					     PAGE_CACHE_SIZE);
745 
746 			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
747 						 isect, page, be,
748 						 bl_end_io_write_zero, par);
749 			if (IS_ERR(bio)) {
750 				wdata->pnfs_error = PTR_ERR(bio);
751 				bio = NULL;
752 				goto out;
753 			}
754 next_page:
755 			isect += PAGE_CACHE_SECTORS;
756 			extent_length -= PAGE_CACHE_SECTORS;
757 		}
758 		if (last)
759 			goto write_done;
760 	}
761 	bio = bl_submit_bio(WRITE, bio);
762 
763 	/* Middle pages */
764 	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
765 	for (i = pg_index; i < wdata->npages; i++) {
766 		if (!extent_length) {
767 			/* We've used up the previous extent */
768 			bl_put_extent(be);
769 			bl_put_extent(cow_read);
770 			bio = bl_submit_bio(WRITE, bio);
771 			/* Get the next one */
772 			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
773 						isect, &cow_read);
774 			if (!be || !is_writable(be, isect)) {
775 				wdata->pnfs_error = -EINVAL;
776 				goto out;
777 			}
778 			if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
779 				if (likely(!bl_push_one_short_extent(
780 								be->be_inval)))
781 					par->bse_count++;
782 				else {
783 					wdata->pnfs_error = -ENOMEM;
784 					goto out;
785 				}
786 			}
787 			extent_length = be->be_length -
788 			    (isect - be->be_f_offset);
789 		}
790 
791 		dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
792 		pg_offset = offset & ~PAGE_CACHE_MASK;
793 		if (pg_offset + count > PAGE_CACHE_SIZE)
794 			pg_len = PAGE_CACHE_SIZE - pg_offset;
795 		else
796 			pg_len = count;
797 
798 		saved_len = pg_len;
799 		if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
800 		    !bl_is_sector_init(be->be_inval, isect)) {
801 			ret = bl_read_partial_page_sync(pages[i], cow_read,
802 						pg_offset, pg_len, true);
803 			if (ret) {
804 				dprintk("%s bl_read_partial_page_sync fail %d\n",
805 					__func__, ret);
806 				wdata->pnfs_error = ret;
807 				goto out;
808 			}
809 
810 			ret = bl_mark_sectors_init(be->be_inval, isect,
811 						       PAGE_CACHE_SECTORS);
812 			if (unlikely(ret)) {
813 				dprintk("%s bl_mark_sectors_init fail %d\n",
814 					__func__, ret);
815 				wdata->pnfs_error = ret;
816 				goto out;
817 			}
818 
819 			/* Expand to full page write */
820 			pg_offset = 0;
821 			pg_len = PAGE_CACHE_SIZE;
822 		} else if ((pg_offset & (SECTOR_SIZE - 1)) ||
823 			    (pg_len & (SECTOR_SIZE - 1))) {
824 			/* ahh, nasty case. We have to do sync full sector
825 			 * read-modify-write cycles.
826 			 */
827 			unsigned int saved_offset = pg_offset;
828 			ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
829 							pg_len, false);
830 			pg_offset = round_down(pg_offset, SECTOR_SIZE);
831 			pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
832 				 - pg_offset;
833 		}
834 		bio = do_add_page_to_bio(bio, wdata->npages - i, WRITE,
835 					 isect, pages[i], be,
836 					 bl_end_io_write, par,
837 					 pg_offset, pg_len);
838 		if (IS_ERR(bio)) {
839 			wdata->pnfs_error = PTR_ERR(bio);
840 			bio = NULL;
841 			goto out;
842 		}
843 		offset += saved_len;
844 		count -= saved_len;
845 		isect += PAGE_CACHE_SECTORS;
846 		last_isect = isect;
847 		extent_length -= PAGE_CACHE_SECTORS;
848 	}
849 
850 	/* Last page inside INVALID extent */
851 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
852 		bio = bl_submit_bio(WRITE, bio);
853 		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
854 		npg_zero = npg_per_block - do_div(temp, npg_per_block);
855 		if (npg_zero < npg_per_block) {
856 			last = 1;
857 			goto fill_invalid_ext;
858 		}
859 	}
860 
861 write_done:
862 	wdata->res.count = wdata->args.count;
863 out:
864 	bl_put_extent(be);
865 	bl_put_extent(cow_read);
866 	bl_submit_bio(WRITE, bio);
867 	put_parallel(par);
868 	return PNFS_ATTEMPTED;
869 out_mds:
870 	bl_put_extent(be);
871 	bl_put_extent(cow_read);
872 	kfree(par);
873 	return PNFS_NOT_ATTEMPTED;
874 }
875 
876 /* FIXME - range ignored */
877 static void
release_extents(struct pnfs_block_layout * bl,struct pnfs_layout_range * range)878 release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
879 {
880 	int i;
881 	struct pnfs_block_extent *be;
882 
883 	spin_lock(&bl->bl_ext_lock);
884 	for (i = 0; i < EXTENT_LISTS; i++) {
885 		while (!list_empty(&bl->bl_extents[i])) {
886 			be = list_first_entry(&bl->bl_extents[i],
887 					      struct pnfs_block_extent,
888 					      be_node);
889 			list_del(&be->be_node);
890 			bl_put_extent(be);
891 		}
892 	}
893 	spin_unlock(&bl->bl_ext_lock);
894 }
895 
896 static void
release_inval_marks(struct pnfs_inval_markings * marks)897 release_inval_marks(struct pnfs_inval_markings *marks)
898 {
899 	struct pnfs_inval_tracking *pos, *temp;
900 	struct pnfs_block_short_extent *se, *stemp;
901 
902 	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
903 		list_del(&pos->it_link);
904 		kfree(pos);
905 	}
906 
907 	list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
908 		list_del(&se->bse_node);
909 		kfree(se);
910 	}
911 	return;
912 }
913 
bl_free_layout_hdr(struct pnfs_layout_hdr * lo)914 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
915 {
916 	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
917 
918 	dprintk("%s enter\n", __func__);
919 	release_extents(bl, NULL);
920 	release_inval_marks(&bl->bl_inval);
921 	kfree(bl);
922 }
923 
bl_alloc_layout_hdr(struct inode * inode,gfp_t gfp_flags)924 static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
925 						   gfp_t gfp_flags)
926 {
927 	struct pnfs_block_layout *bl;
928 
929 	dprintk("%s enter\n", __func__);
930 	bl = kzalloc(sizeof(*bl), gfp_flags);
931 	if (!bl)
932 		return NULL;
933 	spin_lock_init(&bl->bl_ext_lock);
934 	INIT_LIST_HEAD(&bl->bl_extents[0]);
935 	INIT_LIST_HEAD(&bl->bl_extents[1]);
936 	INIT_LIST_HEAD(&bl->bl_commit);
937 	INIT_LIST_HEAD(&bl->bl_committing);
938 	bl->bl_count = 0;
939 	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
940 	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
941 	return &bl->bl_layout;
942 }
943 
bl_free_lseg(struct pnfs_layout_segment * lseg)944 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
945 {
946 	dprintk("%s enter\n", __func__);
947 	kfree(lseg);
948 }
949 
950 /* We pretty much ignore lseg, and store all data layout wide, so we
951  * can correctly merge.
952  */
bl_alloc_lseg(struct pnfs_layout_hdr * lo,struct nfs4_layoutget_res * lgr,gfp_t gfp_flags)953 static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
954 						 struct nfs4_layoutget_res *lgr,
955 						 gfp_t gfp_flags)
956 {
957 	struct pnfs_layout_segment *lseg;
958 	int status;
959 
960 	dprintk("%s enter\n", __func__);
961 	lseg = kzalloc(sizeof(*lseg), gfp_flags);
962 	if (!lseg)
963 		return ERR_PTR(-ENOMEM);
964 	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
965 	if (status) {
966 		/* We don't want to call the full-blown bl_free_lseg,
967 		 * since on error extents were not touched.
968 		 */
969 		kfree(lseg);
970 		return ERR_PTR(status);
971 	}
972 	return lseg;
973 }
974 
975 static void
bl_encode_layoutcommit(struct pnfs_layout_hdr * lo,struct xdr_stream * xdr,const struct nfs4_layoutcommit_args * arg)976 bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
977 		       const struct nfs4_layoutcommit_args *arg)
978 {
979 	dprintk("%s enter\n", __func__);
980 	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
981 }
982 
983 static void
bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data * lcdata)984 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
985 {
986 	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
987 
988 	dprintk("%s enter\n", __func__);
989 	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
990 }
991 
free_blk_mountid(struct block_mount_id * mid)992 static void free_blk_mountid(struct block_mount_id *mid)
993 {
994 	if (mid) {
995 		struct pnfs_block_dev *dev, *tmp;
996 
997 		/* No need to take bm_lock as we are last user freeing bm_devlist */
998 		list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
999 			list_del(&dev->bm_node);
1000 			bl_free_block_dev(dev);
1001 		}
1002 		kfree(mid);
1003 	}
1004 }
1005 
1006 /* This is mostly copied from the filelayout's get_device_info function.
1007  * It seems much of this should be at the generic pnfs level.
1008  */
1009 static struct pnfs_block_dev *
nfs4_blk_get_deviceinfo(struct nfs_server * server,const struct nfs_fh * fh,struct nfs4_deviceid * d_id)1010 nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1011 			struct nfs4_deviceid *d_id)
1012 {
1013 	struct pnfs_device *dev;
1014 	struct pnfs_block_dev *rv;
1015 	u32 max_resp_sz;
1016 	int max_pages;
1017 	struct page **pages = NULL;
1018 	int i, rc;
1019 
1020 	/*
1021 	 * Use the session max response size as the basis for setting
1022 	 * GETDEVICEINFO's maxcount
1023 	 */
1024 	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
1025 	max_pages = nfs_page_array_len(0, max_resp_sz);
1026 	dprintk("%s max_resp_sz %u max_pages %d\n",
1027 		__func__, max_resp_sz, max_pages);
1028 
1029 	dev = kmalloc(sizeof(*dev), GFP_NOFS);
1030 	if (!dev) {
1031 		dprintk("%s kmalloc failed\n", __func__);
1032 		return ERR_PTR(-ENOMEM);
1033 	}
1034 
1035 	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
1036 	if (pages == NULL) {
1037 		kfree(dev);
1038 		return ERR_PTR(-ENOMEM);
1039 	}
1040 	for (i = 0; i < max_pages; i++) {
1041 		pages[i] = alloc_page(GFP_NOFS);
1042 		if (!pages[i]) {
1043 			rv = ERR_PTR(-ENOMEM);
1044 			goto out_free;
1045 		}
1046 	}
1047 
1048 	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
1049 	dev->layout_type = LAYOUT_BLOCK_VOLUME;
1050 	dev->pages = pages;
1051 	dev->pgbase = 0;
1052 	dev->pglen = PAGE_SIZE * max_pages;
1053 	dev->mincount = 0;
1054 
1055 	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
1056 	rc = nfs4_proc_getdeviceinfo(server, dev);
1057 	dprintk("%s getdevice info returns %d\n", __func__, rc);
1058 	if (rc) {
1059 		rv = ERR_PTR(rc);
1060 		goto out_free;
1061 	}
1062 
1063 	rv = nfs4_blk_decode_device(server, dev);
1064  out_free:
1065 	for (i = 0; i < max_pages; i++)
1066 		__free_page(pages[i]);
1067 	kfree(pages);
1068 	kfree(dev);
1069 	return rv;
1070 }
1071 
1072 static int
bl_set_layoutdriver(struct nfs_server * server,const struct nfs_fh * fh)1073 bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
1074 {
1075 	struct block_mount_id *b_mt_id = NULL;
1076 	struct pnfs_devicelist *dlist = NULL;
1077 	struct pnfs_block_dev *bdev;
1078 	LIST_HEAD(block_disklist);
1079 	int status, i;
1080 
1081 	dprintk("%s enter\n", __func__);
1082 
1083 	if (server->pnfs_blksize == 0) {
1084 		dprintk("%s Server did not return blksize\n", __func__);
1085 		return -EINVAL;
1086 	}
1087 	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
1088 	if (!b_mt_id) {
1089 		status = -ENOMEM;
1090 		goto out_error;
1091 	}
1092 	/* Initialize nfs4 block layout mount id */
1093 	spin_lock_init(&b_mt_id->bm_lock);
1094 	INIT_LIST_HEAD(&b_mt_id->bm_devlist);
1095 
1096 	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
1097 	if (!dlist) {
1098 		status = -ENOMEM;
1099 		goto out_error;
1100 	}
1101 	dlist->eof = 0;
1102 	while (!dlist->eof) {
1103 		status = nfs4_proc_getdevicelist(server, fh, dlist);
1104 		if (status)
1105 			goto out_error;
1106 		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
1107 			__func__, dlist->num_devs, dlist->eof);
1108 		for (i = 0; i < dlist->num_devs; i++) {
1109 			bdev = nfs4_blk_get_deviceinfo(server, fh,
1110 						       &dlist->dev_id[i]);
1111 			if (IS_ERR(bdev)) {
1112 				status = PTR_ERR(bdev);
1113 				goto out_error;
1114 			}
1115 			spin_lock(&b_mt_id->bm_lock);
1116 			list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
1117 			spin_unlock(&b_mt_id->bm_lock);
1118 		}
1119 	}
1120 	dprintk("%s SUCCESS\n", __func__);
1121 	server->pnfs_ld_data = b_mt_id;
1122 
1123  out_return:
1124 	kfree(dlist);
1125 	return status;
1126 
1127  out_error:
1128 	free_blk_mountid(b_mt_id);
1129 	goto out_return;
1130 }
1131 
1132 static int
bl_clear_layoutdriver(struct nfs_server * server)1133 bl_clear_layoutdriver(struct nfs_server *server)
1134 {
1135 	struct block_mount_id *b_mt_id = server->pnfs_ld_data;
1136 
1137 	dprintk("%s enter\n", __func__);
1138 	free_blk_mountid(b_mt_id);
1139 	dprintk("%s RETURNS\n", __func__);
1140 	return 0;
1141 }
1142 
1143 static const struct nfs_pageio_ops bl_pg_read_ops = {
1144 	.pg_init = pnfs_generic_pg_init_read,
1145 	.pg_test = pnfs_generic_pg_test,
1146 	.pg_doio = pnfs_generic_pg_readpages,
1147 };
1148 
1149 static const struct nfs_pageio_ops bl_pg_write_ops = {
1150 	.pg_init = pnfs_generic_pg_init_write,
1151 	.pg_test = pnfs_generic_pg_test,
1152 	.pg_doio = pnfs_generic_pg_writepages,
1153 };
1154 
1155 static struct pnfs_layoutdriver_type blocklayout_type = {
1156 	.id				= LAYOUT_BLOCK_VOLUME,
1157 	.name				= "LAYOUT_BLOCK_VOLUME",
1158 	.owner				= THIS_MODULE,
1159 	.read_pagelist			= bl_read_pagelist,
1160 	.write_pagelist			= bl_write_pagelist,
1161 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
1162 	.free_layout_hdr		= bl_free_layout_hdr,
1163 	.alloc_lseg			= bl_alloc_lseg,
1164 	.free_lseg			= bl_free_lseg,
1165 	.encode_layoutcommit		= bl_encode_layoutcommit,
1166 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
1167 	.set_layoutdriver		= bl_set_layoutdriver,
1168 	.clear_layoutdriver		= bl_clear_layoutdriver,
1169 	.pg_read_ops			= &bl_pg_read_ops,
1170 	.pg_write_ops			= &bl_pg_write_ops,
1171 };
1172 
1173 static const struct rpc_pipe_ops bl_upcall_ops = {
1174 	.upcall		= rpc_pipe_generic_upcall,
1175 	.downcall	= bl_pipe_downcall,
1176 	.destroy_msg	= bl_pipe_destroy_msg,
1177 };
1178 
nfs4blocklayout_register_sb(struct super_block * sb,struct rpc_pipe * pipe)1179 static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1180 					    struct rpc_pipe *pipe)
1181 {
1182 	struct dentry *dir, *dentry;
1183 
1184 	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1185 	if (dir == NULL)
1186 		return ERR_PTR(-ENOENT);
1187 	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1188 	dput(dir);
1189 	return dentry;
1190 }
1191 
nfs4blocklayout_unregister_sb(struct super_block * sb,struct rpc_pipe * pipe)1192 static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1193 					  struct rpc_pipe *pipe)
1194 {
1195 	if (pipe->dentry)
1196 		rpc_unlink(pipe->dentry);
1197 }
1198 
rpc_pipefs_event(struct notifier_block * nb,unsigned long event,void * ptr)1199 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1200 			   void *ptr)
1201 {
1202 	struct super_block *sb = ptr;
1203 	struct net *net = sb->s_fs_info;
1204 	struct nfs_net *nn = net_generic(net, nfs_net_id);
1205 	struct dentry *dentry;
1206 	int ret = 0;
1207 
1208 	if (!try_module_get(THIS_MODULE))
1209 		return 0;
1210 
1211 	if (nn->bl_device_pipe == NULL) {
1212 		module_put(THIS_MODULE);
1213 		return 0;
1214 	}
1215 
1216 	switch (event) {
1217 	case RPC_PIPEFS_MOUNT:
1218 		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1219 		if (IS_ERR(dentry)) {
1220 			ret = PTR_ERR(dentry);
1221 			break;
1222 		}
1223 		nn->bl_device_pipe->dentry = dentry;
1224 		break;
1225 	case RPC_PIPEFS_UMOUNT:
1226 		if (nn->bl_device_pipe->dentry)
1227 			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1228 		break;
1229 	default:
1230 		ret = -ENOTSUPP;
1231 		break;
1232 	}
1233 	module_put(THIS_MODULE);
1234 	return ret;
1235 }
1236 
1237 static struct notifier_block nfs4blocklayout_block = {
1238 	.notifier_call = rpc_pipefs_event,
1239 };
1240 
nfs4blocklayout_register_net(struct net * net,struct rpc_pipe * pipe)1241 static struct dentry *nfs4blocklayout_register_net(struct net *net,
1242 						   struct rpc_pipe *pipe)
1243 {
1244 	struct super_block *pipefs_sb;
1245 	struct dentry *dentry;
1246 
1247 	pipefs_sb = rpc_get_sb_net(net);
1248 	if (!pipefs_sb)
1249 		return NULL;
1250 	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1251 	rpc_put_sb_net(net);
1252 	return dentry;
1253 }
1254 
nfs4blocklayout_unregister_net(struct net * net,struct rpc_pipe * pipe)1255 static void nfs4blocklayout_unregister_net(struct net *net,
1256 					   struct rpc_pipe *pipe)
1257 {
1258 	struct super_block *pipefs_sb;
1259 
1260 	pipefs_sb = rpc_get_sb_net(net);
1261 	if (pipefs_sb) {
1262 		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1263 		rpc_put_sb_net(net);
1264 	}
1265 }
1266 
nfs4blocklayout_net_init(struct net * net)1267 static int nfs4blocklayout_net_init(struct net *net)
1268 {
1269 	struct nfs_net *nn = net_generic(net, nfs_net_id);
1270 	struct dentry *dentry;
1271 
1272 	init_waitqueue_head(&nn->bl_wq);
1273 	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1274 	if (IS_ERR(nn->bl_device_pipe))
1275 		return PTR_ERR(nn->bl_device_pipe);
1276 	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1277 	if (IS_ERR(dentry)) {
1278 		rpc_destroy_pipe_data(nn->bl_device_pipe);
1279 		return PTR_ERR(dentry);
1280 	}
1281 	nn->bl_device_pipe->dentry = dentry;
1282 	return 0;
1283 }
1284 
nfs4blocklayout_net_exit(struct net * net)1285 static void nfs4blocklayout_net_exit(struct net *net)
1286 {
1287 	struct nfs_net *nn = net_generic(net, nfs_net_id);
1288 
1289 	nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1290 	rpc_destroy_pipe_data(nn->bl_device_pipe);
1291 	nn->bl_device_pipe = NULL;
1292 }
1293 
1294 static struct pernet_operations nfs4blocklayout_net_ops = {
1295 	.init = nfs4blocklayout_net_init,
1296 	.exit = nfs4blocklayout_net_exit,
1297 };
1298 
nfs4blocklayout_init(void)1299 static int __init nfs4blocklayout_init(void)
1300 {
1301 	int ret;
1302 
1303 	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
1304 
1305 	ret = pnfs_register_layoutdriver(&blocklayout_type);
1306 	if (ret)
1307 		goto out;
1308 
1309 	ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1310 	if (ret)
1311 		goto out_remove;
1312 	ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
1313 	if (ret)
1314 		goto out_notifier;
1315 out:
1316 	return ret;
1317 
1318 out_notifier:
1319 	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1320 out_remove:
1321 	pnfs_unregister_layoutdriver(&blocklayout_type);
1322 	return ret;
1323 }
1324 
nfs4blocklayout_exit(void)1325 static void __exit nfs4blocklayout_exit(void)
1326 {
1327 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1328 	       __func__);
1329 
1330 	rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1331 	unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1332 	pnfs_unregister_layoutdriver(&blocklayout_type);
1333 }
1334 
1335 MODULE_ALIAS("nfs-layouttype4-3");
1336 
1337 module_init(nfs4blocklayout_init);
1338 module_exit(nfs4blocklayout_exit);
1339