• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_shared.h"
20 #include "xfs_format.h"
21 #include "xfs_log_format.h"
22 #include "xfs_trans_resv.h"
23 #include "xfs_mount.h"
24 #include "xfs_inode.h"
25 #include "xfs_trans.h"
26 #include "xfs_inode_item.h"
27 #include "xfs_alloc.h"
28 #include "xfs_error.h"
29 #include "xfs_iomap.h"
30 #include "xfs_trace.h"
31 #include "xfs_bmap.h"
32 #include "xfs_bmap_util.h"
33 #include "xfs_bmap_btree.h"
34 #include <linux/gfp.h>
35 #include <linux/mpage.h>
36 #include <linux/pagevec.h>
37 #include <linux/writeback.h>
38 
39 void
xfs_count_page_state(struct page * page,int * delalloc,int * unwritten)40 xfs_count_page_state(
41 	struct page		*page,
42 	int			*delalloc,
43 	int			*unwritten)
44 {
45 	struct buffer_head	*bh, *head;
46 
47 	*delalloc = *unwritten = 0;
48 
49 	bh = head = page_buffers(page);
50 	do {
51 		if (buffer_unwritten(bh))
52 			(*unwritten) = 1;
53 		else if (buffer_delay(bh))
54 			(*delalloc) = 1;
55 	} while ((bh = bh->b_this_page) != head);
56 }
57 
58 STATIC struct block_device *
xfs_find_bdev_for_inode(struct inode * inode)59 xfs_find_bdev_for_inode(
60 	struct inode		*inode)
61 {
62 	struct xfs_inode	*ip = XFS_I(inode);
63 	struct xfs_mount	*mp = ip->i_mount;
64 
65 	if (XFS_IS_REALTIME_INODE(ip))
66 		return mp->m_rtdev_targp->bt_bdev;
67 	else
68 		return mp->m_ddev_targp->bt_bdev;
69 }
70 
71 /*
72  * We're now finished for good with this ioend structure.
73  * Update the page state via the associated buffer_heads,
74  * release holds on the inode and bio, and finally free
75  * up memory.  Do not use the ioend after this.
76  */
77 STATIC void
xfs_destroy_ioend(xfs_ioend_t * ioend)78 xfs_destroy_ioend(
79 	xfs_ioend_t		*ioend)
80 {
81 	struct buffer_head	*bh, *next;
82 
83 	for (bh = ioend->io_buffer_head; bh; bh = next) {
84 		next = bh->b_private;
85 		bh->b_end_io(bh, !ioend->io_error);
86 	}
87 
88 	mempool_free(ioend, xfs_ioend_pool);
89 }
90 
91 /*
92  * Fast and loose check if this write could update the on-disk inode size.
93  */
xfs_ioend_is_append(struct xfs_ioend * ioend)94 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
95 {
96 	return ioend->io_offset + ioend->io_size >
97 		XFS_I(ioend->io_inode)->i_d.di_size;
98 }
99 
100 STATIC int
xfs_setfilesize_trans_alloc(struct xfs_ioend * ioend)101 xfs_setfilesize_trans_alloc(
102 	struct xfs_ioend	*ioend)
103 {
104 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
105 	struct xfs_trans	*tp;
106 	int			error;
107 
108 	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
109 
110 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
111 	if (error) {
112 		xfs_trans_cancel(tp);
113 		return error;
114 	}
115 
116 	ioend->io_append_trans = tp;
117 
118 	/*
119 	 * We may pass freeze protection with a transaction.  So tell lockdep
120 	 * we released it.
121 	 */
122 	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
123 	/*
124 	 * We hand off the transaction to the completion thread now, so
125 	 * clear the flag here.
126 	 */
127 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
128 	return 0;
129 }
130 
131 /*
132  * Update on-disk file size now that data has been written to disk.
133  */
134 STATIC int
xfs_setfilesize(struct xfs_inode * ip,struct xfs_trans * tp,xfs_off_t offset,size_t size)135 xfs_setfilesize(
136 	struct xfs_inode	*ip,
137 	struct xfs_trans	*tp,
138 	xfs_off_t		offset,
139 	size_t			size)
140 {
141 	xfs_fsize_t		isize;
142 
143 	xfs_ilock(ip, XFS_ILOCK_EXCL);
144 	isize = xfs_new_eof(ip, offset + size);
145 	if (!isize) {
146 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
147 		xfs_trans_cancel(tp);
148 		return 0;
149 	}
150 
151 	trace_xfs_setfilesize(ip, offset, size);
152 
153 	ip->i_d.di_size = isize;
154 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
155 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
156 
157 	return xfs_trans_commit(tp);
158 }
159 
160 STATIC int
xfs_setfilesize_ioend(struct xfs_ioend * ioend)161 xfs_setfilesize_ioend(
162 	struct xfs_ioend	*ioend)
163 {
164 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
165 	struct xfs_trans	*tp = ioend->io_append_trans;
166 
167 	/*
168 	 * The transaction may have been allocated in the I/O submission thread,
169 	 * thus we need to mark ourselves as being in a transaction manually.
170 	 * Similarly for freeze protection.
171 	 */
172 	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
173 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
174 
175 	/* we abort the update if there was an IO error */
176 	if (ioend->io_error) {
177 		xfs_trans_cancel(tp);
178 		return ioend->io_error;
179 	}
180 
181 	return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
182 }
183 
184 /*
185  * Schedule IO completion handling on the final put of an ioend.
186  *
187  * If there is no work to do we might as well call it a day and free the
188  * ioend right now.
189  */
190 STATIC void
xfs_finish_ioend(struct xfs_ioend * ioend)191 xfs_finish_ioend(
192 	struct xfs_ioend	*ioend)
193 {
194 	if (atomic_dec_and_test(&ioend->io_remaining)) {
195 		struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
196 
197 		if (ioend->io_type == XFS_IO_UNWRITTEN)
198 			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
199 		else if (ioend->io_append_trans)
200 			queue_work(mp->m_data_workqueue, &ioend->io_work);
201 		else
202 			xfs_destroy_ioend(ioend);
203 	}
204 }
205 
206 /*
207  * IO write completion.
208  */
209 STATIC void
xfs_end_io(struct work_struct * work)210 xfs_end_io(
211 	struct work_struct *work)
212 {
213 	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
214 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
215 	int		error = 0;
216 
217 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 		ioend->io_error = -EIO;
219 		goto done;
220 	}
221 
222 	/*
223 	 * For unwritten extents we need to issue transactions to convert a
224 	 * range to normal written extens after the data I/O has finished.
225 	 * Detecting and handling completion IO errors is done individually
226 	 * for each case as different cleanup operations need to be performed
227 	 * on error.
228 	 */
229 	if (ioend->io_type == XFS_IO_UNWRITTEN) {
230 		if (ioend->io_error)
231 			goto done;
232 		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
233 						  ioend->io_size);
234 	} else if (ioend->io_append_trans) {
235 		error = xfs_setfilesize_ioend(ioend);
236 	} else {
237 		ASSERT(!xfs_ioend_is_append(ioend));
238 	}
239 
240 done:
241 	if (error)
242 		ioend->io_error = error;
243 	xfs_destroy_ioend(ioend);
244 }
245 
246 /*
247  * Allocate and initialise an IO completion structure.
248  * We need to track unwritten extent write completion here initially.
249  * We'll need to extend this for updating the ondisk inode size later
250  * (vs. incore size).
251  */
252 STATIC xfs_ioend_t *
xfs_alloc_ioend(struct inode * inode,unsigned int type)253 xfs_alloc_ioend(
254 	struct inode		*inode,
255 	unsigned int		type)
256 {
257 	xfs_ioend_t		*ioend;
258 
259 	ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
260 
261 	/*
262 	 * Set the count to 1 initially, which will prevent an I/O
263 	 * completion callback from happening before we have started
264 	 * all the I/O from calling the completion routine too early.
265 	 */
266 	atomic_set(&ioend->io_remaining, 1);
267 	ioend->io_error = 0;
268 	ioend->io_list = NULL;
269 	ioend->io_type = type;
270 	ioend->io_inode = inode;
271 	ioend->io_buffer_head = NULL;
272 	ioend->io_buffer_tail = NULL;
273 	ioend->io_offset = 0;
274 	ioend->io_size = 0;
275 	ioend->io_append_trans = NULL;
276 
277 	INIT_WORK(&ioend->io_work, xfs_end_io);
278 	return ioend;
279 }
280 
281 STATIC int
xfs_map_blocks(struct inode * inode,loff_t offset,struct xfs_bmbt_irec * imap,int type,int nonblocking)282 xfs_map_blocks(
283 	struct inode		*inode,
284 	loff_t			offset,
285 	struct xfs_bmbt_irec	*imap,
286 	int			type,
287 	int			nonblocking)
288 {
289 	struct xfs_inode	*ip = XFS_I(inode);
290 	struct xfs_mount	*mp = ip->i_mount;
291 	ssize_t			count = i_blocksize(inode);
292 	xfs_fileoff_t		offset_fsb, end_fsb;
293 	int			error = 0;
294 	int			bmapi_flags = XFS_BMAPI_ENTIRE;
295 	int			nimaps = 1;
296 
297 	if (XFS_FORCED_SHUTDOWN(mp))
298 		return -EIO;
299 
300 	if (type == XFS_IO_UNWRITTEN)
301 		bmapi_flags |= XFS_BMAPI_IGSTATE;
302 
303 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
304 		if (nonblocking)
305 			return -EAGAIN;
306 		xfs_ilock(ip, XFS_ILOCK_SHARED);
307 	}
308 
309 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
310 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
311 	ASSERT(offset <= mp->m_super->s_maxbytes);
312 
313 	if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes)
314 		count = mp->m_super->s_maxbytes - offset;
315 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
316 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
317 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
318 				imap, &nimaps, bmapi_flags);
319 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
320 
321 	if (error)
322 		return error;
323 
324 	if (type == XFS_IO_DELALLOC &&
325 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
326 		error = xfs_iomap_write_allocate(ip, offset, imap);
327 		if (!error)
328 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
329 		return error;
330 	}
331 
332 #ifdef DEBUG
333 	if (type == XFS_IO_UNWRITTEN) {
334 		ASSERT(nimaps);
335 		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
336 		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
337 	}
338 #endif
339 	if (nimaps)
340 		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
341 	return 0;
342 }
343 
344 STATIC int
xfs_imap_valid(struct inode * inode,struct xfs_bmbt_irec * imap,xfs_off_t offset)345 xfs_imap_valid(
346 	struct inode		*inode,
347 	struct xfs_bmbt_irec	*imap,
348 	xfs_off_t		offset)
349 {
350 	offset >>= inode->i_blkbits;
351 
352 	return offset >= imap->br_startoff &&
353 		offset < imap->br_startoff + imap->br_blockcount;
354 }
355 
356 /*
357  * BIO completion handler for buffered IO.
358  */
359 STATIC void
xfs_end_bio(struct bio * bio)360 xfs_end_bio(
361 	struct bio		*bio)
362 {
363 	xfs_ioend_t		*ioend = bio->bi_private;
364 
365 	if (!ioend->io_error)
366 		ioend->io_error = bio->bi_error;
367 
368 	/* Toss bio and pass work off to an xfsdatad thread */
369 	bio->bi_private = NULL;
370 	bio->bi_end_io = NULL;
371 	bio_put(bio);
372 
373 	xfs_finish_ioend(ioend);
374 }
375 
376 STATIC void
xfs_submit_ioend_bio(struct writeback_control * wbc,xfs_ioend_t * ioend,struct bio * bio)377 xfs_submit_ioend_bio(
378 	struct writeback_control *wbc,
379 	xfs_ioend_t		*ioend,
380 	struct bio		*bio)
381 {
382 	atomic_inc(&ioend->io_remaining);
383 	bio->bi_private = ioend;
384 	bio->bi_end_io = xfs_end_bio;
385 	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
386 }
387 
388 STATIC struct bio *
xfs_alloc_ioend_bio(struct buffer_head * bh)389 xfs_alloc_ioend_bio(
390 	struct buffer_head	*bh)
391 {
392 	struct bio		*bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
393 
394 	ASSERT(bio->bi_private == NULL);
395 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
396 	bio->bi_bdev = bh->b_bdev;
397 	return bio;
398 }
399 
400 STATIC void
xfs_start_buffer_writeback(struct buffer_head * bh)401 xfs_start_buffer_writeback(
402 	struct buffer_head	*bh)
403 {
404 	ASSERT(buffer_mapped(bh));
405 	ASSERT(buffer_locked(bh));
406 	ASSERT(!buffer_delay(bh));
407 	ASSERT(!buffer_unwritten(bh));
408 
409 	mark_buffer_async_write(bh);
410 	set_buffer_uptodate(bh);
411 	clear_buffer_dirty(bh);
412 }
413 
414 STATIC void
xfs_start_page_writeback(struct page * page,int clear_dirty,int buffers)415 xfs_start_page_writeback(
416 	struct page		*page,
417 	int			clear_dirty,
418 	int			buffers)
419 {
420 	ASSERT(PageLocked(page));
421 	ASSERT(!PageWriteback(page));
422 
423 	/*
424 	 * if the page was not fully cleaned, we need to ensure that the higher
425 	 * layers come back to it correctly. That means we need to keep the page
426 	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
427 	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
428 	 * write this page in this writeback sweep will be made.
429 	 */
430 	if (clear_dirty) {
431 		clear_page_dirty_for_io(page);
432 		set_page_writeback(page);
433 	} else
434 		set_page_writeback_keepwrite(page);
435 
436 	unlock_page(page);
437 
438 	/* If no buffers on the page are to be written, finish it here */
439 	if (!buffers)
440 		end_page_writeback(page);
441 }
442 
xfs_bio_add_buffer(struct bio * bio,struct buffer_head * bh)443 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
444 {
445 	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
446 }
447 
448 /*
449  * Submit all of the bios for all of the ioends we have saved up, covering the
450  * initial writepage page and also any probed pages.
451  *
452  * Because we may have multiple ioends spanning a page, we need to start
453  * writeback on all the buffers before we submit them for I/O. If we mark the
454  * buffers as we got, then we can end up with a page that only has buffers
455  * marked async write and I/O complete on can occur before we mark the other
456  * buffers async write.
457  *
458  * The end result of this is that we trip a bug in end_page_writeback() because
459  * we call it twice for the one page as the code in end_buffer_async_write()
460  * assumes that all buffers on the page are started at the same time.
461  *
462  * The fix is two passes across the ioend list - one to start writeback on the
463  * buffer_heads, and then submit them for I/O on the second pass.
464  *
465  * If @fail is non-zero, it means that we have a situation where some part of
466  * the submission process has failed after we have marked paged for writeback
467  * and unlocked them. In this situation, we need to fail the ioend chain rather
468  * than submit it to IO. This typically only happens on a filesystem shutdown.
469  */
470 STATIC void
xfs_submit_ioend(struct writeback_control * wbc,xfs_ioend_t * ioend,int fail)471 xfs_submit_ioend(
472 	struct writeback_control *wbc,
473 	xfs_ioend_t		*ioend,
474 	int			fail)
475 {
476 	xfs_ioend_t		*head = ioend;
477 	xfs_ioend_t		*next;
478 	struct buffer_head	*bh;
479 	struct bio		*bio;
480 	sector_t		lastblock = 0;
481 
482 	/* Pass 1 - start writeback */
483 	do {
484 		next = ioend->io_list;
485 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
486 			xfs_start_buffer_writeback(bh);
487 	} while ((ioend = next) != NULL);
488 
489 	/* Pass 2 - submit I/O */
490 	ioend = head;
491 	do {
492 		next = ioend->io_list;
493 		bio = NULL;
494 
495 		/*
496 		 * If we are failing the IO now, just mark the ioend with an
497 		 * error and finish it. This will run IO completion immediately
498 		 * as there is only one reference to the ioend at this point in
499 		 * time.
500 		 */
501 		if (fail) {
502 			ioend->io_error = fail;
503 			xfs_finish_ioend(ioend);
504 			continue;
505 		}
506 
507 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
508 
509 			if (!bio) {
510  retry:
511 				bio = xfs_alloc_ioend_bio(bh);
512 			} else if (bh->b_blocknr != lastblock + 1) {
513 				xfs_submit_ioend_bio(wbc, ioend, bio);
514 				goto retry;
515 			}
516 
517 			if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
518 				xfs_submit_ioend_bio(wbc, ioend, bio);
519 				goto retry;
520 			}
521 
522 			lastblock = bh->b_blocknr;
523 		}
524 		if (bio)
525 			xfs_submit_ioend_bio(wbc, ioend, bio);
526 		xfs_finish_ioend(ioend);
527 	} while ((ioend = next) != NULL);
528 }
529 
530 /*
531  * Cancel submission of all buffer_heads so far in this endio.
532  * Toss the endio too.  Only ever called for the initial page
533  * in a writepage request, so only ever one page.
534  */
535 STATIC void
xfs_cancel_ioend(xfs_ioend_t * ioend)536 xfs_cancel_ioend(
537 	xfs_ioend_t		*ioend)
538 {
539 	xfs_ioend_t		*next;
540 	struct buffer_head	*bh, *next_bh;
541 
542 	do {
543 		next = ioend->io_list;
544 		bh = ioend->io_buffer_head;
545 		do {
546 			next_bh = bh->b_private;
547 			clear_buffer_async_write(bh);
548 			/*
549 			 * The unwritten flag is cleared when added to the
550 			 * ioend. We're not submitting for I/O so mark the
551 			 * buffer unwritten again for next time around.
552 			 */
553 			if (ioend->io_type == XFS_IO_UNWRITTEN)
554 				set_buffer_unwritten(bh);
555 			unlock_buffer(bh);
556 		} while ((bh = next_bh) != NULL);
557 
558 		mempool_free(ioend, xfs_ioend_pool);
559 	} while ((ioend = next) != NULL);
560 }
561 
562 /*
563  * Test to see if we've been building up a completion structure for
564  * earlier buffers -- if so, we try to append to this ioend if we
565  * can, otherwise we finish off any current ioend and start another.
566  * Return true if we've finished the given ioend.
567  */
568 STATIC void
xfs_add_to_ioend(struct inode * inode,struct buffer_head * bh,xfs_off_t offset,unsigned int type,xfs_ioend_t ** result,int need_ioend)569 xfs_add_to_ioend(
570 	struct inode		*inode,
571 	struct buffer_head	*bh,
572 	xfs_off_t		offset,
573 	unsigned int		type,
574 	xfs_ioend_t		**result,
575 	int			need_ioend)
576 {
577 	xfs_ioend_t		*ioend = *result;
578 
579 	if (!ioend || need_ioend || type != ioend->io_type) {
580 		xfs_ioend_t	*previous = *result;
581 
582 		ioend = xfs_alloc_ioend(inode, type);
583 		ioend->io_offset = offset;
584 		ioend->io_buffer_head = bh;
585 		ioend->io_buffer_tail = bh;
586 		if (previous)
587 			previous->io_list = ioend;
588 		*result = ioend;
589 	} else {
590 		ioend->io_buffer_tail->b_private = bh;
591 		ioend->io_buffer_tail = bh;
592 	}
593 
594 	bh->b_private = NULL;
595 	ioend->io_size += bh->b_size;
596 }
597 
598 STATIC void
xfs_map_buffer(struct inode * inode,struct buffer_head * bh,struct xfs_bmbt_irec * imap,xfs_off_t offset)599 xfs_map_buffer(
600 	struct inode		*inode,
601 	struct buffer_head	*bh,
602 	struct xfs_bmbt_irec	*imap,
603 	xfs_off_t		offset)
604 {
605 	sector_t		bn;
606 	struct xfs_mount	*m = XFS_I(inode)->i_mount;
607 	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
608 	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
609 
610 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
611 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
612 
613 	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
614 	      ((offset - iomap_offset) >> inode->i_blkbits);
615 
616 	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
617 
618 	bh->b_blocknr = bn;
619 	set_buffer_mapped(bh);
620 }
621 
622 STATIC void
xfs_map_at_offset(struct inode * inode,struct buffer_head * bh,struct xfs_bmbt_irec * imap,xfs_off_t offset)623 xfs_map_at_offset(
624 	struct inode		*inode,
625 	struct buffer_head	*bh,
626 	struct xfs_bmbt_irec	*imap,
627 	xfs_off_t		offset)
628 {
629 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
630 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
631 
632 	xfs_map_buffer(inode, bh, imap, offset);
633 	set_buffer_mapped(bh);
634 	clear_buffer_delay(bh);
635 	clear_buffer_unwritten(bh);
636 }
637 
638 /*
639  * Test if a given page contains at least one buffer of a given @type.
640  * If @check_all_buffers is true, then we walk all the buffers in the page to
641  * try to find one of the type passed in. If it is not set, then the caller only
642  * needs to check the first buffer on the page for a match.
643  */
644 STATIC bool
xfs_check_page_type(struct page * page,unsigned int type,bool check_all_buffers)645 xfs_check_page_type(
646 	struct page		*page,
647 	unsigned int		type,
648 	bool			check_all_buffers)
649 {
650 	struct buffer_head	*bh;
651 	struct buffer_head	*head;
652 
653 	if (PageWriteback(page))
654 		return false;
655 	if (!page->mapping)
656 		return false;
657 	if (!page_has_buffers(page))
658 		return false;
659 
660 	bh = head = page_buffers(page);
661 	do {
662 		if (buffer_unwritten(bh)) {
663 			if (type == XFS_IO_UNWRITTEN)
664 				return true;
665 		} else if (buffer_delay(bh)) {
666 			if (type == XFS_IO_DELALLOC)
667 				return true;
668 		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
669 			if (type == XFS_IO_OVERWRITE)
670 				return true;
671 		}
672 
673 		/* If we are only checking the first buffer, we are done now. */
674 		if (!check_all_buffers)
675 			break;
676 	} while ((bh = bh->b_this_page) != head);
677 
678 	return false;
679 }
680 
681 /*
682  * Allocate & map buffers for page given the extent map. Write it out.
683  * except for the original page of a writepage, this is called on
684  * delalloc/unwritten pages only, for the original page it is possible
685  * that the page has no mapping at all.
686  */
687 STATIC int
xfs_convert_page(struct inode * inode,struct page * page,loff_t tindex,struct xfs_bmbt_irec * imap,xfs_ioend_t ** ioendp,struct writeback_control * wbc)688 xfs_convert_page(
689 	struct inode		*inode,
690 	struct page		*page,
691 	loff_t			tindex,
692 	struct xfs_bmbt_irec	*imap,
693 	xfs_ioend_t		**ioendp,
694 	struct writeback_control *wbc)
695 {
696 	struct buffer_head	*bh, *head;
697 	xfs_off_t		end_offset;
698 	unsigned long		p_offset;
699 	unsigned int		type;
700 	int			len, page_dirty;
701 	int			count = 0, done = 0, uptodate = 1;
702  	xfs_off_t		offset = page_offset(page);
703 
704 	if (page->index != tindex)
705 		goto fail;
706 	if (!trylock_page(page))
707 		goto fail;
708 	if (PageWriteback(page))
709 		goto fail_unlock_page;
710 	if (page->mapping != inode->i_mapping)
711 		goto fail_unlock_page;
712 	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
713 		goto fail_unlock_page;
714 
715 	/*
716 	 * page_dirty is initially a count of buffers on the page before
717 	 * EOF and is decremented as we move each into a cleanable state.
718 	 *
719 	 * Derivation:
720 	 *
721 	 * End offset is the highest offset that this page should represent.
722 	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
723 	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
724 	 * hence give us the correct page_dirty count. On any other page,
725 	 * it will be zero and in that case we need page_dirty to be the
726 	 * count of buffers on the page.
727 	 */
728 	end_offset = min_t(unsigned long long,
729 			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
730 			i_size_read(inode));
731 
732 	/*
733 	 * If the current map does not span the entire page we are about to try
734 	 * to write, then give up. The only way we can write a page that spans
735 	 * multiple mappings in a single writeback iteration is via the
736 	 * xfs_vm_writepage() function. Data integrity writeback requires the
737 	 * entire page to be written in a single attempt, otherwise the part of
738 	 * the page we don't write here doesn't get written as part of the data
739 	 * integrity sync.
740 	 *
741 	 * For normal writeback, we also don't attempt to write partial pages
742 	 * here as it simply means that write_cache_pages() will see it under
743 	 * writeback and ignore the page until some point in the future, at
744 	 * which time this will be the only page in the file that needs
745 	 * writeback.  Hence for more optimal IO patterns, we should always
746 	 * avoid partial page writeback due to multiple mappings on a page here.
747 	 */
748 	if (!xfs_imap_valid(inode, imap, end_offset))
749 		goto fail_unlock_page;
750 
751 	len = 1 << inode->i_blkbits;
752 	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
753 					PAGE_CACHE_SIZE);
754 	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
755 	page_dirty = p_offset / len;
756 
757 	/*
758 	 * The moment we find a buffer that doesn't match our current type
759 	 * specification or can't be written, abort the loop and start
760 	 * writeback. As per the above xfs_imap_valid() check, only
761 	 * xfs_vm_writepage() can handle partial page writeback fully - we are
762 	 * limited here to the buffers that are contiguous with the current
763 	 * ioend, and hence a buffer we can't write breaks that contiguity and
764 	 * we have to defer the rest of the IO to xfs_vm_writepage().
765 	 */
766 	bh = head = page_buffers(page);
767 	do {
768 		if (offset >= end_offset)
769 			break;
770 		if (!buffer_uptodate(bh))
771 			uptodate = 0;
772 		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
773 			done = 1;
774 			break;
775 		}
776 
777 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
778 		    buffer_mapped(bh)) {
779 			if (buffer_unwritten(bh))
780 				type = XFS_IO_UNWRITTEN;
781 			else if (buffer_delay(bh))
782 				type = XFS_IO_DELALLOC;
783 			else
784 				type = XFS_IO_OVERWRITE;
785 
786 			/*
787 			 * imap should always be valid because of the above
788 			 * partial page end_offset check on the imap.
789 			 */
790 			ASSERT(xfs_imap_valid(inode, imap, offset));
791 
792 			lock_buffer(bh);
793 			if (type != XFS_IO_OVERWRITE)
794 				xfs_map_at_offset(inode, bh, imap, offset);
795 			xfs_add_to_ioend(inode, bh, offset, type,
796 					 ioendp, done);
797 
798 			page_dirty--;
799 			count++;
800 		} else {
801 			done = 1;
802 			break;
803 		}
804 	} while (offset += len, (bh = bh->b_this_page) != head);
805 
806 	if (uptodate && bh == head)
807 		SetPageUptodate(page);
808 
809 	if (count) {
810 		if (--wbc->nr_to_write <= 0 &&
811 		    wbc->sync_mode == WB_SYNC_NONE)
812 			done = 1;
813 	}
814 	xfs_start_page_writeback(page, !page_dirty, count);
815 
816 	return done;
817  fail_unlock_page:
818 	unlock_page(page);
819  fail:
820 	return 1;
821 }
822 
823 /*
824  * Convert & write out a cluster of pages in the same extent as defined
825  * by mp and following the start page.
826  */
827 STATIC void
xfs_cluster_write(struct inode * inode,pgoff_t tindex,struct xfs_bmbt_irec * imap,xfs_ioend_t ** ioendp,struct writeback_control * wbc,pgoff_t tlast)828 xfs_cluster_write(
829 	struct inode		*inode,
830 	pgoff_t			tindex,
831 	struct xfs_bmbt_irec	*imap,
832 	xfs_ioend_t		**ioendp,
833 	struct writeback_control *wbc,
834 	pgoff_t			tlast)
835 {
836 	struct pagevec		pvec;
837 	int			done = 0, i;
838 
839 	pagevec_init(&pvec, 0);
840 	while (!done && tindex <= tlast) {
841 		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
842 
843 		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
844 			break;
845 
846 		for (i = 0; i < pagevec_count(&pvec); i++) {
847 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
848 					imap, ioendp, wbc);
849 			if (done)
850 				break;
851 		}
852 
853 		pagevec_release(&pvec);
854 		cond_resched();
855 	}
856 }
857 
858 STATIC void
xfs_vm_invalidatepage(struct page * page,unsigned int offset,unsigned int length)859 xfs_vm_invalidatepage(
860 	struct page		*page,
861 	unsigned int		offset,
862 	unsigned int		length)
863 {
864 	trace_xfs_invalidatepage(page->mapping->host, page, offset,
865 				 length);
866 	block_invalidatepage(page, offset, length);
867 }
868 
869 /*
870  * If the page has delalloc buffers on it, we need to punch them out before we
871  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
872  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
873  * is done on that same region - the delalloc extent is returned when none is
874  * supposed to be there.
875  *
876  * We prevent this by truncating away the delalloc regions on the page before
877  * invalidating it. Because they are delalloc, we can do this without needing a
878  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
879  * truncation without a transaction as there is no space left for block
880  * reservation (typically why we see a ENOSPC in writeback).
881  *
882  * This is not a performance critical path, so for now just do the punching a
883  * buffer head at a time.
884  */
885 STATIC void
xfs_aops_discard_page(struct page * page)886 xfs_aops_discard_page(
887 	struct page		*page)
888 {
889 	struct inode		*inode = page->mapping->host;
890 	struct xfs_inode	*ip = XFS_I(inode);
891 	struct buffer_head	*bh, *head;
892 	loff_t			offset = page_offset(page);
893 
894 	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
895 		goto out_invalidate;
896 
897 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
898 		goto out_invalidate;
899 
900 	xfs_alert(ip->i_mount,
901 		"page discard on page %p, inode 0x%llx, offset %llu.",
902 			page, ip->i_ino, offset);
903 
904 	xfs_ilock(ip, XFS_ILOCK_EXCL);
905 	bh = head = page_buffers(page);
906 	do {
907 		int		error;
908 		xfs_fileoff_t	start_fsb;
909 
910 		if (!buffer_delay(bh))
911 			goto next_buffer;
912 
913 		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
914 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
915 		if (error) {
916 			/* something screwed, just bail */
917 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
918 				xfs_alert(ip->i_mount,
919 			"page discard unable to remove delalloc mapping.");
920 			}
921 			break;
922 		}
923 next_buffer:
924 		offset += i_blocksize(inode);
925 
926 	} while ((bh = bh->b_this_page) != head);
927 
928 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
929 out_invalidate:
930 	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
931 	return;
932 }
933 
934 /*
935  * Write out a dirty page.
936  *
937  * For delalloc space on the page we need to allocate space and flush it.
938  * For unwritten space on the page we need to start the conversion to
939  * regular allocated space.
940  * For any other dirty buffer heads on the page we should flush them.
941  */
942 STATIC int
xfs_vm_writepage(struct page * page,struct writeback_control * wbc)943 xfs_vm_writepage(
944 	struct page		*page,
945 	struct writeback_control *wbc)
946 {
947 	struct inode		*inode = page->mapping->host;
948 	struct buffer_head	*bh, *head;
949 	struct xfs_bmbt_irec	imap;
950 	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
951 	loff_t			offset;
952 	unsigned int		type;
953 	__uint64_t              end_offset;
954 	pgoff_t                 end_index, last_index;
955 	ssize_t			len;
956 	int			err, imap_valid = 0, uptodate = 1;
957 	int			count = 0;
958 	int			nonblocking = 0;
959 
960 	trace_xfs_writepage(inode, page, 0, 0);
961 
962 	ASSERT(page_has_buffers(page));
963 
964 	/*
965 	 * Refuse to write the page out if we are called from reclaim context.
966 	 *
967 	 * This avoids stack overflows when called from deeply used stacks in
968 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
969 	 * allow reclaim from kswapd as the stack usage there is relatively low.
970 	 *
971 	 * This should never happen except in the case of a VM regression so
972 	 * warn about it.
973 	 */
974 	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
975 			PF_MEMALLOC))
976 		goto redirty;
977 
978 	/*
979 	 * Given that we do not allow direct reclaim to call us, we should
980 	 * never be called while in a filesystem transaction.
981 	 */
982 	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
983 		goto redirty;
984 
985 	/* Is this page beyond the end of the file? */
986 	offset = i_size_read(inode);
987 	end_index = offset >> PAGE_CACHE_SHIFT;
988 	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
989 
990 	/*
991 	 * The page index is less than the end_index, adjust the end_offset
992 	 * to the highest offset that this page should represent.
993 	 * -----------------------------------------------------
994 	 * |			file mapping	       | <EOF> |
995 	 * -----------------------------------------------------
996 	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
997 	 * ^--------------------------------^----------|--------
998 	 * |     desired writeback range    |      see else    |
999 	 * ---------------------------------^------------------|
1000 	 */
1001 	if (page->index < end_index)
1002 		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
1003 	else {
1004 		/*
1005 		 * Check whether the page to write out is beyond or straddles
1006 		 * i_size or not.
1007 		 * -------------------------------------------------------
1008 		 * |		file mapping		        | <EOF>  |
1009 		 * -------------------------------------------------------
1010 		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1011 		 * ^--------------------------------^-----------|---------
1012 		 * |				    |      Straddles     |
1013 		 * ---------------------------------^-----------|--------|
1014 		 */
1015 		unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
1016 
1017 		/*
1018 		 * Skip the page if it is fully outside i_size, e.g. due to a
1019 		 * truncate operation that is in progress. We must redirty the
1020 		 * page so that reclaim stops reclaiming it. Otherwise
1021 		 * xfs_vm_releasepage() is called on it and gets confused.
1022 		 *
1023 		 * Note that the end_index is unsigned long, it would overflow
1024 		 * if the given offset is greater than 16TB on 32-bit system
1025 		 * and if we do check the page is fully outside i_size or not
1026 		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1027 		 * will be evaluated to 0.  Hence this page will be redirtied
1028 		 * and be written out repeatedly which would result in an
1029 		 * infinite loop, the user program that perform this operation
1030 		 * will hang.  Instead, we can verify this situation by checking
1031 		 * if the page to write is totally beyond the i_size or if it's
1032 		 * offset is just equal to the EOF.
1033 		 */
1034 		if (page->index > end_index ||
1035 		    (page->index == end_index && offset_into_page == 0))
1036 			goto redirty;
1037 
1038 		/*
1039 		 * The page straddles i_size.  It must be zeroed out on each
1040 		 * and every writepage invocation because it may be mmapped.
1041 		 * "A file is mapped in multiples of the page size.  For a file
1042 		 * that is not a multiple of the page size, the remaining
1043 		 * memory is zeroed when mapped, and writes to that region are
1044 		 * not written out to the file."
1045 		 */
1046 		zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
1047 
1048 		/* Adjust the end_offset to the end of file */
1049 		end_offset = offset;
1050 	}
1051 
1052 	len = 1 << inode->i_blkbits;
1053 
1054 	bh = head = page_buffers(page);
1055 	offset = page_offset(page);
1056 	type = XFS_IO_OVERWRITE;
1057 
1058 	if (wbc->sync_mode == WB_SYNC_NONE)
1059 		nonblocking = 1;
1060 
1061 	do {
1062 		int new_ioend = 0;
1063 
1064 		if (offset >= end_offset)
1065 			break;
1066 		if (!buffer_uptodate(bh))
1067 			uptodate = 0;
1068 
1069 		/*
1070 		 * set_page_dirty dirties all buffers in a page, independent
1071 		 * of their state.  The dirty state however is entirely
1072 		 * meaningless for holes (!mapped && uptodate), so skip
1073 		 * buffers covering holes here.
1074 		 */
1075 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
1076 			imap_valid = 0;
1077 			continue;
1078 		}
1079 
1080 		if (buffer_unwritten(bh)) {
1081 			if (type != XFS_IO_UNWRITTEN) {
1082 				type = XFS_IO_UNWRITTEN;
1083 				imap_valid = 0;
1084 			}
1085 		} else if (buffer_delay(bh)) {
1086 			if (type != XFS_IO_DELALLOC) {
1087 				type = XFS_IO_DELALLOC;
1088 				imap_valid = 0;
1089 			}
1090 		} else if (buffer_uptodate(bh)) {
1091 			if (type != XFS_IO_OVERWRITE) {
1092 				type = XFS_IO_OVERWRITE;
1093 				imap_valid = 0;
1094 			}
1095 		} else {
1096 			if (PageUptodate(page))
1097 				ASSERT(buffer_mapped(bh));
1098 			/*
1099 			 * This buffer is not uptodate and will not be
1100 			 * written to disk.  Ensure that we will put any
1101 			 * subsequent writeable buffers into a new
1102 			 * ioend.
1103 			 */
1104 			imap_valid = 0;
1105 			continue;
1106 		}
1107 
1108 		if (imap_valid)
1109 			imap_valid = xfs_imap_valid(inode, &imap, offset);
1110 		if (!imap_valid) {
1111 			/*
1112 			 * If we didn't have a valid mapping then we need to
1113 			 * put the new mapping into a separate ioend structure.
1114 			 * This ensures non-contiguous extents always have
1115 			 * separate ioends, which is particularly important
1116 			 * for unwritten extent conversion at I/O completion
1117 			 * time.
1118 			 */
1119 			new_ioend = 1;
1120 			err = xfs_map_blocks(inode, offset, &imap, type,
1121 					     nonblocking);
1122 			if (err)
1123 				goto error;
1124 			imap_valid = xfs_imap_valid(inode, &imap, offset);
1125 		}
1126 		if (imap_valid) {
1127 			lock_buffer(bh);
1128 			if (type != XFS_IO_OVERWRITE)
1129 				xfs_map_at_offset(inode, bh, &imap, offset);
1130 			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
1131 					 new_ioend);
1132 			count++;
1133 		}
1134 
1135 		if (!iohead)
1136 			iohead = ioend;
1137 
1138 	} while (offset += len, ((bh = bh->b_this_page) != head));
1139 
1140 	if (uptodate && bh == head)
1141 		SetPageUptodate(page);
1142 
1143 	xfs_start_page_writeback(page, 1, count);
1144 
1145 	/* if there is no IO to be submitted for this page, we are done */
1146 	if (!ioend)
1147 		return 0;
1148 
1149 	ASSERT(iohead);
1150 
1151 	/*
1152 	 * Any errors from this point onwards need tobe reported through the IO
1153 	 * completion path as we have marked the initial page as under writeback
1154 	 * and unlocked it.
1155 	 */
1156 	if (imap_valid) {
1157 		xfs_off_t		end_index;
1158 
1159 		end_index = imap.br_startoff + imap.br_blockcount;
1160 
1161 		/* to bytes */
1162 		end_index <<= inode->i_blkbits;
1163 
1164 		/* to pages */
1165 		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
1166 
1167 		/* check against file size */
1168 		if (end_index > last_index)
1169 			end_index = last_index;
1170 
1171 		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
1172 				  wbc, end_index);
1173 	}
1174 
1175 
1176 	/*
1177 	 * Reserve log space if we might write beyond the on-disk inode size.
1178 	 */
1179 	err = 0;
1180 	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
1181 		err = xfs_setfilesize_trans_alloc(ioend);
1182 
1183 	xfs_submit_ioend(wbc, iohead, err);
1184 
1185 	return 0;
1186 
1187 error:
1188 	if (iohead)
1189 		xfs_cancel_ioend(iohead);
1190 
1191 	if (err == -EAGAIN)
1192 		goto redirty;
1193 
1194 	xfs_aops_discard_page(page);
1195 	ClearPageUptodate(page);
1196 	unlock_page(page);
1197 	return err;
1198 
1199 redirty:
1200 	redirty_page_for_writepage(wbc, page);
1201 	unlock_page(page);
1202 	return 0;
1203 }
1204 
1205 STATIC int
xfs_vm_writepages(struct address_space * mapping,struct writeback_control * wbc)1206 xfs_vm_writepages(
1207 	struct address_space	*mapping,
1208 	struct writeback_control *wbc)
1209 {
1210 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1211 	return generic_writepages(mapping, wbc);
1212 }
1213 
1214 /*
1215  * Called to move a page into cleanable state - and from there
1216  * to be released. The page should already be clean. We always
1217  * have buffer heads in this call.
1218  *
1219  * Returns 1 if the page is ok to release, 0 otherwise.
1220  */
1221 STATIC int
xfs_vm_releasepage(struct page * page,gfp_t gfp_mask)1222 xfs_vm_releasepage(
1223 	struct page		*page,
1224 	gfp_t			gfp_mask)
1225 {
1226 	int			delalloc, unwritten;
1227 
1228 	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1229 
1230 	xfs_count_page_state(page, &delalloc, &unwritten);
1231 
1232 	if (WARN_ON_ONCE(delalloc))
1233 		return 0;
1234 	if (WARN_ON_ONCE(unwritten))
1235 		return 0;
1236 
1237 	return try_to_free_buffers(page);
1238 }
1239 
1240 /*
1241  * When we map a DIO buffer, we may need to attach an ioend that describes the
1242  * type of write IO we are doing. This passes to the completion function the
1243  * operations it needs to perform. If the mapping is for an overwrite wholly
1244  * within the EOF then we don't need an ioend and so we don't allocate one.
1245  * This avoids the unnecessary overhead of allocating and freeing ioends for
1246  * workloads that don't require transactions on IO completion.
1247  *
1248  * If we get multiple mappings in a single IO, we might be mapping different
1249  * types. But because the direct IO can only have a single private pointer, we
1250  * need to ensure that:
1251  *
1252  * a) i) the ioend spans the entire region of unwritten mappings; or
1253  *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
1254  * b) if it contains unwritten extents, it is *permanently* marked as such
1255  *
1256  * We could do this by chaining ioends like buffered IO does, but we only
1257  * actually get one IO completion callback from the direct IO, and that spans
1258  * the entire IO regardless of how many mappings and IOs are needed to complete
1259  * the DIO. There is only going to be one reference to the ioend and its life
1260  * cycle is constrained by the DIO completion code. hence we don't need
1261  * reference counting here.
1262  *
1263  * Note that for DIO, an IO to the highest supported file block offset (i.e.
1264  * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1265  * bit variable. Hence if we see this overflow, we have to assume that the IO is
1266  * extending the file size. We won't know for sure until IO completion is run
1267  * and the actual max write offset is communicated to the IO completion
1268  * routine.
1269  *
1270  * For DAX page faults, we are preparing to never see unwritten extents here,
1271  * nor should we ever extend the inode size. Hence we will soon have nothing to
1272  * do here for this case, ensuring we don't have to provide an IO completion
1273  * callback to free an ioend that we don't actually need for a fault into the
1274  * page at offset (2^63 - 1FSB) bytes.
1275  */
1276 
1277 static void
xfs_map_direct(struct inode * inode,struct buffer_head * bh_result,struct xfs_bmbt_irec * imap,xfs_off_t offset,bool dax_fault)1278 xfs_map_direct(
1279 	struct inode		*inode,
1280 	struct buffer_head	*bh_result,
1281 	struct xfs_bmbt_irec	*imap,
1282 	xfs_off_t		offset,
1283 	bool			dax_fault)
1284 {
1285 	struct xfs_ioend	*ioend;
1286 	xfs_off_t		size = bh_result->b_size;
1287 	int			type;
1288 
1289 	if (ISUNWRITTEN(imap))
1290 		type = XFS_IO_UNWRITTEN;
1291 	else
1292 		type = XFS_IO_OVERWRITE;
1293 
1294 	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
1295 
1296 	if (dax_fault) {
1297 		ASSERT(type == XFS_IO_OVERWRITE);
1298 		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1299 					    imap);
1300 		return;
1301 	}
1302 
1303 	if (bh_result->b_private) {
1304 		ioend = bh_result->b_private;
1305 		ASSERT(ioend->io_size > 0);
1306 		ASSERT(offset >= ioend->io_offset);
1307 		if (offset + size > ioend->io_offset + ioend->io_size)
1308 			ioend->io_size = offset - ioend->io_offset + size;
1309 
1310 		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1311 			ioend->io_type = XFS_IO_UNWRITTEN;
1312 
1313 		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1314 					      ioend->io_size, ioend->io_type,
1315 					      imap);
1316 	} else if (type == XFS_IO_UNWRITTEN ||
1317 		   offset + size > i_size_read(inode) ||
1318 		   offset + size < 0) {
1319 		ioend = xfs_alloc_ioend(inode, type);
1320 		ioend->io_offset = offset;
1321 		ioend->io_size = size;
1322 
1323 		bh_result->b_private = ioend;
1324 		set_buffer_defer_completion(bh_result);
1325 
1326 		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1327 					   imap);
1328 	} else {
1329 		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1330 					    imap);
1331 	}
1332 }
1333 
1334 /*
1335  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1336  * is, so that we can avoid repeated get_blocks calls.
1337  *
1338  * If the mapping spans EOF, then we have to break the mapping up as the mapping
1339  * for blocks beyond EOF must be marked new so that sub block regions can be
1340  * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1341  * was just allocated or is unwritten, otherwise the callers would overwrite
1342  * existing data with zeros. Hence we have to split the mapping into a range up
1343  * to and including EOF, and a second mapping for beyond EOF.
1344  */
1345 static void
xfs_map_trim_size(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,struct xfs_bmbt_irec * imap,xfs_off_t offset,ssize_t size)1346 xfs_map_trim_size(
1347 	struct inode		*inode,
1348 	sector_t		iblock,
1349 	struct buffer_head	*bh_result,
1350 	struct xfs_bmbt_irec	*imap,
1351 	xfs_off_t		offset,
1352 	ssize_t			size)
1353 {
1354 	xfs_off_t		mapping_size;
1355 
1356 	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1357 	mapping_size <<= inode->i_blkbits;
1358 
1359 	ASSERT(mapping_size > 0);
1360 	if (mapping_size > size)
1361 		mapping_size = size;
1362 	if (offset < i_size_read(inode) &&
1363 	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
1364 		/* limit mapping to block that spans EOF */
1365 		mapping_size = roundup_64(i_size_read(inode) - offset,
1366 					  i_blocksize(inode));
1367 	}
1368 	if (mapping_size > LONG_MAX)
1369 		mapping_size = LONG_MAX;
1370 
1371 	bh_result->b_size = mapping_size;
1372 }
1373 
1374 STATIC int
__xfs_get_blocks(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,int create,bool direct,bool dax_fault)1375 __xfs_get_blocks(
1376 	struct inode		*inode,
1377 	sector_t		iblock,
1378 	struct buffer_head	*bh_result,
1379 	int			create,
1380 	bool			direct,
1381 	bool			dax_fault)
1382 {
1383 	struct xfs_inode	*ip = XFS_I(inode);
1384 	struct xfs_mount	*mp = ip->i_mount;
1385 	xfs_fileoff_t		offset_fsb, end_fsb;
1386 	int			error = 0;
1387 	int			lockmode = 0;
1388 	struct xfs_bmbt_irec	imap;
1389 	int			nimaps = 1;
1390 	xfs_off_t		offset;
1391 	ssize_t			size;
1392 	int			new = 0;
1393 
1394 	if (XFS_FORCED_SHUTDOWN(mp))
1395 		return -EIO;
1396 
1397 	offset = (xfs_off_t)iblock << inode->i_blkbits;
1398 	ASSERT(bh_result->b_size >= i_blocksize(inode));
1399 	size = bh_result->b_size;
1400 
1401 	if (!create && direct && offset >= i_size_read(inode))
1402 		return 0;
1403 
1404 	/*
1405 	 * Direct I/O is usually done on preallocated files, so try getting
1406 	 * a block mapping without an exclusive lock first.  For buffered
1407 	 * writes we already have the exclusive iolock anyway, so avoiding
1408 	 * a lock roundtrip here by taking the ilock exclusive from the
1409 	 * beginning is a useful micro optimization.
1410 	 */
1411 	if (create && !direct) {
1412 		lockmode = XFS_ILOCK_EXCL;
1413 		xfs_ilock(ip, lockmode);
1414 	} else {
1415 		lockmode = xfs_ilock_data_map_shared(ip);
1416 	}
1417 
1418 	ASSERT(offset <= mp->m_super->s_maxbytes);
1419 	if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes)
1420 		size = mp->m_super->s_maxbytes - offset;
1421 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1422 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
1423 
1424 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1425 				&imap, &nimaps, XFS_BMAPI_ENTIRE);
1426 	if (error)
1427 		goto out_unlock;
1428 
1429 	/*
1430 	 * The only time we can ever safely find delalloc blocks on direct I/O
1431 	 * is a dio write to post-eof speculative preallocation. All other
1432 	 * scenarios are indicative of a problem or misuse (such as mixing
1433 	 * direct and mapped I/O).
1434 	 *
1435 	 * The file may be unmapped by the time we get here so we cannot
1436 	 * reliably fail the I/O based on mapping. Instead, fail the I/O if this
1437 	 * is a read or a write within eof. Otherwise, carry on but warn as a
1438 	 * precuation if the file happens to be mapped.
1439 	 */
1440 	if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
1441 	        if (!create || offset < i_size_read(VFS_I(ip))) {
1442 	                WARN_ON_ONCE(1);
1443 	                error = -EIO;
1444 	                goto out_unlock;
1445 	        }
1446 	        WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
1447 	}
1448 
1449 	/* for DAX, we convert unwritten extents directly */
1450 	if (create &&
1451 	    (!nimaps ||
1452 	     (imap.br_startblock == HOLESTARTBLOCK ||
1453 	      imap.br_startblock == DELAYSTARTBLOCK) ||
1454 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1455 		if (direct || xfs_get_extsz_hint(ip)) {
1456 			/*
1457 			 * xfs_iomap_write_direct() expects the shared lock. It
1458 			 * is unlocked on return.
1459 			 */
1460 			if (lockmode == XFS_ILOCK_EXCL)
1461 				xfs_ilock_demote(ip, lockmode);
1462 
1463 			error = xfs_iomap_write_direct(ip, offset, size,
1464 						       &imap, nimaps);
1465 			if (error)
1466 				return error;
1467 			new = 1;
1468 
1469 		} else {
1470 			/*
1471 			 * Delalloc reservations do not require a transaction,
1472 			 * we can go on without dropping the lock here. If we
1473 			 * are allocating a new delalloc block, make sure that
1474 			 * we set the new flag so that we mark the buffer new so
1475 			 * that we know that it is newly allocated if the write
1476 			 * fails.
1477 			 */
1478 			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1479 				new = 1;
1480 			error = xfs_iomap_write_delay(ip, offset, size, &imap);
1481 			if (error)
1482 				goto out_unlock;
1483 
1484 			xfs_iunlock(ip, lockmode);
1485 		}
1486 		trace_xfs_get_blocks_alloc(ip, offset, size,
1487 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1488 						   : XFS_IO_DELALLOC, &imap);
1489 	} else if (nimaps) {
1490 		trace_xfs_get_blocks_found(ip, offset, size,
1491 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1492 						   : XFS_IO_OVERWRITE, &imap);
1493 		xfs_iunlock(ip, lockmode);
1494 	} else {
1495 		trace_xfs_get_blocks_notfound(ip, offset, size);
1496 		goto out_unlock;
1497 	}
1498 
1499 	if (IS_DAX(inode) && create) {
1500 		ASSERT(!ISUNWRITTEN(&imap));
1501 		/* zeroing is not needed at a higher layer */
1502 		new = 0;
1503 	}
1504 
1505 	/* trim mapping down to size requested */
1506 	if (direct || size > (1 << inode->i_blkbits))
1507 		xfs_map_trim_size(inode, iblock, bh_result,
1508 				  &imap, offset, size);
1509 
1510 	/*
1511 	 * For unwritten extents do not report a disk address in the buffered
1512 	 * read case (treat as if we're reading into a hole).
1513 	 */
1514 	if (imap.br_startblock != HOLESTARTBLOCK &&
1515 	    imap.br_startblock != DELAYSTARTBLOCK &&
1516 	    (create || !ISUNWRITTEN(&imap))) {
1517 		xfs_map_buffer(inode, bh_result, &imap, offset);
1518 		if (ISUNWRITTEN(&imap))
1519 			set_buffer_unwritten(bh_result);
1520 		/* direct IO needs special help */
1521 		if (create && direct)
1522 			xfs_map_direct(inode, bh_result, &imap, offset,
1523 				       dax_fault);
1524 	}
1525 
1526 	/*
1527 	 * If this is a realtime file, data may be on a different device.
1528 	 * to that pointed to from the buffer_head b_bdev currently.
1529 	 */
1530 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1531 
1532 	/*
1533 	 * If we previously allocated a block out beyond eof and we are now
1534 	 * coming back to use it then we will need to flag it as new even if it
1535 	 * has a disk address.
1536 	 *
1537 	 * With sub-block writes into unwritten extents we also need to mark
1538 	 * the buffer as new so that the unwritten parts of the buffer gets
1539 	 * correctly zeroed.
1540 	 */
1541 	if (create &&
1542 	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1543 	     (offset >= i_size_read(inode)) ||
1544 	     (new || ISUNWRITTEN(&imap))))
1545 		set_buffer_new(bh_result);
1546 
1547 	if (imap.br_startblock == DELAYSTARTBLOCK) {
1548 		if (create) {
1549 			set_buffer_uptodate(bh_result);
1550 			set_buffer_mapped(bh_result);
1551 			set_buffer_delay(bh_result);
1552 		}
1553 	}
1554 
1555 	return 0;
1556 
1557 out_unlock:
1558 	xfs_iunlock(ip, lockmode);
1559 	return error;
1560 }
1561 
1562 int
xfs_get_blocks(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,int create)1563 xfs_get_blocks(
1564 	struct inode		*inode,
1565 	sector_t		iblock,
1566 	struct buffer_head	*bh_result,
1567 	int			create)
1568 {
1569 	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
1570 }
1571 
1572 int
xfs_get_blocks_direct(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,int create)1573 xfs_get_blocks_direct(
1574 	struct inode		*inode,
1575 	sector_t		iblock,
1576 	struct buffer_head	*bh_result,
1577 	int			create)
1578 {
1579 	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
1580 }
1581 
1582 int
xfs_get_blocks_dax_fault(struct inode * inode,sector_t iblock,struct buffer_head * bh_result,int create)1583 xfs_get_blocks_dax_fault(
1584 	struct inode		*inode,
1585 	sector_t		iblock,
1586 	struct buffer_head	*bh_result,
1587 	int			create)
1588 {
1589 	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1590 }
1591 
1592 static void
__xfs_end_io_direct_write(struct inode * inode,struct xfs_ioend * ioend,loff_t offset,ssize_t size)1593 __xfs_end_io_direct_write(
1594 	struct inode		*inode,
1595 	struct xfs_ioend	*ioend,
1596 	loff_t			offset,
1597 	ssize_t			size)
1598 {
1599 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
1600 
1601 	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
1602 		goto out_end_io;
1603 
1604 	/*
1605 	 * dio completion end_io functions are only called on writes if more
1606 	 * than 0 bytes was written.
1607 	 */
1608 	ASSERT(size > 0);
1609 
1610 	/*
1611 	 * The ioend only maps whole blocks, while the IO may be sector aligned.
1612 	 * Hence the ioend offset/size may not match the IO offset/size exactly.
1613 	 * Because we don't map overwrites within EOF into the ioend, the offset
1614 	 * may not match, but only if the endio spans EOF.  Either way, write
1615 	 * the IO sizes into the ioend so that completion processing does the
1616 	 * right thing.
1617 	 */
1618 	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1619 	ioend->io_size = size;
1620 	ioend->io_offset = offset;
1621 
1622 	/*
1623 	 * The ioend tells us whether we are doing unwritten extent conversion
1624 	 * or an append transaction that updates the on-disk file size. These
1625 	 * cases are the only cases where we should *potentially* be needing
1626 	 * to update the VFS inode size.
1627 	 *
1628 	 * We need to update the in-core inode size here so that we don't end up
1629 	 * with the on-disk inode size being outside the in-core inode size. We
1630 	 * have no other method of updating EOF for AIO, so always do it here
1631 	 * if necessary.
1632 	 *
1633 	 * We need to lock the test/set EOF update as we can be racing with
1634 	 * other IO completions here to update the EOF. Failing to serialise
1635 	 * here can result in EOF moving backwards and Bad Things Happen when
1636 	 * that occurs.
1637 	 */
1638 	spin_lock(&XFS_I(inode)->i_flags_lock);
1639 	if (offset + size > i_size_read(inode))
1640 		i_size_write(inode, offset + size);
1641 	spin_unlock(&XFS_I(inode)->i_flags_lock);
1642 
1643 	/*
1644 	 * If we are doing an append IO that needs to update the EOF on disk,
1645 	 * do the transaction reserve now so we can use common end io
1646 	 * processing. Stashing the error (if there is one) in the ioend will
1647 	 * result in the ioend processing passing on the error if it is
1648 	 * possible as we can't return it from here.
1649 	 */
1650 	if (ioend->io_type == XFS_IO_OVERWRITE)
1651 		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1652 
1653 out_end_io:
1654 	xfs_end_io(&ioend->io_work);
1655 	return;
1656 }
1657 
1658 /*
1659  * Complete a direct I/O write request.
1660  *
1661  * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1662  * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1663  * wholly within the EOF and so there is nothing for us to do. Note that in this
1664  * case the completion can be called in interrupt context, whereas if we have an
1665  * ioend we will always be called in task context (i.e. from a workqueue).
1666  */
1667 STATIC void
xfs_end_io_direct_write(struct kiocb * iocb,loff_t offset,ssize_t size,void * private)1668 xfs_end_io_direct_write(
1669 	struct kiocb		*iocb,
1670 	loff_t			offset,
1671 	ssize_t			size,
1672 	void			*private)
1673 {
1674 	struct inode		*inode = file_inode(iocb->ki_filp);
1675 	struct xfs_ioend	*ioend = private;
1676 
1677 	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
1678 				     ioend ? ioend->io_type : 0, NULL);
1679 
1680 	if (!ioend) {
1681 		ASSERT(offset + size <= i_size_read(inode));
1682 		return;
1683 	}
1684 
1685 	__xfs_end_io_direct_write(inode, ioend, offset, size);
1686 }
1687 
1688 static inline ssize_t
xfs_vm_do_dio(struct inode * inode,struct kiocb * iocb,struct iov_iter * iter,loff_t offset,void (* endio)(struct kiocb * iocb,loff_t offset,ssize_t size,void * private),int flags)1689 xfs_vm_do_dio(
1690 	struct inode		*inode,
1691 	struct kiocb		*iocb,
1692 	struct iov_iter		*iter,
1693 	loff_t			offset,
1694 	void			(*endio)(struct kiocb	*iocb,
1695 					 loff_t		offset,
1696 					 ssize_t	size,
1697 					 void		*private),
1698 	int			flags)
1699 {
1700 	struct block_device	*bdev;
1701 
1702 	if (IS_DAX(inode))
1703 		return dax_do_io(iocb, inode, iter, offset,
1704 				 xfs_get_blocks_direct, endio, 0);
1705 
1706 	bdev = xfs_find_bdev_for_inode(inode);
1707 	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
1708 				     xfs_get_blocks_direct, endio, NULL, flags);
1709 }
1710 
1711 STATIC ssize_t
xfs_vm_direct_IO(struct kiocb * iocb,struct iov_iter * iter,loff_t offset)1712 xfs_vm_direct_IO(
1713 	struct kiocb		*iocb,
1714 	struct iov_iter		*iter,
1715 	loff_t			offset)
1716 {
1717 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
1718 
1719 	if (iov_iter_rw(iter) == WRITE)
1720 		return xfs_vm_do_dio(inode, iocb, iter, offset,
1721 				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
1722 	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
1723 }
1724 
1725 /*
1726  * Punch out the delalloc blocks we have already allocated.
1727  *
1728  * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1729  * as the page is still locked at this point.
1730  */
1731 STATIC void
xfs_vm_kill_delalloc_range(struct inode * inode,loff_t start,loff_t end)1732 xfs_vm_kill_delalloc_range(
1733 	struct inode		*inode,
1734 	loff_t			start,
1735 	loff_t			end)
1736 {
1737 	struct xfs_inode	*ip = XFS_I(inode);
1738 	xfs_fileoff_t		start_fsb;
1739 	xfs_fileoff_t		end_fsb;
1740 	int			error;
1741 
1742 	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1743 	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1744 	if (end_fsb <= start_fsb)
1745 		return;
1746 
1747 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1748 	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1749 						end_fsb - start_fsb);
1750 	if (error) {
1751 		/* something screwed, just bail */
1752 		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1753 			xfs_alert(ip->i_mount,
1754 		"xfs_vm_write_failed: unable to clean up ino %lld",
1755 					ip->i_ino);
1756 		}
1757 	}
1758 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1759 }
1760 
1761 STATIC void
xfs_vm_write_failed(struct inode * inode,struct page * page,loff_t pos,unsigned len)1762 xfs_vm_write_failed(
1763 	struct inode		*inode,
1764 	struct page		*page,
1765 	loff_t			pos,
1766 	unsigned		len)
1767 {
1768 	loff_t			block_offset;
1769 	loff_t			block_start;
1770 	loff_t			block_end;
1771 	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
1772 	loff_t			to = from + len;
1773 	struct buffer_head	*bh, *head;
1774 
1775 	/*
1776 	 * The request pos offset might be 32 or 64 bit, this is all fine
1777 	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
1778 	 * platform, the high 32-bit will be masked off if we evaluate the
1779 	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1780 	 * 0xfffff000 as an unsigned long, hence the result is incorrect
1781 	 * which could cause the following ASSERT failed in most cases.
1782 	 * In order to avoid this, we can evaluate the block_offset of the
1783 	 * start of the page by using shifts rather than masks the mismatch
1784 	 * problem.
1785 	 */
1786 	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
1787 
1788 	ASSERT(block_offset + from == pos);
1789 
1790 	head = page_buffers(page);
1791 	block_start = 0;
1792 	for (bh = head; bh != head || !block_start;
1793 	     bh = bh->b_this_page, block_start = block_end,
1794 				   block_offset += bh->b_size) {
1795 		block_end = block_start + bh->b_size;
1796 
1797 		/* skip buffers before the write */
1798 		if (block_end <= from)
1799 			continue;
1800 
1801 		/* if the buffer is after the write, we're done */
1802 		if (block_start >= to)
1803 			break;
1804 
1805 		if (!buffer_delay(bh))
1806 			continue;
1807 
1808 		if (!buffer_new(bh) && block_offset < i_size_read(inode))
1809 			continue;
1810 
1811 		xfs_vm_kill_delalloc_range(inode, block_offset,
1812 					   block_offset + bh->b_size);
1813 
1814 		/*
1815 		 * This buffer does not contain data anymore. make sure anyone
1816 		 * who finds it knows that for certain.
1817 		 */
1818 		clear_buffer_delay(bh);
1819 		clear_buffer_uptodate(bh);
1820 		clear_buffer_mapped(bh);
1821 		clear_buffer_new(bh);
1822 		clear_buffer_dirty(bh);
1823 	}
1824 
1825 }
1826 
1827 /*
1828  * This used to call block_write_begin(), but it unlocks and releases the page
1829  * on error, and we need that page to be able to punch stale delalloc blocks out
1830  * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1831  * the appropriate point.
1832  */
1833 STATIC int
xfs_vm_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,void ** fsdata)1834 xfs_vm_write_begin(
1835 	struct file		*file,
1836 	struct address_space	*mapping,
1837 	loff_t			pos,
1838 	unsigned		len,
1839 	unsigned		flags,
1840 	struct page		**pagep,
1841 	void			**fsdata)
1842 {
1843 	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
1844 	struct page		*page;
1845 	int			status;
1846 
1847 	ASSERT(len <= PAGE_CACHE_SIZE);
1848 
1849 	page = grab_cache_page_write_begin(mapping, index, flags);
1850 	if (!page)
1851 		return -ENOMEM;
1852 
1853 	status = __block_write_begin(page, pos, len, xfs_get_blocks);
1854 	if (unlikely(status)) {
1855 		struct inode	*inode = mapping->host;
1856 		size_t		isize = i_size_read(inode);
1857 
1858 		xfs_vm_write_failed(inode, page, pos, len);
1859 		unlock_page(page);
1860 
1861 		/*
1862 		 * If the write is beyond EOF, we only want to kill blocks
1863 		 * allocated in this write, not blocks that were previously
1864 		 * written successfully.
1865 		 */
1866 		if (pos + len > isize) {
1867 			ssize_t start = max_t(ssize_t, pos, isize);
1868 
1869 			truncate_pagecache_range(inode, start, pos + len);
1870 		}
1871 
1872 		page_cache_release(page);
1873 		page = NULL;
1874 	}
1875 
1876 	*pagep = page;
1877 	return status;
1878 }
1879 
1880 /*
1881  * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1882  * this specific write because they will never be written. Previous writes
1883  * beyond EOF where block allocation succeeded do not need to be trashed, so
1884  * only new blocks from this write should be trashed. For blocks within
1885  * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1886  * written with all the other valid data.
1887  */
1888 STATIC int
xfs_vm_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)1889 xfs_vm_write_end(
1890 	struct file		*file,
1891 	struct address_space	*mapping,
1892 	loff_t			pos,
1893 	unsigned		len,
1894 	unsigned		copied,
1895 	struct page		*page,
1896 	void			*fsdata)
1897 {
1898 	int			ret;
1899 
1900 	ASSERT(len <= PAGE_CACHE_SIZE);
1901 
1902 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1903 	if (unlikely(ret < len)) {
1904 		struct inode	*inode = mapping->host;
1905 		size_t		isize = i_size_read(inode);
1906 		loff_t		to = pos + len;
1907 
1908 		if (to > isize) {
1909 			/* only kill blocks in this write beyond EOF */
1910 			if (pos > isize)
1911 				isize = pos;
1912 			xfs_vm_kill_delalloc_range(inode, isize, to);
1913 			truncate_pagecache_range(inode, isize, to);
1914 		}
1915 	}
1916 	return ret;
1917 }
1918 
1919 STATIC sector_t
xfs_vm_bmap(struct address_space * mapping,sector_t block)1920 xfs_vm_bmap(
1921 	struct address_space	*mapping,
1922 	sector_t		block)
1923 {
1924 	struct inode		*inode = (struct inode *)mapping->host;
1925 	struct xfs_inode	*ip = XFS_I(inode);
1926 
1927 	trace_xfs_vm_bmap(XFS_I(inode));
1928 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
1929 	filemap_write_and_wait(mapping);
1930 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1931 	return generic_block_bmap(mapping, block, xfs_get_blocks);
1932 }
1933 
1934 STATIC int
xfs_vm_readpage(struct file * unused,struct page * page)1935 xfs_vm_readpage(
1936 	struct file		*unused,
1937 	struct page		*page)
1938 {
1939 	return mpage_readpage(page, xfs_get_blocks);
1940 }
1941 
1942 STATIC int
xfs_vm_readpages(struct file * unused,struct address_space * mapping,struct list_head * pages,unsigned nr_pages)1943 xfs_vm_readpages(
1944 	struct file		*unused,
1945 	struct address_space	*mapping,
1946 	struct list_head	*pages,
1947 	unsigned		nr_pages)
1948 {
1949 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1950 }
1951 
1952 /*
1953  * This is basically a copy of __set_page_dirty_buffers() with one
1954  * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1955  * dirty, we'll never be able to clean them because we don't write buffers
1956  * beyond EOF, and that means we can't invalidate pages that span EOF
1957  * that have been marked dirty. Further, the dirty state can leak into
1958  * the file interior if the file is extended, resulting in all sorts of
1959  * bad things happening as the state does not match the underlying data.
1960  *
1961  * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1962  * this only exist because of bufferheads and how the generic code manages them.
1963  */
1964 STATIC int
xfs_vm_set_page_dirty(struct page * page)1965 xfs_vm_set_page_dirty(
1966 	struct page		*page)
1967 {
1968 	struct address_space	*mapping = page->mapping;
1969 	struct inode		*inode = mapping->host;
1970 	loff_t			end_offset;
1971 	loff_t			offset;
1972 	int			newly_dirty;
1973 	struct mem_cgroup	*memcg;
1974 
1975 	if (unlikely(!mapping))
1976 		return !TestSetPageDirty(page);
1977 
1978 	end_offset = i_size_read(inode);
1979 	offset = page_offset(page);
1980 
1981 	spin_lock(&mapping->private_lock);
1982 	if (page_has_buffers(page)) {
1983 		struct buffer_head *head = page_buffers(page);
1984 		struct buffer_head *bh = head;
1985 
1986 		do {
1987 			if (offset < end_offset)
1988 				set_buffer_dirty(bh);
1989 			bh = bh->b_this_page;
1990 			offset += i_blocksize(inode);
1991 		} while (bh != head);
1992 	}
1993 	/*
1994 	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
1995 	 * per-memcg dirty page counters.
1996 	 */
1997 	memcg = mem_cgroup_begin_page_stat(page);
1998 	newly_dirty = !TestSetPageDirty(page);
1999 	spin_unlock(&mapping->private_lock);
2000 
2001 	if (newly_dirty) {
2002 		/* sigh - __set_page_dirty() is static, so copy it here, too */
2003 		unsigned long flags;
2004 
2005 		spin_lock_irqsave(&mapping->tree_lock, flags);
2006 		if (page->mapping) {	/* Race with truncate? */
2007 			WARN_ON_ONCE(!PageUptodate(page));
2008 			account_page_dirtied(page, mapping, memcg);
2009 			radix_tree_tag_set(&mapping->page_tree,
2010 					page_index(page), PAGECACHE_TAG_DIRTY);
2011 		}
2012 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
2013 	}
2014 	mem_cgroup_end_page_stat(memcg);
2015 	if (newly_dirty)
2016 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2017 	return newly_dirty;
2018 }
2019 
2020 const struct address_space_operations xfs_address_space_operations = {
2021 	.readpage		= xfs_vm_readpage,
2022 	.readpages		= xfs_vm_readpages,
2023 	.writepage		= xfs_vm_writepage,
2024 	.writepages		= xfs_vm_writepages,
2025 	.set_page_dirty		= xfs_vm_set_page_dirty,
2026 	.releasepage		= xfs_vm_releasepage,
2027 	.invalidatepage		= xfs_vm_invalidatepage,
2028 	.write_begin		= xfs_vm_write_begin,
2029 	.write_end		= xfs_vm_write_end,
2030 	.bmap			= xfs_vm_bmap,
2031 	.direct_IO		= xfs_vm_direct_IO,
2032 	.migratepage		= buffer_migrate_page,
2033 	.is_partially_uptodate  = block_is_partially_uptodate,
2034 	.error_remove_page	= generic_error_remove_page,
2035 };
2036