• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * fs/fs-writeback.c
3   *
4   * Copyright (C) 2002, Linus Torvalds.
5   *
6   * Contains all the functions related to writing back and waiting
7   * upon dirty inodes against superblocks, and writing back dirty
8   * pages against inodes.  ie: data writeback.  Writeout of the
9   * inode itself is not handled here.
10   *
11   * 10Apr2002	Andrew Morton
12   *		Split out of fs/inode.c
13   *		Additions for address_space-based writeback
14   */
15  
16  #include <linux/kernel.h>
17  #include <linux/export.h>
18  #include <linux/spinlock.h>
19  #include <linux/slab.h>
20  #include <linux/sched.h>
21  #include <linux/fs.h>
22  #include <linux/mm.h>
23  #include <linux/pagemap.h>
24  #include <linux/kthread.h>
25  #include <linux/freezer.h>
26  #include <linux/writeback.h>
27  #include <linux/blkdev.h>
28  #include <linux/backing-dev.h>
29  #include <linux/tracepoint.h>
30  #include "internal.h"
31  
32  /*
33   * 4MB minimal write chunk size
34   */
35  #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
36  
37  /*
38   * Passed into wb_writeback(), essentially a subset of writeback_control
39   */
40  struct wb_writeback_work {
41  	long nr_pages;
42  	struct super_block *sb;
43  	unsigned long *older_than_this;
44  	enum writeback_sync_modes sync_mode;
45  	unsigned int tagged_writepages:1;
46  	unsigned int for_kupdate:1;
47  	unsigned int range_cyclic:1;
48  	unsigned int for_background:1;
49  	enum wb_reason reason;		/* why was writeback initiated? */
50  
51  	struct list_head list;		/* pending work list */
52  	struct completion *done;	/* set if the caller waits */
53  };
54  
55  /*
56   * We don't actually have pdflush, but this one is exported though /proc...
57   */
58  int nr_pdflush_threads;
59  
60  /**
61   * writeback_in_progress - determine whether there is writeback in progress
62   * @bdi: the device's backing_dev_info structure.
63   *
64   * Determine whether there is writeback waiting to be handled against a
65   * backing device.
66   */
writeback_in_progress(struct backing_dev_info * bdi)67  int writeback_in_progress(struct backing_dev_info *bdi)
68  {
69  	return test_bit(BDI_writeback_running, &bdi->state);
70  }
71  EXPORT_SYMBOL(writeback_in_progress);
72  
inode_to_bdi(struct inode * inode)73  static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
74  {
75  	struct super_block *sb = inode->i_sb;
76  
77  	if (strcmp(sb->s_type->name, "bdev") == 0)
78  		return inode->i_mapping->backing_dev_info;
79  
80  	return sb->s_bdi;
81  }
82  
wb_inode(struct list_head * head)83  static inline struct inode *wb_inode(struct list_head *head)
84  {
85  	return list_entry(head, struct inode, i_wb_list);
86  }
87  
88  /*
89   * Include the creation of the trace points after defining the
90   * wb_writeback_work structure and inline functions so that the definition
91   * remains local to this file.
92   */
93  #define CREATE_TRACE_POINTS
94  #include <trace/events/writeback.h>
95  
96  /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
bdi_wakeup_flusher(struct backing_dev_info * bdi)97  static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
98  {
99  	if (bdi->wb.task) {
100  		wake_up_process(bdi->wb.task);
101  	} else {
102  		/*
103  		 * The bdi thread isn't there, wake up the forker thread which
104  		 * will create and run it.
105  		 */
106  		wake_up_process(default_backing_dev_info.wb.task);
107  	}
108  }
109  
bdi_queue_work(struct backing_dev_info * bdi,struct wb_writeback_work * work)110  static void bdi_queue_work(struct backing_dev_info *bdi,
111  			   struct wb_writeback_work *work)
112  {
113  	trace_writeback_queue(bdi, work);
114  
115  	spin_lock_bh(&bdi->wb_lock);
116  	list_add_tail(&work->list, &bdi->work_list);
117  	if (!bdi->wb.task)
118  		trace_writeback_nothread(bdi, work);
119  	bdi_wakeup_flusher(bdi);
120  	spin_unlock_bh(&bdi->wb_lock);
121  }
122  
123  static void
__bdi_start_writeback(struct backing_dev_info * bdi,long nr_pages,bool range_cyclic,enum wb_reason reason)124  __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
125  		      bool range_cyclic, enum wb_reason reason)
126  {
127  	struct wb_writeback_work *work;
128  
129  	/*
130  	 * This is WB_SYNC_NONE writeback, so if allocation fails just
131  	 * wakeup the thread for old dirty data writeback
132  	 */
133  	work = kzalloc(sizeof(*work), GFP_ATOMIC);
134  	if (!work) {
135  		if (bdi->wb.task) {
136  			trace_writeback_nowork(bdi);
137  			wake_up_process(bdi->wb.task);
138  		}
139  		return;
140  	}
141  
142  	work->sync_mode	= WB_SYNC_NONE;
143  	work->nr_pages	= nr_pages;
144  	work->range_cyclic = range_cyclic;
145  	work->reason	= reason;
146  
147  	bdi_queue_work(bdi, work);
148  }
149  
150  /**
151   * bdi_start_writeback - start writeback
152   * @bdi: the backing device to write from
153   * @nr_pages: the number of pages to write
154   * @reason: reason why some writeback work was initiated
155   *
156   * Description:
157   *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
158   *   started when this function returns, we make no guarantees on
159   *   completion. Caller need not hold sb s_umount semaphore.
160   *
161   */
bdi_start_writeback(struct backing_dev_info * bdi,long nr_pages,enum wb_reason reason)162  void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
163  			enum wb_reason reason)
164  {
165  	__bdi_start_writeback(bdi, nr_pages, true, reason);
166  }
167  
168  /**
169   * bdi_start_background_writeback - start background writeback
170   * @bdi: the backing device to write from
171   *
172   * Description:
173   *   This makes sure WB_SYNC_NONE background writeback happens. When
174   *   this function returns, it is only guaranteed that for given BDI
175   *   some IO is happening if we are over background dirty threshold.
176   *   Caller need not hold sb s_umount semaphore.
177   */
bdi_start_background_writeback(struct backing_dev_info * bdi)178  void bdi_start_background_writeback(struct backing_dev_info *bdi)
179  {
180  	/*
181  	 * We just wake up the flusher thread. It will perform background
182  	 * writeback as soon as there is no other work to do.
183  	 */
184  	trace_writeback_wake_background(bdi);
185  	spin_lock_bh(&bdi->wb_lock);
186  	bdi_wakeup_flusher(bdi);
187  	spin_unlock_bh(&bdi->wb_lock);
188  }
189  
190  /*
191   * Remove the inode from the writeback list it is on.
192   */
inode_wb_list_del(struct inode * inode)193  void inode_wb_list_del(struct inode *inode)
194  {
195  	struct backing_dev_info *bdi = inode_to_bdi(inode);
196  
197  	spin_lock(&bdi->wb.list_lock);
198  	list_del_init(&inode->i_wb_list);
199  	spin_unlock(&bdi->wb.list_lock);
200  }
201  
202  /*
203   * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
204   * furthest end of its superblock's dirty-inode list.
205   *
206   * Before stamping the inode's ->dirtied_when, we check to see whether it is
207   * already the most-recently-dirtied inode on the b_dirty list.  If that is
208   * the case then the inode must have been redirtied while it was being written
209   * out and we don't reset its dirtied_when.
210   */
redirty_tail(struct inode * inode,struct bdi_writeback * wb)211  static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
212  {
213  	assert_spin_locked(&wb->list_lock);
214  	if (!list_empty(&wb->b_dirty)) {
215  		struct inode *tail;
216  
217  		tail = wb_inode(wb->b_dirty.next);
218  		if (time_before(inode->dirtied_when, tail->dirtied_when))
219  			inode->dirtied_when = jiffies;
220  	}
221  	list_move(&inode->i_wb_list, &wb->b_dirty);
222  }
223  
224  /*
225   * requeue inode for re-scanning after bdi->b_io list is exhausted.
226   */
requeue_io(struct inode * inode,struct bdi_writeback * wb)227  static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
228  {
229  	assert_spin_locked(&wb->list_lock);
230  	list_move(&inode->i_wb_list, &wb->b_more_io);
231  }
232  
inode_sync_complete(struct inode * inode)233  static void inode_sync_complete(struct inode *inode)
234  {
235  	/*
236  	 * Prevent speculative execution through
237  	 * spin_unlock(&wb->list_lock);
238  	 */
239  
240  	smp_mb();
241  	wake_up_bit(&inode->i_state, __I_SYNC);
242  }
243  
inode_dirtied_after(struct inode * inode,unsigned long t)244  static bool inode_dirtied_after(struct inode *inode, unsigned long t)
245  {
246  	bool ret = time_after(inode->dirtied_when, t);
247  #ifndef CONFIG_64BIT
248  	/*
249  	 * For inodes being constantly redirtied, dirtied_when can get stuck.
250  	 * It _appears_ to be in the future, but is actually in distant past.
251  	 * This test is necessary to prevent such wrapped-around relative times
252  	 * from permanently stopping the whole bdi writeback.
253  	 */
254  	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
255  #endif
256  	return ret;
257  }
258  
259  /*
260   * Move expired (dirtied after work->older_than_this) dirty inodes from
261   * @delaying_queue to @dispatch_queue.
262   */
move_expired_inodes(struct list_head * delaying_queue,struct list_head * dispatch_queue,struct wb_writeback_work * work)263  static int move_expired_inodes(struct list_head *delaying_queue,
264  			       struct list_head *dispatch_queue,
265  			       struct wb_writeback_work *work)
266  {
267  	LIST_HEAD(tmp);
268  	struct list_head *pos, *node;
269  	struct super_block *sb = NULL;
270  	struct inode *inode;
271  	int do_sb_sort = 0;
272  	int moved = 0;
273  
274  	while (!list_empty(delaying_queue)) {
275  		inode = wb_inode(delaying_queue->prev);
276  		if (work->older_than_this &&
277  		    inode_dirtied_after(inode, *work->older_than_this))
278  			break;
279  		if (sb && sb != inode->i_sb)
280  			do_sb_sort = 1;
281  		sb = inode->i_sb;
282  		list_move(&inode->i_wb_list, &tmp);
283  		moved++;
284  	}
285  
286  	/* just one sb in list, splice to dispatch_queue and we're done */
287  	if (!do_sb_sort) {
288  		list_splice(&tmp, dispatch_queue);
289  		goto out;
290  	}
291  
292  	/* Move inodes from one superblock together */
293  	while (!list_empty(&tmp)) {
294  		sb = wb_inode(tmp.prev)->i_sb;
295  		list_for_each_prev_safe(pos, node, &tmp) {
296  			inode = wb_inode(pos);
297  			if (inode->i_sb == sb)
298  				list_move(&inode->i_wb_list, dispatch_queue);
299  		}
300  	}
301  out:
302  	return moved;
303  }
304  
305  /*
306   * Queue all expired dirty inodes for io, eldest first.
307   * Before
308   *         newly dirtied     b_dirty    b_io    b_more_io
309   *         =============>    gf         edc     BA
310   * After
311   *         newly dirtied     b_dirty    b_io    b_more_io
312   *         =============>    g          fBAedc
313   *                                           |
314   *                                           +--> dequeue for IO
315   */
queue_io(struct bdi_writeback * wb,struct wb_writeback_work * work)316  static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
317  {
318  	int moved;
319  	assert_spin_locked(&wb->list_lock);
320  	list_splice_init(&wb->b_more_io, &wb->b_io);
321  	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
322  	trace_writeback_queue_io(wb, work, moved);
323  }
324  
write_inode(struct inode * inode,struct writeback_control * wbc)325  static int write_inode(struct inode *inode, struct writeback_control *wbc)
326  {
327  	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
328  		return inode->i_sb->s_op->write_inode(inode, wbc);
329  	return 0;
330  }
331  
332  /*
333   * Wait for writeback on an inode to complete.
334   */
inode_wait_for_writeback(struct inode * inode,struct bdi_writeback * wb)335  static void inode_wait_for_writeback(struct inode *inode,
336  				     struct bdi_writeback *wb)
337  {
338  	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
339  	wait_queue_head_t *wqh;
340  
341  	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
342  	while (inode->i_state & I_SYNC) {
343  		spin_unlock(&inode->i_lock);
344  		spin_unlock(&wb->list_lock);
345  		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
346  		spin_lock(&wb->list_lock);
347  		spin_lock(&inode->i_lock);
348  	}
349  }
350  
351  /*
352   * Write out an inode's dirty pages.  Called under wb->list_lock and
353   * inode->i_lock.  Either the caller has an active reference on the inode or
354   * the inode has I_WILL_FREE set.
355   *
356   * If `wait' is set, wait on the writeout.
357   *
358   * The whole writeout design is quite complex and fragile.  We want to avoid
359   * starvation of particular inodes when others are being redirtied, prevent
360   * livelocks, etc.
361   */
362  static int
writeback_single_inode(struct inode * inode,struct bdi_writeback * wb,struct writeback_control * wbc)363  writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
364  		       struct writeback_control *wbc)
365  {
366  	struct address_space *mapping = inode->i_mapping;
367  	long nr_to_write = wbc->nr_to_write;
368  	unsigned dirty;
369  	int ret;
370  
371  	assert_spin_locked(&wb->list_lock);
372  	assert_spin_locked(&inode->i_lock);
373  
374  	if (!atomic_read(&inode->i_count))
375  		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
376  	else
377  		WARN_ON(inode->i_state & I_WILL_FREE);
378  
379  	if (inode->i_state & I_SYNC) {
380  		/*
381  		 * If this inode is locked for writeback and we are not doing
382  		 * writeback-for-data-integrity, move it to b_more_io so that
383  		 * writeback can proceed with the other inodes on s_io.
384  		 *
385  		 * We'll have another go at writing back this inode when we
386  		 * completed a full scan of b_io.
387  		 */
388  		if (wbc->sync_mode != WB_SYNC_ALL) {
389  			requeue_io(inode, wb);
390  			trace_writeback_single_inode_requeue(inode, wbc,
391  							     nr_to_write);
392  			return 0;
393  		}
394  
395  		/*
396  		 * It's a data-integrity sync.  We must wait.
397  		 */
398  		inode_wait_for_writeback(inode, wb);
399  	}
400  
401  	BUG_ON(inode->i_state & I_SYNC);
402  
403  	/* Set I_SYNC, reset I_DIRTY_PAGES */
404  	inode->i_state |= I_SYNC;
405  	inode->i_state &= ~I_DIRTY_PAGES;
406  	spin_unlock(&inode->i_lock);
407  	spin_unlock(&wb->list_lock);
408  
409  	ret = do_writepages(mapping, wbc);
410  
411  	/*
412  	 * Make sure to wait on the data before writing out the metadata.
413  	 * This is important for filesystems that modify metadata on data
414  	 * I/O completion.
415  	 */
416  	if (wbc->sync_mode == WB_SYNC_ALL) {
417  		int err = filemap_fdatawait(mapping);
418  		if (ret == 0)
419  			ret = err;
420  	}
421  
422  	/*
423  	 * Some filesystems may redirty the inode during the writeback
424  	 * due to delalloc, clear dirty metadata flags right before
425  	 * write_inode()
426  	 */
427  	spin_lock(&inode->i_lock);
428  	dirty = inode->i_state & I_DIRTY;
429  	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
430  	spin_unlock(&inode->i_lock);
431  	/* Don't write the inode if only I_DIRTY_PAGES was set */
432  	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
433  		int err = write_inode(inode, wbc);
434  		if (ret == 0)
435  			ret = err;
436  	}
437  
438  	spin_lock(&wb->list_lock);
439  	spin_lock(&inode->i_lock);
440  	inode->i_state &= ~I_SYNC;
441  	if (!(inode->i_state & I_FREEING)) {
442  		/*
443  		 * Sync livelock prevention. Each inode is tagged and synced in
444  		 * one shot. If still dirty, it will be redirty_tail()'ed below.
445  		 * Update the dirty time to prevent enqueue and sync it again.
446  		 */
447  		if ((inode->i_state & I_DIRTY) &&
448  		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
449  			inode->dirtied_when = jiffies;
450  
451  		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
452  			/*
453  			 * We didn't write back all the pages.  nfs_writepages()
454  			 * sometimes bales out without doing anything.
455  			 */
456  			inode->i_state |= I_DIRTY_PAGES;
457  			if (wbc->nr_to_write <= 0) {
458  				/*
459  				 * slice used up: queue for next turn
460  				 */
461  				requeue_io(inode, wb);
462  			} else {
463  				/*
464  				 * Writeback blocked by something other than
465  				 * congestion. Delay the inode for some time to
466  				 * avoid spinning on the CPU (100% iowait)
467  				 * retrying writeback of the dirty page/inode
468  				 * that cannot be performed immediately.
469  				 */
470  				redirty_tail(inode, wb);
471  			}
472  		} else if (inode->i_state & I_DIRTY) {
473  			/*
474  			 * Filesystems can dirty the inode during writeback
475  			 * operations, such as delayed allocation during
476  			 * submission or metadata updates after data IO
477  			 * completion.
478  			 */
479  			redirty_tail(inode, wb);
480  		} else {
481  			/*
482  			 * The inode is clean.  At this point we either have
483  			 * a reference to the inode or it's on it's way out.
484  			 * No need to add it back to the LRU.
485  			 */
486  			list_del_init(&inode->i_wb_list);
487  		}
488  	}
489  	inode_sync_complete(inode);
490  	trace_writeback_single_inode(inode, wbc, nr_to_write);
491  	return ret;
492  }
493  
writeback_chunk_size(struct backing_dev_info * bdi,struct wb_writeback_work * work)494  static long writeback_chunk_size(struct backing_dev_info *bdi,
495  				 struct wb_writeback_work *work)
496  {
497  	long pages;
498  
499  	/*
500  	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
501  	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
502  	 * here avoids calling into writeback_inodes_wb() more than once.
503  	 *
504  	 * The intended call sequence for WB_SYNC_ALL writeback is:
505  	 *
506  	 *      wb_writeback()
507  	 *          writeback_sb_inodes()       <== called only once
508  	 *              write_cache_pages()     <== called once for each inode
509  	 *                   (quickly) tag currently dirty pages
510  	 *                   (maybe slowly) sync all tagged pages
511  	 */
512  	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
513  		pages = LONG_MAX;
514  	else {
515  		pages = min(bdi->avg_write_bandwidth / 2,
516  			    global_dirty_limit / DIRTY_SCOPE);
517  		pages = min(pages, work->nr_pages);
518  		pages = round_down(pages + MIN_WRITEBACK_PAGES,
519  				   MIN_WRITEBACK_PAGES);
520  	}
521  
522  	return pages;
523  }
524  
525  /*
526   * Write a portion of b_io inodes which belong to @sb.
527   *
528   * If @only_this_sb is true, then find and write all such
529   * inodes. Otherwise write only ones which go sequentially
530   * in reverse order.
531   *
532   * Return the number of pages and/or inodes written.
533   */
writeback_sb_inodes(struct super_block * sb,struct bdi_writeback * wb,struct wb_writeback_work * work)534  static long writeback_sb_inodes(struct super_block *sb,
535  				struct bdi_writeback *wb,
536  				struct wb_writeback_work *work)
537  {
538  	struct writeback_control wbc = {
539  		.sync_mode		= work->sync_mode,
540  		.tagged_writepages	= work->tagged_writepages,
541  		.for_kupdate		= work->for_kupdate,
542  		.for_background		= work->for_background,
543  		.range_cyclic		= work->range_cyclic,
544  		.range_start		= 0,
545  		.range_end		= LLONG_MAX,
546  	};
547  	unsigned long start_time = jiffies;
548  	long write_chunk;
549  	long wrote = 0;  /* count both pages and inodes */
550  
551  	while (!list_empty(&wb->b_io)) {
552  		struct inode *inode = wb_inode(wb->b_io.prev);
553  
554  		if (inode->i_sb != sb) {
555  			if (work->sb) {
556  				/*
557  				 * We only want to write back data for this
558  				 * superblock, move all inodes not belonging
559  				 * to it back onto the dirty list.
560  				 */
561  				redirty_tail(inode, wb);
562  				continue;
563  			}
564  
565  			/*
566  			 * The inode belongs to a different superblock.
567  			 * Bounce back to the caller to unpin this and
568  			 * pin the next superblock.
569  			 */
570  			break;
571  		}
572  
573  		/*
574  		 * Don't bother with new inodes or inodes beeing freed, first
575  		 * kind does not need peridic writeout yet, and for the latter
576  		 * kind writeout is handled by the freer.
577  		 */
578  		spin_lock(&inode->i_lock);
579  		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
580  			spin_unlock(&inode->i_lock);
581  			redirty_tail(inode, wb);
582  			continue;
583  		}
584  		__iget(inode);
585  		write_chunk = writeback_chunk_size(wb->bdi, work);
586  		wbc.nr_to_write = write_chunk;
587  		wbc.pages_skipped = 0;
588  
589  		writeback_single_inode(inode, wb, &wbc);
590  
591  		work->nr_pages -= write_chunk - wbc.nr_to_write;
592  		wrote += write_chunk - wbc.nr_to_write;
593  		if (!(inode->i_state & I_DIRTY))
594  			wrote++;
595  		if (wbc.pages_skipped) {
596  			/*
597  			 * writeback is not making progress due to locked
598  			 * buffers.  Skip this inode for now.
599  			 */
600  			redirty_tail(inode, wb);
601  		}
602  		spin_unlock(&inode->i_lock);
603  		spin_unlock(&wb->list_lock);
604  		iput(inode);
605  		cond_resched();
606  		spin_lock(&wb->list_lock);
607  		/*
608  		 * bail out to wb_writeback() often enough to check
609  		 * background threshold and other termination conditions.
610  		 */
611  		if (wrote) {
612  			if (time_is_before_jiffies(start_time + HZ / 10UL))
613  				break;
614  			if (work->nr_pages <= 0)
615  				break;
616  		}
617  	}
618  	return wrote;
619  }
620  
__writeback_inodes_wb(struct bdi_writeback * wb,struct wb_writeback_work * work)621  static long __writeback_inodes_wb(struct bdi_writeback *wb,
622  				  struct wb_writeback_work *work)
623  {
624  	unsigned long start_time = jiffies;
625  	long wrote = 0;
626  
627  	while (!list_empty(&wb->b_io)) {
628  		struct inode *inode = wb_inode(wb->b_io.prev);
629  		struct super_block *sb = inode->i_sb;
630  
631  		if (!grab_super_passive(sb)) {
632  			/*
633  			 * grab_super_passive() may fail consistently due to
634  			 * s_umount being grabbed by someone else. Don't use
635  			 * requeue_io() to avoid busy retrying the inode/sb.
636  			 */
637  			redirty_tail(inode, wb);
638  			continue;
639  		}
640  		wrote += writeback_sb_inodes(sb, wb, work);
641  		drop_super(sb);
642  
643  		/* refer to the same tests at the end of writeback_sb_inodes */
644  		if (wrote) {
645  			if (time_is_before_jiffies(start_time + HZ / 10UL))
646  				break;
647  			if (work->nr_pages <= 0)
648  				break;
649  		}
650  	}
651  	/* Leave any unwritten inodes on b_io */
652  	return wrote;
653  }
654  
writeback_inodes_wb(struct bdi_writeback * wb,long nr_pages,enum wb_reason reason)655  long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
656  				enum wb_reason reason)
657  {
658  	struct wb_writeback_work work = {
659  		.nr_pages	= nr_pages,
660  		.sync_mode	= WB_SYNC_NONE,
661  		.range_cyclic	= 1,
662  		.reason		= reason,
663  	};
664  
665  	spin_lock(&wb->list_lock);
666  	if (list_empty(&wb->b_io))
667  		queue_io(wb, &work);
668  	__writeback_inodes_wb(wb, &work);
669  	spin_unlock(&wb->list_lock);
670  
671  	return nr_pages - work.nr_pages;
672  }
673  
over_bground_thresh(struct backing_dev_info * bdi)674  static bool over_bground_thresh(struct backing_dev_info *bdi)
675  {
676  	unsigned long background_thresh, dirty_thresh;
677  
678  	global_dirty_limits(&background_thresh, &dirty_thresh);
679  
680  	if (global_page_state(NR_FILE_DIRTY) +
681  	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
682  		return true;
683  
684  	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
685  				bdi_dirty_limit(bdi, background_thresh))
686  		return true;
687  
688  	return false;
689  }
690  
691  /*
692   * Called under wb->list_lock. If there are multiple wb per bdi,
693   * only the flusher working on the first wb should do it.
694   */
wb_update_bandwidth(struct bdi_writeback * wb,unsigned long start_time)695  static void wb_update_bandwidth(struct bdi_writeback *wb,
696  				unsigned long start_time)
697  {
698  	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
699  }
700  
701  /*
702   * Explicit flushing or periodic writeback of "old" data.
703   *
704   * Define "old": the first time one of an inode's pages is dirtied, we mark the
705   * dirtying-time in the inode's address_space.  So this periodic writeback code
706   * just walks the superblock inode list, writing back any inodes which are
707   * older than a specific point in time.
708   *
709   * Try to run once per dirty_writeback_interval.  But if a writeback event
710   * takes longer than a dirty_writeback_interval interval, then leave a
711   * one-second gap.
712   *
713   * older_than_this takes precedence over nr_to_write.  So we'll only write back
714   * all dirty pages if they are all attached to "old" mappings.
715   */
wb_writeback(struct bdi_writeback * wb,struct wb_writeback_work * work)716  static long wb_writeback(struct bdi_writeback *wb,
717  			 struct wb_writeback_work *work)
718  {
719  	unsigned long wb_start = jiffies;
720  	long nr_pages = work->nr_pages;
721  	unsigned long oldest_jif;
722  	struct inode *inode;
723  	long progress;
724  
725  	oldest_jif = jiffies;
726  	work->older_than_this = &oldest_jif;
727  
728  	spin_lock(&wb->list_lock);
729  	for (;;) {
730  		/*
731  		 * Stop writeback when nr_pages has been consumed
732  		 */
733  		if (work->nr_pages <= 0)
734  			break;
735  
736  		/*
737  		 * Background writeout and kupdate-style writeback may
738  		 * run forever. Stop them if there is other work to do
739  		 * so that e.g. sync can proceed. They'll be restarted
740  		 * after the other works are all done.
741  		 */
742  		if ((work->for_background || work->for_kupdate) &&
743  		    !list_empty(&wb->bdi->work_list))
744  			break;
745  
746  		/*
747  		 * For background writeout, stop when we are below the
748  		 * background dirty threshold
749  		 */
750  		if (work->for_background && !over_bground_thresh(wb->bdi))
751  			break;
752  
753  		/*
754  		 * Kupdate and background works are special and we want to
755  		 * include all inodes that need writing. Livelock avoidance is
756  		 * handled by these works yielding to any other work so we are
757  		 * safe.
758  		 */
759  		if (work->for_kupdate) {
760  			oldest_jif = jiffies -
761  				msecs_to_jiffies(dirty_expire_interval * 10);
762  		} else if (work->for_background)
763  			oldest_jif = jiffies;
764  
765  		trace_writeback_start(wb->bdi, work);
766  		if (list_empty(&wb->b_io))
767  			queue_io(wb, work);
768  		if (work->sb)
769  			progress = writeback_sb_inodes(work->sb, wb, work);
770  		else
771  			progress = __writeback_inodes_wb(wb, work);
772  		trace_writeback_written(wb->bdi, work);
773  
774  		wb_update_bandwidth(wb, wb_start);
775  
776  		/*
777  		 * Did we write something? Try for more
778  		 *
779  		 * Dirty inodes are moved to b_io for writeback in batches.
780  		 * The completion of the current batch does not necessarily
781  		 * mean the overall work is done. So we keep looping as long
782  		 * as made some progress on cleaning pages or inodes.
783  		 */
784  		if (progress)
785  			continue;
786  		/*
787  		 * No more inodes for IO, bail
788  		 */
789  		if (list_empty(&wb->b_more_io))
790  			break;
791  		/*
792  		 * Nothing written. Wait for some inode to
793  		 * become available for writeback. Otherwise
794  		 * we'll just busyloop.
795  		 */
796  		if (!list_empty(&wb->b_more_io))  {
797  			trace_writeback_wait(wb->bdi, work);
798  			inode = wb_inode(wb->b_more_io.prev);
799  			spin_lock(&inode->i_lock);
800  			inode_wait_for_writeback(inode, wb);
801  			spin_unlock(&inode->i_lock);
802  		}
803  	}
804  	spin_unlock(&wb->list_lock);
805  
806  	return nr_pages - work->nr_pages;
807  }
808  
809  /*
810   * Return the next wb_writeback_work struct that hasn't been processed yet.
811   */
812  static struct wb_writeback_work *
get_next_work_item(struct backing_dev_info * bdi)813  get_next_work_item(struct backing_dev_info *bdi)
814  {
815  	struct wb_writeback_work *work = NULL;
816  
817  	spin_lock_bh(&bdi->wb_lock);
818  	if (!list_empty(&bdi->work_list)) {
819  		work = list_entry(bdi->work_list.next,
820  				  struct wb_writeback_work, list);
821  		list_del_init(&work->list);
822  	}
823  	spin_unlock_bh(&bdi->wb_lock);
824  	return work;
825  }
826  
827  /*
828   * Add in the number of potentially dirty inodes, because each inode
829   * write can dirty pagecache in the underlying blockdev.
830   */
get_nr_dirty_pages(void)831  static unsigned long get_nr_dirty_pages(void)
832  {
833  	return global_page_state(NR_FILE_DIRTY) +
834  		global_page_state(NR_UNSTABLE_NFS) +
835  		get_nr_dirty_inodes();
836  }
837  
wb_check_background_flush(struct bdi_writeback * wb)838  static long wb_check_background_flush(struct bdi_writeback *wb)
839  {
840  	if (over_bground_thresh(wb->bdi)) {
841  
842  		struct wb_writeback_work work = {
843  			.nr_pages	= LONG_MAX,
844  			.sync_mode	= WB_SYNC_NONE,
845  			.for_background	= 1,
846  			.range_cyclic	= 1,
847  			.reason		= WB_REASON_BACKGROUND,
848  		};
849  
850  		return wb_writeback(wb, &work);
851  	}
852  
853  	return 0;
854  }
855  
wb_check_old_data_flush(struct bdi_writeback * wb)856  static long wb_check_old_data_flush(struct bdi_writeback *wb)
857  {
858  	unsigned long expired;
859  	long nr_pages;
860  
861  	/*
862  	 * When set to zero, disable periodic writeback
863  	 */
864  	if (!dirty_writeback_interval)
865  		return 0;
866  
867  	expired = wb->last_old_flush +
868  			msecs_to_jiffies(dirty_writeback_interval * 10);
869  	if (time_before(jiffies, expired))
870  		return 0;
871  
872  	wb->last_old_flush = jiffies;
873  	nr_pages = get_nr_dirty_pages();
874  
875  	if (nr_pages) {
876  		struct wb_writeback_work work = {
877  			.nr_pages	= nr_pages,
878  			.sync_mode	= WB_SYNC_NONE,
879  			.for_kupdate	= 1,
880  			.range_cyclic	= 1,
881  			.reason		= WB_REASON_PERIODIC,
882  		};
883  
884  		return wb_writeback(wb, &work);
885  	}
886  
887  	return 0;
888  }
889  
890  /*
891   * Retrieve work items and do the writeback they describe
892   */
wb_do_writeback(struct bdi_writeback * wb,int force_wait)893  long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
894  {
895  	struct backing_dev_info *bdi = wb->bdi;
896  	struct wb_writeback_work *work;
897  	long wrote = 0;
898  
899  	set_bit(BDI_writeback_running, &wb->bdi->state);
900  	while ((work = get_next_work_item(bdi)) != NULL) {
901  		/*
902  		 * Override sync mode, in case we must wait for completion
903  		 * because this thread is exiting now.
904  		 */
905  		if (force_wait)
906  			work->sync_mode = WB_SYNC_ALL;
907  
908  		trace_writeback_exec(bdi, work);
909  
910  		wrote += wb_writeback(wb, work);
911  
912  		/*
913  		 * Notify the caller of completion if this is a synchronous
914  		 * work item, otherwise just free it.
915  		 */
916  		if (work->done)
917  			complete(work->done);
918  		else
919  			kfree(work);
920  	}
921  
922  	/*
923  	 * Check for periodic writeback, kupdated() style
924  	 */
925  	wrote += wb_check_old_data_flush(wb);
926  	wrote += wb_check_background_flush(wb);
927  	clear_bit(BDI_writeback_running, &wb->bdi->state);
928  
929  	return wrote;
930  }
931  
932  /*
933   * Handle writeback of dirty data for the device backed by this bdi. Also
934   * wakes up periodically and does kupdated style flushing.
935   */
bdi_writeback_thread(void * data)936  int bdi_writeback_thread(void *data)
937  {
938  	struct bdi_writeback *wb = data;
939  	struct backing_dev_info *bdi = wb->bdi;
940  	long pages_written;
941  
942  	current->flags |= PF_SWAPWRITE;
943  	set_freezable();
944  	wb->last_active = jiffies;
945  
946  	/*
947  	 * Our parent may run at a different priority, just set us to normal
948  	 */
949  	set_user_nice(current, 0);
950  
951  	trace_writeback_thread_start(bdi);
952  
953  	while (!kthread_freezable_should_stop(NULL)) {
954  		/*
955  		 * Remove own delayed wake-up timer, since we are already awake
956  		 * and we'll take care of the preriodic write-back.
957  		 */
958  		del_timer(&wb->wakeup_timer);
959  
960  		pages_written = wb_do_writeback(wb, 0);
961  
962  		trace_writeback_pages_written(pages_written);
963  
964  		if (pages_written)
965  			wb->last_active = jiffies;
966  
967  		set_current_state(TASK_INTERRUPTIBLE);
968  		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
969  			__set_current_state(TASK_RUNNING);
970  			continue;
971  		}
972  
973  		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
974  			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
975  		else {
976  			/*
977  			 * We have nothing to do, so can go sleep without any
978  			 * timeout and save power. When a work is queued or
979  			 * something is made dirty - we will be woken up.
980  			 */
981  			schedule();
982  		}
983  	}
984  
985  	/* Flush any work that raced with us exiting */
986  	if (!list_empty(&bdi->work_list))
987  		wb_do_writeback(wb, 1);
988  
989  	trace_writeback_thread_stop(bdi);
990  	return 0;
991  }
992  
993  
994  /*
995   * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
996   * the whole world.
997   */
wakeup_flusher_threads(long nr_pages,enum wb_reason reason)998  void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
999  {
1000  	struct backing_dev_info *bdi;
1001  
1002  	if (!nr_pages) {
1003  		nr_pages = global_page_state(NR_FILE_DIRTY) +
1004  				global_page_state(NR_UNSTABLE_NFS);
1005  	}
1006  
1007  	rcu_read_lock();
1008  	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1009  		if (!bdi_has_dirty_io(bdi))
1010  			continue;
1011  		__bdi_start_writeback(bdi, nr_pages, false, reason);
1012  	}
1013  	rcu_read_unlock();
1014  }
1015  
block_dump___mark_inode_dirty(struct inode * inode)1016  static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1017  {
1018  	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
1019  		struct dentry *dentry;
1020  		const char *name = "?";
1021  
1022  		dentry = d_find_alias(inode);
1023  		if (dentry) {
1024  			spin_lock(&dentry->d_lock);
1025  			name = (const char *) dentry->d_name.name;
1026  		}
1027  		printk(KERN_DEBUG
1028  		       "%s(%d): dirtied inode %lu (%s) on %s\n",
1029  		       current->comm, task_pid_nr(current), inode->i_ino,
1030  		       name, inode->i_sb->s_id);
1031  		if (dentry) {
1032  			spin_unlock(&dentry->d_lock);
1033  			dput(dentry);
1034  		}
1035  	}
1036  }
1037  
1038  /**
1039   *	__mark_inode_dirty -	internal function
1040   *	@inode: inode to mark
1041   *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
1042   *	Mark an inode as dirty. Callers should use mark_inode_dirty or
1043   *  	mark_inode_dirty_sync.
1044   *
1045   * Put the inode on the super block's dirty list.
1046   *
1047   * CAREFUL! We mark it dirty unconditionally, but move it onto the
1048   * dirty list only if it is hashed or if it refers to a blockdev.
1049   * If it was not hashed, it will never be added to the dirty list
1050   * even if it is later hashed, as it will have been marked dirty already.
1051   *
1052   * In short, make sure you hash any inodes _before_ you start marking
1053   * them dirty.
1054   *
1055   * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1056   * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
1057   * the kernel-internal blockdev inode represents the dirtying time of the
1058   * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
1059   * page->mapping->host, so the page-dirtying time is recorded in the internal
1060   * blockdev inode.
1061   */
__mark_inode_dirty(struct inode * inode,int flags)1062  void __mark_inode_dirty(struct inode *inode, int flags)
1063  {
1064  	struct super_block *sb = inode->i_sb;
1065  	struct backing_dev_info *bdi = NULL;
1066  
1067  	/*
1068  	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
1069  	 * dirty the inode itself
1070  	 */
1071  	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1072  		if (sb->s_op->dirty_inode)
1073  			sb->s_op->dirty_inode(inode, flags);
1074  	}
1075  
1076  	/*
1077  	 * make sure that changes are seen by all cpus before we test i_state
1078  	 * -- mikulas
1079  	 */
1080  	smp_mb();
1081  
1082  	/* avoid the locking if we can */
1083  	if ((inode->i_state & flags) == flags)
1084  		return;
1085  
1086  	if (unlikely(block_dump > 1))
1087  		block_dump___mark_inode_dirty(inode);
1088  
1089  	spin_lock(&inode->i_lock);
1090  	if ((inode->i_state & flags) != flags) {
1091  		const int was_dirty = inode->i_state & I_DIRTY;
1092  
1093  		inode->i_state |= flags;
1094  
1095  		/*
1096  		 * If the inode is being synced, just update its dirty state.
1097  		 * The unlocker will place the inode on the appropriate
1098  		 * superblock list, based upon its state.
1099  		 */
1100  		if (inode->i_state & I_SYNC)
1101  			goto out_unlock_inode;
1102  
1103  		/*
1104  		 * Only add valid (hashed) inodes to the superblock's
1105  		 * dirty list.  Add blockdev inodes as well.
1106  		 */
1107  		if (!S_ISBLK(inode->i_mode)) {
1108  			if (inode_unhashed(inode))
1109  				goto out_unlock_inode;
1110  		}
1111  		if (inode->i_state & I_FREEING)
1112  			goto out_unlock_inode;
1113  
1114  		/*
1115  		 * If the inode was already on b_dirty/b_io/b_more_io, don't
1116  		 * reposition it (that would break b_dirty time-ordering).
1117  		 */
1118  		if (!was_dirty) {
1119  			bool wakeup_bdi = false;
1120  			bdi = inode_to_bdi(inode);
1121  
1122  			if (bdi_cap_writeback_dirty(bdi)) {
1123  				WARN(!test_bit(BDI_registered, &bdi->state),
1124  				     "bdi-%s not registered\n", bdi->name);
1125  
1126  				/*
1127  				 * If this is the first dirty inode for this
1128  				 * bdi, we have to wake-up the corresponding
1129  				 * bdi thread to make sure background
1130  				 * write-back happens later.
1131  				 */
1132  				if (!wb_has_dirty_io(&bdi->wb))
1133  					wakeup_bdi = true;
1134  			}
1135  
1136  			spin_unlock(&inode->i_lock);
1137  			spin_lock(&bdi->wb.list_lock);
1138  			inode->dirtied_when = jiffies;
1139  			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1140  			spin_unlock(&bdi->wb.list_lock);
1141  
1142  			if (wakeup_bdi)
1143  				bdi_wakeup_thread_delayed(bdi);
1144  			return;
1145  		}
1146  	}
1147  out_unlock_inode:
1148  	spin_unlock(&inode->i_lock);
1149  
1150  }
1151  EXPORT_SYMBOL(__mark_inode_dirty);
1152  
wait_sb_inodes(struct super_block * sb)1153  static void wait_sb_inodes(struct super_block *sb)
1154  {
1155  	struct inode *inode, *old_inode = NULL;
1156  
1157  	/*
1158  	 * We need to be protected against the filesystem going from
1159  	 * r/o to r/w or vice versa.
1160  	 */
1161  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1162  
1163  	spin_lock(&inode_sb_list_lock);
1164  
1165  	/*
1166  	 * Data integrity sync. Must wait for all pages under writeback,
1167  	 * because there may have been pages dirtied before our sync
1168  	 * call, but which had writeout started before we write it out.
1169  	 * In which case, the inode may not be on the dirty list, but
1170  	 * we still have to wait for that writeout.
1171  	 */
1172  	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1173  		struct address_space *mapping = inode->i_mapping;
1174  
1175  		spin_lock(&inode->i_lock);
1176  		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1177  		    (mapping->nrpages == 0)) {
1178  			spin_unlock(&inode->i_lock);
1179  			continue;
1180  		}
1181  		__iget(inode);
1182  		spin_unlock(&inode->i_lock);
1183  		spin_unlock(&inode_sb_list_lock);
1184  
1185  		/*
1186  		 * We hold a reference to 'inode' so it couldn't have been
1187  		 * removed from s_inodes list while we dropped the
1188  		 * inode_sb_list_lock.  We cannot iput the inode now as we can
1189  		 * be holding the last reference and we cannot iput it under
1190  		 * inode_sb_list_lock. So we keep the reference and iput it
1191  		 * later.
1192  		 */
1193  		iput(old_inode);
1194  		old_inode = inode;
1195  
1196  		filemap_fdatawait(mapping);
1197  
1198  		cond_resched();
1199  
1200  		spin_lock(&inode_sb_list_lock);
1201  	}
1202  	spin_unlock(&inode_sb_list_lock);
1203  	iput(old_inode);
1204  }
1205  
1206  /**
1207   * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
1208   * @sb: the superblock
1209   * @nr: the number of pages to write
1210   * @reason: reason why some writeback work initiated
1211   *
1212   * Start writeback on some inodes on this super_block. No guarantees are made
1213   * on how many (if any) will be written, and this function does not wait
1214   * for IO completion of submitted IO.
1215   */
writeback_inodes_sb_nr(struct super_block * sb,unsigned long nr,enum wb_reason reason)1216  void writeback_inodes_sb_nr(struct super_block *sb,
1217  			    unsigned long nr,
1218  			    enum wb_reason reason)
1219  {
1220  	DECLARE_COMPLETION_ONSTACK(done);
1221  	struct wb_writeback_work work = {
1222  		.sb			= sb,
1223  		.sync_mode		= WB_SYNC_NONE,
1224  		.tagged_writepages	= 1,
1225  		.done			= &done,
1226  		.nr_pages		= nr,
1227  		.reason			= reason,
1228  	};
1229  
1230  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1231  	bdi_queue_work(sb->s_bdi, &work);
1232  	wait_for_completion(&done);
1233  }
1234  EXPORT_SYMBOL(writeback_inodes_sb_nr);
1235  
1236  /**
1237   * writeback_inodes_sb	-	writeback dirty inodes from given super_block
1238   * @sb: the superblock
1239   * @reason: reason why some writeback work was initiated
1240   *
1241   * Start writeback on some inodes on this super_block. No guarantees are made
1242   * on how many (if any) will be written, and this function does not wait
1243   * for IO completion of submitted IO.
1244   */
writeback_inodes_sb(struct super_block * sb,enum wb_reason reason)1245  void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1246  {
1247  	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1248  }
1249  EXPORT_SYMBOL(writeback_inodes_sb);
1250  
1251  /**
1252   * writeback_inodes_sb_if_idle	-	start writeback if none underway
1253   * @sb: the superblock
1254   * @reason: reason why some writeback work was initiated
1255   *
1256   * Invoke writeback_inodes_sb if no writeback is currently underway.
1257   * Returns 1 if writeback was started, 0 if not.
1258   */
writeback_inodes_sb_if_idle(struct super_block * sb,enum wb_reason reason)1259  int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason)
1260  {
1261  	if (!writeback_in_progress(sb->s_bdi)) {
1262  		down_read(&sb->s_umount);
1263  		writeback_inodes_sb(sb, reason);
1264  		up_read(&sb->s_umount);
1265  		return 1;
1266  	} else
1267  		return 0;
1268  }
1269  EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1270  
1271  /**
1272   * writeback_inodes_sb_nr_if_idle	-	start writeback if none underway
1273   * @sb: the superblock
1274   * @nr: the number of pages to write
1275   * @reason: reason why some writeback work was initiated
1276   *
1277   * Invoke writeback_inodes_sb if no writeback is currently underway.
1278   * Returns 1 if writeback was started, 0 if not.
1279   */
writeback_inodes_sb_nr_if_idle(struct super_block * sb,unsigned long nr,enum wb_reason reason)1280  int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1281  				   unsigned long nr,
1282  				   enum wb_reason reason)
1283  {
1284  	if (!writeback_in_progress(sb->s_bdi)) {
1285  		down_read(&sb->s_umount);
1286  		writeback_inodes_sb_nr(sb, nr, reason);
1287  		up_read(&sb->s_umount);
1288  		return 1;
1289  	} else
1290  		return 0;
1291  }
1292  EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1293  
1294  /**
1295   * sync_inodes_sb	-	sync sb inode pages
1296   * @sb: the superblock
1297   *
1298   * This function writes and waits on any dirty inode belonging to this
1299   * super_block.
1300   */
sync_inodes_sb(struct super_block * sb)1301  void sync_inodes_sb(struct super_block *sb)
1302  {
1303  	DECLARE_COMPLETION_ONSTACK(done);
1304  	struct wb_writeback_work work = {
1305  		.sb		= sb,
1306  		.sync_mode	= WB_SYNC_ALL,
1307  		.nr_pages	= LONG_MAX,
1308  		.range_cyclic	= 0,
1309  		.done		= &done,
1310  		.reason		= WB_REASON_SYNC,
1311  	};
1312  
1313  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1314  
1315  	bdi_queue_work(sb->s_bdi, &work);
1316  	wait_for_completion(&done);
1317  
1318  	wait_sb_inodes(sb);
1319  }
1320  EXPORT_SYMBOL(sync_inodes_sb);
1321  
1322  /**
1323   * write_inode_now	-	write an inode to disk
1324   * @inode: inode to write to disk
1325   * @sync: whether the write should be synchronous or not
1326   *
1327   * This function commits an inode to disk immediately if it is dirty. This is
1328   * primarily needed by knfsd.
1329   *
1330   * The caller must either have a ref on the inode or must have set I_WILL_FREE.
1331   */
write_inode_now(struct inode * inode,int sync)1332  int write_inode_now(struct inode *inode, int sync)
1333  {
1334  	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1335  	int ret;
1336  	struct writeback_control wbc = {
1337  		.nr_to_write = LONG_MAX,
1338  		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
1339  		.range_start = 0,
1340  		.range_end = LLONG_MAX,
1341  	};
1342  
1343  	if (!mapping_cap_writeback_dirty(inode->i_mapping))
1344  		wbc.nr_to_write = 0;
1345  
1346  	might_sleep();
1347  	spin_lock(&wb->list_lock);
1348  	spin_lock(&inode->i_lock);
1349  	ret = writeback_single_inode(inode, wb, &wbc);
1350  	spin_unlock(&inode->i_lock);
1351  	spin_unlock(&wb->list_lock);
1352  	return ret;
1353  }
1354  EXPORT_SYMBOL(write_inode_now);
1355  
1356  /**
1357   * sync_inode - write an inode and its pages to disk.
1358   * @inode: the inode to sync
1359   * @wbc: controls the writeback mode
1360   *
1361   * sync_inode() will write an inode and its pages to disk.  It will also
1362   * correctly update the inode on its superblock's dirty inode lists and will
1363   * update inode->i_state.
1364   *
1365   * The caller must have a ref on the inode.
1366   */
sync_inode(struct inode * inode,struct writeback_control * wbc)1367  int sync_inode(struct inode *inode, struct writeback_control *wbc)
1368  {
1369  	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1370  	int ret;
1371  
1372  	spin_lock(&wb->list_lock);
1373  	spin_lock(&inode->i_lock);
1374  	ret = writeback_single_inode(inode, wb, wbc);
1375  	spin_unlock(&inode->i_lock);
1376  	spin_unlock(&wb->list_lock);
1377  	return ret;
1378  }
1379  EXPORT_SYMBOL(sync_inode);
1380  
1381  /**
1382   * sync_inode_metadata - write an inode to disk
1383   * @inode: the inode to sync
1384   * @wait: wait for I/O to complete.
1385   *
1386   * Write an inode to disk and adjust its dirty state after completion.
1387   *
1388   * Note: only writes the actual inode, no associated data or other metadata.
1389   */
sync_inode_metadata(struct inode * inode,int wait)1390  int sync_inode_metadata(struct inode *inode, int wait)
1391  {
1392  	struct writeback_control wbc = {
1393  		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1394  		.nr_to_write = 0, /* metadata-only */
1395  	};
1396  
1397  	return sync_inode(inode, &wbc);
1398  }
1399  EXPORT_SYMBOL(sync_inode_metadata);
1400