• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * linux/fs/jbd/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/mm.h>
21 #include <linux/pagemap.h>
22 #include <linux/bio.h>
23 #include <linux/blkdev.h>
24 #include <trace/events/jbd.h>
25 
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
journal_end_buffer_io_sync(struct buffer_head * bh,int uptodate)29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31 	BUFFER_TRACE(bh, "");
32 	if (uptodate)
33 		set_buffer_uptodate(bh);
34 	else
35 		clear_buffer_uptodate(bh);
36 	unlock_buffer(bh);
37 }
38 
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not successfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under journal->j_list_lock.  The caller provided us with a ref
51  * against the buffer, and we drop that here.
52  */
release_buffer_page(struct buffer_head * bh)53 static void release_buffer_page(struct buffer_head *bh)
54 {
55 	struct page *page;
56 
57 	if (buffer_dirty(bh))
58 		goto nope;
59 	if (atomic_read(&bh->b_count) != 1)
60 		goto nope;
61 	page = bh->b_page;
62 	if (!page)
63 		goto nope;
64 	if (page->mapping)
65 		goto nope;
66 
67 	/* OK, it's a truncated page */
68 	if (!trylock_page(page))
69 		goto nope;
70 
71 	page_cache_get(page);
72 	__brelse(bh);
73 	try_to_free_buffers(page);
74 	unlock_page(page);
75 	page_cache_release(page);
76 	return;
77 
78 nope:
79 	__brelse(bh);
80 }
81 
82 /*
83  * Decrement reference counter for data buffer. If it has been marked
84  * 'BH_Freed', release it and the page to which it belongs if possible.
85  */
release_data_buffer(struct buffer_head * bh)86 static void release_data_buffer(struct buffer_head *bh)
87 {
88 	if (buffer_freed(bh)) {
89 		WARN_ON_ONCE(buffer_dirty(bh));
90 		clear_buffer_freed(bh);
91 		clear_buffer_mapped(bh);
92 		clear_buffer_new(bh);
93 		clear_buffer_req(bh);
94 		bh->b_bdev = NULL;
95 		release_buffer_page(bh);
96 	} else
97 		put_bh(bh);
98 }
99 
100 /*
101  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
102  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
103  * return 0.  j_list_lock is dropped in this case.
104  */
inverted_lock(journal_t * journal,struct buffer_head * bh)105 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
106 {
107 	if (!jbd_trylock_bh_state(bh)) {
108 		spin_unlock(&journal->j_list_lock);
109 		schedule();
110 		return 0;
111 	}
112 	return 1;
113 }
114 
115 /* Done it all: now write the commit record.  We should have
116  * cleaned up our previous buffers by now, so if we are in abort
117  * mode we can now just skip the rest of the journal write
118  * entirely.
119  *
120  * Returns 1 if the journal needs to be aborted or 0 on success
121  */
journal_write_commit_record(journal_t * journal,transaction_t * commit_transaction)122 static int journal_write_commit_record(journal_t *journal,
123 					transaction_t *commit_transaction)
124 {
125 	struct journal_head *descriptor;
126 	struct buffer_head *bh;
127 	journal_header_t *header;
128 	int ret;
129 
130 	if (is_journal_aborted(journal))
131 		return 0;
132 
133 	descriptor = journal_get_descriptor_buffer(journal);
134 	if (!descriptor)
135 		return 1;
136 
137 	bh = jh2bh(descriptor);
138 
139 	header = (journal_header_t *)(bh->b_data);
140 	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
141 	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
142 	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
143 
144 	JBUFFER_TRACE(descriptor, "write commit block");
145 	set_buffer_dirty(bh);
146 
147 	if (journal->j_flags & JFS_BARRIER)
148 		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
149 	else
150 		ret = sync_dirty_buffer(bh);
151 
152 	put_bh(bh);		/* One for getblk() */
153 	journal_put_journal_head(descriptor);
154 
155 	return (ret == -EIO);
156 }
157 
journal_do_submit_data(struct buffer_head ** wbuf,int bufs,int write_op)158 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
159 				   int write_op)
160 {
161 	int i;
162 
163 	for (i = 0; i < bufs; i++) {
164 		wbuf[i]->b_end_io = end_buffer_write_sync;
165 		/*
166 		 * Here we write back pagecache data that may be mmaped. Since
167 		 * we cannot afford to clean the page and set PageWriteback
168 		 * here due to lock ordering (page lock ranks above transaction
169 		 * start), the data can change while IO is in flight. Tell the
170 		 * block layer it should bounce the bio pages if stable data
171 		 * during write is required.
172 		 *
173 		 * We use up our safety reference in submit_bh().
174 		 */
175 		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
176 	}
177 }
178 
179 /*
180  *  Submit all the data buffers to disk
181  */
journal_submit_data_buffers(journal_t * journal,transaction_t * commit_transaction,int write_op)182 static int journal_submit_data_buffers(journal_t *journal,
183 				       transaction_t *commit_transaction,
184 				       int write_op)
185 {
186 	struct journal_head *jh;
187 	struct buffer_head *bh;
188 	int locked;
189 	int bufs = 0;
190 	struct buffer_head **wbuf = journal->j_wbuf;
191 	int err = 0;
192 
193 	/*
194 	 * Whenever we unlock the journal and sleep, things can get added
195 	 * onto ->t_sync_datalist, so we have to keep looping back to
196 	 * write_out_data until we *know* that the list is empty.
197 	 *
198 	 * Cleanup any flushed data buffers from the data list.  Even in
199 	 * abort mode, we want to flush this out as soon as possible.
200 	 */
201 write_out_data:
202 	cond_resched();
203 	spin_lock(&journal->j_list_lock);
204 
205 	while (commit_transaction->t_sync_datalist) {
206 		jh = commit_transaction->t_sync_datalist;
207 		bh = jh2bh(jh);
208 		locked = 0;
209 
210 		/* Get reference just to make sure buffer does not disappear
211 		 * when we are forced to drop various locks */
212 		get_bh(bh);
213 		/* If the buffer is dirty, we need to submit IO and hence
214 		 * we need the buffer lock. We try to lock the buffer without
215 		 * blocking. If we fail, we need to drop j_list_lock and do
216 		 * blocking lock_buffer().
217 		 */
218 		if (buffer_dirty(bh)) {
219 			if (!trylock_buffer(bh)) {
220 				BUFFER_TRACE(bh, "needs blocking lock");
221 				spin_unlock(&journal->j_list_lock);
222 				trace_jbd_do_submit_data(journal,
223 						     commit_transaction);
224 				/* Write out all data to prevent deadlocks */
225 				journal_do_submit_data(wbuf, bufs, write_op);
226 				bufs = 0;
227 				lock_buffer(bh);
228 				spin_lock(&journal->j_list_lock);
229 			}
230 			locked = 1;
231 		}
232 		/* We have to get bh_state lock. Again out of order, sigh. */
233 		if (!inverted_lock(journal, bh)) {
234 			jbd_lock_bh_state(bh);
235 			spin_lock(&journal->j_list_lock);
236 		}
237 		/* Someone already cleaned up the buffer? */
238 		if (!buffer_jbd(bh) || bh2jh(bh) != jh
239 			|| jh->b_transaction != commit_transaction
240 			|| jh->b_jlist != BJ_SyncData) {
241 			jbd_unlock_bh_state(bh);
242 			if (locked)
243 				unlock_buffer(bh);
244 			BUFFER_TRACE(bh, "already cleaned up");
245 			release_data_buffer(bh);
246 			continue;
247 		}
248 		if (locked && test_clear_buffer_dirty(bh)) {
249 			BUFFER_TRACE(bh, "needs writeout, adding to array");
250 			wbuf[bufs++] = bh;
251 			__journal_file_buffer(jh, commit_transaction,
252 						BJ_Locked);
253 			jbd_unlock_bh_state(bh);
254 			if (bufs == journal->j_wbufsize) {
255 				spin_unlock(&journal->j_list_lock);
256 				trace_jbd_do_submit_data(journal,
257 						     commit_transaction);
258 				journal_do_submit_data(wbuf, bufs, write_op);
259 				bufs = 0;
260 				goto write_out_data;
261 			}
262 		} else if (!locked && buffer_locked(bh)) {
263 			__journal_file_buffer(jh, commit_transaction,
264 						BJ_Locked);
265 			jbd_unlock_bh_state(bh);
266 			put_bh(bh);
267 		} else {
268 			BUFFER_TRACE(bh, "writeout complete: unfile");
269 			if (unlikely(!buffer_uptodate(bh)))
270 				err = -EIO;
271 			__journal_unfile_buffer(jh);
272 			jbd_unlock_bh_state(bh);
273 			if (locked)
274 				unlock_buffer(bh);
275 			release_data_buffer(bh);
276 		}
277 
278 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
279 			spin_unlock(&journal->j_list_lock);
280 			goto write_out_data;
281 		}
282 	}
283 	spin_unlock(&journal->j_list_lock);
284 	trace_jbd_do_submit_data(journal, commit_transaction);
285 	journal_do_submit_data(wbuf, bufs, write_op);
286 
287 	return err;
288 }
289 
290 /*
291  * journal_commit_transaction
292  *
293  * The primary function for committing a transaction to the log.  This
294  * function is called by the journal thread to begin a complete commit.
295  */
journal_commit_transaction(journal_t * journal)296 void journal_commit_transaction(journal_t *journal)
297 {
298 	transaction_t *commit_transaction;
299 	struct journal_head *jh, *new_jh, *descriptor;
300 	struct buffer_head **wbuf = journal->j_wbuf;
301 	int bufs;
302 	int flags;
303 	int err;
304 	unsigned int blocknr;
305 	ktime_t start_time;
306 	u64 commit_time;
307 	char *tagp = NULL;
308 	journal_header_t *header;
309 	journal_block_tag_t *tag = NULL;
310 	int space_left = 0;
311 	int first_tag = 0;
312 	int tag_flag;
313 	int i;
314 	struct blk_plug plug;
315 	int write_op = WRITE;
316 
317 	/*
318 	 * First job: lock down the current transaction and wait for
319 	 * all outstanding updates to complete.
320 	 */
321 
322 	/* Do we need to erase the effects of a prior journal_flush? */
323 	if (journal->j_flags & JFS_FLUSHED) {
324 		jbd_debug(3, "super block updated\n");
325 		mutex_lock(&journal->j_checkpoint_mutex);
326 		/*
327 		 * We hold j_checkpoint_mutex so tail cannot change under us.
328 		 * We don't need any special data guarantees for writing sb
329 		 * since journal is empty and it is ok for write to be
330 		 * flushed only with transaction commit.
331 		 */
332 		journal_update_sb_log_tail(journal, journal->j_tail_sequence,
333 					   journal->j_tail, WRITE_SYNC);
334 		mutex_unlock(&journal->j_checkpoint_mutex);
335 	} else {
336 		jbd_debug(3, "superblock not updated\n");
337 	}
338 
339 	J_ASSERT(journal->j_running_transaction != NULL);
340 	J_ASSERT(journal->j_committing_transaction == NULL);
341 
342 	commit_transaction = journal->j_running_transaction;
343 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
344 
345 	trace_jbd_start_commit(journal, commit_transaction);
346 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
347 			commit_transaction->t_tid);
348 
349 	spin_lock(&journal->j_state_lock);
350 	commit_transaction->t_state = T_LOCKED;
351 
352 	trace_jbd_commit_locking(journal, commit_transaction);
353 	spin_lock(&commit_transaction->t_handle_lock);
354 	while (commit_transaction->t_updates) {
355 		DEFINE_WAIT(wait);
356 
357 		prepare_to_wait(&journal->j_wait_updates, &wait,
358 					TASK_UNINTERRUPTIBLE);
359 		if (commit_transaction->t_updates) {
360 			spin_unlock(&commit_transaction->t_handle_lock);
361 			spin_unlock(&journal->j_state_lock);
362 			schedule();
363 			spin_lock(&journal->j_state_lock);
364 			spin_lock(&commit_transaction->t_handle_lock);
365 		}
366 		finish_wait(&journal->j_wait_updates, &wait);
367 	}
368 	spin_unlock(&commit_transaction->t_handle_lock);
369 
370 	J_ASSERT (commit_transaction->t_outstanding_credits <=
371 			journal->j_max_transaction_buffers);
372 
373 	/*
374 	 * First thing we are allowed to do is to discard any remaining
375 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
376 	 * that there are no such buffers: if a large filesystem
377 	 * operation like a truncate needs to split itself over multiple
378 	 * transactions, then it may try to do a journal_restart() while
379 	 * there are still BJ_Reserved buffers outstanding.  These must
380 	 * be released cleanly from the current transaction.
381 	 *
382 	 * In this case, the filesystem must still reserve write access
383 	 * again before modifying the buffer in the new transaction, but
384 	 * we do not require it to remember exactly which old buffers it
385 	 * has reserved.  This is consistent with the existing behaviour
386 	 * that multiple journal_get_write_access() calls to the same
387 	 * buffer are perfectly permissible.
388 	 */
389 	while (commit_transaction->t_reserved_list) {
390 		jh = commit_transaction->t_reserved_list;
391 		JBUFFER_TRACE(jh, "reserved, unused: refile");
392 		/*
393 		 * A journal_get_undo_access()+journal_release_buffer() may
394 		 * leave undo-committed data.
395 		 */
396 		if (jh->b_committed_data) {
397 			struct buffer_head *bh = jh2bh(jh);
398 
399 			jbd_lock_bh_state(bh);
400 			jbd_free(jh->b_committed_data, bh->b_size);
401 			jh->b_committed_data = NULL;
402 			jbd_unlock_bh_state(bh);
403 		}
404 		journal_refile_buffer(journal, jh);
405 	}
406 
407 	/*
408 	 * Now try to drop any written-back buffers from the journal's
409 	 * checkpoint lists.  We do this *before* commit because it potentially
410 	 * frees some memory
411 	 */
412 	spin_lock(&journal->j_list_lock);
413 	__journal_clean_checkpoint_list(journal);
414 	spin_unlock(&journal->j_list_lock);
415 
416 	jbd_debug (3, "JBD: commit phase 1\n");
417 
418 	/*
419 	 * Clear revoked flag to reflect there is no revoked buffers
420 	 * in the next transaction which is going to be started.
421 	 */
422 	journal_clear_buffer_revoked_flags(journal);
423 
424 	/*
425 	 * Switch to a new revoke table.
426 	 */
427 	journal_switch_revoke_table(journal);
428 
429 	trace_jbd_commit_flushing(journal, commit_transaction);
430 	commit_transaction->t_state = T_FLUSH;
431 	journal->j_committing_transaction = commit_transaction;
432 	journal->j_running_transaction = NULL;
433 	start_time = ktime_get();
434 	commit_transaction->t_log_start = journal->j_head;
435 	wake_up(&journal->j_wait_transaction_locked);
436 	spin_unlock(&journal->j_state_lock);
437 
438 	jbd_debug (3, "JBD: commit phase 2\n");
439 
440 	if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
441 		write_op = WRITE_SYNC;
442 
443 	/*
444 	 * Now start flushing things to disk, in the order they appear
445 	 * on the transaction lists.  Data blocks go first.
446 	 */
447 	blk_start_plug(&plug);
448 	err = journal_submit_data_buffers(journal, commit_transaction,
449 					  write_op);
450 	blk_finish_plug(&plug);
451 
452 	/*
453 	 * Wait for all previously submitted IO to complete.
454 	 */
455 	spin_lock(&journal->j_list_lock);
456 	while (commit_transaction->t_locked_list) {
457 		struct buffer_head *bh;
458 
459 		jh = commit_transaction->t_locked_list->b_tprev;
460 		bh = jh2bh(jh);
461 		get_bh(bh);
462 		if (buffer_locked(bh)) {
463 			spin_unlock(&journal->j_list_lock);
464 			wait_on_buffer(bh);
465 			spin_lock(&journal->j_list_lock);
466 		}
467 		if (unlikely(!buffer_uptodate(bh))) {
468 			if (!trylock_page(bh->b_page)) {
469 				spin_unlock(&journal->j_list_lock);
470 				lock_page(bh->b_page);
471 				spin_lock(&journal->j_list_lock);
472 			}
473 			if (bh->b_page->mapping)
474 				set_bit(AS_EIO, &bh->b_page->mapping->flags);
475 
476 			unlock_page(bh->b_page);
477 			SetPageError(bh->b_page);
478 			err = -EIO;
479 		}
480 		if (!inverted_lock(journal, bh)) {
481 			put_bh(bh);
482 			spin_lock(&journal->j_list_lock);
483 			continue;
484 		}
485 		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
486 		    jh->b_transaction == commit_transaction &&
487 		    jh->b_jlist == BJ_Locked)
488 			__journal_unfile_buffer(jh);
489 		jbd_unlock_bh_state(bh);
490 		release_data_buffer(bh);
491 		cond_resched_lock(&journal->j_list_lock);
492 	}
493 	spin_unlock(&journal->j_list_lock);
494 
495 	if (err) {
496 		char b[BDEVNAME_SIZE];
497 
498 		printk(KERN_WARNING
499 			"JBD: Detected IO errors while flushing file data "
500 			"on %s\n", bdevname(journal->j_fs_dev, b));
501 		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
502 			journal_abort(journal, err);
503 		err = 0;
504 	}
505 
506 	blk_start_plug(&plug);
507 
508 	journal_write_revoke_records(journal, commit_transaction, write_op);
509 
510 	/*
511 	 * If we found any dirty or locked buffers, then we should have
512 	 * looped back up to the write_out_data label.  If there weren't
513 	 * any then journal_clean_data_list should have wiped the list
514 	 * clean by now, so check that it is in fact empty.
515 	 */
516 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
517 
518 	jbd_debug (3, "JBD: commit phase 3\n");
519 
520 	/*
521 	 * Way to go: we have now written out all of the data for a
522 	 * transaction!  Now comes the tricky part: we need to write out
523 	 * metadata.  Loop over the transaction's entire buffer list:
524 	 */
525 	spin_lock(&journal->j_state_lock);
526 	commit_transaction->t_state = T_COMMIT;
527 	spin_unlock(&journal->j_state_lock);
528 
529 	trace_jbd_commit_logging(journal, commit_transaction);
530 	J_ASSERT(commit_transaction->t_nr_buffers <=
531 		 commit_transaction->t_outstanding_credits);
532 
533 	descriptor = NULL;
534 	bufs = 0;
535 	while (commit_transaction->t_buffers) {
536 
537 		/* Find the next buffer to be journaled... */
538 
539 		jh = commit_transaction->t_buffers;
540 
541 		/* If we're in abort mode, we just un-journal the buffer and
542 		   release it. */
543 
544 		if (is_journal_aborted(journal)) {
545 			clear_buffer_jbddirty(jh2bh(jh));
546 			JBUFFER_TRACE(jh, "journal is aborting: refile");
547 			journal_refile_buffer(journal, jh);
548 			/* If that was the last one, we need to clean up
549 			 * any descriptor buffers which may have been
550 			 * already allocated, even if we are now
551 			 * aborting. */
552 			if (!commit_transaction->t_buffers)
553 				goto start_journal_io;
554 			continue;
555 		}
556 
557 		/* Make sure we have a descriptor block in which to
558 		   record the metadata buffer. */
559 
560 		if (!descriptor) {
561 			struct buffer_head *bh;
562 
563 			J_ASSERT (bufs == 0);
564 
565 			jbd_debug(4, "JBD: get descriptor\n");
566 
567 			descriptor = journal_get_descriptor_buffer(journal);
568 			if (!descriptor) {
569 				journal_abort(journal, -EIO);
570 				continue;
571 			}
572 
573 			bh = jh2bh(descriptor);
574 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
575 				(unsigned long long)bh->b_blocknr, bh->b_data);
576 			header = (journal_header_t *)&bh->b_data[0];
577 			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
578 			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
579 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
580 
581 			tagp = &bh->b_data[sizeof(journal_header_t)];
582 			space_left = bh->b_size - sizeof(journal_header_t);
583 			first_tag = 1;
584 			set_buffer_jwrite(bh);
585 			set_buffer_dirty(bh);
586 			wbuf[bufs++] = bh;
587 
588 			/* Record it so that we can wait for IO
589                            completion later */
590 			BUFFER_TRACE(bh, "ph3: file as descriptor");
591 			journal_file_buffer(descriptor, commit_transaction,
592 					BJ_LogCtl);
593 		}
594 
595 		/* Where is the buffer to be written? */
596 
597 		err = journal_next_log_block(journal, &blocknr);
598 		/* If the block mapping failed, just abandon the buffer
599 		   and repeat this loop: we'll fall into the
600 		   refile-on-abort condition above. */
601 		if (err) {
602 			journal_abort(journal, err);
603 			continue;
604 		}
605 
606 		/*
607 		 * start_this_handle() uses t_outstanding_credits to determine
608 		 * the free space in the log, but this counter is changed
609 		 * by journal_next_log_block() also.
610 		 */
611 		commit_transaction->t_outstanding_credits--;
612 
613 		/* Bump b_count to prevent truncate from stumbling over
614                    the shadowed buffer!  @@@ This can go if we ever get
615                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
616 		get_bh(jh2bh(jh));
617 
618 		/* Make a temporary IO buffer with which to write it out
619                    (this will requeue both the metadata buffer and the
620                    temporary IO buffer). new_bh goes on BJ_IO*/
621 
622 		set_buffer_jwrite(jh2bh(jh));
623 		/*
624 		 * akpm: journal_write_metadata_buffer() sets
625 		 * new_bh->b_transaction to commit_transaction.
626 		 * We need to clean this up before we release new_bh
627 		 * (which is of type BJ_IO)
628 		 */
629 		JBUFFER_TRACE(jh, "ph3: write metadata");
630 		flags = journal_write_metadata_buffer(commit_transaction,
631 						      jh, &new_jh, blocknr);
632 		set_buffer_jwrite(jh2bh(new_jh));
633 		wbuf[bufs++] = jh2bh(new_jh);
634 
635 		/* Record the new block's tag in the current descriptor
636                    buffer */
637 
638 		tag_flag = 0;
639 		if (flags & 1)
640 			tag_flag |= JFS_FLAG_ESCAPE;
641 		if (!first_tag)
642 			tag_flag |= JFS_FLAG_SAME_UUID;
643 
644 		tag = (journal_block_tag_t *) tagp;
645 		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
646 		tag->t_flags = cpu_to_be32(tag_flag);
647 		tagp += sizeof(journal_block_tag_t);
648 		space_left -= sizeof(journal_block_tag_t);
649 
650 		if (first_tag) {
651 			memcpy (tagp, journal->j_uuid, 16);
652 			tagp += 16;
653 			space_left -= 16;
654 			first_tag = 0;
655 		}
656 
657 		/* If there's no more to do, or if the descriptor is full,
658 		   let the IO rip! */
659 
660 		if (bufs == journal->j_wbufsize ||
661 		    commit_transaction->t_buffers == NULL ||
662 		    space_left < sizeof(journal_block_tag_t) + 16) {
663 
664 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
665 
666 			/* Write an end-of-descriptor marker before
667                            submitting the IOs.  "tag" still points to
668                            the last tag we set up. */
669 
670 			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
671 
672 start_journal_io:
673 			for (i = 0; i < bufs; i++) {
674 				struct buffer_head *bh = wbuf[i];
675 				lock_buffer(bh);
676 				clear_buffer_dirty(bh);
677 				set_buffer_uptodate(bh);
678 				bh->b_end_io = journal_end_buffer_io_sync;
679 				/*
680 				 * In data=journal mode, here we can end up
681 				 * writing pagecache data that might be
682 				 * mmapped. Since we can't afford to clean the
683 				 * page and set PageWriteback (see the comment
684 				 * near the other use of _submit_bh()), the
685 				 * data can change while the write is in
686 				 * flight.  Tell the block layer to bounce the
687 				 * bio pages if stable pages are required.
688 				 */
689 				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
690 			}
691 			cond_resched();
692 
693 			/* Force a new descriptor to be generated next
694                            time round the loop. */
695 			descriptor = NULL;
696 			bufs = 0;
697 		}
698 	}
699 
700 	blk_finish_plug(&plug);
701 
702 	/* Lo and behold: we have just managed to send a transaction to
703            the log.  Before we can commit it, wait for the IO so far to
704            complete.  Control buffers being written are on the
705            transaction's t_log_list queue, and metadata buffers are on
706            the t_iobuf_list queue.
707 
708 	   Wait for the buffers in reverse order.  That way we are
709 	   less likely to be woken up until all IOs have completed, and
710 	   so we incur less scheduling load.
711 	*/
712 
713 	jbd_debug(3, "JBD: commit phase 4\n");
714 
715 	/*
716 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
717 	 * See __journal_try_to_free_buffer.
718 	 */
719 wait_for_iobuf:
720 	while (commit_transaction->t_iobuf_list != NULL) {
721 		struct buffer_head *bh;
722 
723 		jh = commit_transaction->t_iobuf_list->b_tprev;
724 		bh = jh2bh(jh);
725 		if (buffer_locked(bh)) {
726 			wait_on_buffer(bh);
727 			goto wait_for_iobuf;
728 		}
729 		if (cond_resched())
730 			goto wait_for_iobuf;
731 
732 		if (unlikely(!buffer_uptodate(bh)))
733 			err = -EIO;
734 
735 		clear_buffer_jwrite(bh);
736 
737 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
738 		journal_unfile_buffer(journal, jh);
739 
740 		/*
741 		 * ->t_iobuf_list should contain only dummy buffer_heads
742 		 * which were created by journal_write_metadata_buffer().
743 		 */
744 		BUFFER_TRACE(bh, "dumping temporary bh");
745 		journal_put_journal_head(jh);
746 		__brelse(bh);
747 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
748 		free_buffer_head(bh);
749 
750 		/* We also have to unlock and free the corresponding
751                    shadowed buffer */
752 		jh = commit_transaction->t_shadow_list->b_tprev;
753 		bh = jh2bh(jh);
754 		clear_buffer_jwrite(bh);
755 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
756 
757 		/* The metadata is now released for reuse, but we need
758                    to remember it against this transaction so that when
759                    we finally commit, we can do any checkpointing
760                    required. */
761 		JBUFFER_TRACE(jh, "file as BJ_Forget");
762 		journal_file_buffer(jh, commit_transaction, BJ_Forget);
763 		/*
764 		 * Wake up any transactions which were waiting for this
765 		 * IO to complete. The barrier must be here so that changes
766 		 * by journal_file_buffer() take effect before wake_up_bit()
767 		 * does the waitqueue check.
768 		 */
769 		smp_mb();
770 		wake_up_bit(&bh->b_state, BH_Unshadow);
771 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
772 		__brelse(bh);
773 	}
774 
775 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
776 
777 	jbd_debug(3, "JBD: commit phase 5\n");
778 
779 	/* Here we wait for the revoke record and descriptor record buffers */
780  wait_for_ctlbuf:
781 	while (commit_transaction->t_log_list != NULL) {
782 		struct buffer_head *bh;
783 
784 		jh = commit_transaction->t_log_list->b_tprev;
785 		bh = jh2bh(jh);
786 		if (buffer_locked(bh)) {
787 			wait_on_buffer(bh);
788 			goto wait_for_ctlbuf;
789 		}
790 		if (cond_resched())
791 			goto wait_for_ctlbuf;
792 
793 		if (unlikely(!buffer_uptodate(bh)))
794 			err = -EIO;
795 
796 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
797 		clear_buffer_jwrite(bh);
798 		journal_unfile_buffer(journal, jh);
799 		journal_put_journal_head(jh);
800 		__brelse(bh);		/* One for getblk */
801 		/* AKPM: bforget here */
802 	}
803 
804 	if (err)
805 		journal_abort(journal, err);
806 
807 	jbd_debug(3, "JBD: commit phase 6\n");
808 
809 	/* All metadata is written, now write commit record and do cleanup */
810 	spin_lock(&journal->j_state_lock);
811 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
812 	commit_transaction->t_state = T_COMMIT_RECORD;
813 	spin_unlock(&journal->j_state_lock);
814 
815 	if (journal_write_commit_record(journal, commit_transaction))
816 		err = -EIO;
817 
818 	if (err)
819 		journal_abort(journal, err);
820 
821 	/* End of a transaction!  Finally, we can do checkpoint
822            processing: any buffers committed as a result of this
823            transaction can be removed from any checkpoint list it was on
824            before. */
825 
826 	jbd_debug(3, "JBD: commit phase 7\n");
827 
828 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
829 	J_ASSERT(commit_transaction->t_buffers == NULL);
830 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
831 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
832 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
833 	J_ASSERT(commit_transaction->t_log_list == NULL);
834 
835 restart_loop:
836 	/*
837 	 * As there are other places (journal_unmap_buffer()) adding buffers
838 	 * to this list we have to be careful and hold the j_list_lock.
839 	 */
840 	spin_lock(&journal->j_list_lock);
841 	while (commit_transaction->t_forget) {
842 		transaction_t *cp_transaction;
843 		struct buffer_head *bh;
844 		int try_to_free = 0;
845 
846 		jh = commit_transaction->t_forget;
847 		spin_unlock(&journal->j_list_lock);
848 		bh = jh2bh(jh);
849 		/*
850 		 * Get a reference so that bh cannot be freed before we are
851 		 * done with it.
852 		 */
853 		get_bh(bh);
854 		jbd_lock_bh_state(bh);
855 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
856 			jh->b_transaction == journal->j_running_transaction);
857 
858 		/*
859 		 * If there is undo-protected committed data against
860 		 * this buffer, then we can remove it now.  If it is a
861 		 * buffer needing such protection, the old frozen_data
862 		 * field now points to a committed version of the
863 		 * buffer, so rotate that field to the new committed
864 		 * data.
865 		 *
866 		 * Otherwise, we can just throw away the frozen data now.
867 		 */
868 		if (jh->b_committed_data) {
869 			jbd_free(jh->b_committed_data, bh->b_size);
870 			jh->b_committed_data = NULL;
871 			if (jh->b_frozen_data) {
872 				jh->b_committed_data = jh->b_frozen_data;
873 				jh->b_frozen_data = NULL;
874 			}
875 		} else if (jh->b_frozen_data) {
876 			jbd_free(jh->b_frozen_data, bh->b_size);
877 			jh->b_frozen_data = NULL;
878 		}
879 
880 		spin_lock(&journal->j_list_lock);
881 		cp_transaction = jh->b_cp_transaction;
882 		if (cp_transaction) {
883 			JBUFFER_TRACE(jh, "remove from old cp transaction");
884 			__journal_remove_checkpoint(jh);
885 		}
886 
887 		/* Only re-checkpoint the buffer_head if it is marked
888 		 * dirty.  If the buffer was added to the BJ_Forget list
889 		 * by journal_forget, it may no longer be dirty and
890 		 * there's no point in keeping a checkpoint record for
891 		 * it. */
892 
893 		/*
894 		 * A buffer which has been freed while still being journaled by
895 		 * a previous transaction.
896 		 */
897 		if (buffer_freed(bh)) {
898 			/*
899 			 * If the running transaction is the one containing
900 			 * "add to orphan" operation (b_next_transaction !=
901 			 * NULL), we have to wait for that transaction to
902 			 * commit before we can really get rid of the buffer.
903 			 * So just clear b_modified to not confuse transaction
904 			 * credit accounting and refile the buffer to
905 			 * BJ_Forget of the running transaction. If the just
906 			 * committed transaction contains "add to orphan"
907 			 * operation, we can completely invalidate the buffer
908 			 * now. We are rather throughout in that since the
909 			 * buffer may be still accessible when blocksize <
910 			 * pagesize and it is attached to the last partial
911 			 * page.
912 			 */
913 			jh->b_modified = 0;
914 			if (!jh->b_next_transaction) {
915 				clear_buffer_freed(bh);
916 				clear_buffer_jbddirty(bh);
917 				clear_buffer_mapped(bh);
918 				clear_buffer_new(bh);
919 				clear_buffer_req(bh);
920 				bh->b_bdev = NULL;
921 			}
922 		}
923 
924 		if (buffer_jbddirty(bh)) {
925 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
926 			__journal_insert_checkpoint(jh, commit_transaction);
927 			if (is_journal_aborted(journal))
928 				clear_buffer_jbddirty(bh);
929 		} else {
930 			J_ASSERT_BH(bh, !buffer_dirty(bh));
931 			/*
932 			 * The buffer on BJ_Forget list and not jbddirty means
933 			 * it has been freed by this transaction and hence it
934 			 * could not have been reallocated until this
935 			 * transaction has committed. *BUT* it could be
936 			 * reallocated once we have written all the data to
937 			 * disk and before we process the buffer on BJ_Forget
938 			 * list.
939 			 */
940 			if (!jh->b_next_transaction)
941 				try_to_free = 1;
942 		}
943 		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
944 		__journal_refile_buffer(jh);
945 		jbd_unlock_bh_state(bh);
946 		if (try_to_free)
947 			release_buffer_page(bh);
948 		else
949 			__brelse(bh);
950 		cond_resched_lock(&journal->j_list_lock);
951 	}
952 	spin_unlock(&journal->j_list_lock);
953 	/*
954 	 * This is a bit sleazy.  We use j_list_lock to protect transition
955 	 * of a transaction into T_FINISHED state and calling
956 	 * __journal_drop_transaction(). Otherwise we could race with
957 	 * other checkpointing code processing the transaction...
958 	 */
959 	spin_lock(&journal->j_state_lock);
960 	spin_lock(&journal->j_list_lock);
961 	/*
962 	 * Now recheck if some buffers did not get attached to the transaction
963 	 * while the lock was dropped...
964 	 */
965 	if (commit_transaction->t_forget) {
966 		spin_unlock(&journal->j_list_lock);
967 		spin_unlock(&journal->j_state_lock);
968 		goto restart_loop;
969 	}
970 
971 	/* Done with this transaction! */
972 
973 	jbd_debug(3, "JBD: commit phase 8\n");
974 
975 	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
976 
977 	commit_transaction->t_state = T_FINISHED;
978 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
979 	journal->j_commit_sequence = commit_transaction->t_tid;
980 	journal->j_committing_transaction = NULL;
981 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
982 
983 	/*
984 	 * weight the commit time higher than the average time so we don't
985 	 * react too strongly to vast changes in commit time
986 	 */
987 	if (likely(journal->j_average_commit_time))
988 		journal->j_average_commit_time = (commit_time*3 +
989 				journal->j_average_commit_time) / 4;
990 	else
991 		journal->j_average_commit_time = commit_time;
992 
993 	spin_unlock(&journal->j_state_lock);
994 
995 	if (commit_transaction->t_checkpoint_list == NULL &&
996 	    commit_transaction->t_checkpoint_io_list == NULL) {
997 		__journal_drop_transaction(journal, commit_transaction);
998 	} else {
999 		if (journal->j_checkpoint_transactions == NULL) {
1000 			journal->j_checkpoint_transactions = commit_transaction;
1001 			commit_transaction->t_cpnext = commit_transaction;
1002 			commit_transaction->t_cpprev = commit_transaction;
1003 		} else {
1004 			commit_transaction->t_cpnext =
1005 				journal->j_checkpoint_transactions;
1006 			commit_transaction->t_cpprev =
1007 				commit_transaction->t_cpnext->t_cpprev;
1008 			commit_transaction->t_cpnext->t_cpprev =
1009 				commit_transaction;
1010 			commit_transaction->t_cpprev->t_cpnext =
1011 				commit_transaction;
1012 		}
1013 	}
1014 	spin_unlock(&journal->j_list_lock);
1015 
1016 	trace_jbd_end_commit(journal, commit_transaction);
1017 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1018 		  journal->j_commit_sequence, journal->j_tail_sequence);
1019 
1020 	wake_up(&journal->j_wait_done_commit);
1021 }
1022