• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3   * All Rights Reserved.
4   *
5   * This program is free software; you can redistribute it and/or
6   * modify it under the terms of the GNU General Public License as
7   * published by the Free Software Foundation.
8   *
9   * This program is distributed in the hope that it would be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License for more details.
13   *
14   * You should have received a copy of the GNU General Public License
15   * along with this program; if not, write the Free Software Foundation,
16   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17   */
18  #include "xfs.h"
19  #include "xfs_fs.h"
20  #include "xfs_shared.h"
21  #include "xfs_format.h"
22  #include "xfs_log_format.h"
23  #include "xfs_trans_resv.h"
24  #include "xfs_mount.h"
25  #include "xfs_error.h"
26  #include "xfs_trans.h"
27  #include "xfs_trans_priv.h"
28  #include "xfs_log.h"
29  #include "xfs_log_priv.h"
30  #include "xfs_log_recover.h"
31  #include "xfs_inode.h"
32  #include "xfs_trace.h"
33  #include "xfs_fsops.h"
34  #include "xfs_cksum.h"
35  #include "xfs_sysfs.h"
36  #include "xfs_sb.h"
37  
38  kmem_zone_t	*xfs_log_ticket_zone;
39  
40  /* Local miscellaneous function prototypes */
41  STATIC int
42  xlog_commit_record(
43  	struct xlog		*log,
44  	struct xlog_ticket	*ticket,
45  	struct xlog_in_core	**iclog,
46  	xfs_lsn_t		*commitlsnp);
47  
48  STATIC struct xlog *
49  xlog_alloc_log(
50  	struct xfs_mount	*mp,
51  	struct xfs_buftarg	*log_target,
52  	xfs_daddr_t		blk_offset,
53  	int			num_bblks);
54  STATIC int
55  xlog_space_left(
56  	struct xlog		*log,
57  	atomic64_t		*head);
58  STATIC int
59  xlog_sync(
60  	struct xlog		*log,
61  	struct xlog_in_core	*iclog);
62  STATIC void
63  xlog_dealloc_log(
64  	struct xlog		*log);
65  
66  /* local state machine functions */
67  STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
68  STATIC void
69  xlog_state_do_callback(
70  	struct xlog		*log,
71  	int			aborted,
72  	struct xlog_in_core	*iclog);
73  STATIC int
74  xlog_state_get_iclog_space(
75  	struct xlog		*log,
76  	int			len,
77  	struct xlog_in_core	**iclog,
78  	struct xlog_ticket	*ticket,
79  	int			*continued_write,
80  	int			*logoffsetp);
81  STATIC int
82  xlog_state_release_iclog(
83  	struct xlog		*log,
84  	struct xlog_in_core	*iclog);
85  STATIC void
86  xlog_state_switch_iclogs(
87  	struct xlog		*log,
88  	struct xlog_in_core	*iclog,
89  	int			eventual_size);
90  STATIC void
91  xlog_state_want_sync(
92  	struct xlog		*log,
93  	struct xlog_in_core	*iclog);
94  
95  STATIC void
96  xlog_grant_push_ail(
97  	struct xlog		*log,
98  	int			need_bytes);
99  STATIC void
100  xlog_regrant_reserve_log_space(
101  	struct xlog		*log,
102  	struct xlog_ticket	*ticket);
103  STATIC void
104  xlog_ungrant_log_space(
105  	struct xlog		*log,
106  	struct xlog_ticket	*ticket);
107  
108  #if defined(DEBUG)
109  STATIC void
110  xlog_verify_dest_ptr(
111  	struct xlog		*log,
112  	void			*ptr);
113  STATIC void
114  xlog_verify_grant_tail(
115  	struct xlog *log);
116  STATIC void
117  xlog_verify_iclog(
118  	struct xlog		*log,
119  	struct xlog_in_core	*iclog,
120  	int			count,
121  	bool                    syncing);
122  STATIC void
123  xlog_verify_tail_lsn(
124  	struct xlog		*log,
125  	struct xlog_in_core	*iclog,
126  	xfs_lsn_t		tail_lsn);
127  #else
128  #define xlog_verify_dest_ptr(a,b)
129  #define xlog_verify_grant_tail(a)
130  #define xlog_verify_iclog(a,b,c,d)
131  #define xlog_verify_tail_lsn(a,b,c)
132  #endif
133  
134  STATIC int
135  xlog_iclogs_empty(
136  	struct xlog		*log);
137  
138  static void
xlog_grant_sub_space(struct xlog * log,atomic64_t * head,int bytes)139  xlog_grant_sub_space(
140  	struct xlog		*log,
141  	atomic64_t		*head,
142  	int			bytes)
143  {
144  	int64_t	head_val = atomic64_read(head);
145  	int64_t new, old;
146  
147  	do {
148  		int	cycle, space;
149  
150  		xlog_crack_grant_head_val(head_val, &cycle, &space);
151  
152  		space -= bytes;
153  		if (space < 0) {
154  			space += log->l_logsize;
155  			cycle--;
156  		}
157  
158  		old = head_val;
159  		new = xlog_assign_grant_head_val(cycle, space);
160  		head_val = atomic64_cmpxchg(head, old, new);
161  	} while (head_val != old);
162  }
163  
164  static void
xlog_grant_add_space(struct xlog * log,atomic64_t * head,int bytes)165  xlog_grant_add_space(
166  	struct xlog		*log,
167  	atomic64_t		*head,
168  	int			bytes)
169  {
170  	int64_t	head_val = atomic64_read(head);
171  	int64_t new, old;
172  
173  	do {
174  		int		tmp;
175  		int		cycle, space;
176  
177  		xlog_crack_grant_head_val(head_val, &cycle, &space);
178  
179  		tmp = log->l_logsize - space;
180  		if (tmp > bytes)
181  			space += bytes;
182  		else {
183  			space = bytes - tmp;
184  			cycle++;
185  		}
186  
187  		old = head_val;
188  		new = xlog_assign_grant_head_val(cycle, space);
189  		head_val = atomic64_cmpxchg(head, old, new);
190  	} while (head_val != old);
191  }
192  
193  STATIC void
xlog_grant_head_init(struct xlog_grant_head * head)194  xlog_grant_head_init(
195  	struct xlog_grant_head	*head)
196  {
197  	xlog_assign_grant_head(&head->grant, 1, 0);
198  	INIT_LIST_HEAD(&head->waiters);
199  	spin_lock_init(&head->lock);
200  }
201  
202  STATIC void
xlog_grant_head_wake_all(struct xlog_grant_head * head)203  xlog_grant_head_wake_all(
204  	struct xlog_grant_head	*head)
205  {
206  	struct xlog_ticket	*tic;
207  
208  	spin_lock(&head->lock);
209  	list_for_each_entry(tic, &head->waiters, t_queue)
210  		wake_up_process(tic->t_task);
211  	spin_unlock(&head->lock);
212  }
213  
214  static inline int
xlog_ticket_reservation(struct xlog * log,struct xlog_grant_head * head,struct xlog_ticket * tic)215  xlog_ticket_reservation(
216  	struct xlog		*log,
217  	struct xlog_grant_head	*head,
218  	struct xlog_ticket	*tic)
219  {
220  	if (head == &log->l_write_head) {
221  		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
222  		return tic->t_unit_res;
223  	} else {
224  		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
225  			return tic->t_unit_res * tic->t_cnt;
226  		else
227  			return tic->t_unit_res;
228  	}
229  }
230  
231  STATIC bool
xlog_grant_head_wake(struct xlog * log,struct xlog_grant_head * head,int * free_bytes)232  xlog_grant_head_wake(
233  	struct xlog		*log,
234  	struct xlog_grant_head	*head,
235  	int			*free_bytes)
236  {
237  	struct xlog_ticket	*tic;
238  	int			need_bytes;
239  
240  	list_for_each_entry(tic, &head->waiters, t_queue) {
241  		need_bytes = xlog_ticket_reservation(log, head, tic);
242  		if (*free_bytes < need_bytes)
243  			return false;
244  
245  		*free_bytes -= need_bytes;
246  		trace_xfs_log_grant_wake_up(log, tic);
247  		wake_up_process(tic->t_task);
248  	}
249  
250  	return true;
251  }
252  
253  STATIC int
xlog_grant_head_wait(struct xlog * log,struct xlog_grant_head * head,struct xlog_ticket * tic,int need_bytes)254  xlog_grant_head_wait(
255  	struct xlog		*log,
256  	struct xlog_grant_head	*head,
257  	struct xlog_ticket	*tic,
258  	int			need_bytes) __releases(&head->lock)
259  					    __acquires(&head->lock)
260  {
261  	list_add_tail(&tic->t_queue, &head->waiters);
262  
263  	do {
264  		if (XLOG_FORCED_SHUTDOWN(log))
265  			goto shutdown;
266  		xlog_grant_push_ail(log, need_bytes);
267  
268  		__set_current_state(TASK_UNINTERRUPTIBLE);
269  		spin_unlock(&head->lock);
270  
271  		XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
272  
273  		trace_xfs_log_grant_sleep(log, tic);
274  		schedule();
275  		trace_xfs_log_grant_wake(log, tic);
276  
277  		spin_lock(&head->lock);
278  		if (XLOG_FORCED_SHUTDOWN(log))
279  			goto shutdown;
280  	} while (xlog_space_left(log, &head->grant) < need_bytes);
281  
282  	list_del_init(&tic->t_queue);
283  	return 0;
284  shutdown:
285  	list_del_init(&tic->t_queue);
286  	return -EIO;
287  }
288  
289  /*
290   * Atomically get the log space required for a log ticket.
291   *
292   * Once a ticket gets put onto head->waiters, it will only return after the
293   * needed reservation is satisfied.
294   *
295   * This function is structured so that it has a lock free fast path. This is
296   * necessary because every new transaction reservation will come through this
297   * path. Hence any lock will be globally hot if we take it unconditionally on
298   * every pass.
299   *
300   * As tickets are only ever moved on and off head->waiters under head->lock, we
301   * only need to take that lock if we are going to add the ticket to the queue
302   * and sleep. We can avoid taking the lock if the ticket was never added to
303   * head->waiters because the t_queue list head will be empty and we hold the
304   * only reference to it so it can safely be checked unlocked.
305   */
306  STATIC int
xlog_grant_head_check(struct xlog * log,struct xlog_grant_head * head,struct xlog_ticket * tic,int * need_bytes)307  xlog_grant_head_check(
308  	struct xlog		*log,
309  	struct xlog_grant_head	*head,
310  	struct xlog_ticket	*tic,
311  	int			*need_bytes)
312  {
313  	int			free_bytes;
314  	int			error = 0;
315  
316  	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
317  
318  	/*
319  	 * If there are other waiters on the queue then give them a chance at
320  	 * logspace before us.  Wake up the first waiters, if we do not wake
321  	 * up all the waiters then go to sleep waiting for more free space,
322  	 * otherwise try to get some space for this transaction.
323  	 */
324  	*need_bytes = xlog_ticket_reservation(log, head, tic);
325  	free_bytes = xlog_space_left(log, &head->grant);
326  	if (!list_empty_careful(&head->waiters)) {
327  		spin_lock(&head->lock);
328  		if (!xlog_grant_head_wake(log, head, &free_bytes) ||
329  		    free_bytes < *need_bytes) {
330  			error = xlog_grant_head_wait(log, head, tic,
331  						     *need_bytes);
332  		}
333  		spin_unlock(&head->lock);
334  	} else if (free_bytes < *need_bytes) {
335  		spin_lock(&head->lock);
336  		error = xlog_grant_head_wait(log, head, tic, *need_bytes);
337  		spin_unlock(&head->lock);
338  	}
339  
340  	return error;
341  }
342  
343  static void
xlog_tic_reset_res(xlog_ticket_t * tic)344  xlog_tic_reset_res(xlog_ticket_t *tic)
345  {
346  	tic->t_res_num = 0;
347  	tic->t_res_arr_sum = 0;
348  	tic->t_res_num_ophdrs = 0;
349  }
350  
351  static void
xlog_tic_add_region(xlog_ticket_t * tic,uint len,uint type)352  xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
353  {
354  	if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
355  		/* add to overflow and start again */
356  		tic->t_res_o_flow += tic->t_res_arr_sum;
357  		tic->t_res_num = 0;
358  		tic->t_res_arr_sum = 0;
359  	}
360  
361  	tic->t_res_arr[tic->t_res_num].r_len = len;
362  	tic->t_res_arr[tic->t_res_num].r_type = type;
363  	tic->t_res_arr_sum += len;
364  	tic->t_res_num++;
365  }
366  
367  /*
368   * Replenish the byte reservation required by moving the grant write head.
369   */
370  int
xfs_log_regrant(struct xfs_mount * mp,struct xlog_ticket * tic)371  xfs_log_regrant(
372  	struct xfs_mount	*mp,
373  	struct xlog_ticket	*tic)
374  {
375  	struct xlog		*log = mp->m_log;
376  	int			need_bytes;
377  	int			error = 0;
378  
379  	if (XLOG_FORCED_SHUTDOWN(log))
380  		return -EIO;
381  
382  	XFS_STATS_INC(mp, xs_try_logspace);
383  
384  	/*
385  	 * This is a new transaction on the ticket, so we need to change the
386  	 * transaction ID so that the next transaction has a different TID in
387  	 * the log. Just add one to the existing tid so that we can see chains
388  	 * of rolling transactions in the log easily.
389  	 */
390  	tic->t_tid++;
391  
392  	xlog_grant_push_ail(log, tic->t_unit_res);
393  
394  	tic->t_curr_res = tic->t_unit_res;
395  	xlog_tic_reset_res(tic);
396  
397  	if (tic->t_cnt > 0)
398  		return 0;
399  
400  	trace_xfs_log_regrant(log, tic);
401  
402  	error = xlog_grant_head_check(log, &log->l_write_head, tic,
403  				      &need_bytes);
404  	if (error)
405  		goto out_error;
406  
407  	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
408  	trace_xfs_log_regrant_exit(log, tic);
409  	xlog_verify_grant_tail(log);
410  	return 0;
411  
412  out_error:
413  	/*
414  	 * If we are failing, make sure the ticket doesn't have any current
415  	 * reservations.  We don't want to add this back when the ticket/
416  	 * transaction gets cancelled.
417  	 */
418  	tic->t_curr_res = 0;
419  	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
420  	return error;
421  }
422  
423  /*
424   * Reserve log space and return a ticket corresponding the reservation.
425   *
426   * Each reservation is going to reserve extra space for a log record header.
427   * When writes happen to the on-disk log, we don't subtract the length of the
428   * log record header from any reservation.  By wasting space in each
429   * reservation, we prevent over allocation problems.
430   */
431  int
xfs_log_reserve(struct xfs_mount * mp,int unit_bytes,int cnt,struct xlog_ticket ** ticp,__uint8_t client,bool permanent,uint t_type)432  xfs_log_reserve(
433  	struct xfs_mount	*mp,
434  	int		 	unit_bytes,
435  	int		 	cnt,
436  	struct xlog_ticket	**ticp,
437  	__uint8_t	 	client,
438  	bool			permanent,
439  	uint		 	t_type)
440  {
441  	struct xlog		*log = mp->m_log;
442  	struct xlog_ticket	*tic;
443  	int			need_bytes;
444  	int			error = 0;
445  
446  	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
447  
448  	if (XLOG_FORCED_SHUTDOWN(log))
449  		return -EIO;
450  
451  	XFS_STATS_INC(mp, xs_try_logspace);
452  
453  	ASSERT(*ticp == NULL);
454  	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
455  				KM_SLEEP | KM_MAYFAIL);
456  	if (!tic)
457  		return -ENOMEM;
458  
459  	tic->t_trans_type = t_type;
460  	*ticp = tic;
461  
462  	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
463  					    : tic->t_unit_res);
464  
465  	trace_xfs_log_reserve(log, tic);
466  
467  	error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
468  				      &need_bytes);
469  	if (error)
470  		goto out_error;
471  
472  	xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
473  	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
474  	trace_xfs_log_reserve_exit(log, tic);
475  	xlog_verify_grant_tail(log);
476  	return 0;
477  
478  out_error:
479  	/*
480  	 * If we are failing, make sure the ticket doesn't have any current
481  	 * reservations.  We don't want to add this back when the ticket/
482  	 * transaction gets cancelled.
483  	 */
484  	tic->t_curr_res = 0;
485  	tic->t_cnt = 0;	/* ungrant will give back unit_res * t_cnt. */
486  	return error;
487  }
488  
489  
490  /*
491   * NOTES:
492   *
493   *	1. currblock field gets updated at startup and after in-core logs
494   *		marked as with WANT_SYNC.
495   */
496  
497  /*
498   * This routine is called when a user of a log manager ticket is done with
499   * the reservation.  If the ticket was ever used, then a commit record for
500   * the associated transaction is written out as a log operation header with
501   * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
502   * a given ticket.  If the ticket was one with a permanent reservation, then
503   * a few operations are done differently.  Permanent reservation tickets by
504   * default don't release the reservation.  They just commit the current
505   * transaction with the belief that the reservation is still needed.  A flag
506   * must be passed in before permanent reservations are actually released.
507   * When these type of tickets are not released, they need to be set into
508   * the inited state again.  By doing this, a start record will be written
509   * out when the next write occurs.
510   */
511  xfs_lsn_t
xfs_log_done(struct xfs_mount * mp,struct xlog_ticket * ticket,struct xlog_in_core ** iclog,bool regrant)512  xfs_log_done(
513  	struct xfs_mount	*mp,
514  	struct xlog_ticket	*ticket,
515  	struct xlog_in_core	**iclog,
516  	bool			regrant)
517  {
518  	struct xlog		*log = mp->m_log;
519  	xfs_lsn_t		lsn = 0;
520  
521  	if (XLOG_FORCED_SHUTDOWN(log) ||
522  	    /*
523  	     * If nothing was ever written, don't write out commit record.
524  	     * If we get an error, just continue and give back the log ticket.
525  	     */
526  	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
527  	     (xlog_commit_record(log, ticket, iclog, &lsn)))) {
528  		lsn = (xfs_lsn_t) -1;
529  		regrant = false;
530  	}
531  
532  
533  	if (!regrant) {
534  		trace_xfs_log_done_nonperm(log, ticket);
535  
536  		/*
537  		 * Release ticket if not permanent reservation or a specific
538  		 * request has been made to release a permanent reservation.
539  		 */
540  		xlog_ungrant_log_space(log, ticket);
541  	} else {
542  		trace_xfs_log_done_perm(log, ticket);
543  
544  		xlog_regrant_reserve_log_space(log, ticket);
545  		/* If this ticket was a permanent reservation and we aren't
546  		 * trying to release it, reset the inited flags; so next time
547  		 * we write, a start record will be written out.
548  		 */
549  		ticket->t_flags |= XLOG_TIC_INITED;
550  	}
551  
552  	xfs_log_ticket_put(ticket);
553  	return lsn;
554  }
555  
556  /*
557   * Attaches a new iclog I/O completion callback routine during
558   * transaction commit.  If the log is in error state, a non-zero
559   * return code is handed back and the caller is responsible for
560   * executing the callback at an appropriate time.
561   */
562  int
xfs_log_notify(struct xfs_mount * mp,struct xlog_in_core * iclog,xfs_log_callback_t * cb)563  xfs_log_notify(
564  	struct xfs_mount	*mp,
565  	struct xlog_in_core	*iclog,
566  	xfs_log_callback_t	*cb)
567  {
568  	int	abortflg;
569  
570  	spin_lock(&iclog->ic_callback_lock);
571  	abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
572  	if (!abortflg) {
573  		ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
574  			      (iclog->ic_state == XLOG_STATE_WANT_SYNC));
575  		cb->cb_next = NULL;
576  		*(iclog->ic_callback_tail) = cb;
577  		iclog->ic_callback_tail = &(cb->cb_next);
578  	}
579  	spin_unlock(&iclog->ic_callback_lock);
580  	return abortflg;
581  }
582  
583  int
xfs_log_release_iclog(struct xfs_mount * mp,struct xlog_in_core * iclog)584  xfs_log_release_iclog(
585  	struct xfs_mount	*mp,
586  	struct xlog_in_core	*iclog)
587  {
588  	if (xlog_state_release_iclog(mp->m_log, iclog)) {
589  		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
590  		return -EIO;
591  	}
592  
593  	return 0;
594  }
595  
596  /*
597   * Mount a log filesystem
598   *
599   * mp		- ubiquitous xfs mount point structure
600   * log_target	- buftarg of on-disk log device
601   * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
602   * num_bblocks	- Number of BBSIZE blocks in on-disk log
603   *
604   * Return error or zero.
605   */
606  int
xfs_log_mount(xfs_mount_t * mp,xfs_buftarg_t * log_target,xfs_daddr_t blk_offset,int num_bblks)607  xfs_log_mount(
608  	xfs_mount_t	*mp,
609  	xfs_buftarg_t	*log_target,
610  	xfs_daddr_t	blk_offset,
611  	int		num_bblks)
612  {
613  	int		error = 0;
614  	int		min_logfsbs;
615  
616  	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
617  		xfs_notice(mp, "Mounting V%d Filesystem",
618  			   XFS_SB_VERSION_NUM(&mp->m_sb));
619  	} else {
620  		xfs_notice(mp,
621  "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
622  			   XFS_SB_VERSION_NUM(&mp->m_sb));
623  		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
624  	}
625  
626  	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
627  	if (IS_ERR(mp->m_log)) {
628  		error = PTR_ERR(mp->m_log);
629  		goto out;
630  	}
631  
632  	/*
633  	 * Validate the given log space and drop a critical message via syslog
634  	 * if the log size is too small that would lead to some unexpected
635  	 * situations in transaction log space reservation stage.
636  	 *
637  	 * Note: we can't just reject the mount if the validation fails.  This
638  	 * would mean that people would have to downgrade their kernel just to
639  	 * remedy the situation as there is no way to grow the log (short of
640  	 * black magic surgery with xfs_db).
641  	 *
642  	 * We can, however, reject mounts for CRC format filesystems, as the
643  	 * mkfs binary being used to make the filesystem should never create a
644  	 * filesystem with a log that is too small.
645  	 */
646  	min_logfsbs = xfs_log_calc_minimum_size(mp);
647  
648  	if (mp->m_sb.sb_logblocks < min_logfsbs) {
649  		xfs_warn(mp,
650  		"Log size %d blocks too small, minimum size is %d blocks",
651  			 mp->m_sb.sb_logblocks, min_logfsbs);
652  		error = -EINVAL;
653  	} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
654  		xfs_warn(mp,
655  		"Log size %d blocks too large, maximum size is %lld blocks",
656  			 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
657  		error = -EINVAL;
658  	} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
659  		xfs_warn(mp,
660  		"log size %lld bytes too large, maximum size is %lld bytes",
661  			 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
662  			 XFS_MAX_LOG_BYTES);
663  		error = -EINVAL;
664  	}
665  	if (error) {
666  		if (xfs_sb_version_hascrc(&mp->m_sb)) {
667  			xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
668  			ASSERT(0);
669  			goto out_free_log;
670  		}
671  		xfs_crit(mp, "Log size out of supported range.");
672  		xfs_crit(mp,
673  "Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
674  	}
675  
676  	/*
677  	 * Initialize the AIL now we have a log.
678  	 */
679  	error = xfs_trans_ail_init(mp);
680  	if (error) {
681  		xfs_warn(mp, "AIL initialisation failed: error %d", error);
682  		goto out_free_log;
683  	}
684  	mp->m_log->l_ailp = mp->m_ail;
685  
686  	/*
687  	 * skip log recovery on a norecovery mount.  pretend it all
688  	 * just worked.
689  	 */
690  	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
691  		int	readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
692  
693  		if (readonly)
694  			mp->m_flags &= ~XFS_MOUNT_RDONLY;
695  
696  		error = xlog_recover(mp->m_log);
697  
698  		if (readonly)
699  			mp->m_flags |= XFS_MOUNT_RDONLY;
700  		if (error) {
701  			xfs_warn(mp, "log mount/recovery failed: error %d",
702  				error);
703  			xlog_recover_cancel(mp->m_log);
704  			goto out_destroy_ail;
705  		}
706  	}
707  
708  	error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
709  			       "log");
710  	if (error)
711  		goto out_destroy_ail;
712  
713  	/* Normal transactions can now occur */
714  	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
715  
716  	/*
717  	 * Now the log has been fully initialised and we know were our
718  	 * space grant counters are, we can initialise the permanent ticket
719  	 * needed for delayed logging to work.
720  	 */
721  	xlog_cil_init_post_recovery(mp->m_log);
722  
723  	return 0;
724  
725  out_destroy_ail:
726  	xfs_trans_ail_destroy(mp);
727  out_free_log:
728  	xlog_dealloc_log(mp->m_log);
729  out:
730  	return error;
731  }
732  
733  /*
734   * Finish the recovery of the file system.  This is separate from the
735   * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
736   * in the root and real-time bitmap inodes between calling xfs_log_mount() and
737   * here.
738   *
739   * If we finish recovery successfully, start the background log work. If we are
740   * not doing recovery, then we have a RO filesystem and we don't need to start
741   * it.
742   */
743  int
xfs_log_mount_finish(struct xfs_mount * mp)744  xfs_log_mount_finish(
745  	struct xfs_mount	*mp)
746  {
747  	int	error = 0;
748  
749  	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
750  		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
751  		return 0;
752  	}
753  
754  	error = xlog_recover_finish(mp->m_log);
755  	if (!error)
756  		xfs_log_work_queue(mp);
757  
758  	return error;
759  }
760  
761  /*
762   * The mount has failed. Cancel the recovery if it hasn't completed and destroy
763   * the log.
764   */
765  int
xfs_log_mount_cancel(struct xfs_mount * mp)766  xfs_log_mount_cancel(
767  	struct xfs_mount	*mp)
768  {
769  	int			error;
770  
771  	error = xlog_recover_cancel(mp->m_log);
772  	xfs_log_unmount(mp);
773  
774  	return error;
775  }
776  
777  /*
778   * Final log writes as part of unmount.
779   *
780   * Mark the filesystem clean as unmount happens.  Note that during relocation
781   * this routine needs to be executed as part of source-bag while the
782   * deallocation must not be done until source-end.
783   */
784  
785  /*
786   * Unmount record used to have a string "Unmount filesystem--" in the
787   * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
788   * We just write the magic number now since that particular field isn't
789   * currently architecture converted and "Unmount" is a bit foo.
790   * As far as I know, there weren't any dependencies on the old behaviour.
791   */
792  
793  int
xfs_log_unmount_write(xfs_mount_t * mp)794  xfs_log_unmount_write(xfs_mount_t *mp)
795  {
796  	struct xlog	 *log = mp->m_log;
797  	xlog_in_core_t	 *iclog;
798  #ifdef DEBUG
799  	xlog_in_core_t	 *first_iclog;
800  #endif
801  	xlog_ticket_t	*tic = NULL;
802  	xfs_lsn_t	 lsn;
803  	int		 error;
804  
805  	/*
806  	 * Don't write out unmount record on read-only mounts.
807  	 * Or, if we are doing a forced umount (typically because of IO errors).
808  	 */
809  	if (mp->m_flags & XFS_MOUNT_RDONLY)
810  		return 0;
811  
812  	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
813  	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
814  
815  #ifdef DEBUG
816  	first_iclog = iclog = log->l_iclog;
817  	do {
818  		if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
819  			ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
820  			ASSERT(iclog->ic_offset == 0);
821  		}
822  		iclog = iclog->ic_next;
823  	} while (iclog != first_iclog);
824  #endif
825  	if (! (XLOG_FORCED_SHUTDOWN(log))) {
826  		error = xfs_log_reserve(mp, 600, 1, &tic,
827  					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
828  		if (!error) {
829  			/* the data section must be 32 bit size aligned */
830  			struct {
831  			    __uint16_t magic;
832  			    __uint16_t pad1;
833  			    __uint32_t pad2; /* may as well make it 64 bits */
834  			} magic = {
835  				.magic = XLOG_UNMOUNT_TYPE,
836  			};
837  			struct xfs_log_iovec reg = {
838  				.i_addr = &magic,
839  				.i_len = sizeof(magic),
840  				.i_type = XLOG_REG_TYPE_UNMOUNT,
841  			};
842  			struct xfs_log_vec vec = {
843  				.lv_niovecs = 1,
844  				.lv_iovecp = &reg,
845  			};
846  
847  			/* remove inited flag, and account for space used */
848  			tic->t_flags = 0;
849  			tic->t_curr_res -= sizeof(magic);
850  			error = xlog_write(log, &vec, tic, &lsn,
851  					   NULL, XLOG_UNMOUNT_TRANS);
852  			/*
853  			 * At this point, we're umounting anyway,
854  			 * so there's no point in transitioning log state
855  			 * to IOERROR. Just continue...
856  			 */
857  		}
858  
859  		if (error)
860  			xfs_alert(mp, "%s: unmount record failed", __func__);
861  
862  
863  		spin_lock(&log->l_icloglock);
864  		iclog = log->l_iclog;
865  		atomic_inc(&iclog->ic_refcnt);
866  		xlog_state_want_sync(log, iclog);
867  		spin_unlock(&log->l_icloglock);
868  		error = xlog_state_release_iclog(log, iclog);
869  
870  		spin_lock(&log->l_icloglock);
871  		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
872  		      iclog->ic_state == XLOG_STATE_DIRTY)) {
873  			if (!XLOG_FORCED_SHUTDOWN(log)) {
874  				xlog_wait(&iclog->ic_force_wait,
875  							&log->l_icloglock);
876  			} else {
877  				spin_unlock(&log->l_icloglock);
878  			}
879  		} else {
880  			spin_unlock(&log->l_icloglock);
881  		}
882  		if (tic) {
883  			trace_xfs_log_umount_write(log, tic);
884  			xlog_ungrant_log_space(log, tic);
885  			xfs_log_ticket_put(tic);
886  		}
887  	} else {
888  		/*
889  		 * We're already in forced_shutdown mode, couldn't
890  		 * even attempt to write out the unmount transaction.
891  		 *
892  		 * Go through the motions of sync'ing and releasing
893  		 * the iclog, even though no I/O will actually happen,
894  		 * we need to wait for other log I/Os that may already
895  		 * be in progress.  Do this as a separate section of
896  		 * code so we'll know if we ever get stuck here that
897  		 * we're in this odd situation of trying to unmount
898  		 * a file system that went into forced_shutdown as
899  		 * the result of an unmount..
900  		 */
901  		spin_lock(&log->l_icloglock);
902  		iclog = log->l_iclog;
903  		atomic_inc(&iclog->ic_refcnt);
904  
905  		xlog_state_want_sync(log, iclog);
906  		spin_unlock(&log->l_icloglock);
907  		error =  xlog_state_release_iclog(log, iclog);
908  
909  		spin_lock(&log->l_icloglock);
910  
911  		if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
912  			|| iclog->ic_state == XLOG_STATE_DIRTY
913  			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
914  
915  				xlog_wait(&iclog->ic_force_wait,
916  							&log->l_icloglock);
917  		} else {
918  			spin_unlock(&log->l_icloglock);
919  		}
920  	}
921  
922  	return error;
923  }	/* xfs_log_unmount_write */
924  
925  /*
926   * Empty the log for unmount/freeze.
927   *
928   * To do this, we first need to shut down the background log work so it is not
929   * trying to cover the log as we clean up. We then need to unpin all objects in
930   * the log so we can then flush them out. Once they have completed their IO and
931   * run the callbacks removing themselves from the AIL, we can write the unmount
932   * record.
933   */
934  void
xfs_log_quiesce(struct xfs_mount * mp)935  xfs_log_quiesce(
936  	struct xfs_mount	*mp)
937  {
938  	cancel_delayed_work_sync(&mp->m_log->l_work);
939  	xfs_log_force(mp, XFS_LOG_SYNC);
940  
941  	/*
942  	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
943  	 * will push it, xfs_wait_buftarg() will not wait for it. Further,
944  	 * xfs_buf_iowait() cannot be used because it was pushed with the
945  	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
946  	 * the IO to complete.
947  	 */
948  	xfs_ail_push_all_sync(mp->m_ail);
949  	xfs_wait_buftarg(mp->m_ddev_targp);
950  	xfs_buf_lock(mp->m_sb_bp);
951  	xfs_buf_unlock(mp->m_sb_bp);
952  
953  	xfs_log_unmount_write(mp);
954  }
955  
956  /*
957   * Shut down and release the AIL and Log.
958   *
959   * During unmount, we need to ensure we flush all the dirty metadata objects
960   * from the AIL so that the log is empty before we write the unmount record to
961   * the log. Once this is done, we can tear down the AIL and the log.
962   */
963  void
xfs_log_unmount(struct xfs_mount * mp)964  xfs_log_unmount(
965  	struct xfs_mount	*mp)
966  {
967  	xfs_log_quiesce(mp);
968  
969  	xfs_trans_ail_destroy(mp);
970  
971  	xfs_sysfs_del(&mp->m_log->l_kobj);
972  
973  	xlog_dealloc_log(mp->m_log);
974  }
975  
976  void
xfs_log_item_init(struct xfs_mount * mp,struct xfs_log_item * item,int type,const struct xfs_item_ops * ops)977  xfs_log_item_init(
978  	struct xfs_mount	*mp,
979  	struct xfs_log_item	*item,
980  	int			type,
981  	const struct xfs_item_ops *ops)
982  {
983  	item->li_mountp = mp;
984  	item->li_ailp = mp->m_ail;
985  	item->li_type = type;
986  	item->li_ops = ops;
987  	item->li_lv = NULL;
988  
989  	INIT_LIST_HEAD(&item->li_ail);
990  	INIT_LIST_HEAD(&item->li_cil);
991  }
992  
993  /*
994   * Wake up processes waiting for log space after we have moved the log tail.
995   */
996  void
xfs_log_space_wake(struct xfs_mount * mp)997  xfs_log_space_wake(
998  	struct xfs_mount	*mp)
999  {
1000  	struct xlog		*log = mp->m_log;
1001  	int			free_bytes;
1002  
1003  	if (XLOG_FORCED_SHUTDOWN(log))
1004  		return;
1005  
1006  	if (!list_empty_careful(&log->l_write_head.waiters)) {
1007  		ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
1008  
1009  		spin_lock(&log->l_write_head.lock);
1010  		free_bytes = xlog_space_left(log, &log->l_write_head.grant);
1011  		xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
1012  		spin_unlock(&log->l_write_head.lock);
1013  	}
1014  
1015  	if (!list_empty_careful(&log->l_reserve_head.waiters)) {
1016  		ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
1017  
1018  		spin_lock(&log->l_reserve_head.lock);
1019  		free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
1020  		xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
1021  		spin_unlock(&log->l_reserve_head.lock);
1022  	}
1023  }
1024  
1025  /*
1026   * Determine if we have a transaction that has gone to disk that needs to be
1027   * covered. To begin the transition to the idle state firstly the log needs to
1028   * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
1029   * we start attempting to cover the log.
1030   *
1031   * Only if we are then in a state where covering is needed, the caller is
1032   * informed that dummy transactions are required to move the log into the idle
1033   * state.
1034   *
1035   * If there are any items in the AIl or CIL, then we do not want to attempt to
1036   * cover the log as we may be in a situation where there isn't log space
1037   * available to run a dummy transaction and this can lead to deadlocks when the
1038   * tail of the log is pinned by an item that is modified in the CIL.  Hence
1039   * there's no point in running a dummy transaction at this point because we
1040   * can't start trying to idle the log until both the CIL and AIL are empty.
1041   */
1042  int
xfs_log_need_covered(xfs_mount_t * mp)1043  xfs_log_need_covered(xfs_mount_t *mp)
1044  {
1045  	struct xlog	*log = mp->m_log;
1046  	int		needed = 0;
1047  
1048  	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
1049  		return 0;
1050  
1051  	if (!xlog_cil_empty(log))
1052  		return 0;
1053  
1054  	spin_lock(&log->l_icloglock);
1055  	switch (log->l_covered_state) {
1056  	case XLOG_STATE_COVER_DONE:
1057  	case XLOG_STATE_COVER_DONE2:
1058  	case XLOG_STATE_COVER_IDLE:
1059  		break;
1060  	case XLOG_STATE_COVER_NEED:
1061  	case XLOG_STATE_COVER_NEED2:
1062  		if (xfs_ail_min_lsn(log->l_ailp))
1063  			break;
1064  		if (!xlog_iclogs_empty(log))
1065  			break;
1066  
1067  		needed = 1;
1068  		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
1069  			log->l_covered_state = XLOG_STATE_COVER_DONE;
1070  		else
1071  			log->l_covered_state = XLOG_STATE_COVER_DONE2;
1072  		break;
1073  	default:
1074  		needed = 1;
1075  		break;
1076  	}
1077  	spin_unlock(&log->l_icloglock);
1078  	return needed;
1079  }
1080  
1081  /*
1082   * We may be holding the log iclog lock upon entering this routine.
1083   */
1084  xfs_lsn_t
xlog_assign_tail_lsn_locked(struct xfs_mount * mp)1085  xlog_assign_tail_lsn_locked(
1086  	struct xfs_mount	*mp)
1087  {
1088  	struct xlog		*log = mp->m_log;
1089  	struct xfs_log_item	*lip;
1090  	xfs_lsn_t		tail_lsn;
1091  
1092  	assert_spin_locked(&mp->m_ail->xa_lock);
1093  
1094  	/*
1095  	 * To make sure we always have a valid LSN for the log tail we keep
1096  	 * track of the last LSN which was committed in log->l_last_sync_lsn,
1097  	 * and use that when the AIL was empty.
1098  	 */
1099  	lip = xfs_ail_min(mp->m_ail);
1100  	if (lip)
1101  		tail_lsn = lip->li_lsn;
1102  	else
1103  		tail_lsn = atomic64_read(&log->l_last_sync_lsn);
1104  	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
1105  	atomic64_set(&log->l_tail_lsn, tail_lsn);
1106  	return tail_lsn;
1107  }
1108  
1109  xfs_lsn_t
xlog_assign_tail_lsn(struct xfs_mount * mp)1110  xlog_assign_tail_lsn(
1111  	struct xfs_mount	*mp)
1112  {
1113  	xfs_lsn_t		tail_lsn;
1114  
1115  	spin_lock(&mp->m_ail->xa_lock);
1116  	tail_lsn = xlog_assign_tail_lsn_locked(mp);
1117  	spin_unlock(&mp->m_ail->xa_lock);
1118  
1119  	return tail_lsn;
1120  }
1121  
1122  /*
1123   * Return the space in the log between the tail and the head.  The head
1124   * is passed in the cycle/bytes formal parms.  In the special case where
1125   * the reserve head has wrapped passed the tail, this calculation is no
1126   * longer valid.  In this case, just return 0 which means there is no space
1127   * in the log.  This works for all places where this function is called
1128   * with the reserve head.  Of course, if the write head were to ever
1129   * wrap the tail, we should blow up.  Rather than catch this case here,
1130   * we depend on other ASSERTions in other parts of the code.   XXXmiken
1131   *
1132   * This code also handles the case where the reservation head is behind
1133   * the tail.  The details of this case are described below, but the end
1134   * result is that we return the size of the log as the amount of space left.
1135   */
1136  STATIC int
xlog_space_left(struct xlog * log,atomic64_t * head)1137  xlog_space_left(
1138  	struct xlog	*log,
1139  	atomic64_t	*head)
1140  {
1141  	int		free_bytes;
1142  	int		tail_bytes;
1143  	int		tail_cycle;
1144  	int		head_cycle;
1145  	int		head_bytes;
1146  
1147  	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
1148  	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
1149  	tail_bytes = BBTOB(tail_bytes);
1150  	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
1151  		free_bytes = log->l_logsize - (head_bytes - tail_bytes);
1152  	else if (tail_cycle + 1 < head_cycle)
1153  		return 0;
1154  	else if (tail_cycle < head_cycle) {
1155  		ASSERT(tail_cycle == (head_cycle - 1));
1156  		free_bytes = tail_bytes - head_bytes;
1157  	} else {
1158  		/*
1159  		 * The reservation head is behind the tail.
1160  		 * In this case we just want to return the size of the
1161  		 * log as the amount of space left.
1162  		 */
1163  		xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
1164  		xfs_alert(log->l_mp,
1165  			  "  tail_cycle = %d, tail_bytes = %d",
1166  			  tail_cycle, tail_bytes);
1167  		xfs_alert(log->l_mp,
1168  			  "  GH   cycle = %d, GH   bytes = %d",
1169  			  head_cycle, head_bytes);
1170  		ASSERT(0);
1171  		free_bytes = log->l_logsize;
1172  	}
1173  	return free_bytes;
1174  }
1175  
1176  
1177  /*
1178   * Log function which is called when an io completes.
1179   *
1180   * The log manager needs its own routine, in order to control what
1181   * happens with the buffer after the write completes.
1182   */
1183  void
xlog_iodone(xfs_buf_t * bp)1184  xlog_iodone(xfs_buf_t *bp)
1185  {
1186  	struct xlog_in_core	*iclog = bp->b_fspriv;
1187  	struct xlog		*l = iclog->ic_log;
1188  	int			aborted = 0;
1189  
1190  	/*
1191  	 * Race to shutdown the filesystem if we see an error.
1192  	 */
1193  	if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
1194  			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
1195  		xfs_buf_ioerror_alert(bp, __func__);
1196  		xfs_buf_stale(bp);
1197  		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
1198  		/*
1199  		 * This flag will be propagated to the trans-committed
1200  		 * callback routines to let them know that the log-commit
1201  		 * didn't succeed.
1202  		 */
1203  		aborted = XFS_LI_ABORTED;
1204  	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
1205  		aborted = XFS_LI_ABORTED;
1206  	}
1207  
1208  	/* log I/O is always issued ASYNC */
1209  	ASSERT(XFS_BUF_ISASYNC(bp));
1210  	xlog_state_done_syncing(iclog, aborted);
1211  
1212  	/*
1213  	 * drop the buffer lock now that we are done. Nothing references
1214  	 * the buffer after this, so an unmount waiting on this lock can now
1215  	 * tear it down safely. As such, it is unsafe to reference the buffer
1216  	 * (bp) after the unlock as we could race with it being freed.
1217  	 */
1218  	xfs_buf_unlock(bp);
1219  }
1220  
1221  /*
1222   * Return size of each in-core log record buffer.
1223   *
1224   * All machines get 8 x 32kB buffers by default, unless tuned otherwise.
1225   *
1226   * If the filesystem blocksize is too large, we may need to choose a
1227   * larger size since the directory code currently logs entire blocks.
1228   */
1229  
1230  STATIC void
xlog_get_iclog_buffer_size(struct xfs_mount * mp,struct xlog * log)1231  xlog_get_iclog_buffer_size(
1232  	struct xfs_mount	*mp,
1233  	struct xlog		*log)
1234  {
1235  	int size;
1236  	int xhdrs;
1237  
1238  	if (mp->m_logbufs <= 0)
1239  		log->l_iclog_bufs = XLOG_MAX_ICLOGS;
1240  	else
1241  		log->l_iclog_bufs = mp->m_logbufs;
1242  
1243  	/*
1244  	 * Buffer size passed in from mount system call.
1245  	 */
1246  	if (mp->m_logbsize > 0) {
1247  		size = log->l_iclog_size = mp->m_logbsize;
1248  		log->l_iclog_size_log = 0;
1249  		while (size != 1) {
1250  			log->l_iclog_size_log++;
1251  			size >>= 1;
1252  		}
1253  
1254  		if (xfs_sb_version_haslogv2(&mp->m_sb)) {
1255  			/* # headers = size / 32k
1256  			 * one header holds cycles from 32k of data
1257  			 */
1258  
1259  			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
1260  			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
1261  				xhdrs++;
1262  			log->l_iclog_hsize = xhdrs << BBSHIFT;
1263  			log->l_iclog_heads = xhdrs;
1264  		} else {
1265  			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
1266  			log->l_iclog_hsize = BBSIZE;
1267  			log->l_iclog_heads = 1;
1268  		}
1269  		goto done;
1270  	}
1271  
1272  	/* All machines use 32kB buffers by default. */
1273  	log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
1274  	log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
1275  
1276  	/* the default log size is 16k or 32k which is one header sector */
1277  	log->l_iclog_hsize = BBSIZE;
1278  	log->l_iclog_heads = 1;
1279  
1280  done:
1281  	/* are we being asked to make the sizes selected above visible? */
1282  	if (mp->m_logbufs == 0)
1283  		mp->m_logbufs = log->l_iclog_bufs;
1284  	if (mp->m_logbsize == 0)
1285  		mp->m_logbsize = log->l_iclog_size;
1286  }	/* xlog_get_iclog_buffer_size */
1287  
1288  
1289  void
xfs_log_work_queue(struct xfs_mount * mp)1290  xfs_log_work_queue(
1291  	struct xfs_mount        *mp)
1292  {
1293  	queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
1294  				msecs_to_jiffies(xfs_syncd_centisecs * 10));
1295  }
1296  
1297  /*
1298   * Every sync period we need to unpin all items in the AIL and push them to
1299   * disk. If there is nothing dirty, then we might need to cover the log to
1300   * indicate that the filesystem is idle.
1301   */
1302  void
xfs_log_worker(struct work_struct * work)1303  xfs_log_worker(
1304  	struct work_struct	*work)
1305  {
1306  	struct xlog		*log = container_of(to_delayed_work(work),
1307  						struct xlog, l_work);
1308  	struct xfs_mount	*mp = log->l_mp;
1309  
1310  	/* dgc: errors ignored - not fatal and nowhere to report them */
1311  	if (xfs_log_need_covered(mp)) {
1312  		/*
1313  		 * Dump a transaction into the log that contains no real change.
1314  		 * This is needed to stamp the current tail LSN into the log
1315  		 * during the covering operation.
1316  		 *
1317  		 * We cannot use an inode here for this - that will push dirty
1318  		 * state back up into the VFS and then periodic inode flushing
1319  		 * will prevent log covering from making progress. Hence we
1320  		 * synchronously log the superblock instead to ensure the
1321  		 * superblock is immediately unpinned and can be written back.
1322  		 */
1323  		xfs_sync_sb(mp, true);
1324  	} else
1325  		xfs_log_force(mp, 0);
1326  
1327  	/* start pushing all the metadata that is currently dirty */
1328  	xfs_ail_push_all(mp->m_ail);
1329  
1330  	/* queue us up again */
1331  	xfs_log_work_queue(mp);
1332  }
1333  
1334  /*
1335   * This routine initializes some of the log structure for a given mount point.
1336   * Its primary purpose is to fill in enough, so recovery can occur.  However,
1337   * some other stuff may be filled in too.
1338   */
1339  STATIC struct xlog *
xlog_alloc_log(struct xfs_mount * mp,struct xfs_buftarg * log_target,xfs_daddr_t blk_offset,int num_bblks)1340  xlog_alloc_log(
1341  	struct xfs_mount	*mp,
1342  	struct xfs_buftarg	*log_target,
1343  	xfs_daddr_t		blk_offset,
1344  	int			num_bblks)
1345  {
1346  	struct xlog		*log;
1347  	xlog_rec_header_t	*head;
1348  	xlog_in_core_t		**iclogp;
1349  	xlog_in_core_t		*iclog, *prev_iclog=NULL;
1350  	xfs_buf_t		*bp;
1351  	int			i;
1352  	int			error = -ENOMEM;
1353  	uint			log2_size = 0;
1354  
1355  	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
1356  	if (!log) {
1357  		xfs_warn(mp, "Log allocation failed: No memory!");
1358  		goto out;
1359  	}
1360  
1361  	log->l_mp	   = mp;
1362  	log->l_targ	   = log_target;
1363  	log->l_logsize     = BBTOB(num_bblks);
1364  	log->l_logBBstart  = blk_offset;
1365  	log->l_logBBsize   = num_bblks;
1366  	log->l_covered_state = XLOG_STATE_COVER_IDLE;
1367  	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
1368  	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1369  
1370  	log->l_prev_block  = -1;
1371  	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
1372  	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
1373  	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
1374  	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
1375  
1376  	xlog_grant_head_init(&log->l_reserve_head);
1377  	xlog_grant_head_init(&log->l_write_head);
1378  
1379  	error = -EFSCORRUPTED;
1380  	if (xfs_sb_version_hassector(&mp->m_sb)) {
1381  	        log2_size = mp->m_sb.sb_logsectlog;
1382  		if (log2_size < BBSHIFT) {
1383  			xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1384  				log2_size, BBSHIFT);
1385  			goto out_free_log;
1386  		}
1387  
1388  	        log2_size -= BBSHIFT;
1389  		if (log2_size > mp->m_sectbb_log) {
1390  			xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1391  				log2_size, mp->m_sectbb_log);
1392  			goto out_free_log;
1393  		}
1394  
1395  		/* for larger sector sizes, must have v2 or external log */
1396  		if (log2_size && log->l_logBBstart > 0 &&
1397  			    !xfs_sb_version_haslogv2(&mp->m_sb)) {
1398  			xfs_warn(mp,
1399  		"log sector size (0x%x) invalid for configuration.",
1400  				log2_size);
1401  			goto out_free_log;
1402  		}
1403  	}
1404  	log->l_sectBBsize = 1 << log2_size;
1405  
1406  	xlog_get_iclog_buffer_size(mp, log);
1407  
1408  	/*
1409  	 * Use a NULL block for the extra log buffer used during splits so that
1410  	 * it will trigger errors if we ever try to do IO on it without first
1411  	 * having set it up properly.
1412  	 */
1413  	error = -ENOMEM;
1414  	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
1415  			   BTOBB(log->l_iclog_size), 0);
1416  	if (!bp)
1417  		goto out_free_log;
1418  
1419  	/*
1420  	 * The iclogbuf buffer locks are held over IO but we are not going to do
1421  	 * IO yet.  Hence unlock the buffer so that the log IO path can grab it
1422  	 * when appropriately.
1423  	 */
1424  	ASSERT(xfs_buf_islocked(bp));
1425  	xfs_buf_unlock(bp);
1426  
1427  	/* use high priority wq for log I/O completion */
1428  	bp->b_ioend_wq = mp->m_log_workqueue;
1429  	bp->b_iodone = xlog_iodone;
1430  	log->l_xbuf = bp;
1431  
1432  	spin_lock_init(&log->l_icloglock);
1433  	init_waitqueue_head(&log->l_flush_wait);
1434  
1435  	iclogp = &log->l_iclog;
1436  	/*
1437  	 * The amount of memory to allocate for the iclog structure is
1438  	 * rather funky due to the way the structure is defined.  It is
1439  	 * done this way so that we can use different sizes for machines
1440  	 * with different amounts of memory.  See the definition of
1441  	 * xlog_in_core_t in xfs_log_priv.h for details.
1442  	 */
1443  	ASSERT(log->l_iclog_size >= 4096);
1444  	for (i=0; i < log->l_iclog_bufs; i++) {
1445  		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
1446  		if (!*iclogp)
1447  			goto out_free_iclog;
1448  
1449  		iclog = *iclogp;
1450  		iclog->ic_prev = prev_iclog;
1451  		prev_iclog = iclog;
1452  
1453  		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1454  						BTOBB(log->l_iclog_size), 0);
1455  		if (!bp)
1456  			goto out_free_iclog;
1457  
1458  		ASSERT(xfs_buf_islocked(bp));
1459  		xfs_buf_unlock(bp);
1460  
1461  		/* use high priority wq for log I/O completion */
1462  		bp->b_ioend_wq = mp->m_log_workqueue;
1463  		bp->b_iodone = xlog_iodone;
1464  		iclog->ic_bp = bp;
1465  		iclog->ic_data = bp->b_addr;
1466  #ifdef DEBUG
1467  		log->l_iclog_bak[i] = &iclog->ic_header;
1468  #endif
1469  		head = &iclog->ic_header;
1470  		memset(head, 0, sizeof(xlog_rec_header_t));
1471  		head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1472  		head->h_version = cpu_to_be32(
1473  			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1474  		head->h_size = cpu_to_be32(log->l_iclog_size);
1475  		/* new fields */
1476  		head->h_fmt = cpu_to_be32(XLOG_FMT);
1477  		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1478  
1479  		iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
1480  		iclog->ic_state = XLOG_STATE_ACTIVE;
1481  		iclog->ic_log = log;
1482  		atomic_set(&iclog->ic_refcnt, 0);
1483  		spin_lock_init(&iclog->ic_callback_lock);
1484  		iclog->ic_callback_tail = &(iclog->ic_callback);
1485  		iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
1486  
1487  		init_waitqueue_head(&iclog->ic_force_wait);
1488  		init_waitqueue_head(&iclog->ic_write_wait);
1489  
1490  		iclogp = &iclog->ic_next;
1491  	}
1492  	*iclogp = log->l_iclog;			/* complete ring */
1493  	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
1494  
1495  	error = xlog_cil_init(log);
1496  	if (error)
1497  		goto out_free_iclog;
1498  	return log;
1499  
1500  out_free_iclog:
1501  	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
1502  		prev_iclog = iclog->ic_next;
1503  		if (iclog->ic_bp)
1504  			xfs_buf_free(iclog->ic_bp);
1505  		kmem_free(iclog);
1506  		if (prev_iclog == log->l_iclog)
1507  			break;
1508  	}
1509  	spinlock_destroy(&log->l_icloglock);
1510  	xfs_buf_free(log->l_xbuf);
1511  out_free_log:
1512  	kmem_free(log);
1513  out:
1514  	return ERR_PTR(error);
1515  }	/* xlog_alloc_log */
1516  
1517  
1518  /*
1519   * Write out the commit record of a transaction associated with the given
1520   * ticket.  Return the lsn of the commit record.
1521   */
1522  STATIC int
xlog_commit_record(struct xlog * log,struct xlog_ticket * ticket,struct xlog_in_core ** iclog,xfs_lsn_t * commitlsnp)1523  xlog_commit_record(
1524  	struct xlog		*log,
1525  	struct xlog_ticket	*ticket,
1526  	struct xlog_in_core	**iclog,
1527  	xfs_lsn_t		*commitlsnp)
1528  {
1529  	struct xfs_mount *mp = log->l_mp;
1530  	int	error;
1531  	struct xfs_log_iovec reg = {
1532  		.i_addr = NULL,
1533  		.i_len = 0,
1534  		.i_type = XLOG_REG_TYPE_COMMIT,
1535  	};
1536  	struct xfs_log_vec vec = {
1537  		.lv_niovecs = 1,
1538  		.lv_iovecp = &reg,
1539  	};
1540  
1541  	ASSERT_ALWAYS(iclog);
1542  	error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
1543  					XLOG_COMMIT_TRANS);
1544  	if (error)
1545  		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1546  	return error;
1547  }
1548  
1549  /*
1550   * Push on the buffer cache code if we ever use more than 75% of the on-disk
1551   * log space.  This code pushes on the lsn which would supposedly free up
1552   * the 25% which we want to leave free.  We may need to adopt a policy which
1553   * pushes on an lsn which is further along in the log once we reach the high
1554   * water mark.  In this manner, we would be creating a low water mark.
1555   */
1556  STATIC void
xlog_grant_push_ail(struct xlog * log,int need_bytes)1557  xlog_grant_push_ail(
1558  	struct xlog	*log,
1559  	int		need_bytes)
1560  {
1561  	xfs_lsn_t	threshold_lsn = 0;
1562  	xfs_lsn_t	last_sync_lsn;
1563  	int		free_blocks;
1564  	int		free_bytes;
1565  	int		threshold_block;
1566  	int		threshold_cycle;
1567  	int		free_threshold;
1568  
1569  	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
1570  
1571  	free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
1572  	free_blocks = BTOBBT(free_bytes);
1573  
1574  	/*
1575  	 * Set the threshold for the minimum number of free blocks in the
1576  	 * log to the maximum of what the caller needs, one quarter of the
1577  	 * log, and 256 blocks.
1578  	 */
1579  	free_threshold = BTOBB(need_bytes);
1580  	free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
1581  	free_threshold = MAX(free_threshold, 256);
1582  	if (free_blocks >= free_threshold)
1583  		return;
1584  
1585  	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
1586  						&threshold_block);
1587  	threshold_block += free_threshold;
1588  	if (threshold_block >= log->l_logBBsize) {
1589  		threshold_block -= log->l_logBBsize;
1590  		threshold_cycle += 1;
1591  	}
1592  	threshold_lsn = xlog_assign_lsn(threshold_cycle,
1593  					threshold_block);
1594  	/*
1595  	 * Don't pass in an lsn greater than the lsn of the last
1596  	 * log record known to be on disk. Use a snapshot of the last sync lsn
1597  	 * so that it doesn't change between the compare and the set.
1598  	 */
1599  	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
1600  	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
1601  		threshold_lsn = last_sync_lsn;
1602  
1603  	/*
1604  	 * Get the transaction layer to kick the dirty buffers out to
1605  	 * disk asynchronously. No point in trying to do this if
1606  	 * the filesystem is shutting down.
1607  	 */
1608  	if (!XLOG_FORCED_SHUTDOWN(log))
1609  		xfs_ail_push(log->l_ailp, threshold_lsn);
1610  }
1611  
1612  /*
1613   * Stamp cycle number in every block
1614   */
1615  STATIC void
xlog_pack_data(struct xlog * log,struct xlog_in_core * iclog,int roundoff)1616  xlog_pack_data(
1617  	struct xlog		*log,
1618  	struct xlog_in_core	*iclog,
1619  	int			roundoff)
1620  {
1621  	int			i, j, k;
1622  	int			size = iclog->ic_offset + roundoff;
1623  	__be32			cycle_lsn;
1624  	char			*dp;
1625  
1626  	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1627  
1628  	dp = iclog->ic_datap;
1629  	for (i = 0; i < BTOBB(size); i++) {
1630  		if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1631  			break;
1632  		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
1633  		*(__be32 *)dp = cycle_lsn;
1634  		dp += BBSIZE;
1635  	}
1636  
1637  	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1638  		xlog_in_core_2_t *xhdr = iclog->ic_data;
1639  
1640  		for ( ; i < BTOBB(size); i++) {
1641  			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1642  			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1643  			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
1644  			*(__be32 *)dp = cycle_lsn;
1645  			dp += BBSIZE;
1646  		}
1647  
1648  		for (i = 1; i < log->l_iclog_heads; i++)
1649  			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1650  	}
1651  }
1652  
1653  /*
1654   * Calculate the checksum for a log buffer.
1655   *
1656   * This is a little more complicated than it should be because the various
1657   * headers and the actual data are non-contiguous.
1658   */
1659  __le32
xlog_cksum(struct xlog * log,struct xlog_rec_header * rhead,char * dp,int size)1660  xlog_cksum(
1661  	struct xlog		*log,
1662  	struct xlog_rec_header	*rhead,
1663  	char			*dp,
1664  	int			size)
1665  {
1666  	__uint32_t		crc;
1667  
1668  	/* first generate the crc for the record header ... */
1669  	crc = xfs_start_cksum((char *)rhead,
1670  			      sizeof(struct xlog_rec_header),
1671  			      offsetof(struct xlog_rec_header, h_crc));
1672  
1673  	/* ... then for additional cycle data for v2 logs ... */
1674  	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1675  		union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
1676  		int		i;
1677  		int		xheads;
1678  
1679  		xheads = size / XLOG_HEADER_CYCLE_SIZE;
1680  		if (size % XLOG_HEADER_CYCLE_SIZE)
1681  			xheads++;
1682  
1683  		for (i = 1; i < xheads; i++) {
1684  			crc = crc32c(crc, &xhdr[i].hic_xheader,
1685  				     sizeof(struct xlog_rec_ext_header));
1686  		}
1687  	}
1688  
1689  	/* ... and finally for the payload */
1690  	crc = crc32c(crc, dp, size);
1691  
1692  	return xfs_end_cksum(crc);
1693  }
1694  
1695  /*
1696   * The bdstrat callback function for log bufs. This gives us a central
1697   * place to trap bufs in case we get hit by a log I/O error and need to
1698   * shutdown. Actually, in practice, even when we didn't get a log error,
1699   * we transition the iclogs to IOERROR state *after* flushing all existing
1700   * iclogs to disk. This is because we don't want anymore new transactions to be
1701   * started or completed afterwards.
1702   *
1703   * We lock the iclogbufs here so that we can serialise against IO completion
1704   * during unmount. We might be processing a shutdown triggered during unmount,
1705   * and that can occur asynchronously to the unmount thread, and hence we need to
1706   * ensure that completes before tearing down the iclogbufs. Hence we need to
1707   * hold the buffer lock across the log IO to acheive that.
1708   */
1709  STATIC int
xlog_bdstrat(struct xfs_buf * bp)1710  xlog_bdstrat(
1711  	struct xfs_buf		*bp)
1712  {
1713  	struct xlog_in_core	*iclog = bp->b_fspriv;
1714  
1715  	xfs_buf_lock(bp);
1716  	if (iclog->ic_state & XLOG_STATE_IOERROR) {
1717  		xfs_buf_ioerror(bp, -EIO);
1718  		xfs_buf_stale(bp);
1719  		xfs_buf_ioend(bp);
1720  		/*
1721  		 * It would seem logical to return EIO here, but we rely on
1722  		 * the log state machine to propagate I/O errors instead of
1723  		 * doing it here. Similarly, IO completion will unlock the
1724  		 * buffer, so we don't do it here.
1725  		 */
1726  		return 0;
1727  	}
1728  
1729  	xfs_buf_submit(bp);
1730  	return 0;
1731  }
1732  
1733  /*
1734   * Flush out the in-core log (iclog) to the on-disk log in an asynchronous
1735   * fashion.  Previously, we should have moved the current iclog
1736   * ptr in the log to point to the next available iclog.  This allows further
1737   * write to continue while this code syncs out an iclog ready to go.
1738   * Before an in-core log can be written out, the data section must be scanned
1739   * to save away the 1st word of each BBSIZE block into the header.  We replace
1740   * it with the current cycle count.  Each BBSIZE block is tagged with the
1741   * cycle count because there in an implicit assumption that drives will
1742   * guarantee that entire 512 byte blocks get written at once.  In other words,
1743   * we can't have part of a 512 byte block written and part not written.  By
1744   * tagging each block, we will know which blocks are valid when recovering
1745   * after an unclean shutdown.
1746   *
1747   * This routine is single threaded on the iclog.  No other thread can be in
1748   * this routine with the same iclog.  Changing contents of iclog can there-
1749   * fore be done without grabbing the state machine lock.  Updating the global
1750   * log will require grabbing the lock though.
1751   *
1752   * The entire log manager uses a logical block numbering scheme.  Only
1753   * log_sync (and then only bwrite()) know about the fact that the log may
1754   * not start with block zero on a given device.  The log block start offset
1755   * is added immediately before calling bwrite().
1756   */
1757  
1758  STATIC int
xlog_sync(struct xlog * log,struct xlog_in_core * iclog)1759  xlog_sync(
1760  	struct xlog		*log,
1761  	struct xlog_in_core	*iclog)
1762  {
1763  	xfs_buf_t	*bp;
1764  	int		i;
1765  	uint		count;		/* byte count of bwrite */
1766  	uint		count_init;	/* initial count before roundup */
1767  	int		roundoff;       /* roundoff to BB or stripe */
1768  	int		split = 0;	/* split write into two regions */
1769  	int		error;
1770  	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1771  	int		size;
1772  
1773  	XFS_STATS_INC(log->l_mp, xs_log_writes);
1774  	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
1775  
1776  	/* Add for LR header */
1777  	count_init = log->l_iclog_hsize + iclog->ic_offset;
1778  
1779  	/* Round out the log write size */
1780  	if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
1781  		/* we have a v2 stripe unit to use */
1782  		count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
1783  	} else {
1784  		count = BBTOB(BTOBB(count_init));
1785  	}
1786  	roundoff = count - count_init;
1787  	ASSERT(roundoff >= 0);
1788  	ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
1789                  roundoff < log->l_mp->m_sb.sb_logsunit)
1790  		||
1791  		(log->l_mp->m_sb.sb_logsunit <= 1 &&
1792  		 roundoff < BBTOB(1)));
1793  
1794  	/* move grant heads by roundoff in sync */
1795  	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
1796  	xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
1797  
1798  	/* put cycle number in every block */
1799  	xlog_pack_data(log, iclog, roundoff);
1800  
1801  	/* real byte length */
1802  	size = iclog->ic_offset;
1803  	if (v2)
1804  		size += roundoff;
1805  	iclog->ic_header.h_len = cpu_to_be32(size);
1806  
1807  	bp = iclog->ic_bp;
1808  	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
1809  
1810  	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
1811  
1812  	/* Do we need to split this write into 2 parts? */
1813  	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1814  		char		*dptr;
1815  
1816  		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1817  		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1818  		iclog->ic_bwritecnt = 2;
1819  
1820  		/*
1821  		 * Bump the cycle numbers at the start of each block in the
1822  		 * part of the iclog that ends up in the buffer that gets
1823  		 * written to the start of the log.
1824  		 *
1825  		 * Watch out for the header magic number case, though.
1826  		 */
1827  		dptr = (char *)&iclog->ic_header + count;
1828  		for (i = 0; i < split; i += BBSIZE) {
1829  			__uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
1830  			if (++cycle == XLOG_HEADER_MAGIC_NUM)
1831  				cycle++;
1832  			*(__be32 *)dptr = cpu_to_be32(cycle);
1833  
1834  			dptr += BBSIZE;
1835  		}
1836  	} else {
1837  		iclog->ic_bwritecnt = 1;
1838  	}
1839  
1840  	/* calculcate the checksum */
1841  	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
1842  					    iclog->ic_datap, size);
1843  
1844  	bp->b_io_length = BTOBB(count);
1845  	bp->b_fspriv = iclog;
1846  	XFS_BUF_ZEROFLAGS(bp);
1847  	XFS_BUF_ASYNC(bp);
1848  	bp->b_flags |= XBF_SYNCIO;
1849  
1850  	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
1851  		bp->b_flags |= XBF_FUA;
1852  
1853  		/*
1854  		 * Flush the data device before flushing the log to make
1855  		 * sure all meta data written back from the AIL actually made
1856  		 * it to disk before stamping the new log tail LSN into the
1857  		 * log buffer.  For an external log we need to issue the
1858  		 * flush explicitly, and unfortunately synchronously here;
1859  		 * for an internal log we can simply use the block layer
1860  		 * state machine for preflushes.
1861  		 */
1862  		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
1863  			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
1864  		else
1865  			bp->b_flags |= XBF_FLUSH;
1866  	}
1867  
1868  	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1869  	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1870  
1871  	xlog_verify_iclog(log, iclog, count, true);
1872  
1873  	/* account for log which doesn't start at block #0 */
1874  	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1875  	/*
1876  	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
1877  	 * is shutting down.
1878  	 */
1879  	XFS_BUF_WRITE(bp);
1880  
1881  	error = xlog_bdstrat(bp);
1882  	if (error) {
1883  		xfs_buf_ioerror_alert(bp, "xlog_sync");
1884  		return error;
1885  	}
1886  	if (split) {
1887  		bp = iclog->ic_log->l_xbuf;
1888  		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
1889  		xfs_buf_associate_memory(bp,
1890  				(char *)&iclog->ic_header + count, split);
1891  		bp->b_fspriv = iclog;
1892  		XFS_BUF_ZEROFLAGS(bp);
1893  		XFS_BUF_ASYNC(bp);
1894  		bp->b_flags |= XBF_SYNCIO;
1895  		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1896  			bp->b_flags |= XBF_FUA;
1897  
1898  		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1899  		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
1900  
1901  		/* account for internal log which doesn't start at block #0 */
1902  		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
1903  		XFS_BUF_WRITE(bp);
1904  		error = xlog_bdstrat(bp);
1905  		if (error) {
1906  			xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
1907  			return error;
1908  		}
1909  	}
1910  	return 0;
1911  }	/* xlog_sync */
1912  
1913  /*
1914   * Deallocate a log structure
1915   */
1916  STATIC void
xlog_dealloc_log(struct xlog * log)1917  xlog_dealloc_log(
1918  	struct xlog	*log)
1919  {
1920  	xlog_in_core_t	*iclog, *next_iclog;
1921  	int		i;
1922  
1923  	xlog_cil_destroy(log);
1924  
1925  	/*
1926  	 * Cycle all the iclogbuf locks to make sure all log IO completion
1927  	 * is done before we tear down these buffers.
1928  	 */
1929  	iclog = log->l_iclog;
1930  	for (i = 0; i < log->l_iclog_bufs; i++) {
1931  		xfs_buf_lock(iclog->ic_bp);
1932  		xfs_buf_unlock(iclog->ic_bp);
1933  		iclog = iclog->ic_next;
1934  	}
1935  
1936  	/*
1937  	 * Always need to ensure that the extra buffer does not point to memory
1938  	 * owned by another log buffer before we free it. Also, cycle the lock
1939  	 * first to ensure we've completed IO on it.
1940  	 */
1941  	xfs_buf_lock(log->l_xbuf);
1942  	xfs_buf_unlock(log->l_xbuf);
1943  	xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
1944  	xfs_buf_free(log->l_xbuf);
1945  
1946  	iclog = log->l_iclog;
1947  	for (i = 0; i < log->l_iclog_bufs; i++) {
1948  		xfs_buf_free(iclog->ic_bp);
1949  		next_iclog = iclog->ic_next;
1950  		kmem_free(iclog);
1951  		iclog = next_iclog;
1952  	}
1953  	spinlock_destroy(&log->l_icloglock);
1954  
1955  	log->l_mp->m_log = NULL;
1956  	kmem_free(log);
1957  }	/* xlog_dealloc_log */
1958  
1959  /*
1960   * Update counters atomically now that memcpy is done.
1961   */
1962  /* ARGSUSED */
1963  static inline void
xlog_state_finish_copy(struct xlog * log,struct xlog_in_core * iclog,int record_cnt,int copy_bytes)1964  xlog_state_finish_copy(
1965  	struct xlog		*log,
1966  	struct xlog_in_core	*iclog,
1967  	int			record_cnt,
1968  	int			copy_bytes)
1969  {
1970  	spin_lock(&log->l_icloglock);
1971  
1972  	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
1973  	iclog->ic_offset += copy_bytes;
1974  
1975  	spin_unlock(&log->l_icloglock);
1976  }	/* xlog_state_finish_copy */
1977  
1978  
1979  
1980  
1981  /*
1982   * print out info relating to regions written which consume
1983   * the reservation
1984   */
1985  void
xlog_print_tic_res(struct xfs_mount * mp,struct xlog_ticket * ticket)1986  xlog_print_tic_res(
1987  	struct xfs_mount	*mp,
1988  	struct xlog_ticket	*ticket)
1989  {
1990  	uint i;
1991  	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
1992  
1993  	/* match with XLOG_REG_TYPE_* in xfs_log.h */
1994  	static char *res_type_str[XLOG_REG_TYPE_MAX] = {
1995  	    "bformat",
1996  	    "bchunk",
1997  	    "efi_format",
1998  	    "efd_format",
1999  	    "iformat",
2000  	    "icore",
2001  	    "iext",
2002  	    "ibroot",
2003  	    "ilocal",
2004  	    "iattr_ext",
2005  	    "iattr_broot",
2006  	    "iattr_local",
2007  	    "qformat",
2008  	    "dquot",
2009  	    "quotaoff",
2010  	    "LR header",
2011  	    "unmount",
2012  	    "commit",
2013  	    "trans header"
2014  	};
2015  	static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
2016  	    "SETATTR_NOT_SIZE",
2017  	    "SETATTR_SIZE",
2018  	    "INACTIVE",
2019  	    "CREATE",
2020  	    "CREATE_TRUNC",
2021  	    "TRUNCATE_FILE",
2022  	    "REMOVE",
2023  	    "LINK",
2024  	    "RENAME",
2025  	    "MKDIR",
2026  	    "RMDIR",
2027  	    "SYMLINK",
2028  	    "SET_DMATTRS",
2029  	    "GROWFS",
2030  	    "STRAT_WRITE",
2031  	    "DIOSTRAT",
2032  	    "WRITE_SYNC",
2033  	    "WRITEID",
2034  	    "ADDAFORK",
2035  	    "ATTRINVAL",
2036  	    "ATRUNCATE",
2037  	    "ATTR_SET",
2038  	    "ATTR_RM",
2039  	    "ATTR_FLAG",
2040  	    "CLEAR_AGI_BUCKET",
2041  	    "QM_SBCHANGE",
2042  	    "DUMMY1",
2043  	    "DUMMY2",
2044  	    "QM_QUOTAOFF",
2045  	    "QM_DQALLOC",
2046  	    "QM_SETQLIM",
2047  	    "QM_DQCLUSTER",
2048  	    "QM_QINOCREATE",
2049  	    "QM_QUOTAOFF_END",
2050  	    "SB_UNIT",
2051  	    "FSYNC_TS",
2052  	    "GROWFSRT_ALLOC",
2053  	    "GROWFSRT_ZERO",
2054  	    "GROWFSRT_FREE",
2055  	    "SWAPEXT"
2056  	};
2057  
2058  	xfs_warn(mp, "xlog_write: reservation summary:");
2059  	xfs_warn(mp, "  trans type  = %s (%u)",
2060  		 ((ticket->t_trans_type <= 0 ||
2061  		   ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
2062  		  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
2063  		 ticket->t_trans_type);
2064  	xfs_warn(mp, "  unit res    = %d bytes",
2065  		 ticket->t_unit_res);
2066  	xfs_warn(mp, "  current res = %d bytes",
2067  		 ticket->t_curr_res);
2068  	xfs_warn(mp, "  total reg   = %u bytes (o/flow = %u bytes)",
2069  		 ticket->t_res_arr_sum, ticket->t_res_o_flow);
2070  	xfs_warn(mp, "  ophdrs      = %u (ophdr space = %u bytes)",
2071  		 ticket->t_res_num_ophdrs, ophdr_spc);
2072  	xfs_warn(mp, "  ophdr + reg = %u bytes",
2073  		 ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
2074  	xfs_warn(mp, "  num regions = %u",
2075  		 ticket->t_res_num);
2076  
2077  	for (i = 0; i < ticket->t_res_num; i++) {
2078  		uint r_type = ticket->t_res_arr[i].r_type;
2079  		xfs_warn(mp, "region[%u]: %s - %u bytes", i,
2080  			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
2081  			    "bad-rtype" : res_type_str[r_type-1]),
2082  			    ticket->t_res_arr[i].r_len);
2083  	}
2084  
2085  	xfs_alert_tag(mp, XFS_PTAG_LOGRES,
2086  		"xlog_write: reservation ran out. Need to up reservation");
2087  	xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
2088  }
2089  
2090  /*
2091   * Calculate the potential space needed by the log vector.  Each region gets
2092   * its own xlog_op_header_t and may need to be double word aligned.
2093   */
2094  static int
xlog_write_calc_vec_length(struct xlog_ticket * ticket,struct xfs_log_vec * log_vector)2095  xlog_write_calc_vec_length(
2096  	struct xlog_ticket	*ticket,
2097  	struct xfs_log_vec	*log_vector)
2098  {
2099  	struct xfs_log_vec	*lv;
2100  	int			headers = 0;
2101  	int			len = 0;
2102  	int			i;
2103  
2104  	/* acct for start rec of xact */
2105  	if (ticket->t_flags & XLOG_TIC_INITED)
2106  		headers++;
2107  
2108  	for (lv = log_vector; lv; lv = lv->lv_next) {
2109  		/* we don't write ordered log vectors */
2110  		if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
2111  			continue;
2112  
2113  		headers += lv->lv_niovecs;
2114  
2115  		for (i = 0; i < lv->lv_niovecs; i++) {
2116  			struct xfs_log_iovec	*vecp = &lv->lv_iovecp[i];
2117  
2118  			len += vecp->i_len;
2119  			xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
2120  		}
2121  	}
2122  
2123  	ticket->t_res_num_ophdrs += headers;
2124  	len += headers * sizeof(struct xlog_op_header);
2125  
2126  	return len;
2127  }
2128  
2129  /*
2130   * If first write for transaction, insert start record  We can't be trying to
2131   * commit if we are inited.  We can't have any "partial_copy" if we are inited.
2132   */
2133  static int
xlog_write_start_rec(struct xlog_op_header * ophdr,struct xlog_ticket * ticket)2134  xlog_write_start_rec(
2135  	struct xlog_op_header	*ophdr,
2136  	struct xlog_ticket	*ticket)
2137  {
2138  	if (!(ticket->t_flags & XLOG_TIC_INITED))
2139  		return 0;
2140  
2141  	ophdr->oh_tid	= cpu_to_be32(ticket->t_tid);
2142  	ophdr->oh_clientid = ticket->t_clientid;
2143  	ophdr->oh_len = 0;
2144  	ophdr->oh_flags = XLOG_START_TRANS;
2145  	ophdr->oh_res2 = 0;
2146  
2147  	ticket->t_flags &= ~XLOG_TIC_INITED;
2148  
2149  	return sizeof(struct xlog_op_header);
2150  }
2151  
2152  static xlog_op_header_t *
xlog_write_setup_ophdr(struct xlog * log,struct xlog_op_header * ophdr,struct xlog_ticket * ticket,uint flags)2153  xlog_write_setup_ophdr(
2154  	struct xlog		*log,
2155  	struct xlog_op_header	*ophdr,
2156  	struct xlog_ticket	*ticket,
2157  	uint			flags)
2158  {
2159  	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
2160  	ophdr->oh_clientid = ticket->t_clientid;
2161  	ophdr->oh_res2 = 0;
2162  
2163  	/* are we copying a commit or unmount record? */
2164  	ophdr->oh_flags = flags;
2165  
2166  	/*
2167  	 * We've seen logs corrupted with bad transaction client ids.  This
2168  	 * makes sure that XFS doesn't generate them on.  Turn this into an EIO
2169  	 * and shut down the filesystem.
2170  	 */
2171  	switch (ophdr->oh_clientid)  {
2172  	case XFS_TRANSACTION:
2173  	case XFS_VOLUME:
2174  	case XFS_LOG:
2175  		break;
2176  	default:
2177  		xfs_warn(log->l_mp,
2178  			"Bad XFS transaction clientid 0x%x in ticket 0x%p",
2179  			ophdr->oh_clientid, ticket);
2180  		return NULL;
2181  	}
2182  
2183  	return ophdr;
2184  }
2185  
2186  /*
2187   * Set up the parameters of the region copy into the log. This has
2188   * to handle region write split across multiple log buffers - this
2189   * state is kept external to this function so that this code can
2190   * be written in an obvious, self documenting manner.
2191   */
2192  static int
xlog_write_setup_copy(struct xlog_ticket * ticket,struct xlog_op_header * ophdr,int space_available,int space_required,int * copy_off,int * copy_len,int * last_was_partial_copy,int * bytes_consumed)2193  xlog_write_setup_copy(
2194  	struct xlog_ticket	*ticket,
2195  	struct xlog_op_header	*ophdr,
2196  	int			space_available,
2197  	int			space_required,
2198  	int			*copy_off,
2199  	int			*copy_len,
2200  	int			*last_was_partial_copy,
2201  	int			*bytes_consumed)
2202  {
2203  	int			still_to_copy;
2204  
2205  	still_to_copy = space_required - *bytes_consumed;
2206  	*copy_off = *bytes_consumed;
2207  
2208  	if (still_to_copy <= space_available) {
2209  		/* write of region completes here */
2210  		*copy_len = still_to_copy;
2211  		ophdr->oh_len = cpu_to_be32(*copy_len);
2212  		if (*last_was_partial_copy)
2213  			ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
2214  		*last_was_partial_copy = 0;
2215  		*bytes_consumed = 0;
2216  		return 0;
2217  	}
2218  
2219  	/* partial write of region, needs extra log op header reservation */
2220  	*copy_len = space_available;
2221  	ophdr->oh_len = cpu_to_be32(*copy_len);
2222  	ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
2223  	if (*last_was_partial_copy)
2224  		ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
2225  	*bytes_consumed += *copy_len;
2226  	(*last_was_partial_copy)++;
2227  
2228  	/* account for new log op header */
2229  	ticket->t_curr_res -= sizeof(struct xlog_op_header);
2230  	ticket->t_res_num_ophdrs++;
2231  
2232  	return sizeof(struct xlog_op_header);
2233  }
2234  
2235  static int
xlog_write_copy_finish(struct xlog * log,struct xlog_in_core * iclog,uint flags,int * record_cnt,int * data_cnt,int * partial_copy,int * partial_copy_len,int log_offset,struct xlog_in_core ** commit_iclog)2236  xlog_write_copy_finish(
2237  	struct xlog		*log,
2238  	struct xlog_in_core	*iclog,
2239  	uint			flags,
2240  	int			*record_cnt,
2241  	int			*data_cnt,
2242  	int			*partial_copy,
2243  	int			*partial_copy_len,
2244  	int			log_offset,
2245  	struct xlog_in_core	**commit_iclog)
2246  {
2247  	if (*partial_copy) {
2248  		/*
2249  		 * This iclog has already been marked WANT_SYNC by
2250  		 * xlog_state_get_iclog_space.
2251  		 */
2252  		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
2253  		*record_cnt = 0;
2254  		*data_cnt = 0;
2255  		return xlog_state_release_iclog(log, iclog);
2256  	}
2257  
2258  	*partial_copy = 0;
2259  	*partial_copy_len = 0;
2260  
2261  	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
2262  		/* no more space in this iclog - push it. */
2263  		xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
2264  		*record_cnt = 0;
2265  		*data_cnt = 0;
2266  
2267  		spin_lock(&log->l_icloglock);
2268  		xlog_state_want_sync(log, iclog);
2269  		spin_unlock(&log->l_icloglock);
2270  
2271  		if (!commit_iclog)
2272  			return xlog_state_release_iclog(log, iclog);
2273  		ASSERT(flags & XLOG_COMMIT_TRANS);
2274  		*commit_iclog = iclog;
2275  	}
2276  
2277  	return 0;
2278  }
2279  
2280  /*
2281   * Write some region out to in-core log
2282   *
2283   * This will be called when writing externally provided regions or when
2284   * writing out a commit record for a given transaction.
2285   *
2286   * General algorithm:
2287   *	1. Find total length of this write.  This may include adding to the
2288   *		lengths passed in.
2289   *	2. Check whether we violate the tickets reservation.
2290   *	3. While writing to this iclog
2291   *	    A. Reserve as much space in this iclog as can get
2292   *	    B. If this is first write, save away start lsn
2293   *	    C. While writing this region:
2294   *		1. If first write of transaction, write start record
2295   *		2. Write log operation header (header per region)
2296   *		3. Find out if we can fit entire region into this iclog
2297   *		4. Potentially, verify destination memcpy ptr
2298   *		5. Memcpy (partial) region
2299   *		6. If partial copy, release iclog; otherwise, continue
2300   *			copying more regions into current iclog
2301   *	4. Mark want sync bit (in simulation mode)
2302   *	5. Release iclog for potential flush to on-disk log.
2303   *
2304   * ERRORS:
2305   * 1.	Panic if reservation is overrun.  This should never happen since
2306   *	reservation amounts are generated internal to the filesystem.
2307   * NOTES:
2308   * 1. Tickets are single threaded data structures.
2309   * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
2310   *	syncing routine.  When a single log_write region needs to span
2311   *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
2312   *	on all log operation writes which don't contain the end of the
2313   *	region.  The XLOG_END_TRANS bit is used for the in-core log
2314   *	operation which contains the end of the continued log_write region.
2315   * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
2316   *	we don't really know exactly how much space will be used.  As a result,
2317   *	we don't update ic_offset until the end when we know exactly how many
2318   *	bytes have been written out.
2319   */
2320  int
xlog_write(struct xlog * log,struct xfs_log_vec * log_vector,struct xlog_ticket * ticket,xfs_lsn_t * start_lsn,struct xlog_in_core ** commit_iclog,uint flags)2321  xlog_write(
2322  	struct xlog		*log,
2323  	struct xfs_log_vec	*log_vector,
2324  	struct xlog_ticket	*ticket,
2325  	xfs_lsn_t		*start_lsn,
2326  	struct xlog_in_core	**commit_iclog,
2327  	uint			flags)
2328  {
2329  	struct xlog_in_core	*iclog = NULL;
2330  	struct xfs_log_iovec	*vecp;
2331  	struct xfs_log_vec	*lv;
2332  	int			len;
2333  	int			index;
2334  	int			partial_copy = 0;
2335  	int			partial_copy_len = 0;
2336  	int			contwr = 0;
2337  	int			record_cnt = 0;
2338  	int			data_cnt = 0;
2339  	int			error;
2340  
2341  	*start_lsn = 0;
2342  
2343  	len = xlog_write_calc_vec_length(ticket, log_vector);
2344  
2345  	/*
2346  	 * Region headers and bytes are already accounted for.
2347  	 * We only need to take into account start records and
2348  	 * split regions in this function.
2349  	 */
2350  	if (ticket->t_flags & XLOG_TIC_INITED)
2351  		ticket->t_curr_res -= sizeof(xlog_op_header_t);
2352  
2353  	/*
2354  	 * Commit record headers need to be accounted for. These
2355  	 * come in as separate writes so are easy to detect.
2356  	 */
2357  	if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
2358  		ticket->t_curr_res -= sizeof(xlog_op_header_t);
2359  
2360  	if (ticket->t_curr_res < 0)
2361  		xlog_print_tic_res(log->l_mp, ticket);
2362  
2363  	index = 0;
2364  	lv = log_vector;
2365  	vecp = lv->lv_iovecp;
2366  	while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2367  		void		*ptr;
2368  		int		log_offset;
2369  
2370  		error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
2371  						   &contwr, &log_offset);
2372  		if (error)
2373  			return error;
2374  
2375  		ASSERT(log_offset <= iclog->ic_size - 1);
2376  		ptr = iclog->ic_datap + log_offset;
2377  
2378  		/* start_lsn is the first lsn written to. That's all we need. */
2379  		if (!*start_lsn)
2380  			*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2381  
2382  		/*
2383  		 * This loop writes out as many regions as can fit in the amount
2384  		 * of space which was allocated by xlog_state_get_iclog_space().
2385  		 */
2386  		while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2387  			struct xfs_log_iovec	*reg;
2388  			struct xlog_op_header	*ophdr;
2389  			int			start_rec_copy;
2390  			int			copy_len;
2391  			int			copy_off;
2392  			bool			ordered = false;
2393  
2394  			/* ordered log vectors have no regions to write */
2395  			if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
2396  				ASSERT(lv->lv_niovecs == 0);
2397  				ordered = true;
2398  				goto next_lv;
2399  			}
2400  
2401  			reg = &vecp[index];
2402  			ASSERT(reg->i_len % sizeof(__int32_t) == 0);
2403  			ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
2404  
2405  			start_rec_copy = xlog_write_start_rec(ptr, ticket);
2406  			if (start_rec_copy) {
2407  				record_cnt++;
2408  				xlog_write_adv_cnt(&ptr, &len, &log_offset,
2409  						   start_rec_copy);
2410  			}
2411  
2412  			ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
2413  			if (!ophdr)
2414  				return -EIO;
2415  
2416  			xlog_write_adv_cnt(&ptr, &len, &log_offset,
2417  					   sizeof(struct xlog_op_header));
2418  
2419  			len += xlog_write_setup_copy(ticket, ophdr,
2420  						     iclog->ic_size-log_offset,
2421  						     reg->i_len,
2422  						     &copy_off, &copy_len,
2423  						     &partial_copy,
2424  						     &partial_copy_len);
2425  			xlog_verify_dest_ptr(log, ptr);
2426  
2427  			/*
2428  			 * Copy region.
2429  			 *
2430  			 * Unmount records just log an opheader, so can have
2431  			 * empty payloads with no data region to copy. Hence we
2432  			 * only copy the payload if the vector says it has data
2433  			 * to copy.
2434  			 */
2435  			ASSERT(copy_len >= 0);
2436  			if (copy_len > 0) {
2437  				memcpy(ptr, reg->i_addr + copy_off, copy_len);
2438  				xlog_write_adv_cnt(&ptr, &len, &log_offset,
2439  						   copy_len);
2440  			}
2441  			copy_len += start_rec_copy + sizeof(xlog_op_header_t);
2442  			record_cnt++;
2443  			data_cnt += contwr ? copy_len : 0;
2444  
2445  			error = xlog_write_copy_finish(log, iclog, flags,
2446  						       &record_cnt, &data_cnt,
2447  						       &partial_copy,
2448  						       &partial_copy_len,
2449  						       log_offset,
2450  						       commit_iclog);
2451  			if (error)
2452  				return error;
2453  
2454  			/*
2455  			 * if we had a partial copy, we need to get more iclog
2456  			 * space but we don't want to increment the region
2457  			 * index because there is still more is this region to
2458  			 * write.
2459  			 *
2460  			 * If we completed writing this region, and we flushed
2461  			 * the iclog (indicated by resetting of the record
2462  			 * count), then we also need to get more log space. If
2463  			 * this was the last record, though, we are done and
2464  			 * can just return.
2465  			 */
2466  			if (partial_copy)
2467  				break;
2468  
2469  			if (++index == lv->lv_niovecs) {
2470  next_lv:
2471  				lv = lv->lv_next;
2472  				index = 0;
2473  				if (lv)
2474  					vecp = lv->lv_iovecp;
2475  			}
2476  			if (record_cnt == 0 && ordered == false) {
2477  				if (!lv)
2478  					return 0;
2479  				break;
2480  			}
2481  		}
2482  	}
2483  
2484  	ASSERT(len == 0);
2485  
2486  	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
2487  	if (!commit_iclog)
2488  		return xlog_state_release_iclog(log, iclog);
2489  
2490  	ASSERT(flags & XLOG_COMMIT_TRANS);
2491  	*commit_iclog = iclog;
2492  	return 0;
2493  }
2494  
2495  
2496  /*****************************************************************************
2497   *
2498   *		State Machine functions
2499   *
2500   *****************************************************************************
2501   */
2502  
2503  /* Clean iclogs starting from the head.  This ordering must be
2504   * maintained, so an iclog doesn't become ACTIVE beyond one that
2505   * is SYNCING.  This is also required to maintain the notion that we use
2506   * a ordered wait queue to hold off would be writers to the log when every
2507   * iclog is trying to sync to disk.
2508   *
2509   * State Change: DIRTY -> ACTIVE
2510   */
2511  STATIC void
xlog_state_clean_log(struct xlog * log)2512  xlog_state_clean_log(
2513  	struct xlog *log)
2514  {
2515  	xlog_in_core_t	*iclog;
2516  	int changed = 0;
2517  
2518  	iclog = log->l_iclog;
2519  	do {
2520  		if (iclog->ic_state == XLOG_STATE_DIRTY) {
2521  			iclog->ic_state	= XLOG_STATE_ACTIVE;
2522  			iclog->ic_offset       = 0;
2523  			ASSERT(iclog->ic_callback == NULL);
2524  			/*
2525  			 * If the number of ops in this iclog indicate it just
2526  			 * contains the dummy transaction, we can
2527  			 * change state into IDLE (the second time around).
2528  			 * Otherwise we should change the state into
2529  			 * NEED a dummy.
2530  			 * We don't need to cover the dummy.
2531  			 */
2532  			if (!changed &&
2533  			   (be32_to_cpu(iclog->ic_header.h_num_logops) ==
2534  			   		XLOG_COVER_OPS)) {
2535  				changed = 1;
2536  			} else {
2537  				/*
2538  				 * We have two dirty iclogs so start over
2539  				 * This could also be num of ops indicates
2540  				 * this is not the dummy going out.
2541  				 */
2542  				changed = 2;
2543  			}
2544  			iclog->ic_header.h_num_logops = 0;
2545  			memset(iclog->ic_header.h_cycle_data, 0,
2546  			      sizeof(iclog->ic_header.h_cycle_data));
2547  			iclog->ic_header.h_lsn = 0;
2548  		} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
2549  			/* do nothing */;
2550  		else
2551  			break;	/* stop cleaning */
2552  		iclog = iclog->ic_next;
2553  	} while (iclog != log->l_iclog);
2554  
2555  	/* log is locked when we are called */
2556  	/*
2557  	 * Change state for the dummy log recording.
2558  	 * We usually go to NEED. But we go to NEED2 if the changed indicates
2559  	 * we are done writing the dummy record.
2560  	 * If we are done with the second dummy recored (DONE2), then
2561  	 * we go to IDLE.
2562  	 */
2563  	if (changed) {
2564  		switch (log->l_covered_state) {
2565  		case XLOG_STATE_COVER_IDLE:
2566  		case XLOG_STATE_COVER_NEED:
2567  		case XLOG_STATE_COVER_NEED2:
2568  			log->l_covered_state = XLOG_STATE_COVER_NEED;
2569  			break;
2570  
2571  		case XLOG_STATE_COVER_DONE:
2572  			if (changed == 1)
2573  				log->l_covered_state = XLOG_STATE_COVER_NEED2;
2574  			else
2575  				log->l_covered_state = XLOG_STATE_COVER_NEED;
2576  			break;
2577  
2578  		case XLOG_STATE_COVER_DONE2:
2579  			if (changed == 1)
2580  				log->l_covered_state = XLOG_STATE_COVER_IDLE;
2581  			else
2582  				log->l_covered_state = XLOG_STATE_COVER_NEED;
2583  			break;
2584  
2585  		default:
2586  			ASSERT(0);
2587  		}
2588  	}
2589  }	/* xlog_state_clean_log */
2590  
2591  STATIC xfs_lsn_t
xlog_get_lowest_lsn(struct xlog * log)2592  xlog_get_lowest_lsn(
2593  	struct xlog	*log)
2594  {
2595  	xlog_in_core_t  *lsn_log;
2596  	xfs_lsn_t	lowest_lsn, lsn;
2597  
2598  	lsn_log = log->l_iclog;
2599  	lowest_lsn = 0;
2600  	do {
2601  	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
2602  		lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
2603  		if ((lsn && !lowest_lsn) ||
2604  		    (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
2605  			lowest_lsn = lsn;
2606  		}
2607  	    }
2608  	    lsn_log = lsn_log->ic_next;
2609  	} while (lsn_log != log->l_iclog);
2610  	return lowest_lsn;
2611  }
2612  
2613  
2614  STATIC void
xlog_state_do_callback(struct xlog * log,int aborted,struct xlog_in_core * ciclog)2615  xlog_state_do_callback(
2616  	struct xlog		*log,
2617  	int			aborted,
2618  	struct xlog_in_core	*ciclog)
2619  {
2620  	xlog_in_core_t	   *iclog;
2621  	xlog_in_core_t	   *first_iclog;	/* used to know when we've
2622  						 * processed all iclogs once */
2623  	xfs_log_callback_t *cb, *cb_next;
2624  	int		   flushcnt = 0;
2625  	xfs_lsn_t	   lowest_lsn;
2626  	int		   ioerrors;	/* counter: iclogs with errors */
2627  	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
2628  	int		   funcdidcallbacks; /* flag: function did callbacks */
2629  	int		   repeats;	/* for issuing console warnings if
2630  					 * looping too many times */
2631  	int		   wake = 0;
2632  
2633  	spin_lock(&log->l_icloglock);
2634  	first_iclog = iclog = log->l_iclog;
2635  	ioerrors = 0;
2636  	funcdidcallbacks = 0;
2637  	repeats = 0;
2638  
2639  	do {
2640  		/*
2641  		 * Scan all iclogs starting with the one pointed to by the
2642  		 * log.  Reset this starting point each time the log is
2643  		 * unlocked (during callbacks).
2644  		 *
2645  		 * Keep looping through iclogs until one full pass is made
2646  		 * without running any callbacks.
2647  		 */
2648  		first_iclog = log->l_iclog;
2649  		iclog = log->l_iclog;
2650  		loopdidcallbacks = 0;
2651  		repeats++;
2652  
2653  		do {
2654  
2655  			/* skip all iclogs in the ACTIVE & DIRTY states */
2656  			if (iclog->ic_state &
2657  			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
2658  				iclog = iclog->ic_next;
2659  				continue;
2660  			}
2661  
2662  			/*
2663  			 * Between marking a filesystem SHUTDOWN and stopping
2664  			 * the log, we do flush all iclogs to disk (if there
2665  			 * wasn't a log I/O error). So, we do want things to
2666  			 * go smoothly in case of just a SHUTDOWN  w/o a
2667  			 * LOG_IO_ERROR.
2668  			 */
2669  			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
2670  				/*
2671  				 * Can only perform callbacks in order.  Since
2672  				 * this iclog is not in the DONE_SYNC/
2673  				 * DO_CALLBACK state, we skip the rest and
2674  				 * just try to clean up.  If we set our iclog
2675  				 * to DO_CALLBACK, we will not process it when
2676  				 * we retry since a previous iclog is in the
2677  				 * CALLBACK and the state cannot change since
2678  				 * we are holding the l_icloglock.
2679  				 */
2680  				if (!(iclog->ic_state &
2681  					(XLOG_STATE_DONE_SYNC |
2682  						 XLOG_STATE_DO_CALLBACK))) {
2683  					if (ciclog && (ciclog->ic_state ==
2684  							XLOG_STATE_DONE_SYNC)) {
2685  						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
2686  					}
2687  					break;
2688  				}
2689  				/*
2690  				 * We now have an iclog that is in either the
2691  				 * DO_CALLBACK or DONE_SYNC states. The other
2692  				 * states (WANT_SYNC, SYNCING, or CALLBACK were
2693  				 * caught by the above if and are going to
2694  				 * clean (i.e. we aren't doing their callbacks)
2695  				 * see the above if.
2696  				 */
2697  
2698  				/*
2699  				 * We will do one more check here to see if we
2700  				 * have chased our tail around.
2701  				 */
2702  
2703  				lowest_lsn = xlog_get_lowest_lsn(log);
2704  				if (lowest_lsn &&
2705  				    XFS_LSN_CMP(lowest_lsn,
2706  						be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
2707  					iclog = iclog->ic_next;
2708  					continue; /* Leave this iclog for
2709  						   * another thread */
2710  				}
2711  
2712  				iclog->ic_state = XLOG_STATE_CALLBACK;
2713  
2714  
2715  				/*
2716  				 * Completion of a iclog IO does not imply that
2717  				 * a transaction has completed, as transactions
2718  				 * can be large enough to span many iclogs. We
2719  				 * cannot change the tail of the log half way
2720  				 * through a transaction as this may be the only
2721  				 * transaction in the log and moving th etail to
2722  				 * point to the middle of it will prevent
2723  				 * recovery from finding the start of the
2724  				 * transaction. Hence we should only update the
2725  				 * last_sync_lsn if this iclog contains
2726  				 * transaction completion callbacks on it.
2727  				 *
2728  				 * We have to do this before we drop the
2729  				 * icloglock to ensure we are the only one that
2730  				 * can update it.
2731  				 */
2732  				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
2733  					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
2734  				if (iclog->ic_callback)
2735  					atomic64_set(&log->l_last_sync_lsn,
2736  						be64_to_cpu(iclog->ic_header.h_lsn));
2737  
2738  			} else
2739  				ioerrors++;
2740  
2741  			spin_unlock(&log->l_icloglock);
2742  
2743  			/*
2744  			 * Keep processing entries in the callback list until
2745  			 * we come around and it is empty.  We need to
2746  			 * atomically see that the list is empty and change the
2747  			 * state to DIRTY so that we don't miss any more
2748  			 * callbacks being added.
2749  			 */
2750  			spin_lock(&iclog->ic_callback_lock);
2751  			cb = iclog->ic_callback;
2752  			while (cb) {
2753  				iclog->ic_callback_tail = &(iclog->ic_callback);
2754  				iclog->ic_callback = NULL;
2755  				spin_unlock(&iclog->ic_callback_lock);
2756  
2757  				/* perform callbacks in the order given */
2758  				for (; cb; cb = cb_next) {
2759  					cb_next = cb->cb_next;
2760  					cb->cb_func(cb->cb_arg, aborted);
2761  				}
2762  				spin_lock(&iclog->ic_callback_lock);
2763  				cb = iclog->ic_callback;
2764  			}
2765  
2766  			loopdidcallbacks++;
2767  			funcdidcallbacks++;
2768  
2769  			spin_lock(&log->l_icloglock);
2770  			ASSERT(iclog->ic_callback == NULL);
2771  			spin_unlock(&iclog->ic_callback_lock);
2772  			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2773  				iclog->ic_state = XLOG_STATE_DIRTY;
2774  
2775  			/*
2776  			 * Transition from DIRTY to ACTIVE if applicable.
2777  			 * NOP if STATE_IOERROR.
2778  			 */
2779  			xlog_state_clean_log(log);
2780  
2781  			/* wake up threads waiting in xfs_log_force() */
2782  			wake_up_all(&iclog->ic_force_wait);
2783  
2784  			iclog = iclog->ic_next;
2785  		} while (first_iclog != iclog);
2786  
2787  		if (repeats > 5000) {
2788  			flushcnt += repeats;
2789  			repeats = 0;
2790  			xfs_warn(log->l_mp,
2791  				"%s: possible infinite loop (%d iterations)",
2792  				__func__, flushcnt);
2793  		}
2794  	} while (!ioerrors && loopdidcallbacks);
2795  
2796  	/*
2797  	 * make one last gasp attempt to see if iclogs are being left in
2798  	 * limbo..
2799  	 */
2800  #ifdef DEBUG
2801  	if (funcdidcallbacks) {
2802  		first_iclog = iclog = log->l_iclog;
2803  		do {
2804  			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
2805  			/*
2806  			 * Terminate the loop if iclogs are found in states
2807  			 * which will cause other threads to clean up iclogs.
2808  			 *
2809  			 * SYNCING - i/o completion will go through logs
2810  			 * DONE_SYNC - interrupt thread should be waiting for
2811  			 *              l_icloglock
2812  			 * IOERROR - give up hope all ye who enter here
2813  			 */
2814  			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
2815  			    iclog->ic_state == XLOG_STATE_SYNCING ||
2816  			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
2817  			    iclog->ic_state == XLOG_STATE_IOERROR )
2818  				break;
2819  			iclog = iclog->ic_next;
2820  		} while (first_iclog != iclog);
2821  	}
2822  #endif
2823  
2824  	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
2825  		wake = 1;
2826  	spin_unlock(&log->l_icloglock);
2827  
2828  	if (wake)
2829  		wake_up_all(&log->l_flush_wait);
2830  }
2831  
2832  
2833  /*
2834   * Finish transitioning this iclog to the dirty state.
2835   *
2836   * Make sure that we completely execute this routine only when this is
2837   * the last call to the iclog.  There is a good chance that iclog flushes,
2838   * when we reach the end of the physical log, get turned into 2 separate
2839   * calls to bwrite.  Hence, one iclog flush could generate two calls to this
2840   * routine.  By using the reference count bwritecnt, we guarantee that only
2841   * the second completion goes through.
2842   *
2843   * Callbacks could take time, so they are done outside the scope of the
2844   * global state machine log lock.
2845   */
2846  STATIC void
xlog_state_done_syncing(xlog_in_core_t * iclog,int aborted)2847  xlog_state_done_syncing(
2848  	xlog_in_core_t	*iclog,
2849  	int		aborted)
2850  {
2851  	struct xlog	   *log = iclog->ic_log;
2852  
2853  	spin_lock(&log->l_icloglock);
2854  
2855  	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
2856  	       iclog->ic_state == XLOG_STATE_IOERROR);
2857  	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
2858  	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
2859  
2860  
2861  	/*
2862  	 * If we got an error, either on the first buffer, or in the case of
2863  	 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
2864  	 * and none should ever be attempted to be written to disk
2865  	 * again.
2866  	 */
2867  	if (iclog->ic_state != XLOG_STATE_IOERROR) {
2868  		if (--iclog->ic_bwritecnt == 1) {
2869  			spin_unlock(&log->l_icloglock);
2870  			return;
2871  		}
2872  		iclog->ic_state = XLOG_STATE_DONE_SYNC;
2873  	}
2874  
2875  	/*
2876  	 * Someone could be sleeping prior to writing out the next
2877  	 * iclog buffer, we wake them all, one will get to do the
2878  	 * I/O, the others get to wait for the result.
2879  	 */
2880  	wake_up_all(&iclog->ic_write_wait);
2881  	spin_unlock(&log->l_icloglock);
2882  	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
2883  }	/* xlog_state_done_syncing */
2884  
2885  
2886  /*
2887   * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
2888   * sleep.  We wait on the flush queue on the head iclog as that should be
2889   * the first iclog to complete flushing. Hence if all iclogs are syncing,
2890   * we will wait here and all new writes will sleep until a sync completes.
2891   *
2892   * The in-core logs are used in a circular fashion. They are not used
2893   * out-of-order even when an iclog past the head is free.
2894   *
2895   * return:
2896   *	* log_offset where xlog_write() can start writing into the in-core
2897   *		log's data space.
2898   *	* in-core log pointer to which xlog_write() should write.
2899   *	* boolean indicating this is a continued write to an in-core log.
2900   *		If this is the last write, then the in-core log's offset field
2901   *		needs to be incremented, depending on the amount of data which
2902   *		is copied.
2903   */
2904  STATIC int
xlog_state_get_iclog_space(struct xlog * log,int len,struct xlog_in_core ** iclogp,struct xlog_ticket * ticket,int * continued_write,int * logoffsetp)2905  xlog_state_get_iclog_space(
2906  	struct xlog		*log,
2907  	int			len,
2908  	struct xlog_in_core	**iclogp,
2909  	struct xlog_ticket	*ticket,
2910  	int			*continued_write,
2911  	int			*logoffsetp)
2912  {
2913  	int		  log_offset;
2914  	xlog_rec_header_t *head;
2915  	xlog_in_core_t	  *iclog;
2916  	int		  error;
2917  
2918  restart:
2919  	spin_lock(&log->l_icloglock);
2920  	if (XLOG_FORCED_SHUTDOWN(log)) {
2921  		spin_unlock(&log->l_icloglock);
2922  		return -EIO;
2923  	}
2924  
2925  	iclog = log->l_iclog;
2926  	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
2927  		XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
2928  
2929  		/* Wait for log writes to have flushed */
2930  		xlog_wait(&log->l_flush_wait, &log->l_icloglock);
2931  		goto restart;
2932  	}
2933  
2934  	head = &iclog->ic_header;
2935  
2936  	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
2937  	log_offset = iclog->ic_offset;
2938  
2939  	/* On the 1st write to an iclog, figure out lsn.  This works
2940  	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
2941  	 * committing to.  If the offset is set, that's how many blocks
2942  	 * must be written.
2943  	 */
2944  	if (log_offset == 0) {
2945  		ticket->t_curr_res -= log->l_iclog_hsize;
2946  		xlog_tic_add_region(ticket,
2947  				    log->l_iclog_hsize,
2948  				    XLOG_REG_TYPE_LRHEADER);
2949  		head->h_cycle = cpu_to_be32(log->l_curr_cycle);
2950  		head->h_lsn = cpu_to_be64(
2951  			xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
2952  		ASSERT(log->l_curr_block >= 0);
2953  	}
2954  
2955  	/* If there is enough room to write everything, then do it.  Otherwise,
2956  	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
2957  	 * bit is on, so this will get flushed out.  Don't update ic_offset
2958  	 * until you know exactly how many bytes get copied.  Therefore, wait
2959  	 * until later to update ic_offset.
2960  	 *
2961  	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
2962  	 * can fit into remaining data section.
2963  	 */
2964  	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
2965  		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2966  
2967  		/*
2968  		 * If I'm the only one writing to this iclog, sync it to disk.
2969  		 * We need to do an atomic compare and decrement here to avoid
2970  		 * racing with concurrent atomic_dec_and_lock() calls in
2971  		 * xlog_state_release_iclog() when there is more than one
2972  		 * reference to the iclog.
2973  		 */
2974  		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
2975  			/* we are the only one */
2976  			spin_unlock(&log->l_icloglock);
2977  			error = xlog_state_release_iclog(log, iclog);
2978  			if (error)
2979  				return error;
2980  		} else {
2981  			spin_unlock(&log->l_icloglock);
2982  		}
2983  		goto restart;
2984  	}
2985  
2986  	/* Do we have enough room to write the full amount in the remainder
2987  	 * of this iclog?  Or must we continue a write on the next iclog and
2988  	 * mark this iclog as completely taken?  In the case where we switch
2989  	 * iclogs (to mark it taken), this particular iclog will release/sync
2990  	 * to disk in xlog_write().
2991  	 */
2992  	if (len <= iclog->ic_size - iclog->ic_offset) {
2993  		*continued_write = 0;
2994  		iclog->ic_offset += len;
2995  	} else {
2996  		*continued_write = 1;
2997  		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2998  	}
2999  	*iclogp = iclog;
3000  
3001  	ASSERT(iclog->ic_offset <= iclog->ic_size);
3002  	spin_unlock(&log->l_icloglock);
3003  
3004  	*logoffsetp = log_offset;
3005  	return 0;
3006  }	/* xlog_state_get_iclog_space */
3007  
3008  /* The first cnt-1 times through here we don't need to
3009   * move the grant write head because the permanent
3010   * reservation has reserved cnt times the unit amount.
3011   * Release part of current permanent unit reservation and
3012   * reset current reservation to be one units worth.  Also
3013   * move grant reservation head forward.
3014   */
3015  STATIC void
xlog_regrant_reserve_log_space(struct xlog * log,struct xlog_ticket * ticket)3016  xlog_regrant_reserve_log_space(
3017  	struct xlog		*log,
3018  	struct xlog_ticket	*ticket)
3019  {
3020  	trace_xfs_log_regrant_reserve_enter(log, ticket);
3021  
3022  	if (ticket->t_cnt > 0)
3023  		ticket->t_cnt--;
3024  
3025  	xlog_grant_sub_space(log, &log->l_reserve_head.grant,
3026  					ticket->t_curr_res);
3027  	xlog_grant_sub_space(log, &log->l_write_head.grant,
3028  					ticket->t_curr_res);
3029  	ticket->t_curr_res = ticket->t_unit_res;
3030  	xlog_tic_reset_res(ticket);
3031  
3032  	trace_xfs_log_regrant_reserve_sub(log, ticket);
3033  
3034  	/* just return if we still have some of the pre-reserved space */
3035  	if (ticket->t_cnt > 0)
3036  		return;
3037  
3038  	xlog_grant_add_space(log, &log->l_reserve_head.grant,
3039  					ticket->t_unit_res);
3040  
3041  	trace_xfs_log_regrant_reserve_exit(log, ticket);
3042  
3043  	ticket->t_curr_res = ticket->t_unit_res;
3044  	xlog_tic_reset_res(ticket);
3045  }	/* xlog_regrant_reserve_log_space */
3046  
3047  
3048  /*
3049   * Give back the space left from a reservation.
3050   *
3051   * All the information we need to make a correct determination of space left
3052   * is present.  For non-permanent reservations, things are quite easy.  The
3053   * count should have been decremented to zero.  We only need to deal with the
3054   * space remaining in the current reservation part of the ticket.  If the
3055   * ticket contains a permanent reservation, there may be left over space which
3056   * needs to be released.  A count of N means that N-1 refills of the current
3057   * reservation can be done before we need to ask for more space.  The first
3058   * one goes to fill up the first current reservation.  Once we run out of
3059   * space, the count will stay at zero and the only space remaining will be
3060   * in the current reservation field.
3061   */
3062  STATIC void
xlog_ungrant_log_space(struct xlog * log,struct xlog_ticket * ticket)3063  xlog_ungrant_log_space(
3064  	struct xlog		*log,
3065  	struct xlog_ticket	*ticket)
3066  {
3067  	int	bytes;
3068  
3069  	if (ticket->t_cnt > 0)
3070  		ticket->t_cnt--;
3071  
3072  	trace_xfs_log_ungrant_enter(log, ticket);
3073  	trace_xfs_log_ungrant_sub(log, ticket);
3074  
3075  	/*
3076  	 * If this is a permanent reservation ticket, we may be able to free
3077  	 * up more space based on the remaining count.
3078  	 */
3079  	bytes = ticket->t_curr_res;
3080  	if (ticket->t_cnt > 0) {
3081  		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
3082  		bytes += ticket->t_unit_res*ticket->t_cnt;
3083  	}
3084  
3085  	xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
3086  	xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
3087  
3088  	trace_xfs_log_ungrant_exit(log, ticket);
3089  
3090  	xfs_log_space_wake(log->l_mp);
3091  }
3092  
3093  /*
3094   * Flush iclog to disk if this is the last reference to the given iclog and
3095   * the WANT_SYNC bit is set.
3096   *
3097   * When this function is entered, the iclog is not necessarily in the
3098   * WANT_SYNC state.  It may be sitting around waiting to get filled.
3099   *
3100   *
3101   */
3102  STATIC int
xlog_state_release_iclog(struct xlog * log,struct xlog_in_core * iclog)3103  xlog_state_release_iclog(
3104  	struct xlog		*log,
3105  	struct xlog_in_core	*iclog)
3106  {
3107  	int		sync = 0;	/* do we sync? */
3108  
3109  	if (iclog->ic_state & XLOG_STATE_IOERROR)
3110  		return -EIO;
3111  
3112  	ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
3113  	if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
3114  		return 0;
3115  
3116  	if (iclog->ic_state & XLOG_STATE_IOERROR) {
3117  		spin_unlock(&log->l_icloglock);
3118  		return -EIO;
3119  	}
3120  	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
3121  	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
3122  
3123  	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
3124  		/* update tail before writing to iclog */
3125  		xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
3126  		sync++;
3127  		iclog->ic_state = XLOG_STATE_SYNCING;
3128  		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
3129  		xlog_verify_tail_lsn(log, iclog, tail_lsn);
3130  		/* cycle incremented when incrementing curr_block */
3131  	}
3132  	spin_unlock(&log->l_icloglock);
3133  
3134  	/*
3135  	 * We let the log lock go, so it's possible that we hit a log I/O
3136  	 * error or some other SHUTDOWN condition that marks the iclog
3137  	 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
3138  	 * this iclog has consistent data, so we ignore IOERROR
3139  	 * flags after this point.
3140  	 */
3141  	if (sync)
3142  		return xlog_sync(log, iclog);
3143  	return 0;
3144  }	/* xlog_state_release_iclog */
3145  
3146  
3147  /*
3148   * This routine will mark the current iclog in the ring as WANT_SYNC
3149   * and move the current iclog pointer to the next iclog in the ring.
3150   * When this routine is called from xlog_state_get_iclog_space(), the
3151   * exact size of the iclog has not yet been determined.  All we know is
3152   * that every data block.  We have run out of space in this log record.
3153   */
3154  STATIC void
xlog_state_switch_iclogs(struct xlog * log,struct xlog_in_core * iclog,int eventual_size)3155  xlog_state_switch_iclogs(
3156  	struct xlog		*log,
3157  	struct xlog_in_core	*iclog,
3158  	int			eventual_size)
3159  {
3160  	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
3161  	if (!eventual_size)
3162  		eventual_size = iclog->ic_offset;
3163  	iclog->ic_state = XLOG_STATE_WANT_SYNC;
3164  	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
3165  	log->l_prev_block = log->l_curr_block;
3166  	log->l_prev_cycle = log->l_curr_cycle;
3167  
3168  	/* roll log?: ic_offset changed later */
3169  	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
3170  
3171  	/* Round up to next log-sunit */
3172  	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
3173  	    log->l_mp->m_sb.sb_logsunit > 1) {
3174  		__uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
3175  		log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
3176  	}
3177  
3178  	if (log->l_curr_block >= log->l_logBBsize) {
3179  		/*
3180  		 * Rewind the current block before the cycle is bumped to make
3181  		 * sure that the combined LSN never transiently moves forward
3182  		 * when the log wraps to the next cycle. This is to support the
3183  		 * unlocked sample of these fields from xlog_valid_lsn(). Most
3184  		 * other cases should acquire l_icloglock.
3185  		 */
3186  		log->l_curr_block -= log->l_logBBsize;
3187  		ASSERT(log->l_curr_block >= 0);
3188  		smp_wmb();
3189  		log->l_curr_cycle++;
3190  		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
3191  			log->l_curr_cycle++;
3192  	}
3193  	ASSERT(iclog == log->l_iclog);
3194  	log->l_iclog = iclog->ic_next;
3195  }	/* xlog_state_switch_iclogs */
3196  
3197  /*
3198   * Write out all data in the in-core log as of this exact moment in time.
3199   *
3200   * Data may be written to the in-core log during this call.  However,
3201   * we don't guarantee this data will be written out.  A change from past
3202   * implementation means this routine will *not* write out zero length LRs.
3203   *
3204   * Basically, we try and perform an intelligent scan of the in-core logs.
3205   * If we determine there is no flushable data, we just return.  There is no
3206   * flushable data if:
3207   *
3208   *	1. the current iclog is active and has no data; the previous iclog
3209   *		is in the active or dirty state.
3210   *	2. the current iclog is drity, and the previous iclog is in the
3211   *		active or dirty state.
3212   *
3213   * We may sleep if:
3214   *
3215   *	1. the current iclog is not in the active nor dirty state.
3216   *	2. the current iclog dirty, and the previous iclog is not in the
3217   *		active nor dirty state.
3218   *	3. the current iclog is active, and there is another thread writing
3219   *		to this particular iclog.
3220   *	4. a) the current iclog is active and has no other writers
3221   *	   b) when we return from flushing out this iclog, it is still
3222   *		not in the active nor dirty state.
3223   */
3224  int
_xfs_log_force(struct xfs_mount * mp,uint flags,int * log_flushed)3225  _xfs_log_force(
3226  	struct xfs_mount	*mp,
3227  	uint			flags,
3228  	int			*log_flushed)
3229  {
3230  	struct xlog		*log = mp->m_log;
3231  	struct xlog_in_core	*iclog;
3232  	xfs_lsn_t		lsn;
3233  
3234  	XFS_STATS_INC(mp, xs_log_force);
3235  
3236  	xlog_cil_force(log);
3237  
3238  	spin_lock(&log->l_icloglock);
3239  
3240  	iclog = log->l_iclog;
3241  	if (iclog->ic_state & XLOG_STATE_IOERROR) {
3242  		spin_unlock(&log->l_icloglock);
3243  		return -EIO;
3244  	}
3245  
3246  	/* If the head iclog is not active nor dirty, we just attach
3247  	 * ourselves to the head and go to sleep.
3248  	 */
3249  	if (iclog->ic_state == XLOG_STATE_ACTIVE ||
3250  	    iclog->ic_state == XLOG_STATE_DIRTY) {
3251  		/*
3252  		 * If the head is dirty or (active and empty), then
3253  		 * we need to look at the previous iclog.  If the previous
3254  		 * iclog is active or dirty we are done.  There is nothing
3255  		 * to sync out.  Otherwise, we attach ourselves to the
3256  		 * previous iclog and go to sleep.
3257  		 */
3258  		if (iclog->ic_state == XLOG_STATE_DIRTY ||
3259  		    (atomic_read(&iclog->ic_refcnt) == 0
3260  		     && iclog->ic_offset == 0)) {
3261  			iclog = iclog->ic_prev;
3262  			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
3263  			    iclog->ic_state == XLOG_STATE_DIRTY)
3264  				goto no_sleep;
3265  			else
3266  				goto maybe_sleep;
3267  		} else {
3268  			if (atomic_read(&iclog->ic_refcnt) == 0) {
3269  				/* We are the only one with access to this
3270  				 * iclog.  Flush it out now.  There should
3271  				 * be a roundoff of zero to show that someone
3272  				 * has already taken care of the roundoff from
3273  				 * the previous sync.
3274  				 */
3275  				atomic_inc(&iclog->ic_refcnt);
3276  				lsn = be64_to_cpu(iclog->ic_header.h_lsn);
3277  				xlog_state_switch_iclogs(log, iclog, 0);
3278  				spin_unlock(&log->l_icloglock);
3279  
3280  				if (xlog_state_release_iclog(log, iclog))
3281  					return -EIO;
3282  
3283  				if (log_flushed)
3284  					*log_flushed = 1;
3285  				spin_lock(&log->l_icloglock);
3286  				if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
3287  				    iclog->ic_state != XLOG_STATE_DIRTY)
3288  					goto maybe_sleep;
3289  				else
3290  					goto no_sleep;
3291  			} else {
3292  				/* Someone else is writing to this iclog.
3293  				 * Use its call to flush out the data.  However,
3294  				 * the other thread may not force out this LR,
3295  				 * so we mark it WANT_SYNC.
3296  				 */
3297  				xlog_state_switch_iclogs(log, iclog, 0);
3298  				goto maybe_sleep;
3299  			}
3300  		}
3301  	}
3302  
3303  	/* By the time we come around again, the iclog could've been filled
3304  	 * which would give it another lsn.  If we have a new lsn, just
3305  	 * return because the relevant data has been flushed.
3306  	 */
3307  maybe_sleep:
3308  	if (flags & XFS_LOG_SYNC) {
3309  		/*
3310  		 * We must check if we're shutting down here, before
3311  		 * we wait, while we're holding the l_icloglock.
3312  		 * Then we check again after waking up, in case our
3313  		 * sleep was disturbed by a bad news.
3314  		 */
3315  		if (iclog->ic_state & XLOG_STATE_IOERROR) {
3316  			spin_unlock(&log->l_icloglock);
3317  			return -EIO;
3318  		}
3319  		XFS_STATS_INC(mp, xs_log_force_sleep);
3320  		xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3321  		/*
3322  		 * No need to grab the log lock here since we're
3323  		 * only deciding whether or not to return EIO
3324  		 * and the memory read should be atomic.
3325  		 */
3326  		if (iclog->ic_state & XLOG_STATE_IOERROR)
3327  			return -EIO;
3328  	} else {
3329  
3330  no_sleep:
3331  		spin_unlock(&log->l_icloglock);
3332  	}
3333  	return 0;
3334  }
3335  
3336  /*
3337   * Wrapper for _xfs_log_force(), to be used when caller doesn't care
3338   * about errors or whether the log was flushed or not. This is the normal
3339   * interface to use when trying to unpin items or move the log forward.
3340   */
3341  void
xfs_log_force(xfs_mount_t * mp,uint flags)3342  xfs_log_force(
3343  	xfs_mount_t	*mp,
3344  	uint		flags)
3345  {
3346  	int	error;
3347  
3348  	trace_xfs_log_force(mp, 0);
3349  	error = _xfs_log_force(mp, flags, NULL);
3350  	if (error)
3351  		xfs_warn(mp, "%s: error %d returned.", __func__, error);
3352  }
3353  
3354  /*
3355   * Force the in-core log to disk for a specific LSN.
3356   *
3357   * Find in-core log with lsn.
3358   *	If it is in the DIRTY state, just return.
3359   *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
3360   *		state and go to sleep or return.
3361   *	If it is in any other state, go to sleep or return.
3362   *
3363   * Synchronous forces are implemented with a signal variable. All callers
3364   * to force a given lsn to disk will wait on a the sv attached to the
3365   * specific in-core log.  When given in-core log finally completes its
3366   * write to disk, that thread will wake up all threads waiting on the
3367   * sv.
3368   */
3369  int
_xfs_log_force_lsn(struct xfs_mount * mp,xfs_lsn_t lsn,uint flags,int * log_flushed)3370  _xfs_log_force_lsn(
3371  	struct xfs_mount	*mp,
3372  	xfs_lsn_t		lsn,
3373  	uint			flags,
3374  	int			*log_flushed)
3375  {
3376  	struct xlog		*log = mp->m_log;
3377  	struct xlog_in_core	*iclog;
3378  	int			already_slept = 0;
3379  
3380  	ASSERT(lsn != 0);
3381  
3382  	XFS_STATS_INC(mp, xs_log_force);
3383  
3384  	lsn = xlog_cil_force_lsn(log, lsn);
3385  	if (lsn == NULLCOMMITLSN)
3386  		return 0;
3387  
3388  try_again:
3389  	spin_lock(&log->l_icloglock);
3390  	iclog = log->l_iclog;
3391  	if (iclog->ic_state & XLOG_STATE_IOERROR) {
3392  		spin_unlock(&log->l_icloglock);
3393  		return -EIO;
3394  	}
3395  
3396  	do {
3397  		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
3398  			iclog = iclog->ic_next;
3399  			continue;
3400  		}
3401  
3402  		if (iclog->ic_state == XLOG_STATE_DIRTY) {
3403  			spin_unlock(&log->l_icloglock);
3404  			return 0;
3405  		}
3406  
3407  		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3408  			/*
3409  			 * We sleep here if we haven't already slept (e.g.
3410  			 * this is the first time we've looked at the correct
3411  			 * iclog buf) and the buffer before us is going to
3412  			 * be sync'ed. The reason for this is that if we
3413  			 * are doing sync transactions here, by waiting for
3414  			 * the previous I/O to complete, we can allow a few
3415  			 * more transactions into this iclog before we close
3416  			 * it down.
3417  			 *
3418  			 * Otherwise, we mark the buffer WANT_SYNC, and bump
3419  			 * up the refcnt so we can release the log (which
3420  			 * drops the ref count).  The state switch keeps new
3421  			 * transaction commits from using this buffer.  When
3422  			 * the current commits finish writing into the buffer,
3423  			 * the refcount will drop to zero and the buffer will
3424  			 * go out then.
3425  			 */
3426  			if (!already_slept &&
3427  			    (iclog->ic_prev->ic_state &
3428  			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
3429  				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
3430  
3431  				XFS_STATS_INC(mp, xs_log_force_sleep);
3432  
3433  				xlog_wait(&iclog->ic_prev->ic_write_wait,
3434  							&log->l_icloglock);
3435  				already_slept = 1;
3436  				goto try_again;
3437  			}
3438  			atomic_inc(&iclog->ic_refcnt);
3439  			xlog_state_switch_iclogs(log, iclog, 0);
3440  			spin_unlock(&log->l_icloglock);
3441  			if (xlog_state_release_iclog(log, iclog))
3442  				return -EIO;
3443  			if (log_flushed)
3444  				*log_flushed = 1;
3445  			spin_lock(&log->l_icloglock);
3446  		}
3447  
3448  		if ((flags & XFS_LOG_SYNC) && /* sleep */
3449  		    !(iclog->ic_state &
3450  		      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
3451  			/*
3452  			 * Don't wait on completion if we know that we've
3453  			 * gotten a log write error.
3454  			 */
3455  			if (iclog->ic_state & XLOG_STATE_IOERROR) {
3456  				spin_unlock(&log->l_icloglock);
3457  				return -EIO;
3458  			}
3459  			XFS_STATS_INC(mp, xs_log_force_sleep);
3460  			xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
3461  			/*
3462  			 * No need to grab the log lock here since we're
3463  			 * only deciding whether or not to return EIO
3464  			 * and the memory read should be atomic.
3465  			 */
3466  			if (iclog->ic_state & XLOG_STATE_IOERROR)
3467  				return -EIO;
3468  		} else {		/* just return */
3469  			spin_unlock(&log->l_icloglock);
3470  		}
3471  
3472  		return 0;
3473  	} while (iclog != log->l_iclog);
3474  
3475  	spin_unlock(&log->l_icloglock);
3476  	return 0;
3477  }
3478  
3479  /*
3480   * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
3481   * about errors or whether the log was flushed or not. This is the normal
3482   * interface to use when trying to unpin items or move the log forward.
3483   */
3484  void
xfs_log_force_lsn(xfs_mount_t * mp,xfs_lsn_t lsn,uint flags)3485  xfs_log_force_lsn(
3486  	xfs_mount_t	*mp,
3487  	xfs_lsn_t	lsn,
3488  	uint		flags)
3489  {
3490  	int	error;
3491  
3492  	trace_xfs_log_force(mp, lsn);
3493  	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3494  	if (error)
3495  		xfs_warn(mp, "%s: error %d returned.", __func__, error);
3496  }
3497  
3498  /*
3499   * Called when we want to mark the current iclog as being ready to sync to
3500   * disk.
3501   */
3502  STATIC void
xlog_state_want_sync(struct xlog * log,struct xlog_in_core * iclog)3503  xlog_state_want_sync(
3504  	struct xlog		*log,
3505  	struct xlog_in_core	*iclog)
3506  {
3507  	assert_spin_locked(&log->l_icloglock);
3508  
3509  	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
3510  		xlog_state_switch_iclogs(log, iclog, 0);
3511  	} else {
3512  		ASSERT(iclog->ic_state &
3513  			(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
3514  	}
3515  }
3516  
3517  
3518  /*****************************************************************************
3519   *
3520   *		TICKET functions
3521   *
3522   *****************************************************************************
3523   */
3524  
3525  /*
3526   * Free a used ticket when its refcount falls to zero.
3527   */
3528  void
xfs_log_ticket_put(xlog_ticket_t * ticket)3529  xfs_log_ticket_put(
3530  	xlog_ticket_t	*ticket)
3531  {
3532  	ASSERT(atomic_read(&ticket->t_ref) > 0);
3533  	if (atomic_dec_and_test(&ticket->t_ref))
3534  		kmem_zone_free(xfs_log_ticket_zone, ticket);
3535  }
3536  
3537  xlog_ticket_t *
xfs_log_ticket_get(xlog_ticket_t * ticket)3538  xfs_log_ticket_get(
3539  	xlog_ticket_t	*ticket)
3540  {
3541  	ASSERT(atomic_read(&ticket->t_ref) > 0);
3542  	atomic_inc(&ticket->t_ref);
3543  	return ticket;
3544  }
3545  
3546  /*
3547   * Figure out the total log space unit (in bytes) that would be
3548   * required for a log ticket.
3549   */
3550  int
xfs_log_calc_unit_res(struct xfs_mount * mp,int unit_bytes)3551  xfs_log_calc_unit_res(
3552  	struct xfs_mount	*mp,
3553  	int			unit_bytes)
3554  {
3555  	struct xlog		*log = mp->m_log;
3556  	int			iclog_space;
3557  	uint			num_headers;
3558  
3559  	/*
3560  	 * Permanent reservations have up to 'cnt'-1 active log operations
3561  	 * in the log.  A unit in this case is the amount of space for one
3562  	 * of these log operations.  Normal reservations have a cnt of 1
3563  	 * and their unit amount is the total amount of space required.
3564  	 *
3565  	 * The following lines of code account for non-transaction data
3566  	 * which occupy space in the on-disk log.
3567  	 *
3568  	 * Normal form of a transaction is:
3569  	 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
3570  	 * and then there are LR hdrs, split-recs and roundoff at end of syncs.
3571  	 *
3572  	 * We need to account for all the leadup data and trailer data
3573  	 * around the transaction data.
3574  	 * And then we need to account for the worst case in terms of using
3575  	 * more space.
3576  	 * The worst case will happen if:
3577  	 * - the placement of the transaction happens to be such that the
3578  	 *   roundoff is at its maximum
3579  	 * - the transaction data is synced before the commit record is synced
3580  	 *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
3581  	 *   Therefore the commit record is in its own Log Record.
3582  	 *   This can happen as the commit record is called with its
3583  	 *   own region to xlog_write().
3584  	 *   This then means that in the worst case, roundoff can happen for
3585  	 *   the commit-rec as well.
3586  	 *   The commit-rec is smaller than padding in this scenario and so it is
3587  	 *   not added separately.
3588  	 */
3589  
3590  	/* for trans header */
3591  	unit_bytes += sizeof(xlog_op_header_t);
3592  	unit_bytes += sizeof(xfs_trans_header_t);
3593  
3594  	/* for start-rec */
3595  	unit_bytes += sizeof(xlog_op_header_t);
3596  
3597  	/*
3598  	 * for LR headers - the space for data in an iclog is the size minus
3599  	 * the space used for the headers. If we use the iclog size, then we
3600  	 * undercalculate the number of headers required.
3601  	 *
3602  	 * Furthermore - the addition of op headers for split-recs might
3603  	 * increase the space required enough to require more log and op
3604  	 * headers, so take that into account too.
3605  	 *
3606  	 * IMPORTANT: This reservation makes the assumption that if this
3607  	 * transaction is the first in an iclog and hence has the LR headers
3608  	 * accounted to it, then the remaining space in the iclog is
3609  	 * exclusively for this transaction.  i.e. if the transaction is larger
3610  	 * than the iclog, it will be the only thing in that iclog.
3611  	 * Fundamentally, this means we must pass the entire log vector to
3612  	 * xlog_write to guarantee this.
3613  	 */
3614  	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
3615  	num_headers = howmany(unit_bytes, iclog_space);
3616  
3617  	/* for split-recs - ophdrs added when data split over LRs */
3618  	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
3619  
3620  	/* add extra header reservations if we overrun */
3621  	while (!num_headers ||
3622  	       howmany(unit_bytes, iclog_space) > num_headers) {
3623  		unit_bytes += sizeof(xlog_op_header_t);
3624  		num_headers++;
3625  	}
3626  	unit_bytes += log->l_iclog_hsize * num_headers;
3627  
3628  	/* for commit-rec LR header - note: padding will subsume the ophdr */
3629  	unit_bytes += log->l_iclog_hsize;
3630  
3631  	/* for roundoff padding for transaction data and one for commit record */
3632  	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
3633  		/* log su roundoff */
3634  		unit_bytes += 2 * mp->m_sb.sb_logsunit;
3635  	} else {
3636  		/* BB roundoff */
3637  		unit_bytes += 2 * BBSIZE;
3638          }
3639  
3640  	return unit_bytes;
3641  }
3642  
3643  /*
3644   * Allocate and initialise a new log ticket.
3645   */
3646  struct xlog_ticket *
xlog_ticket_alloc(struct xlog * log,int unit_bytes,int cnt,char client,bool permanent,xfs_km_flags_t alloc_flags)3647  xlog_ticket_alloc(
3648  	struct xlog		*log,
3649  	int			unit_bytes,
3650  	int			cnt,
3651  	char			client,
3652  	bool			permanent,
3653  	xfs_km_flags_t		alloc_flags)
3654  {
3655  	struct xlog_ticket	*tic;
3656  	int			unit_res;
3657  
3658  	tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3659  	if (!tic)
3660  		return NULL;
3661  
3662  	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
3663  
3664  	atomic_set(&tic->t_ref, 1);
3665  	tic->t_task		= current;
3666  	INIT_LIST_HEAD(&tic->t_queue);
3667  	tic->t_unit_res		= unit_res;
3668  	tic->t_curr_res		= unit_res;
3669  	tic->t_cnt		= cnt;
3670  	tic->t_ocnt		= cnt;
3671  	tic->t_tid		= prandom_u32();
3672  	tic->t_clientid		= client;
3673  	tic->t_flags		= XLOG_TIC_INITED;
3674  	tic->t_trans_type	= 0;
3675  	if (permanent)
3676  		tic->t_flags |= XLOG_TIC_PERM_RESERV;
3677  
3678  	xlog_tic_reset_res(tic);
3679  
3680  	return tic;
3681  }
3682  
3683  
3684  /******************************************************************************
3685   *
3686   *		Log debug routines
3687   *
3688   ******************************************************************************
3689   */
3690  #if defined(DEBUG)
3691  /*
3692   * Make sure that the destination ptr is within the valid data region of
3693   * one of the iclogs.  This uses backup pointers stored in a different
3694   * part of the log in case we trash the log structure.
3695   */
3696  void
xlog_verify_dest_ptr(struct xlog * log,void * ptr)3697  xlog_verify_dest_ptr(
3698  	struct xlog	*log,
3699  	void		*ptr)
3700  {
3701  	int i;
3702  	int good_ptr = 0;
3703  
3704  	for (i = 0; i < log->l_iclog_bufs; i++) {
3705  		if (ptr >= log->l_iclog_bak[i] &&
3706  		    ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
3707  			good_ptr++;
3708  	}
3709  
3710  	if (!good_ptr)
3711  		xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3712  }
3713  
3714  /*
3715   * Check to make sure the grant write head didn't just over lap the tail.  If
3716   * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
3717   * the cycles differ by exactly one and check the byte count.
3718   *
3719   * This check is run unlocked, so can give false positives. Rather than assert
3720   * on failures, use a warn-once flag and a panic tag to allow the admin to
3721   * determine if they want to panic the machine when such an error occurs. For
3722   * debug kernels this will have the same effect as using an assert but, unlinke
3723   * an assert, it can be turned off at runtime.
3724   */
3725  STATIC void
xlog_verify_grant_tail(struct xlog * log)3726  xlog_verify_grant_tail(
3727  	struct xlog	*log)
3728  {
3729  	int		tail_cycle, tail_blocks;
3730  	int		cycle, space;
3731  
3732  	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
3733  	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3734  	if (tail_cycle != cycle) {
3735  		if (cycle - 1 != tail_cycle &&
3736  		    !(log->l_flags & XLOG_TAIL_WARN)) {
3737  			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3738  				"%s: cycle - 1 != tail_cycle", __func__);
3739  			log->l_flags |= XLOG_TAIL_WARN;
3740  		}
3741  
3742  		if (space > BBTOB(tail_blocks) &&
3743  		    !(log->l_flags & XLOG_TAIL_WARN)) {
3744  			xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3745  				"%s: space > BBTOB(tail_blocks)", __func__);
3746  			log->l_flags |= XLOG_TAIL_WARN;
3747  		}
3748  	}
3749  }
3750  
3751  /* check if it will fit */
3752  STATIC void
xlog_verify_tail_lsn(struct xlog * log,struct xlog_in_core * iclog,xfs_lsn_t tail_lsn)3753  xlog_verify_tail_lsn(
3754  	struct xlog		*log,
3755  	struct xlog_in_core	*iclog,
3756  	xfs_lsn_t		tail_lsn)
3757  {
3758      int blocks;
3759  
3760      if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
3761  	blocks =
3762  	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3763  	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3764  		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3765      } else {
3766  	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3767  
3768  	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3769  		xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3770  
3771  	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3772  	if (blocks < BTOBB(iclog->ic_offset) + 1)
3773  		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3774      }
3775  }	/* xlog_verify_tail_lsn */
3776  
3777  /*
3778   * Perform a number of checks on the iclog before writing to disk.
3779   *
3780   * 1. Make sure the iclogs are still circular
3781   * 2. Make sure we have a good magic number
3782   * 3. Make sure we don't have magic numbers in the data
3783   * 4. Check fields of each log operation header for:
3784   *	A. Valid client identifier
3785   *	B. tid ptr value falls in valid ptr space (user space code)
3786   *	C. Length in log record header is correct according to the
3787   *		individual operation headers within record.
3788   * 5. When a bwrite will occur within 5 blocks of the front of the physical
3789   *	log, check the preceding blocks of the physical log to make sure all
3790   *	the cycle numbers agree with the current cycle number.
3791   */
3792  STATIC void
xlog_verify_iclog(struct xlog * log,struct xlog_in_core * iclog,int count,bool syncing)3793  xlog_verify_iclog(
3794  	struct xlog		*log,
3795  	struct xlog_in_core	*iclog,
3796  	int			count,
3797  	bool                    syncing)
3798  {
3799  	xlog_op_header_t	*ophead;
3800  	xlog_in_core_t		*icptr;
3801  	xlog_in_core_2_t	*xhdr;
3802  	void			*base_ptr, *ptr, *p;
3803  	ptrdiff_t		field_offset;
3804  	__uint8_t		clientid;
3805  	int			len, i, j, k, op_len;
3806  	int			idx;
3807  
3808  	/* check validity of iclog pointers */
3809  	spin_lock(&log->l_icloglock);
3810  	icptr = log->l_iclog;
3811  	for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
3812  		ASSERT(icptr);
3813  
3814  	if (icptr != log->l_iclog)
3815  		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3816  	spin_unlock(&log->l_icloglock);
3817  
3818  	/* check log magic numbers */
3819  	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3820  		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3821  
3822  	base_ptr = ptr = &iclog->ic_header;
3823  	p = &iclog->ic_header;
3824  	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
3825  		if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
3826  			xfs_emerg(log->l_mp, "%s: unexpected magic num",
3827  				__func__);
3828  	}
3829  
3830  	/* check fields */
3831  	len = be32_to_cpu(iclog->ic_header.h_num_logops);
3832  	base_ptr = ptr = iclog->ic_datap;
3833  	ophead = ptr;
3834  	xhdr = iclog->ic_data;
3835  	for (i = 0; i < len; i++) {
3836  		ophead = ptr;
3837  
3838  		/* clientid is only 1 byte */
3839  		p = &ophead->oh_clientid;
3840  		field_offset = p - base_ptr;
3841  		if (!syncing || (field_offset & 0x1ff)) {
3842  			clientid = ophead->oh_clientid;
3843  		} else {
3844  			idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
3845  			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3846  				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3847  				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3848  				clientid = xlog_get_client_id(
3849  					xhdr[j].hic_xheader.xh_cycle_data[k]);
3850  			} else {
3851  				clientid = xlog_get_client_id(
3852  					iclog->ic_header.h_cycle_data[idx]);
3853  			}
3854  		}
3855  		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3856  			xfs_warn(log->l_mp,
3857  				"%s: invalid clientid %d op 0x%p offset 0x%lx",
3858  				__func__, clientid, ophead,
3859  				(unsigned long)field_offset);
3860  
3861  		/* check length */
3862  		p = &ophead->oh_len;
3863  		field_offset = p - base_ptr;
3864  		if (!syncing || (field_offset & 0x1ff)) {
3865  			op_len = be32_to_cpu(ophead->oh_len);
3866  		} else {
3867  			idx = BTOBBT((uintptr_t)&ophead->oh_len -
3868  				    (uintptr_t)iclog->ic_datap);
3869  			if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
3870  				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3871  				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3872  				op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
3873  			} else {
3874  				op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
3875  			}
3876  		}
3877  		ptr += sizeof(xlog_op_header_t) + op_len;
3878  	}
3879  }	/* xlog_verify_iclog */
3880  #endif
3881  
3882  /*
3883   * Mark all iclogs IOERROR. l_icloglock is held by the caller.
3884   */
3885  STATIC int
xlog_state_ioerror(struct xlog * log)3886  xlog_state_ioerror(
3887  	struct xlog	*log)
3888  {
3889  	xlog_in_core_t	*iclog, *ic;
3890  
3891  	iclog = log->l_iclog;
3892  	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
3893  		/*
3894  		 * Mark all the incore logs IOERROR.
3895  		 * From now on, no log flushes will result.
3896  		 */
3897  		ic = iclog;
3898  		do {
3899  			ic->ic_state = XLOG_STATE_IOERROR;
3900  			ic = ic->ic_next;
3901  		} while (ic != iclog);
3902  		return 0;
3903  	}
3904  	/*
3905  	 * Return non-zero, if state transition has already happened.
3906  	 */
3907  	return 1;
3908  }
3909  
3910  /*
3911   * This is called from xfs_force_shutdown, when we're forcibly
3912   * shutting down the filesystem, typically because of an IO error.
3913   * Our main objectives here are to make sure that:
3914   *	a. if !logerror, flush the logs to disk. Anything modified
3915   *	   after this is ignored.
3916   *	b. the filesystem gets marked 'SHUTDOWN' for all interested
3917   *	   parties to find out, 'atomically'.
3918   *	c. those who're sleeping on log reservations, pinned objects and
3919   *	    other resources get woken up, and be told the bad news.
3920   *	d. nothing new gets queued up after (b) and (c) are done.
3921   *
3922   * Note: for the !logerror case we need to flush the regions held in memory out
3923   * to disk first. This needs to be done before the log is marked as shutdown,
3924   * otherwise the iclog writes will fail.
3925   */
3926  int
xfs_log_force_umount(struct xfs_mount * mp,int logerror)3927  xfs_log_force_umount(
3928  	struct xfs_mount	*mp,
3929  	int			logerror)
3930  {
3931  	struct xlog	*log;
3932  	int		retval;
3933  
3934  	log = mp->m_log;
3935  
3936  	/*
3937  	 * If this happens during log recovery, don't worry about
3938  	 * locking; the log isn't open for business yet.
3939  	 */
3940  	if (!log ||
3941  	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
3942  		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3943  		if (mp->m_sb_bp)
3944  			XFS_BUF_DONE(mp->m_sb_bp);
3945  		return 0;
3946  	}
3947  
3948  	/*
3949  	 * Somebody could've already done the hard work for us.
3950  	 * No need to get locks for this.
3951  	 */
3952  	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
3953  		ASSERT(XLOG_FORCED_SHUTDOWN(log));
3954  		return 1;
3955  	}
3956  
3957  	/*
3958  	 * Flush all the completed transactions to disk before marking the log
3959  	 * being shut down. We need to do it in this order to ensure that
3960  	 * completed operations are safely on disk before we shut down, and that
3961  	 * we don't have to issue any buffer IO after the shutdown flags are set
3962  	 * to guarantee this.
3963  	 */
3964  	if (!logerror)
3965  		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3966  
3967  	/*
3968  	 * mark the filesystem and the as in a shutdown state and wake
3969  	 * everybody up to tell them the bad news.
3970  	 */
3971  	spin_lock(&log->l_icloglock);
3972  	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3973  	if (mp->m_sb_bp)
3974  		XFS_BUF_DONE(mp->m_sb_bp);
3975  
3976  	/*
3977  	 * Mark the log and the iclogs with IO error flags to prevent any
3978  	 * further log IO from being issued or completed.
3979  	 */
3980  	log->l_flags |= XLOG_IO_ERROR;
3981  	retval = xlog_state_ioerror(log);
3982  	spin_unlock(&log->l_icloglock);
3983  
3984  	/*
3985  	 * We don't want anybody waiting for log reservations after this. That
3986  	 * means we have to wake up everybody queued up on reserveq as well as
3987  	 * writeq.  In addition, we make sure in xlog_{re}grant_log_space that
3988  	 * we don't enqueue anything once the SHUTDOWN flag is set, and this
3989  	 * action is protected by the grant locks.
3990  	 */
3991  	xlog_grant_head_wake_all(&log->l_reserve_head);
3992  	xlog_grant_head_wake_all(&log->l_write_head);
3993  
3994  	/*
3995  	 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
3996  	 * as if the log writes were completed. The abort handling in the log
3997  	 * item committed callback functions will do this again under lock to
3998  	 * avoid races.
3999  	 */
4000  	wake_up_all(&log->l_cilp->xc_commit_wait);
4001  	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
4002  
4003  #ifdef XFSERRORDEBUG
4004  	{
4005  		xlog_in_core_t	*iclog;
4006  
4007  		spin_lock(&log->l_icloglock);
4008  		iclog = log->l_iclog;
4009  		do {
4010  			ASSERT(iclog->ic_callback == 0);
4011  			iclog = iclog->ic_next;
4012  		} while (iclog != log->l_iclog);
4013  		spin_unlock(&log->l_icloglock);
4014  	}
4015  #endif
4016  	/* return non-zero if log IOERROR transition had already happened */
4017  	return retval;
4018  }
4019  
4020  STATIC int
xlog_iclogs_empty(struct xlog * log)4021  xlog_iclogs_empty(
4022  	struct xlog	*log)
4023  {
4024  	xlog_in_core_t	*iclog;
4025  
4026  	iclog = log->l_iclog;
4027  	do {
4028  		/* endianness does not matter here, zero is zero in
4029  		 * any language.
4030  		 */
4031  		if (iclog->ic_header.h_num_logops)
4032  			return 0;
4033  		iclog = iclog->ic_next;
4034  	} while (iclog != log->l_iclog);
4035  	return 1;
4036  }
4037  
4038  /*
4039   * Verify that an LSN stamped into a piece of metadata is valid. This is
4040   * intended for use in read verifiers on v5 superblocks.
4041   */
4042  bool
xfs_log_check_lsn(struct xfs_mount * mp,xfs_lsn_t lsn)4043  xfs_log_check_lsn(
4044  	struct xfs_mount	*mp,
4045  	xfs_lsn_t		lsn)
4046  {
4047  	struct xlog		*log = mp->m_log;
4048  	bool			valid;
4049  
4050  	/*
4051  	 * norecovery mode skips mount-time log processing and unconditionally
4052  	 * resets the in-core LSN. We can't validate in this mode, but
4053  	 * modifications are not allowed anyways so just return true.
4054  	 */
4055  	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
4056  		return true;
4057  
4058  	/*
4059  	 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
4060  	 * handled by recovery and thus safe to ignore here.
4061  	 */
4062  	if (lsn == NULLCOMMITLSN)
4063  		return true;
4064  
4065  	valid = xlog_valid_lsn(mp->m_log, lsn);
4066  
4067  	/* warn the user about what's gone wrong before verifier failure */
4068  	if (!valid) {
4069  		spin_lock(&log->l_icloglock);
4070  		xfs_warn(mp,
4071  "Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
4072  "Please unmount and run xfs_repair (>= v4.3) to resolve.",
4073  			 CYCLE_LSN(lsn), BLOCK_LSN(lsn),
4074  			 log->l_curr_cycle, log->l_curr_block);
4075  		spin_unlock(&log->l_icloglock);
4076  	}
4077  
4078  	return valid;
4079  }
4080