• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_log.h"
22 #include "xfs_trans.h"
23 #include "xfs_sb.h"
24 #include "xfs_ag.h"
25 #include "xfs_mount.h"
26 #include "xfs_bmap_btree.h"
27 #include "xfs_dinode.h"
28 #include "xfs_inode.h"
29 #include "xfs_inode_item.h"
30 #include "xfs_bmap.h"
31 #include "xfs_itable.h"
32 #include "xfs_dfrag.h"
33 #include "xfs_error.h"
34 #include "xfs_vnodeops.h"
35 #include "xfs_trace.h"
36 
37 
38 static int xfs_swap_extents(
39 	xfs_inode_t	*ip,	/* target inode */
40 	xfs_inode_t	*tip,	/* tmp inode */
41 	xfs_swapext_t	*sxp);
42 
43 /*
44  * ioctl interface for swapext
45  */
46 int
xfs_swapext(xfs_swapext_t * sxp)47 xfs_swapext(
48 	xfs_swapext_t	*sxp)
49 {
50 	xfs_inode_t     *ip, *tip;
51 	struct fd	f, tmp;
52 	int		error = 0;
53 
54 	/* Pull information for the target fd */
55 	f = fdget((int)sxp->sx_fdtarget);
56 	if (!f.file) {
57 		error = XFS_ERROR(EINVAL);
58 		goto out;
59 	}
60 
61 	if (!(f.file->f_mode & FMODE_WRITE) ||
62 	    !(f.file->f_mode & FMODE_READ) ||
63 	    (f.file->f_flags & O_APPEND)) {
64 		error = XFS_ERROR(EBADF);
65 		goto out_put_file;
66 	}
67 
68 	tmp = fdget((int)sxp->sx_fdtmp);
69 	if (!tmp.file) {
70 		error = XFS_ERROR(EINVAL);
71 		goto out_put_file;
72 	}
73 
74 	if (!(tmp.file->f_mode & FMODE_WRITE) ||
75 	    !(tmp.file->f_mode & FMODE_READ) ||
76 	    (tmp.file->f_flags & O_APPEND)) {
77 		error = XFS_ERROR(EBADF);
78 		goto out_put_tmp_file;
79 	}
80 
81 	if (IS_SWAPFILE(file_inode(f.file)) ||
82 	    IS_SWAPFILE(file_inode(tmp.file))) {
83 		error = XFS_ERROR(EINVAL);
84 		goto out_put_tmp_file;
85 	}
86 
87 	ip = XFS_I(file_inode(f.file));
88 	tip = XFS_I(file_inode(tmp.file));
89 
90 	if (ip->i_mount != tip->i_mount) {
91 		error = XFS_ERROR(EINVAL);
92 		goto out_put_tmp_file;
93 	}
94 
95 	if (ip->i_ino == tip->i_ino) {
96 		error = XFS_ERROR(EINVAL);
97 		goto out_put_tmp_file;
98 	}
99 
100 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
101 		error = XFS_ERROR(EIO);
102 		goto out_put_tmp_file;
103 	}
104 
105 	error = xfs_swap_extents(ip, tip, sxp);
106 
107  out_put_tmp_file:
108 	fdput(tmp);
109  out_put_file:
110 	fdput(f);
111  out:
112 	return error;
113 }
114 
115 /*
116  * We need to check that the format of the data fork in the temporary inode is
117  * valid for the target inode before doing the swap. This is not a problem with
118  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
119  * data fork depending on the space the attribute fork is taking so we can get
120  * invalid formats on the target inode.
121  *
122  * E.g. target has space for 7 extents in extent format, temp inode only has
123  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
124  * btree, but when swapped it needs to be in extent format. Hence we can't just
125  * blindly swap data forks on attr2 filesystems.
126  *
127  * Note that we check the swap in both directions so that we don't end up with
128  * a corrupt temporary inode, either.
129  *
130  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
131  * inode will prevent this situation from occurring, so all we do here is
132  * reject and log the attempt. basically we are putting the responsibility on
133  * userspace to get this right.
134  */
135 static int
xfs_swap_extents_check_format(xfs_inode_t * ip,xfs_inode_t * tip)136 xfs_swap_extents_check_format(
137 	xfs_inode_t	*ip,	/* target inode */
138 	xfs_inode_t	*tip)	/* tmp inode */
139 {
140 
141 	/* Should never get a local format */
142 	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
143 	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
144 		return EINVAL;
145 
146 	/*
147 	 * if the target inode has less extents that then temporary inode then
148 	 * why did userspace call us?
149 	 */
150 	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
151 		return EINVAL;
152 
153 	/*
154 	 * if the target inode is in extent form and the temp inode is in btree
155 	 * form then we will end up with the target inode in the wrong format
156 	 * as we already know there are less extents in the temp inode.
157 	 */
158 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
159 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
160 		return EINVAL;
161 
162 	/* Check temp in extent form to max in target */
163 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
164 	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
165 			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
166 		return EINVAL;
167 
168 	/* Check target in extent form to max in temp */
169 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
170 	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
171 			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
172 		return EINVAL;
173 
174 	/*
175 	 * If we are in a btree format, check that the temp root block will fit
176 	 * in the target and that it has enough extents to be in btree format
177 	 * in the target.
178 	 *
179 	 * Note that we have to be careful to allow btree->extent conversions
180 	 * (a common defrag case) which will occur when the temp inode is in
181 	 * extent format...
182 	 */
183 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 		if (XFS_IFORK_BOFF(ip) &&
185 		    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
186 			return EINVAL;
187 		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
188 		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
189 			return EINVAL;
190 	}
191 
192 	/* Reciprocal target->temp btree format checks */
193 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
194 		if (XFS_IFORK_BOFF(tip) &&
195 		    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
196 			return EINVAL;
197 
198 		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
199 		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
200 			return EINVAL;
201 	}
202 
203 	return 0;
204 }
205 
206 static int
xfs_swap_extents(xfs_inode_t * ip,xfs_inode_t * tip,xfs_swapext_t * sxp)207 xfs_swap_extents(
208 	xfs_inode_t	*ip,	/* target inode */
209 	xfs_inode_t	*tip,	/* tmp inode */
210 	xfs_swapext_t	*sxp)
211 {
212 	xfs_mount_t	*mp = ip->i_mount;
213 	xfs_trans_t	*tp;
214 	xfs_bstat_t	*sbp = &sxp->sx_stat;
215 	xfs_ifork_t	*tempifp, *ifp, *tifp;
216 	int		src_log_flags, target_log_flags;
217 	int		error = 0;
218 	int		aforkblks = 0;
219 	int		taforkblks = 0;
220 	__uint64_t	tmp;
221 
222 	/*
223 	 * We have no way of updating owner information in the BMBT blocks for
224 	 * each inode on CRC enabled filesystems, so to avoid corrupting the
225 	 * this metadata we simply don't allow extent swaps to occur.
226 	 */
227 	if (xfs_sb_version_hascrc(&mp->m_sb))
228 		return XFS_ERROR(EINVAL);
229 
230 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
231 	if (!tempifp) {
232 		error = XFS_ERROR(ENOMEM);
233 		goto out;
234 	}
235 
236 	/*
237 	 * we have to do two separate lock calls here to keep lockdep
238 	 * happy. If we try to get all the locks in one call, lock will
239 	 * report false positives when we drop the ILOCK and regain them
240 	 * below.
241 	 */
242 	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
243 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
244 
245 	/* Verify that both files have the same format */
246 	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
247 		error = XFS_ERROR(EINVAL);
248 		goto out_unlock;
249 	}
250 
251 	/* Verify both files are either real-time or non-realtime */
252 	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
253 		error = XFS_ERROR(EINVAL);
254 		goto out_unlock;
255 	}
256 
257 	error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
258 	if (error)
259 		goto out_unlock;
260 	truncate_pagecache_range(VFS_I(tip), 0, -1);
261 
262 	/* Verify O_DIRECT for ftmp */
263 	if (VN_CACHED(VFS_I(tip)) != 0) {
264 		error = XFS_ERROR(EINVAL);
265 		goto out_unlock;
266 	}
267 
268 	/* Verify all data are being swapped */
269 	if (sxp->sx_offset != 0 ||
270 	    sxp->sx_length != ip->i_d.di_size ||
271 	    sxp->sx_length != tip->i_d.di_size) {
272 		error = XFS_ERROR(EFAULT);
273 		goto out_unlock;
274 	}
275 
276 	trace_xfs_swap_extent_before(ip, 0);
277 	trace_xfs_swap_extent_before(tip, 1);
278 
279 	/* check inode formats now that data is flushed */
280 	error = xfs_swap_extents_check_format(ip, tip);
281 	if (error) {
282 		xfs_notice(mp,
283 		    "%s: inode 0x%llx format is incompatible for exchanging.",
284 				__func__, ip->i_ino);
285 		goto out_unlock;
286 	}
287 
288 	/*
289 	 * Compare the current change & modify times with that
290 	 * passed in.  If they differ, we abort this swap.
291 	 * This is the mechanism used to ensure the calling
292 	 * process that the file was not changed out from
293 	 * under it.
294 	 */
295 	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
296 	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
297 	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
298 	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
299 		error = XFS_ERROR(EBUSY);
300 		goto out_unlock;
301 	}
302 
303 	/* We need to fail if the file is memory mapped.  Once we have tossed
304 	 * all existing pages, the page fault will have no option
305 	 * but to go to the filesystem for pages. By making the page fault call
306 	 * vop_read (or write in the case of autogrow) they block on the iolock
307 	 * until we have switched the extents.
308 	 */
309 	if (VN_MAPPED(VFS_I(ip))) {
310 		error = XFS_ERROR(EBUSY);
311 		goto out_unlock;
312 	}
313 
314 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
315 	xfs_iunlock(tip, XFS_ILOCK_EXCL);
316 
317 	/*
318 	 * There is a race condition here since we gave up the
319 	 * ilock.  However, the data fork will not change since
320 	 * we have the iolock (locked for truncation too) so we
321 	 * are safe.  We don't really care if non-io related
322 	 * fields change.
323 	 */
324 	truncate_pagecache_range(VFS_I(ip), 0, -1);
325 
326 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
327 	if ((error = xfs_trans_reserve(tp, 0,
328 				     XFS_ICHANGE_LOG_RES(mp), 0,
329 				     0, 0))) {
330 		xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
331 		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
332 		xfs_trans_cancel(tp, 0);
333 		goto out;
334 	}
335 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
336 
337 	/*
338 	 * Count the number of extended attribute blocks
339 	 */
340 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
341 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
342 		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
343 		if (error)
344 			goto out_trans_cancel;
345 	}
346 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
347 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
348 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
349 			&taforkblks);
350 		if (error)
351 			goto out_trans_cancel;
352 	}
353 
354 	/*
355 	 * Swap the data forks of the inodes
356 	 */
357 	ifp = &ip->i_df;
358 	tifp = &tip->i_df;
359 	*tempifp = *ifp;	/* struct copy */
360 	*ifp = *tifp;		/* struct copy */
361 	*tifp = *tempifp;	/* struct copy */
362 
363 	/*
364 	 * Fix the on-disk inode values
365 	 */
366 	tmp = (__uint64_t)ip->i_d.di_nblocks;
367 	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
368 	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
369 
370 	tmp = (__uint64_t) ip->i_d.di_nextents;
371 	ip->i_d.di_nextents = tip->i_d.di_nextents;
372 	tip->i_d.di_nextents = tmp;
373 
374 	tmp = (__uint64_t) ip->i_d.di_format;
375 	ip->i_d.di_format = tip->i_d.di_format;
376 	tip->i_d.di_format = tmp;
377 
378 	/*
379 	 * The extents in the source inode could still contain speculative
380 	 * preallocation beyond EOF (e.g. the file is open but not modified
381 	 * while defrag is in progress). In that case, we need to copy over the
382 	 * number of delalloc blocks the data fork in the source inode is
383 	 * tracking beyond EOF so that when the fork is truncated away when the
384 	 * temporary inode is unlinked we don't underrun the i_delayed_blks
385 	 * counter on that inode.
386 	 */
387 	ASSERT(tip->i_delayed_blks == 0);
388 	tip->i_delayed_blks = ip->i_delayed_blks;
389 	ip->i_delayed_blks = 0;
390 
391 	src_log_flags = XFS_ILOG_CORE;
392 	switch (ip->i_d.di_format) {
393 	case XFS_DINODE_FMT_EXTENTS:
394 		/* If the extents fit in the inode, fix the
395 		 * pointer.  Otherwise it's already NULL or
396 		 * pointing to the extent.
397 		 */
398 		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
399 			ifp->if_u1.if_extents =
400 				ifp->if_u2.if_inline_ext;
401 		}
402 		src_log_flags |= XFS_ILOG_DEXT;
403 		break;
404 	case XFS_DINODE_FMT_BTREE:
405 		src_log_flags |= XFS_ILOG_DBROOT;
406 		break;
407 	}
408 
409 	target_log_flags = XFS_ILOG_CORE;
410 	switch (tip->i_d.di_format) {
411 	case XFS_DINODE_FMT_EXTENTS:
412 		/* If the extents fit in the inode, fix the
413 		 * pointer.  Otherwise it's already NULL or
414 		 * pointing to the extent.
415 		 */
416 		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
417 			tifp->if_u1.if_extents =
418 				tifp->if_u2.if_inline_ext;
419 		}
420 		target_log_flags |= XFS_ILOG_DEXT;
421 		break;
422 	case XFS_DINODE_FMT_BTREE:
423 		target_log_flags |= XFS_ILOG_DBROOT;
424 		break;
425 	}
426 
427 
428 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
429 	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
430 
431 	xfs_trans_log_inode(tp, ip,  src_log_flags);
432 	xfs_trans_log_inode(tp, tip, target_log_flags);
433 
434 	/*
435 	 * If this is a synchronous mount, make sure that the
436 	 * transaction goes to disk before returning to the user.
437 	 */
438 	if (mp->m_flags & XFS_MOUNT_WSYNC)
439 		xfs_trans_set_sync(tp);
440 
441 	error = xfs_trans_commit(tp, 0);
442 
443 	trace_xfs_swap_extent_after(ip, 0);
444 	trace_xfs_swap_extent_after(tip, 1);
445 out:
446 	kmem_free(tempifp);
447 	return error;
448 
449 out_unlock:
450 	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
451 	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
452 	goto out;
453 
454 out_trans_cancel:
455 	xfs_trans_cancel(tp, 0);
456 	goto out_unlock;
457 }
458