• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * file.c
5  *
6  * File open, close, extend, truncate
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/capability.h>
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 #include <linux/pagemap.h>
32 #include <linux/uio.h>
33 #include <linux/sched.h>
34 #include <linux/splice.h>
35 #include <linux/mount.h>
36 #include <linux/writeback.h>
37 #include <linux/falloc.h>
38 #include <linux/quotaops.h>
39 
40 #define MLOG_MASK_PREFIX ML_INODE
41 #include <cluster/masklog.h>
42 
43 #include "ocfs2.h"
44 
45 #include "alloc.h"
46 #include "aops.h"
47 #include "dir.h"
48 #include "dlmglue.h"
49 #include "extent_map.h"
50 #include "file.h"
51 #include "sysfile.h"
52 #include "inode.h"
53 #include "ioctl.h"
54 #include "journal.h"
55 #include "locks.h"
56 #include "mmap.h"
57 #include "suballoc.h"
58 #include "super.h"
59 #include "xattr.h"
60 #include "acl.h"
61 #include "quota.h"
62 
63 #include "buffer_head_io.h"
64 
ocfs2_sync_inode(struct inode * inode)65 static int ocfs2_sync_inode(struct inode *inode)
66 {
67 	filemap_fdatawrite(inode->i_mapping);
68 	return sync_mapping_buffers(inode->i_mapping);
69 }
70 
ocfs2_init_file_private(struct inode * inode,struct file * file)71 static int ocfs2_init_file_private(struct inode *inode, struct file *file)
72 {
73 	struct ocfs2_file_private *fp;
74 
75 	fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
76 	if (!fp)
77 		return -ENOMEM;
78 
79 	fp->fp_file = file;
80 	mutex_init(&fp->fp_mutex);
81 	ocfs2_file_lock_res_init(&fp->fp_flock, fp);
82 	file->private_data = fp;
83 
84 	return 0;
85 }
86 
ocfs2_free_file_private(struct inode * inode,struct file * file)87 static void ocfs2_free_file_private(struct inode *inode, struct file *file)
88 {
89 	struct ocfs2_file_private *fp = file->private_data;
90 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
91 
92 	if (fp) {
93 		ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
94 		ocfs2_lock_res_free(&fp->fp_flock);
95 		kfree(fp);
96 		file->private_data = NULL;
97 	}
98 }
99 
ocfs2_file_open(struct inode * inode,struct file * file)100 static int ocfs2_file_open(struct inode *inode, struct file *file)
101 {
102 	int status;
103 	int mode = file->f_flags;
104 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
105 
106 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
107 		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
108 
109 	spin_lock(&oi->ip_lock);
110 
111 	/* Check that the inode hasn't been wiped from disk by another
112 	 * node. If it hasn't then we're safe as long as we hold the
113 	 * spin lock until our increment of open count. */
114 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
115 		spin_unlock(&oi->ip_lock);
116 
117 		status = -ENOENT;
118 		goto leave;
119 	}
120 
121 	if (mode & O_DIRECT)
122 		oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
123 
124 	oi->ip_open_count++;
125 	spin_unlock(&oi->ip_lock);
126 
127 	status = ocfs2_init_file_private(inode, file);
128 	if (status) {
129 		/*
130 		 * We want to set open count back if we're failing the
131 		 * open.
132 		 */
133 		spin_lock(&oi->ip_lock);
134 		oi->ip_open_count--;
135 		spin_unlock(&oi->ip_lock);
136 	}
137 
138 leave:
139 	mlog_exit(status);
140 	return status;
141 }
142 
ocfs2_file_release(struct inode * inode,struct file * file)143 static int ocfs2_file_release(struct inode *inode, struct file *file)
144 {
145 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
146 
147 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
148 		       file->f_path.dentry->d_name.len,
149 		       file->f_path.dentry->d_name.name);
150 
151 	spin_lock(&oi->ip_lock);
152 	if (!--oi->ip_open_count)
153 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
154 	spin_unlock(&oi->ip_lock);
155 
156 	ocfs2_free_file_private(inode, file);
157 
158 	mlog_exit(0);
159 
160 	return 0;
161 }
162 
ocfs2_dir_open(struct inode * inode,struct file * file)163 static int ocfs2_dir_open(struct inode *inode, struct file *file)
164 {
165 	return ocfs2_init_file_private(inode, file);
166 }
167 
ocfs2_dir_release(struct inode * inode,struct file * file)168 static int ocfs2_dir_release(struct inode *inode, struct file *file)
169 {
170 	ocfs2_free_file_private(inode, file);
171 	return 0;
172 }
173 
ocfs2_sync_file(struct file * file,struct dentry * dentry,int datasync)174 static int ocfs2_sync_file(struct file *file,
175 			   struct dentry *dentry,
176 			   int datasync)
177 {
178 	int err = 0;
179 	journal_t *journal;
180 	struct inode *inode = dentry->d_inode;
181 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
182 
183 	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
184 		   dentry->d_name.len, dentry->d_name.name);
185 
186 	err = ocfs2_sync_inode(dentry->d_inode);
187 	if (err)
188 		goto bail;
189 
190 	journal = osb->journal->j_journal;
191 	err = jbd2_journal_force_commit(journal);
192 
193 bail:
194 	mlog_exit(err);
195 
196 	return (err < 0) ? -EIO : 0;
197 }
198 
ocfs2_should_update_atime(struct inode * inode,struct vfsmount * vfsmnt)199 int ocfs2_should_update_atime(struct inode *inode,
200 			      struct vfsmount *vfsmnt)
201 {
202 	struct timespec now;
203 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
204 
205 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
206 		return 0;
207 
208 	if ((inode->i_flags & S_NOATIME) ||
209 	    ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
210 		return 0;
211 
212 	/*
213 	 * We can be called with no vfsmnt structure - NFSD will
214 	 * sometimes do this.
215 	 *
216 	 * Note that our action here is different than touch_atime() -
217 	 * if we can't tell whether this is a noatime mount, then we
218 	 * don't know whether to trust the value of s_atime_quantum.
219 	 */
220 	if (vfsmnt == NULL)
221 		return 0;
222 
223 	if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
224 	    ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
225 		return 0;
226 
227 	if (vfsmnt->mnt_flags & MNT_RELATIME) {
228 		if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
229 		    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
230 			return 1;
231 
232 		return 0;
233 	}
234 
235 	now = CURRENT_TIME;
236 	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
237 		return 0;
238 	else
239 		return 1;
240 }
241 
ocfs2_update_inode_atime(struct inode * inode,struct buffer_head * bh)242 int ocfs2_update_inode_atime(struct inode *inode,
243 			     struct buffer_head *bh)
244 {
245 	int ret;
246 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
247 	handle_t *handle;
248 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
249 
250 	mlog_entry_void();
251 
252 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
253 	if (IS_ERR(handle)) {
254 		ret = PTR_ERR(handle);
255 		mlog_errno(ret);
256 		goto out;
257 	}
258 
259 	ret = ocfs2_journal_access_di(handle, inode, bh,
260 				      OCFS2_JOURNAL_ACCESS_WRITE);
261 	if (ret) {
262 		mlog_errno(ret);
263 		goto out_commit;
264 	}
265 
266 	/*
267 	 * Don't use ocfs2_mark_inode_dirty() here as we don't always
268 	 * have i_mutex to guard against concurrent changes to other
269 	 * inode fields.
270 	 */
271 	inode->i_atime = CURRENT_TIME;
272 	di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
273 	di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
274 
275 	ret = ocfs2_journal_dirty(handle, bh);
276 	if (ret < 0)
277 		mlog_errno(ret);
278 
279 out_commit:
280 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
281 out:
282 	mlog_exit(ret);
283 	return ret;
284 }
285 
ocfs2_set_inode_size(handle_t * handle,struct inode * inode,struct buffer_head * fe_bh,u64 new_i_size)286 static int ocfs2_set_inode_size(handle_t *handle,
287 				struct inode *inode,
288 				struct buffer_head *fe_bh,
289 				u64 new_i_size)
290 {
291 	int status;
292 
293 	mlog_entry_void();
294 	i_size_write(inode, new_i_size);
295 	inode->i_blocks = ocfs2_inode_sector_count(inode);
296 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
297 
298 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
299 	if (status < 0) {
300 		mlog_errno(status);
301 		goto bail;
302 	}
303 
304 bail:
305 	mlog_exit(status);
306 	return status;
307 }
308 
ocfs2_simple_size_update(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)309 int ocfs2_simple_size_update(struct inode *inode,
310 			     struct buffer_head *di_bh,
311 			     u64 new_i_size)
312 {
313 	int ret;
314 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
315 	handle_t *handle = NULL;
316 
317 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
318 	if (IS_ERR(handle)) {
319 		ret = PTR_ERR(handle);
320 		mlog_errno(ret);
321 		goto out;
322 	}
323 
324 	ret = ocfs2_set_inode_size(handle, inode, di_bh,
325 				   new_i_size);
326 	if (ret < 0)
327 		mlog_errno(ret);
328 
329 	ocfs2_commit_trans(osb, handle);
330 out:
331 	return ret;
332 }
333 
ocfs2_orphan_for_truncate(struct ocfs2_super * osb,struct inode * inode,struct buffer_head * fe_bh,u64 new_i_size)334 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
335 				     struct inode *inode,
336 				     struct buffer_head *fe_bh,
337 				     u64 new_i_size)
338 {
339 	int status;
340 	handle_t *handle;
341 	struct ocfs2_dinode *di;
342 	u64 cluster_bytes;
343 
344 	mlog_entry_void();
345 
346 	/* TODO: This needs to actually orphan the inode in this
347 	 * transaction. */
348 
349 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
350 	if (IS_ERR(handle)) {
351 		status = PTR_ERR(handle);
352 		mlog_errno(status);
353 		goto out;
354 	}
355 
356 	status = ocfs2_journal_access_di(handle, inode, fe_bh,
357 					 OCFS2_JOURNAL_ACCESS_WRITE);
358 	if (status < 0) {
359 		mlog_errno(status);
360 		goto out_commit;
361 	}
362 
363 	/*
364 	 * Do this before setting i_size.
365 	 */
366 	cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
367 	status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
368 					       cluster_bytes);
369 	if (status) {
370 		mlog_errno(status);
371 		goto out_commit;
372 	}
373 
374 	i_size_write(inode, new_i_size);
375 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
376 
377 	di = (struct ocfs2_dinode *) fe_bh->b_data;
378 	di->i_size = cpu_to_le64(new_i_size);
379 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
380 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
381 
382 	status = ocfs2_journal_dirty(handle, fe_bh);
383 	if (status < 0)
384 		mlog_errno(status);
385 
386 out_commit:
387 	ocfs2_commit_trans(osb, handle);
388 out:
389 
390 	mlog_exit(status);
391 	return status;
392 }
393 
ocfs2_truncate_file(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)394 static int ocfs2_truncate_file(struct inode *inode,
395 			       struct buffer_head *di_bh,
396 			       u64 new_i_size)
397 {
398 	int status = 0;
399 	struct ocfs2_dinode *fe = NULL;
400 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
401 	struct ocfs2_truncate_context *tc = NULL;
402 
403 	mlog_entry("(inode = %llu, new_i_size = %llu\n",
404 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
405 		   (unsigned long long)new_i_size);
406 
407 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
408 	 * already validated it */
409 	fe = (struct ocfs2_dinode *) di_bh->b_data;
410 
411 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
412 			"Inode %llu, inode i_size = %lld != di "
413 			"i_size = %llu, i_flags = 0x%x\n",
414 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
415 			i_size_read(inode),
416 			(unsigned long long)le64_to_cpu(fe->i_size),
417 			le32_to_cpu(fe->i_flags));
418 
419 	if (new_i_size > le64_to_cpu(fe->i_size)) {
420 		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
421 		     (unsigned long long)le64_to_cpu(fe->i_size),
422 		     (unsigned long long)new_i_size);
423 		status = -EINVAL;
424 		mlog_errno(status);
425 		goto bail;
426 	}
427 
428 	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
429 	     (unsigned long long)le64_to_cpu(fe->i_blkno),
430 	     (unsigned long long)le64_to_cpu(fe->i_size),
431 	     (unsigned long long)new_i_size);
432 
433 	/* lets handle the simple truncate cases before doing any more
434 	 * cluster locking. */
435 	if (new_i_size == le64_to_cpu(fe->i_size))
436 		goto bail;
437 
438 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
439 
440 	/*
441 	 * The inode lock forced other nodes to sync and drop their
442 	 * pages, which (correctly) happens even if we have a truncate
443 	 * without allocation change - ocfs2 cluster sizes can be much
444 	 * greater than page size, so we have to truncate them
445 	 * anyway.
446 	 */
447 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
448 	truncate_inode_pages(inode->i_mapping, new_i_size);
449 
450 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
451 		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
452 					       i_size_read(inode), 1);
453 		if (status)
454 			mlog_errno(status);
455 
456 		goto bail_unlock_sem;
457 	}
458 
459 	/* alright, we're going to need to do a full blown alloc size
460 	 * change. Orphan the inode so that recovery can complete the
461 	 * truncate if necessary. This does the task of marking
462 	 * i_size. */
463 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
464 	if (status < 0) {
465 		mlog_errno(status);
466 		goto bail_unlock_sem;
467 	}
468 
469 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
470 	if (status < 0) {
471 		mlog_errno(status);
472 		goto bail_unlock_sem;
473 	}
474 
475 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
476 	if (status < 0) {
477 		mlog_errno(status);
478 		goto bail_unlock_sem;
479 	}
480 
481 	/* TODO: orphan dir cleanup here. */
482 bail_unlock_sem:
483 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
484 
485 bail:
486 
487 	mlog_exit(status);
488 	return status;
489 }
490 
491 /*
492  * extend file allocation only here.
493  * we'll update all the disk stuff, and oip->alloc_size
494  *
495  * expect stuff to be locked, a transaction started and enough data /
496  * metadata reservations in the contexts.
497  *
498  * Will return -EAGAIN, and a reason if a restart is needed.
499  * If passed in, *reason will always be set, even in error.
500  */
ocfs2_add_inode_data(struct ocfs2_super * osb,struct inode * inode,u32 * logical_offset,u32 clusters_to_add,int mark_unwritten,struct buffer_head * fe_bh,handle_t * handle,struct ocfs2_alloc_context * data_ac,struct ocfs2_alloc_context * meta_ac,enum ocfs2_alloc_restarted * reason_ret)501 int ocfs2_add_inode_data(struct ocfs2_super *osb,
502 			 struct inode *inode,
503 			 u32 *logical_offset,
504 			 u32 clusters_to_add,
505 			 int mark_unwritten,
506 			 struct buffer_head *fe_bh,
507 			 handle_t *handle,
508 			 struct ocfs2_alloc_context *data_ac,
509 			 struct ocfs2_alloc_context *meta_ac,
510 			 enum ocfs2_alloc_restarted *reason_ret)
511 {
512 	int ret;
513 	struct ocfs2_extent_tree et;
514 
515 	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
516 	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
517 					   clusters_to_add, mark_unwritten,
518 					   &et, handle,
519 					   data_ac, meta_ac, reason_ret);
520 
521 	return ret;
522 }
523 
__ocfs2_extend_allocation(struct inode * inode,u32 logical_start,u32 clusters_to_add,int mark_unwritten)524 static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
525 				     u32 clusters_to_add, int mark_unwritten)
526 {
527 	int status = 0;
528 	int restart_func = 0;
529 	int credits;
530 	u32 prev_clusters;
531 	struct buffer_head *bh = NULL;
532 	struct ocfs2_dinode *fe = NULL;
533 	handle_t *handle = NULL;
534 	struct ocfs2_alloc_context *data_ac = NULL;
535 	struct ocfs2_alloc_context *meta_ac = NULL;
536 	enum ocfs2_alloc_restarted why;
537 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
538 	struct ocfs2_extent_tree et;
539 	int did_quota = 0;
540 
541 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
542 
543 	/*
544 	 * This function only exists for file systems which don't
545 	 * support holes.
546 	 */
547 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
548 
549 	status = ocfs2_read_inode_block(inode, &bh);
550 	if (status < 0) {
551 		mlog_errno(status);
552 		goto leave;
553 	}
554 	fe = (struct ocfs2_dinode *) bh->b_data;
555 
556 restart_all:
557 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
558 
559 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
560 	     "clusters_to_add = %u\n",
561 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
562 	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
563 	     clusters_to_add);
564 	ocfs2_init_dinode_extent_tree(&et, inode, bh);
565 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
566 				       &data_ac, &meta_ac);
567 	if (status) {
568 		mlog_errno(status);
569 		goto leave;
570 	}
571 
572 	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
573 					    clusters_to_add);
574 	handle = ocfs2_start_trans(osb, credits);
575 	if (IS_ERR(handle)) {
576 		status = PTR_ERR(handle);
577 		handle = NULL;
578 		mlog_errno(status);
579 		goto leave;
580 	}
581 
582 restarted_transaction:
583 	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
584 	    clusters_to_add))) {
585 		status = -EDQUOT;
586 		goto leave;
587 	}
588 	did_quota = 1;
589 
590 	/* reserve a write to the file entry early on - that we if we
591 	 * run out of credits in the allocation path, we can still
592 	 * update i_size. */
593 	status = ocfs2_journal_access_di(handle, inode, bh,
594 					 OCFS2_JOURNAL_ACCESS_WRITE);
595 	if (status < 0) {
596 		mlog_errno(status);
597 		goto leave;
598 	}
599 
600 	prev_clusters = OCFS2_I(inode)->ip_clusters;
601 
602 	status = ocfs2_add_inode_data(osb,
603 				      inode,
604 				      &logical_start,
605 				      clusters_to_add,
606 				      mark_unwritten,
607 				      bh,
608 				      handle,
609 				      data_ac,
610 				      meta_ac,
611 				      &why);
612 	if ((status < 0) && (status != -EAGAIN)) {
613 		if (status != -ENOSPC)
614 			mlog_errno(status);
615 		goto leave;
616 	}
617 
618 	status = ocfs2_journal_dirty(handle, bh);
619 	if (status < 0) {
620 		mlog_errno(status);
621 		goto leave;
622 	}
623 
624 	spin_lock(&OCFS2_I(inode)->ip_lock);
625 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
626 	spin_unlock(&OCFS2_I(inode)->ip_lock);
627 	/* Release unused quota reservation */
628 	vfs_dq_free_space(inode,
629 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
630 	did_quota = 0;
631 
632 	if (why != RESTART_NONE && clusters_to_add) {
633 		if (why == RESTART_META) {
634 			mlog(0, "restarting function.\n");
635 			restart_func = 1;
636 		} else {
637 			BUG_ON(why != RESTART_TRANS);
638 
639 			mlog(0, "restarting transaction.\n");
640 			/* TODO: This can be more intelligent. */
641 			credits = ocfs2_calc_extend_credits(osb->sb,
642 							    &fe->id2.i_list,
643 							    clusters_to_add);
644 			status = ocfs2_extend_trans(handle, credits);
645 			if (status < 0) {
646 				/* handle still has to be committed at
647 				 * this point. */
648 				status = -ENOMEM;
649 				mlog_errno(status);
650 				goto leave;
651 			}
652 			goto restarted_transaction;
653 		}
654 	}
655 
656 	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
657 	     le32_to_cpu(fe->i_clusters),
658 	     (unsigned long long)le64_to_cpu(fe->i_size));
659 	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
660 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
661 
662 leave:
663 	if (status < 0 && did_quota)
664 		vfs_dq_free_space(inode,
665 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
666 	if (handle) {
667 		ocfs2_commit_trans(osb, handle);
668 		handle = NULL;
669 	}
670 	if (data_ac) {
671 		ocfs2_free_alloc_context(data_ac);
672 		data_ac = NULL;
673 	}
674 	if (meta_ac) {
675 		ocfs2_free_alloc_context(meta_ac);
676 		meta_ac = NULL;
677 	}
678 	if ((!status) && restart_func) {
679 		restart_func = 0;
680 		goto restart_all;
681 	}
682 	brelse(bh);
683 	bh = NULL;
684 
685 	mlog_exit(status);
686 	return status;
687 }
688 
689 /* Some parts of this taken from generic_cont_expand, which turned out
690  * to be too fragile to do exactly what we need without us having to
691  * worry about recursive locking in ->write_begin() and ->write_end(). */
ocfs2_write_zero_page(struct inode * inode,u64 size)692 static int ocfs2_write_zero_page(struct inode *inode,
693 				 u64 size)
694 {
695 	struct address_space *mapping = inode->i_mapping;
696 	struct page *page;
697 	unsigned long index;
698 	unsigned int offset;
699 	handle_t *handle = NULL;
700 	int ret;
701 
702 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
703 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
704 	** skip the prepare.  make sure we never send an offset for the start
705 	** of a block
706 	*/
707 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
708 		offset++;
709 	}
710 	index = size >> PAGE_CACHE_SHIFT;
711 
712 	page = grab_cache_page(mapping, index);
713 	if (!page) {
714 		ret = -ENOMEM;
715 		mlog_errno(ret);
716 		goto out;
717 	}
718 
719 	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
720 	if (ret < 0) {
721 		mlog_errno(ret);
722 		goto out_unlock;
723 	}
724 
725 	if (ocfs2_should_order_data(inode)) {
726 		handle = ocfs2_start_walk_page_trans(inode, page, offset,
727 						     offset);
728 		if (IS_ERR(handle)) {
729 			ret = PTR_ERR(handle);
730 			handle = NULL;
731 			goto out_unlock;
732 		}
733 	}
734 
735 	/* must not update i_size! */
736 	ret = block_commit_write(page, offset, offset);
737 	if (ret < 0)
738 		mlog_errno(ret);
739 	else
740 		ret = 0;
741 
742 	if (handle)
743 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
744 out_unlock:
745 	unlock_page(page);
746 	page_cache_release(page);
747 out:
748 	return ret;
749 }
750 
ocfs2_zero_extend(struct inode * inode,u64 zero_to_size)751 static int ocfs2_zero_extend(struct inode *inode,
752 			     u64 zero_to_size)
753 {
754 	int ret = 0;
755 	u64 start_off;
756 	struct super_block *sb = inode->i_sb;
757 
758 	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
759 	while (start_off < zero_to_size) {
760 		ret = ocfs2_write_zero_page(inode, start_off);
761 		if (ret < 0) {
762 			mlog_errno(ret);
763 			goto out;
764 		}
765 
766 		start_off += sb->s_blocksize;
767 
768 		/*
769 		 * Very large extends have the potential to lock up
770 		 * the cpu for extended periods of time.
771 		 */
772 		cond_resched();
773 	}
774 
775 out:
776 	return ret;
777 }
778 
ocfs2_extend_no_holes(struct inode * inode,u64 new_i_size,u64 zero_to)779 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
780 {
781 	int ret;
782 	u32 clusters_to_add;
783 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
784 
785 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
786 	if (clusters_to_add < oi->ip_clusters)
787 		clusters_to_add = 0;
788 	else
789 		clusters_to_add -= oi->ip_clusters;
790 
791 	if (clusters_to_add) {
792 		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
793 						clusters_to_add, 0);
794 		if (ret) {
795 			mlog_errno(ret);
796 			goto out;
797 		}
798 	}
799 
800 	/*
801 	 * Call this even if we don't add any clusters to the tree. We
802 	 * still need to zero the area between the old i_size and the
803 	 * new i_size.
804 	 */
805 	ret = ocfs2_zero_extend(inode, zero_to);
806 	if (ret < 0)
807 		mlog_errno(ret);
808 
809 out:
810 	return ret;
811 }
812 
ocfs2_extend_file(struct inode * inode,struct buffer_head * di_bh,u64 new_i_size)813 static int ocfs2_extend_file(struct inode *inode,
814 			     struct buffer_head *di_bh,
815 			     u64 new_i_size)
816 {
817 	int ret = 0;
818 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
819 
820 	BUG_ON(!di_bh);
821 
822 	/* setattr sometimes calls us like this. */
823 	if (new_i_size == 0)
824 		goto out;
825 
826 	if (i_size_read(inode) == new_i_size)
827   		goto out;
828 	BUG_ON(new_i_size < i_size_read(inode));
829 
830 	/*
831 	 * Fall through for converting inline data, even if the fs
832 	 * supports sparse files.
833 	 *
834 	 * The check for inline data here is legal - nobody can add
835 	 * the feature since we have i_mutex. We must check it again
836 	 * after acquiring ip_alloc_sem though, as paths like mmap
837 	 * might have raced us to converting the inode to extents.
838 	 */
839 	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
840 	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
841 		goto out_update_size;
842 
843 	/*
844 	 * The alloc sem blocks people in read/write from reading our
845 	 * allocation until we're done changing it. We depend on
846 	 * i_mutex to block other extend/truncate calls while we're
847 	 * here.
848 	 */
849 	down_write(&oi->ip_alloc_sem);
850 
851 	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
852 		/*
853 		 * We can optimize small extends by keeping the inodes
854 		 * inline data.
855 		 */
856 		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
857 			up_write(&oi->ip_alloc_sem);
858 			goto out_update_size;
859 		}
860 
861 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
862 		if (ret) {
863 			up_write(&oi->ip_alloc_sem);
864 
865 			mlog_errno(ret);
866 			goto out;
867 		}
868 	}
869 
870 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
871 		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
872 
873 	up_write(&oi->ip_alloc_sem);
874 
875 	if (ret < 0) {
876 		mlog_errno(ret);
877 		goto out;
878 	}
879 
880 out_update_size:
881 	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
882 	if (ret < 0)
883 		mlog_errno(ret);
884 
885 out:
886 	return ret;
887 }
888 
ocfs2_setattr(struct dentry * dentry,struct iattr * attr)889 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
890 {
891 	int status = 0, size_change;
892 	struct inode *inode = dentry->d_inode;
893 	struct super_block *sb = inode->i_sb;
894 	struct ocfs2_super *osb = OCFS2_SB(sb);
895 	struct buffer_head *bh = NULL;
896 	handle_t *handle = NULL;
897 	int locked[MAXQUOTAS] = {0, 0};
898 	int credits, qtype;
899 	struct ocfs2_mem_dqinfo *oinfo;
900 
901 	mlog_entry("(0x%p, '%.*s')\n", dentry,
902 	           dentry->d_name.len, dentry->d_name.name);
903 
904 	/* ensuring we don't even attempt to truncate a symlink */
905 	if (S_ISLNK(inode->i_mode))
906 		attr->ia_valid &= ~ATTR_SIZE;
907 
908 	if (attr->ia_valid & ATTR_MODE)
909 		mlog(0, "mode change: %d\n", attr->ia_mode);
910 	if (attr->ia_valid & ATTR_UID)
911 		mlog(0, "uid change: %d\n", attr->ia_uid);
912 	if (attr->ia_valid & ATTR_GID)
913 		mlog(0, "gid change: %d\n", attr->ia_gid);
914 	if (attr->ia_valid & ATTR_SIZE)
915 		mlog(0, "size change...\n");
916 	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
917 		mlog(0, "time change...\n");
918 
919 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
920 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
921 	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
922 		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
923 		return 0;
924 	}
925 
926 	status = inode_change_ok(inode, attr);
927 	if (status)
928 		return status;
929 
930 	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
931 	if (size_change) {
932 		status = ocfs2_rw_lock(inode, 1);
933 		if (status < 0) {
934 			mlog_errno(status);
935 			goto bail;
936 		}
937 	}
938 
939 	status = ocfs2_inode_lock(inode, &bh, 1);
940 	if (status < 0) {
941 		if (status != -ENOENT)
942 			mlog_errno(status);
943 		goto bail_unlock_rw;
944 	}
945 
946 	if (size_change && attr->ia_size != i_size_read(inode)) {
947 		if (attr->ia_size > sb->s_maxbytes) {
948 			status = -EFBIG;
949 			goto bail_unlock;
950 		}
951 
952 		if (i_size_read(inode) > attr->ia_size) {
953 			if (ocfs2_should_order_data(inode)) {
954 				status = ocfs2_begin_ordered_truncate(inode,
955 								      attr->ia_size);
956 				if (status)
957 					goto bail_unlock;
958 			}
959 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
960 		} else
961 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
962 		if (status < 0) {
963 			if (status != -ENOSPC)
964 				mlog_errno(status);
965 			status = -ENOSPC;
966 			goto bail_unlock;
967 		}
968 	}
969 
970 	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
971 	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
972 		credits = OCFS2_INODE_UPDATE_CREDITS;
973 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
974 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
975 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
976 			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
977 			status = ocfs2_lock_global_qf(oinfo, 1);
978 			if (status < 0)
979 				goto bail_unlock;
980 			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
981 				ocfs2_calc_qdel_credits(sb, USRQUOTA);
982 			locked[USRQUOTA] = 1;
983 		}
984 		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
985 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
986 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
987 			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
988 			status = ocfs2_lock_global_qf(oinfo, 1);
989 			if (status < 0)
990 				goto bail_unlock;
991 			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
992 				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
993 			locked[GRPQUOTA] = 1;
994 		}
995 		handle = ocfs2_start_trans(osb, credits);
996 		if (IS_ERR(handle)) {
997 			status = PTR_ERR(handle);
998 			mlog_errno(status);
999 			goto bail_unlock;
1000 		}
1001 		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1002 		if (status < 0)
1003 			goto bail_commit;
1004 	} else {
1005 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1006 		if (IS_ERR(handle)) {
1007 			status = PTR_ERR(handle);
1008 			mlog_errno(status);
1009 			goto bail_unlock;
1010 		}
1011 	}
1012 
1013 	/*
1014 	 * This will intentionally not wind up calling vmtruncate(),
1015 	 * since all the work for a size change has been done above.
1016 	 * Otherwise, we could get into problems with truncate as
1017 	 * ip_alloc_sem is used there to protect against i_size
1018 	 * changes.
1019 	 */
1020 	status = inode_setattr(inode, attr);
1021 	if (status < 0) {
1022 		mlog_errno(status);
1023 		goto bail_commit;
1024 	}
1025 
1026 	status = ocfs2_mark_inode_dirty(handle, inode, bh);
1027 	if (status < 0)
1028 		mlog_errno(status);
1029 
1030 bail_commit:
1031 	ocfs2_commit_trans(osb, handle);
1032 bail_unlock:
1033 	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1034 		if (!locked[qtype])
1035 			continue;
1036 		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
1037 		ocfs2_unlock_global_qf(oinfo, 1);
1038 	}
1039 	ocfs2_inode_unlock(inode, 1);
1040 bail_unlock_rw:
1041 	if (size_change)
1042 		ocfs2_rw_unlock(inode, 1);
1043 bail:
1044 	brelse(bh);
1045 
1046 	if (!status && attr->ia_valid & ATTR_MODE) {
1047 		status = ocfs2_acl_chmod(inode);
1048 		if (status < 0)
1049 			mlog_errno(status);
1050 	}
1051 
1052 	mlog_exit(status);
1053 	return status;
1054 }
1055 
ocfs2_getattr(struct vfsmount * mnt,struct dentry * dentry,struct kstat * stat)1056 int ocfs2_getattr(struct vfsmount *mnt,
1057 		  struct dentry *dentry,
1058 		  struct kstat *stat)
1059 {
1060 	struct inode *inode = dentry->d_inode;
1061 	struct super_block *sb = dentry->d_inode->i_sb;
1062 	struct ocfs2_super *osb = sb->s_fs_info;
1063 	int err;
1064 
1065 	mlog_entry_void();
1066 
1067 	err = ocfs2_inode_revalidate(dentry);
1068 	if (err) {
1069 		if (err != -ENOENT)
1070 			mlog_errno(err);
1071 		goto bail;
1072 	}
1073 
1074 	generic_fillattr(inode, stat);
1075 
1076 	/* We set the blksize from the cluster size for performance */
1077 	stat->blksize = osb->s_clustersize;
1078 
1079 bail:
1080 	mlog_exit(err);
1081 
1082 	return err;
1083 }
1084 
ocfs2_permission(struct inode * inode,int mask)1085 int ocfs2_permission(struct inode *inode, int mask)
1086 {
1087 	int ret;
1088 
1089 	mlog_entry_void();
1090 
1091 	ret = ocfs2_inode_lock(inode, NULL, 0);
1092 	if (ret) {
1093 		if (ret != -ENOENT)
1094 			mlog_errno(ret);
1095 		goto out;
1096 	}
1097 
1098 	ret = generic_permission(inode, mask, ocfs2_check_acl);
1099 
1100 	ocfs2_inode_unlock(inode, 0);
1101 out:
1102 	mlog_exit(ret);
1103 	return ret;
1104 }
1105 
__ocfs2_write_remove_suid(struct inode * inode,struct buffer_head * bh)1106 static int __ocfs2_write_remove_suid(struct inode *inode,
1107 				     struct buffer_head *bh)
1108 {
1109 	int ret;
1110 	handle_t *handle;
1111 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1112 	struct ocfs2_dinode *di;
1113 
1114 	mlog_entry("(Inode %llu, mode 0%o)\n",
1115 		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1116 
1117 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1118 	if (IS_ERR(handle)) {
1119 		ret = PTR_ERR(handle);
1120 		mlog_errno(ret);
1121 		goto out;
1122 	}
1123 
1124 	ret = ocfs2_journal_access_di(handle, inode, bh,
1125 				      OCFS2_JOURNAL_ACCESS_WRITE);
1126 	if (ret < 0) {
1127 		mlog_errno(ret);
1128 		goto out_trans;
1129 	}
1130 
1131 	inode->i_mode &= ~S_ISUID;
1132 	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1133 		inode->i_mode &= ~S_ISGID;
1134 
1135 	di = (struct ocfs2_dinode *) bh->b_data;
1136 	di->i_mode = cpu_to_le16(inode->i_mode);
1137 
1138 	ret = ocfs2_journal_dirty(handle, bh);
1139 	if (ret < 0)
1140 		mlog_errno(ret);
1141 
1142 out_trans:
1143 	ocfs2_commit_trans(osb, handle);
1144 out:
1145 	mlog_exit(ret);
1146 	return ret;
1147 }
1148 
1149 /*
1150  * Will look for holes and unwritten extents in the range starting at
1151  * pos for count bytes (inclusive).
1152  */
ocfs2_check_range_for_holes(struct inode * inode,loff_t pos,size_t count)1153 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1154 				       size_t count)
1155 {
1156 	int ret = 0;
1157 	unsigned int extent_flags;
1158 	u32 cpos, clusters, extent_len, phys_cpos;
1159 	struct super_block *sb = inode->i_sb;
1160 
1161 	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1162 	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1163 
1164 	while (clusters) {
1165 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1166 					 &extent_flags);
1167 		if (ret < 0) {
1168 			mlog_errno(ret);
1169 			goto out;
1170 		}
1171 
1172 		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1173 			ret = 1;
1174 			break;
1175 		}
1176 
1177 		if (extent_len > clusters)
1178 			extent_len = clusters;
1179 
1180 		clusters -= extent_len;
1181 		cpos += extent_len;
1182 	}
1183 out:
1184 	return ret;
1185 }
1186 
ocfs2_write_remove_suid(struct inode * inode)1187 static int ocfs2_write_remove_suid(struct inode *inode)
1188 {
1189 	int ret;
1190 	struct buffer_head *bh = NULL;
1191 
1192 	ret = ocfs2_read_inode_block(inode, &bh);
1193 	if (ret < 0) {
1194 		mlog_errno(ret);
1195 		goto out;
1196 	}
1197 
1198 	ret =  __ocfs2_write_remove_suid(inode, bh);
1199 out:
1200 	brelse(bh);
1201 	return ret;
1202 }
1203 
1204 /*
1205  * Allocate enough extents to cover the region starting at byte offset
1206  * start for len bytes. Existing extents are skipped, any extents
1207  * added are marked as "unwritten".
1208  */
ocfs2_allocate_unwritten_extents(struct inode * inode,u64 start,u64 len)1209 static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1210 					    u64 start, u64 len)
1211 {
1212 	int ret;
1213 	u32 cpos, phys_cpos, clusters, alloc_size;
1214 	u64 end = start + len;
1215 	struct buffer_head *di_bh = NULL;
1216 
1217 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1218 		ret = ocfs2_read_inode_block(inode, &di_bh);
1219 		if (ret) {
1220 			mlog_errno(ret);
1221 			goto out;
1222 		}
1223 
1224 		/*
1225 		 * Nothing to do if the requested reservation range
1226 		 * fits within the inode.
1227 		 */
1228 		if (ocfs2_size_fits_inline_data(di_bh, end))
1229 			goto out;
1230 
1231 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1232 		if (ret) {
1233 			mlog_errno(ret);
1234 			goto out;
1235 		}
1236 	}
1237 
1238 	/*
1239 	 * We consider both start and len to be inclusive.
1240 	 */
1241 	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1242 	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1243 	clusters -= cpos;
1244 
1245 	while (clusters) {
1246 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1247 					 &alloc_size, NULL);
1248 		if (ret) {
1249 			mlog_errno(ret);
1250 			goto out;
1251 		}
1252 
1253 		/*
1254 		 * Hole or existing extent len can be arbitrary, so
1255 		 * cap it to our own allocation request.
1256 		 */
1257 		if (alloc_size > clusters)
1258 			alloc_size = clusters;
1259 
1260 		if (phys_cpos) {
1261 			/*
1262 			 * We already have an allocation at this
1263 			 * region so we can safely skip it.
1264 			 */
1265 			goto next;
1266 		}
1267 
1268 		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1269 		if (ret) {
1270 			if (ret != -ENOSPC)
1271 				mlog_errno(ret);
1272 			goto out;
1273 		}
1274 
1275 next:
1276 		cpos += alloc_size;
1277 		clusters -= alloc_size;
1278 	}
1279 
1280 	ret = 0;
1281 out:
1282 
1283 	brelse(di_bh);
1284 	return ret;
1285 }
1286 
1287 /*
1288  * Truncate a byte range, avoiding pages within partial clusters. This
1289  * preserves those pages for the zeroing code to write to.
1290  */
ocfs2_truncate_cluster_pages(struct inode * inode,u64 byte_start,u64 byte_len)1291 static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1292 					 u64 byte_len)
1293 {
1294 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1295 	loff_t start, end;
1296 	struct address_space *mapping = inode->i_mapping;
1297 
1298 	start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1299 	end = byte_start + byte_len;
1300 	end = end & ~(osb->s_clustersize - 1);
1301 
1302 	if (start < end) {
1303 		unmap_mapping_range(mapping, start, end - start, 0);
1304 		truncate_inode_pages_range(mapping, start, end - 1);
1305 	}
1306 }
1307 
ocfs2_zero_partial_clusters(struct inode * inode,u64 start,u64 len)1308 static int ocfs2_zero_partial_clusters(struct inode *inode,
1309 				       u64 start, u64 len)
1310 {
1311 	int ret = 0;
1312 	u64 tmpend, end = start + len;
1313 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1314 	unsigned int csize = osb->s_clustersize;
1315 	handle_t *handle;
1316 
1317 	/*
1318 	 * The "start" and "end" values are NOT necessarily part of
1319 	 * the range whose allocation is being deleted. Rather, this
1320 	 * is what the user passed in with the request. We must zero
1321 	 * partial clusters here. There's no need to worry about
1322 	 * physical allocation - the zeroing code knows to skip holes.
1323 	 */
1324 	mlog(0, "byte start: %llu, end: %llu\n",
1325 	     (unsigned long long)start, (unsigned long long)end);
1326 
1327 	/*
1328 	 * If both edges are on a cluster boundary then there's no
1329 	 * zeroing required as the region is part of the allocation to
1330 	 * be truncated.
1331 	 */
1332 	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1333 		goto out;
1334 
1335 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1336 	if (IS_ERR(handle)) {
1337 		ret = PTR_ERR(handle);
1338 		mlog_errno(ret);
1339 		goto out;
1340 	}
1341 
1342 	/*
1343 	 * We want to get the byte offset of the end of the 1st cluster.
1344 	 */
1345 	tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1346 	if (tmpend > end)
1347 		tmpend = end;
1348 
1349 	mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1350 	     (unsigned long long)start, (unsigned long long)tmpend);
1351 
1352 	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1353 	if (ret)
1354 		mlog_errno(ret);
1355 
1356 	if (tmpend < end) {
1357 		/*
1358 		 * This may make start and end equal, but the zeroing
1359 		 * code will skip any work in that case so there's no
1360 		 * need to catch it up here.
1361 		 */
1362 		start = end & ~(osb->s_clustersize - 1);
1363 
1364 		mlog(0, "2nd range: start: %llu, end: %llu\n",
1365 		     (unsigned long long)start, (unsigned long long)end);
1366 
1367 		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1368 		if (ret)
1369 			mlog_errno(ret);
1370 	}
1371 
1372 	ocfs2_commit_trans(osb, handle);
1373 out:
1374 	return ret;
1375 }
1376 
ocfs2_remove_inode_range(struct inode * inode,struct buffer_head * di_bh,u64 byte_start,u64 byte_len)1377 static int ocfs2_remove_inode_range(struct inode *inode,
1378 				    struct buffer_head *di_bh, u64 byte_start,
1379 				    u64 byte_len)
1380 {
1381 	int ret = 0;
1382 	u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1383 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1384 	struct ocfs2_cached_dealloc_ctxt dealloc;
1385 	struct address_space *mapping = inode->i_mapping;
1386 	struct ocfs2_extent_tree et;
1387 
1388 	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
1389 	ocfs2_init_dealloc_ctxt(&dealloc);
1390 
1391 	if (byte_len == 0)
1392 		return 0;
1393 
1394 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1395 		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1396 					    byte_start + byte_len, 0);
1397 		if (ret) {
1398 			mlog_errno(ret);
1399 			goto out;
1400 		}
1401 		/*
1402 		 * There's no need to get fancy with the page cache
1403 		 * truncate of an inline-data inode. We're talking
1404 		 * about less than a page here, which will be cached
1405 		 * in the dinode buffer anyway.
1406 		 */
1407 		unmap_mapping_range(mapping, 0, 0, 0);
1408 		truncate_inode_pages(mapping, 0);
1409 		goto out;
1410 	}
1411 
1412 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1413 	trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1414 	if (trunc_len >= trunc_start)
1415 		trunc_len -= trunc_start;
1416 	else
1417 		trunc_len = 0;
1418 
1419 	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1420 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1421 	     (unsigned long long)byte_start,
1422 	     (unsigned long long)byte_len, trunc_start, trunc_len);
1423 
1424 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1425 	if (ret) {
1426 		mlog_errno(ret);
1427 		goto out;
1428 	}
1429 
1430 	cpos = trunc_start;
1431 	while (trunc_len) {
1432 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1433 					 &alloc_size, NULL);
1434 		if (ret) {
1435 			mlog_errno(ret);
1436 			goto out;
1437 		}
1438 
1439 		if (alloc_size > trunc_len)
1440 			alloc_size = trunc_len;
1441 
1442 		/* Only do work for non-holes */
1443 		if (phys_cpos != 0) {
1444 			ret = ocfs2_remove_btree_range(inode, &et, cpos,
1445 						       phys_cpos, alloc_size,
1446 						       &dealloc);
1447 			if (ret) {
1448 				mlog_errno(ret);
1449 				goto out;
1450 			}
1451 		}
1452 
1453 		cpos += alloc_size;
1454 		trunc_len -= alloc_size;
1455 	}
1456 
1457 	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1458 
1459 out:
1460 	ocfs2_schedule_truncate_log_flush(osb, 1);
1461 	ocfs2_run_deallocs(osb, &dealloc);
1462 
1463 	return ret;
1464 }
1465 
1466 /*
1467  * Parts of this function taken from xfs_change_file_space()
1468  */
__ocfs2_change_file_space(struct file * file,struct inode * inode,loff_t f_pos,unsigned int cmd,struct ocfs2_space_resv * sr,int change_size)1469 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1470 				     loff_t f_pos, unsigned int cmd,
1471 				     struct ocfs2_space_resv *sr,
1472 				     int change_size)
1473 {
1474 	int ret;
1475 	s64 llen;
1476 	loff_t size;
1477 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1478 	struct buffer_head *di_bh = NULL;
1479 	handle_t *handle;
1480 	unsigned long long max_off = inode->i_sb->s_maxbytes;
1481 
1482 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1483 		return -EROFS;
1484 
1485 	mutex_lock(&inode->i_mutex);
1486 
1487 	/*
1488 	 * This prevents concurrent writes on other nodes
1489 	 */
1490 	ret = ocfs2_rw_lock(inode, 1);
1491 	if (ret) {
1492 		mlog_errno(ret);
1493 		goto out;
1494 	}
1495 
1496 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
1497 	if (ret) {
1498 		mlog_errno(ret);
1499 		goto out_rw_unlock;
1500 	}
1501 
1502 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1503 		ret = -EPERM;
1504 		goto out_inode_unlock;
1505 	}
1506 
1507 	switch (sr->l_whence) {
1508 	case 0: /*SEEK_SET*/
1509 		break;
1510 	case 1: /*SEEK_CUR*/
1511 		sr->l_start += f_pos;
1512 		break;
1513 	case 2: /*SEEK_END*/
1514 		sr->l_start += i_size_read(inode);
1515 		break;
1516 	default:
1517 		ret = -EINVAL;
1518 		goto out_inode_unlock;
1519 	}
1520 	sr->l_whence = 0;
1521 
1522 	llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1523 
1524 	if (sr->l_start < 0
1525 	    || sr->l_start > max_off
1526 	    || (sr->l_start + llen) < 0
1527 	    || (sr->l_start + llen) > max_off) {
1528 		ret = -EINVAL;
1529 		goto out_inode_unlock;
1530 	}
1531 	size = sr->l_start + sr->l_len;
1532 
1533 	if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1534 		if (sr->l_len <= 0) {
1535 			ret = -EINVAL;
1536 			goto out_inode_unlock;
1537 		}
1538 	}
1539 
1540 	if (file && should_remove_suid(file->f_path.dentry)) {
1541 		ret = __ocfs2_write_remove_suid(inode, di_bh);
1542 		if (ret) {
1543 			mlog_errno(ret);
1544 			goto out_inode_unlock;
1545 		}
1546 	}
1547 
1548 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
1549 	switch (cmd) {
1550 	case OCFS2_IOC_RESVSP:
1551 	case OCFS2_IOC_RESVSP64:
1552 		/*
1553 		 * This takes unsigned offsets, but the signed ones we
1554 		 * pass have been checked against overflow above.
1555 		 */
1556 		ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1557 						       sr->l_len);
1558 		break;
1559 	case OCFS2_IOC_UNRESVSP:
1560 	case OCFS2_IOC_UNRESVSP64:
1561 		ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1562 					       sr->l_len);
1563 		break;
1564 	default:
1565 		ret = -EINVAL;
1566 	}
1567 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
1568 	if (ret) {
1569 		mlog_errno(ret);
1570 		goto out_inode_unlock;
1571 	}
1572 
1573 	/*
1574 	 * We update c/mtime for these changes
1575 	 */
1576 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1577 	if (IS_ERR(handle)) {
1578 		ret = PTR_ERR(handle);
1579 		mlog_errno(ret);
1580 		goto out_inode_unlock;
1581 	}
1582 
1583 	if (change_size && i_size_read(inode) < size)
1584 		i_size_write(inode, size);
1585 
1586 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1587 	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1588 	if (ret < 0)
1589 		mlog_errno(ret);
1590 
1591 	ocfs2_commit_trans(osb, handle);
1592 
1593 out_inode_unlock:
1594 	brelse(di_bh);
1595 	ocfs2_inode_unlock(inode, 1);
1596 out_rw_unlock:
1597 	ocfs2_rw_unlock(inode, 1);
1598 
1599 out:
1600 	mutex_unlock(&inode->i_mutex);
1601 	return ret;
1602 }
1603 
ocfs2_change_file_space(struct file * file,unsigned int cmd,struct ocfs2_space_resv * sr)1604 int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1605 			    struct ocfs2_space_resv *sr)
1606 {
1607 	struct inode *inode = file->f_path.dentry->d_inode;
1608 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1609 
1610 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1611 	    !ocfs2_writes_unwritten_extents(osb))
1612 		return -ENOTTY;
1613 	else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1614 		 !ocfs2_sparse_alloc(osb))
1615 		return -ENOTTY;
1616 
1617 	if (!S_ISREG(inode->i_mode))
1618 		return -EINVAL;
1619 
1620 	if (!(file->f_mode & FMODE_WRITE))
1621 		return -EBADF;
1622 
1623 	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1624 }
1625 
ocfs2_fallocate(struct inode * inode,int mode,loff_t offset,loff_t len)1626 static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1627 			    loff_t len)
1628 {
1629 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1630 	struct ocfs2_space_resv sr;
1631 	int change_size = 1;
1632 
1633 	if (!ocfs2_writes_unwritten_extents(osb))
1634 		return -EOPNOTSUPP;
1635 
1636 	if (S_ISDIR(inode->i_mode))
1637 		return -ENODEV;
1638 
1639 	if (mode & FALLOC_FL_KEEP_SIZE)
1640 		change_size = 0;
1641 
1642 	sr.l_whence = 0;
1643 	sr.l_start = (s64)offset;
1644 	sr.l_len = (s64)len;
1645 
1646 	return __ocfs2_change_file_space(NULL, inode, offset,
1647 					 OCFS2_IOC_RESVSP64, &sr, change_size);
1648 }
1649 
ocfs2_prepare_inode_for_write(struct dentry * dentry,loff_t * ppos,size_t count,int appending,int * direct_io)1650 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1651 					 loff_t *ppos,
1652 					 size_t count,
1653 					 int appending,
1654 					 int *direct_io)
1655 {
1656 	int ret = 0, meta_level = 0;
1657 	struct inode *inode = dentry->d_inode;
1658 	loff_t saved_pos, end;
1659 
1660 	/*
1661 	 * We start with a read level meta lock and only jump to an ex
1662 	 * if we need to make modifications here.
1663 	 */
1664 	for(;;) {
1665 		ret = ocfs2_inode_lock(inode, NULL, meta_level);
1666 		if (ret < 0) {
1667 			meta_level = -1;
1668 			mlog_errno(ret);
1669 			goto out;
1670 		}
1671 
1672 		/* Clear suid / sgid if necessary. We do this here
1673 		 * instead of later in the write path because
1674 		 * remove_suid() calls ->setattr without any hint that
1675 		 * we may have already done our cluster locking. Since
1676 		 * ocfs2_setattr() *must* take cluster locks to
1677 		 * proceeed, this will lead us to recursively lock the
1678 		 * inode. There's also the dinode i_size state which
1679 		 * can be lost via setattr during extending writes (we
1680 		 * set inode->i_size at the end of a write. */
1681 		if (should_remove_suid(dentry)) {
1682 			if (meta_level == 0) {
1683 				ocfs2_inode_unlock(inode, meta_level);
1684 				meta_level = 1;
1685 				continue;
1686 			}
1687 
1688 			ret = ocfs2_write_remove_suid(inode);
1689 			if (ret < 0) {
1690 				mlog_errno(ret);
1691 				goto out_unlock;
1692 			}
1693 		}
1694 
1695 		/* work on a copy of ppos until we're sure that we won't have
1696 		 * to recalculate it due to relocking. */
1697 		if (appending) {
1698 			saved_pos = i_size_read(inode);
1699 			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1700 		} else {
1701 			saved_pos = *ppos;
1702 		}
1703 
1704 		end = saved_pos + count;
1705 
1706 		/*
1707 		 * Skip the O_DIRECT checks if we don't need
1708 		 * them.
1709 		 */
1710 		if (!direct_io || !(*direct_io))
1711 			break;
1712 
1713 		/*
1714 		 * There's no sane way to do direct writes to an inode
1715 		 * with inline data.
1716 		 */
1717 		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1718 			*direct_io = 0;
1719 			break;
1720 		}
1721 
1722 		/*
1723 		 * Allowing concurrent direct writes means
1724 		 * i_size changes wouldn't be synchronized, so
1725 		 * one node could wind up truncating another
1726 		 * nodes writes.
1727 		 */
1728 		if (end > i_size_read(inode)) {
1729 			*direct_io = 0;
1730 			break;
1731 		}
1732 
1733 		/*
1734 		 * We don't fill holes during direct io, so
1735 		 * check for them here. If any are found, the
1736 		 * caller will have to retake some cluster
1737 		 * locks and initiate the io as buffered.
1738 		 */
1739 		ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1740 		if (ret == 1) {
1741 			*direct_io = 0;
1742 			ret = 0;
1743 		} else if (ret < 0)
1744 			mlog_errno(ret);
1745 		break;
1746 	}
1747 
1748 	if (appending)
1749 		*ppos = saved_pos;
1750 
1751 out_unlock:
1752 	ocfs2_inode_unlock(inode, meta_level);
1753 
1754 out:
1755 	return ret;
1756 }
1757 
ocfs2_file_aio_write(struct kiocb * iocb,const struct iovec * iov,unsigned long nr_segs,loff_t pos)1758 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1759 				    const struct iovec *iov,
1760 				    unsigned long nr_segs,
1761 				    loff_t pos)
1762 {
1763 	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
1764 	int can_do_direct;
1765 	ssize_t written = 0;
1766 	size_t ocount;		/* original count */
1767 	size_t count;		/* after file limit checks */
1768 	loff_t old_size, *ppos = &iocb->ki_pos;
1769 	u32 old_clusters;
1770 	struct file *file = iocb->ki_filp;
1771 	struct inode *inode = file->f_path.dentry->d_inode;
1772 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1773 
1774 	mlog_entry("(0x%p, %u, '%.*s')\n", file,
1775 		   (unsigned int)nr_segs,
1776 		   file->f_path.dentry->d_name.len,
1777 		   file->f_path.dentry->d_name.name);
1778 
1779 	if (iocb->ki_left == 0)
1780 		return 0;
1781 
1782 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1783 
1784 	appending = file->f_flags & O_APPEND ? 1 : 0;
1785 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1786 
1787 	mutex_lock(&inode->i_mutex);
1788 
1789 relock:
1790 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1791 	if (direct_io) {
1792 		down_read(&inode->i_alloc_sem);
1793 		have_alloc_sem = 1;
1794 	}
1795 
1796 	/* concurrent O_DIRECT writes are allowed */
1797 	rw_level = !direct_io;
1798 	ret = ocfs2_rw_lock(inode, rw_level);
1799 	if (ret < 0) {
1800 		mlog_errno(ret);
1801 		goto out_sems;
1802 	}
1803 
1804 	can_do_direct = direct_io;
1805 	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1806 					    iocb->ki_left, appending,
1807 					    &can_do_direct);
1808 	if (ret < 0) {
1809 		mlog_errno(ret);
1810 		goto out;
1811 	}
1812 
1813 	/*
1814 	 * We can't complete the direct I/O as requested, fall back to
1815 	 * buffered I/O.
1816 	 */
1817 	if (direct_io && !can_do_direct) {
1818 		ocfs2_rw_unlock(inode, rw_level);
1819 		up_read(&inode->i_alloc_sem);
1820 
1821 		have_alloc_sem = 0;
1822 		rw_level = -1;
1823 
1824 		direct_io = 0;
1825 		goto relock;
1826 	}
1827 
1828 	/*
1829 	 * To later detect whether a journal commit for sync writes is
1830 	 * necessary, we sample i_size, and cluster count here.
1831 	 */
1832 	old_size = i_size_read(inode);
1833 	old_clusters = OCFS2_I(inode)->ip_clusters;
1834 
1835 	/* communicate with ocfs2_dio_end_io */
1836 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
1837 
1838 	if (direct_io) {
1839 		ret = generic_segment_checks(iov, &nr_segs, &ocount,
1840 					     VERIFY_READ);
1841 		if (ret)
1842 			goto out_dio;
1843 
1844 		ret = generic_write_checks(file, ppos, &count,
1845 					   S_ISBLK(inode->i_mode));
1846 		if (ret)
1847 			goto out_dio;
1848 
1849 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1850 						    ppos, count, ocount);
1851 		if (written < 0) {
1852 			/*
1853 			 * direct write may have instantiated a few
1854 			 * blocks outside i_size. Trim these off again.
1855 			 * Don't need i_size_read because we hold i_mutex.
1856 			 */
1857 			if (*ppos + count > inode->i_size)
1858 				vmtruncate(inode, inode->i_size);
1859 			ret = written;
1860 			goto out_dio;
1861 		}
1862 	} else {
1863 		written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
1864 							*ppos);
1865 	}
1866 
1867 out_dio:
1868 	/* buffered aio wouldn't have proper lock coverage today */
1869 	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1870 
1871 	if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
1872 		/*
1873 		 * The generic write paths have handled getting data
1874 		 * to disk, but since we don't make use of the dirty
1875 		 * inode list, a manual journal commit is necessary
1876 		 * here.
1877 		 */
1878 		if (old_size != i_size_read(inode) ||
1879 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
1880 			ret = jbd2_journal_force_commit(osb->journal->j_journal);
1881 			if (ret < 0)
1882 				written = ret;
1883 		}
1884 	}
1885 
1886 	/*
1887 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1888 	 * function pointer which is called when o_direct io completes so that
1889 	 * it can unlock our rw lock.  (it's the clustered equivalent of
1890 	 * i_alloc_sem; protects truncate from racing with pending ios).
1891 	 * Unfortunately there are error cases which call end_io and others
1892 	 * that don't.  so we don't have to unlock the rw_lock if either an
1893 	 * async dio is going to do it in the future or an end_io after an
1894 	 * error has already done it.
1895 	 */
1896 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1897 		rw_level = -1;
1898 		have_alloc_sem = 0;
1899 	}
1900 
1901 out:
1902 	if (rw_level != -1)
1903 		ocfs2_rw_unlock(inode, rw_level);
1904 
1905 out_sems:
1906 	if (have_alloc_sem)
1907 		up_read(&inode->i_alloc_sem);
1908 
1909 	mutex_unlock(&inode->i_mutex);
1910 
1911 	mlog_exit(ret);
1912 	return written ? written : ret;
1913 }
1914 
ocfs2_file_splice_write(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags)1915 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1916 				       struct file *out,
1917 				       loff_t *ppos,
1918 				       size_t len,
1919 				       unsigned int flags)
1920 {
1921 	int ret;
1922 	struct inode *inode = out->f_path.dentry->d_inode;
1923 
1924 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1925 		   (unsigned int)len,
1926 		   out->f_path.dentry->d_name.len,
1927 		   out->f_path.dentry->d_name.name);
1928 
1929 	inode_double_lock(inode, pipe->inode);
1930 
1931 	ret = ocfs2_rw_lock(inode, 1);
1932 	if (ret < 0) {
1933 		mlog_errno(ret);
1934 		goto out;
1935 	}
1936 
1937 	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1938 					    NULL);
1939 	if (ret < 0) {
1940 		mlog_errno(ret);
1941 		goto out_unlock;
1942 	}
1943 
1944 	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1945 
1946 out_unlock:
1947 	ocfs2_rw_unlock(inode, 1);
1948 out:
1949 	inode_double_unlock(inode, pipe->inode);
1950 
1951 	mlog_exit(ret);
1952 	return ret;
1953 }
1954 
ocfs2_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)1955 static ssize_t ocfs2_file_splice_read(struct file *in,
1956 				      loff_t *ppos,
1957 				      struct pipe_inode_info *pipe,
1958 				      size_t len,
1959 				      unsigned int flags)
1960 {
1961 	int ret = 0;
1962 	struct inode *inode = in->f_path.dentry->d_inode;
1963 
1964 	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1965 		   (unsigned int)len,
1966 		   in->f_path.dentry->d_name.len,
1967 		   in->f_path.dentry->d_name.name);
1968 
1969 	/*
1970 	 * See the comment in ocfs2_file_aio_read()
1971 	 */
1972 	ret = ocfs2_inode_lock(inode, NULL, 0);
1973 	if (ret < 0) {
1974 		mlog_errno(ret);
1975 		goto bail;
1976 	}
1977 	ocfs2_inode_unlock(inode, 0);
1978 
1979 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1980 
1981 bail:
1982 	mlog_exit(ret);
1983 	return ret;
1984 }
1985 
ocfs2_file_aio_read(struct kiocb * iocb,const struct iovec * iov,unsigned long nr_segs,loff_t pos)1986 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1987 				   const struct iovec *iov,
1988 				   unsigned long nr_segs,
1989 				   loff_t pos)
1990 {
1991 	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1992 	struct file *filp = iocb->ki_filp;
1993 	struct inode *inode = filp->f_path.dentry->d_inode;
1994 
1995 	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1996 		   (unsigned int)nr_segs,
1997 		   filp->f_path.dentry->d_name.len,
1998 		   filp->f_path.dentry->d_name.name);
1999 
2000 	if (!inode) {
2001 		ret = -EINVAL;
2002 		mlog_errno(ret);
2003 		goto bail;
2004 	}
2005 
2006 	/*
2007 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
2008 	 * need locks to protect pending reads from racing with truncate.
2009 	 */
2010 	if (filp->f_flags & O_DIRECT) {
2011 		down_read(&inode->i_alloc_sem);
2012 		have_alloc_sem = 1;
2013 
2014 		ret = ocfs2_rw_lock(inode, 0);
2015 		if (ret < 0) {
2016 			mlog_errno(ret);
2017 			goto bail;
2018 		}
2019 		rw_level = 0;
2020 		/* communicate with ocfs2_dio_end_io */
2021 		ocfs2_iocb_set_rw_locked(iocb, rw_level);
2022 	}
2023 
2024 	/*
2025 	 * We're fine letting folks race truncates and extending
2026 	 * writes with read across the cluster, just like they can
2027 	 * locally. Hence no rw_lock during read.
2028 	 *
2029 	 * Take and drop the meta data lock to update inode fields
2030 	 * like i_size. This allows the checks down below
2031 	 * generic_file_aio_read() a chance of actually working.
2032 	 */
2033 	ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2034 	if (ret < 0) {
2035 		mlog_errno(ret);
2036 		goto bail;
2037 	}
2038 	ocfs2_inode_unlock(inode, lock_level);
2039 
2040 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2041 	if (ret == -EINVAL)
2042 		mlog(0, "generic_file_aio_read returned -EINVAL\n");
2043 
2044 	/* buffered aio wouldn't have proper lock coverage today */
2045 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2046 
2047 	/* see ocfs2_file_aio_write */
2048 	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2049 		rw_level = -1;
2050 		have_alloc_sem = 0;
2051 	}
2052 
2053 bail:
2054 	if (have_alloc_sem)
2055 		up_read(&inode->i_alloc_sem);
2056 	if (rw_level != -1)
2057 		ocfs2_rw_unlock(inode, rw_level);
2058 	mlog_exit(ret);
2059 
2060 	return ret;
2061 }
2062 
2063 const struct inode_operations ocfs2_file_iops = {
2064 	.setattr	= ocfs2_setattr,
2065 	.getattr	= ocfs2_getattr,
2066 	.permission	= ocfs2_permission,
2067 	.setxattr	= generic_setxattr,
2068 	.getxattr	= generic_getxattr,
2069 	.listxattr	= ocfs2_listxattr,
2070 	.removexattr	= generic_removexattr,
2071 	.fallocate	= ocfs2_fallocate,
2072 	.fiemap		= ocfs2_fiemap,
2073 };
2074 
2075 const struct inode_operations ocfs2_special_file_iops = {
2076 	.setattr	= ocfs2_setattr,
2077 	.getattr	= ocfs2_getattr,
2078 	.permission	= ocfs2_permission,
2079 };
2080 
2081 /*
2082  * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2083  * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2084  */
2085 const struct file_operations ocfs2_fops = {
2086 	.llseek		= generic_file_llseek,
2087 	.read		= do_sync_read,
2088 	.write		= do_sync_write,
2089 	.mmap		= ocfs2_mmap,
2090 	.fsync		= ocfs2_sync_file,
2091 	.release	= ocfs2_file_release,
2092 	.open		= ocfs2_file_open,
2093 	.aio_read	= ocfs2_file_aio_read,
2094 	.aio_write	= ocfs2_file_aio_write,
2095 	.unlocked_ioctl	= ocfs2_ioctl,
2096 #ifdef CONFIG_COMPAT
2097 	.compat_ioctl   = ocfs2_compat_ioctl,
2098 #endif
2099 	.lock		= ocfs2_lock,
2100 	.flock		= ocfs2_flock,
2101 	.splice_read	= ocfs2_file_splice_read,
2102 	.splice_write	= ocfs2_file_splice_write,
2103 };
2104 
2105 const struct file_operations ocfs2_dops = {
2106 	.llseek		= generic_file_llseek,
2107 	.read		= generic_read_dir,
2108 	.readdir	= ocfs2_readdir,
2109 	.fsync		= ocfs2_sync_file,
2110 	.release	= ocfs2_dir_release,
2111 	.open		= ocfs2_dir_open,
2112 	.unlocked_ioctl	= ocfs2_ioctl,
2113 #ifdef CONFIG_COMPAT
2114 	.compat_ioctl   = ocfs2_compat_ioctl,
2115 #endif
2116 	.lock		= ocfs2_lock,
2117 	.flock		= ocfs2_flock,
2118 };
2119 
2120 /*
2121  * POSIX-lockless variants of our file_operations.
2122  *
2123  * These will be used if the underlying cluster stack does not support
2124  * posix file locking, if the user passes the "localflocks" mount
2125  * option, or if we have a local-only fs.
2126  *
2127  * ocfs2_flock is in here because all stacks handle UNIX file locks,
2128  * so we still want it in the case of no stack support for
2129  * plocks. Internally, it will do the right thing when asked to ignore
2130  * the cluster.
2131  */
2132 const struct file_operations ocfs2_fops_no_plocks = {
2133 	.llseek		= generic_file_llseek,
2134 	.read		= do_sync_read,
2135 	.write		= do_sync_write,
2136 	.mmap		= ocfs2_mmap,
2137 	.fsync		= ocfs2_sync_file,
2138 	.release	= ocfs2_file_release,
2139 	.open		= ocfs2_file_open,
2140 	.aio_read	= ocfs2_file_aio_read,
2141 	.aio_write	= ocfs2_file_aio_write,
2142 	.unlocked_ioctl	= ocfs2_ioctl,
2143 #ifdef CONFIG_COMPAT
2144 	.compat_ioctl   = ocfs2_compat_ioctl,
2145 #endif
2146 	.flock		= ocfs2_flock,
2147 	.splice_read	= ocfs2_file_splice_read,
2148 	.splice_write	= ocfs2_file_splice_write,
2149 };
2150 
2151 const struct file_operations ocfs2_dops_no_plocks = {
2152 	.llseek		= generic_file_llseek,
2153 	.read		= generic_read_dir,
2154 	.readdir	= ocfs2_readdir,
2155 	.fsync		= ocfs2_sync_file,
2156 	.release	= ocfs2_dir_release,
2157 	.open		= ocfs2_dir_open,
2158 	.unlocked_ioctl	= ocfs2_ioctl,
2159 #ifdef CONFIG_COMPAT
2160 	.compat_ioctl   = ocfs2_compat_ioctl,
2161 #endif
2162 	.flock		= ocfs2_flock,
2163 };
2164