• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2012, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42 
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
50 
51 #include "../include/cl_object.h"
52 
53 static int
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55 
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 			  bool *lease_broken);
58 
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 		  unsigned int cmd, unsigned long arg, int *rcp);
62 
ll_file_data_get(void)63 static struct ll_file_data *ll_file_data_get(void)
64 {
65 	struct ll_file_data *fd;
66 
67 	fd = kmem_cache_alloc(ll_file_data_slab, GFP_NOFS | __GFP_ZERO);
68 	if (fd == NULL)
69 		return NULL;
70 	fd->fd_write_failed = false;
71 	return fd;
72 }
73 
ll_file_data_put(struct ll_file_data * fd)74 static void ll_file_data_put(struct ll_file_data *fd)
75 {
76 	if (fd != NULL)
77 		kmem_cache_free(ll_file_data_slab, fd);
78 }
79 
ll_pack_inode2opdata(struct inode * inode,struct md_op_data * op_data,struct lustre_handle * fh)80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 			  struct lustre_handle *fh)
82 {
83 	op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 	op_data->op_attr.ia_mode = inode->i_mode;
85 	op_data->op_attr.ia_atime = inode->i_atime;
86 	op_data->op_attr.ia_mtime = inode->i_mtime;
87 	op_data->op_attr.ia_ctime = inode->i_ctime;
88 	op_data->op_attr.ia_size = i_size_read(inode);
89 	op_data->op_attr_blocks = inode->i_blocks;
90 	((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 					ll_inode_to_ext_flags(inode->i_flags);
92 	op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 	if (fh)
94 		op_data->op_handle = *fh;
95 
96 	if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
97 		op_data->op_bias |= MDS_DATA_MODIFIED;
98 }
99 
100 /**
101  * Closes the IO epoch and packs all the attributes into @op_data for
102  * the CLOSE rpc.
103  */
ll_prepare_close(struct inode * inode,struct md_op_data * op_data,struct obd_client_handle * och)104 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
105 			     struct obd_client_handle *och)
106 {
107 	op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 					ATTR_MTIME | ATTR_MTIME_SET |
109 					ATTR_CTIME | ATTR_CTIME_SET;
110 
111 	if (!(och->och_flags & FMODE_WRITE))
112 		goto out;
113 
114 	if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
115 		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
116 	else
117 		ll_ioepoch_close(inode, op_data, &och, 0);
118 
119 out:
120 	ll_pack_inode2opdata(inode, op_data, &och->och_fh);
121 	ll_prep_md_op_data(op_data, inode, NULL, NULL,
122 			   0, 0, LUSTRE_OPC_ANY, NULL);
123 }
124 
ll_close_inode_openhandle(struct obd_export * md_exp,struct inode * inode,struct obd_client_handle * och,const __u64 * data_version)125 static int ll_close_inode_openhandle(struct obd_export *md_exp,
126 				     struct inode *inode,
127 				     struct obd_client_handle *och,
128 				     const __u64 *data_version)
129 {
130 	struct obd_export *exp = ll_i2mdexp(inode);
131 	struct md_op_data *op_data;
132 	struct ptlrpc_request *req = NULL;
133 	struct obd_device *obd = class_exp2obd(exp);
134 	int epoch_close = 1;
135 	int rc;
136 
137 	if (obd == NULL) {
138 		/*
139 		 * XXX: in case of LMV, is this correct to access
140 		 * ->exp_handle?
141 		 */
142 		CERROR("Invalid MDC connection handle %#llx\n",
143 		       ll_i2mdexp(inode)->exp_handle.h_cookie);
144 		rc = 0;
145 		goto out;
146 	}
147 
148 	op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
149 	if (!op_data) {
150 		/* XXX We leak openhandle and request here. */
151 		rc = -ENOMEM;
152 		goto out;
153 	}
154 
155 	ll_prepare_close(inode, op_data, och);
156 	if (data_version != NULL) {
157 		/* Pass in data_version implies release. */
158 		op_data->op_bias |= MDS_HSM_RELEASE;
159 		op_data->op_data_version = *data_version;
160 		op_data->op_lease_handle = och->och_lease_handle;
161 		op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
162 	}
163 	epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
164 	rc = md_close(md_exp, op_data, och->och_mod, &req);
165 	if (rc == -EAGAIN) {
166 		/* This close must have the epoch closed. */
167 		LASSERT(epoch_close);
168 		/* MDS has instructed us to obtain Size-on-MDS attribute from
169 		 * OSTs and send setattr to back to MDS. */
170 		rc = ll_som_update(inode, op_data);
171 		if (rc) {
172 			CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
173 			       inode->i_ino, rc);
174 			rc = 0;
175 		}
176 	} else if (rc) {
177 		CERROR("inode %lu mdc close failed: rc = %d\n",
178 		       inode->i_ino, rc);
179 	}
180 
181 	/* DATA_MODIFIED flag was successfully sent on close, cancel data
182 	 * modification flag. */
183 	if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
184 		struct ll_inode_info *lli = ll_i2info(inode);
185 
186 		spin_lock(&lli->lli_lock);
187 		lli->lli_flags &= ~LLIF_DATA_MODIFIED;
188 		spin_unlock(&lli->lli_lock);
189 	}
190 
191 	if (rc == 0) {
192 		rc = ll_objects_destroy(req, inode);
193 		if (rc)
194 			CERROR("inode %lu ll_objects destroy: rc = %d\n",
195 			       inode->i_ino, rc);
196 	}
197 	if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 		struct mdt_body *body;
199 
200 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 		if (!(body->valid & OBD_MD_FLRELEASED))
202 			rc = -EBUSY;
203 	}
204 
205 	ll_finish_md_op_data(op_data);
206 
207 out:
208 	if (exp_connect_som(exp) && !epoch_close &&
209 	    S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 		ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 	} else {
212 		md_clear_open_replay_data(md_exp, och);
213 		/* Free @och if it is not waiting for DONE_WRITING. */
214 		och->och_fh.cookie = DEAD_HANDLE_MAGIC;
215 		kfree(och);
216 	}
217 	if (req) /* This is close request */
218 		ptlrpc_req_finished(req);
219 	return rc;
220 }
221 
ll_md_real_close(struct inode * inode,fmode_t fmode)222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
223 {
224 	struct ll_inode_info *lli = ll_i2info(inode);
225 	struct obd_client_handle **och_p;
226 	struct obd_client_handle *och;
227 	__u64 *och_usecount;
228 	int rc = 0;
229 
230 	if (fmode & FMODE_WRITE) {
231 		och_p = &lli->lli_mds_write_och;
232 		och_usecount = &lli->lli_open_fd_write_count;
233 	} else if (fmode & FMODE_EXEC) {
234 		och_p = &lli->lli_mds_exec_och;
235 		och_usecount = &lli->lli_open_fd_exec_count;
236 	} else {
237 		LASSERT(fmode & FMODE_READ);
238 		och_p = &lli->lli_mds_read_och;
239 		och_usecount = &lli->lli_open_fd_read_count;
240 	}
241 
242 	mutex_lock(&lli->lli_och_mutex);
243 	if (*och_usecount > 0) {
244 		/* There are still users of this handle, so skip
245 		 * freeing it. */
246 		mutex_unlock(&lli->lli_och_mutex);
247 		return 0;
248 	}
249 
250 	och = *och_p;
251 	*och_p = NULL;
252 	mutex_unlock(&lli->lli_och_mutex);
253 
254 	if (och != NULL) {
255 		/* There might be a race and this handle may already
256 		   be closed. */
257 		rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
258 					       inode, och, NULL);
259 	}
260 
261 	return rc;
262 }
263 
ll_md_close(struct obd_export * md_exp,struct inode * inode,struct file * file)264 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
265 		       struct file *file)
266 {
267 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 	struct ll_inode_info *lli = ll_i2info(inode);
269 	int lockmode;
270 	__u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
271 	struct lustre_handle lockh;
272 	ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
273 	int rc = 0;
274 
275 	/* clear group lock, if present */
276 	if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
277 		ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
278 
279 	if (fd->fd_lease_och != NULL) {
280 		bool lease_broken;
281 
282 		/* Usually the lease is not released when the
283 		 * application crashed, we need to release here. */
284 		rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
285 		CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
286 			PFID(&lli->lli_fid), rc, lease_broken);
287 
288 		fd->fd_lease_och = NULL;
289 	}
290 
291 	if (fd->fd_och != NULL) {
292 		rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
293 		fd->fd_och = NULL;
294 		goto out;
295 	}
296 
297 	/* Let's see if we have good enough OPEN lock on the file and if
298 	   we can skip talking to MDS */
299 
300 	mutex_lock(&lli->lli_och_mutex);
301 	if (fd->fd_omode & FMODE_WRITE) {
302 		lockmode = LCK_CW;
303 		LASSERT(lli->lli_open_fd_write_count);
304 		lli->lli_open_fd_write_count--;
305 	} else if (fd->fd_omode & FMODE_EXEC) {
306 		lockmode = LCK_PR;
307 		LASSERT(lli->lli_open_fd_exec_count);
308 		lli->lli_open_fd_exec_count--;
309 	} else {
310 		lockmode = LCK_CR;
311 		LASSERT(lli->lli_open_fd_read_count);
312 		lli->lli_open_fd_read_count--;
313 	}
314 	mutex_unlock(&lli->lli_och_mutex);
315 
316 	if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
317 			   LDLM_IBITS, &policy, lockmode, &lockh))
318 		rc = ll_md_real_close(inode, fd->fd_omode);
319 
320 out:
321 	LUSTRE_FPRIVATE(file) = NULL;
322 	ll_file_data_put(fd);
323 
324 	return rc;
325 }
326 
327 /* While this returns an error code, fput() the caller does not, so we need
328  * to make every effort to clean up all of our state here.  Also, applications
329  * rarely check close errors and even if an error is returned they will not
330  * re-try the close call.
331  */
ll_file_release(struct inode * inode,struct file * file)332 int ll_file_release(struct inode *inode, struct file *file)
333 {
334 	struct ll_file_data *fd;
335 	struct ll_sb_info *sbi = ll_i2sbi(inode);
336 	struct ll_inode_info *lli = ll_i2info(inode);
337 	int rc;
338 
339 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
340 	       inode->i_generation, inode);
341 
342 #ifdef CONFIG_FS_POSIX_ACL
343 	if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
344 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
345 
346 		LASSERT(fd != NULL);
347 		if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
348 			fd->fd_flags &= ~LL_FILE_RMTACL;
349 			rct_del(&sbi->ll_rct, current_pid());
350 			et_search_free(&sbi->ll_et, current_pid());
351 		}
352 	}
353 #endif
354 
355 	if (!is_root_inode(inode))
356 		ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
357 	fd = LUSTRE_FPRIVATE(file);
358 	LASSERT(fd != NULL);
359 
360 	/* The last ref on @file, maybe not the owner pid of statahead.
361 	 * Different processes can open the same dir, "ll_opendir_key" means:
362 	 * it is me that should stop the statahead thread. */
363 	if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
364 	    lli->lli_opendir_pid != 0)
365 		ll_stop_statahead(inode, lli->lli_opendir_key);
366 
367 	if (is_root_inode(inode)) {
368 		LUSTRE_FPRIVATE(file) = NULL;
369 		ll_file_data_put(fd);
370 		return 0;
371 	}
372 
373 	if (!S_ISDIR(inode->i_mode)) {
374 		lov_read_and_clear_async_rc(lli->lli_clob);
375 		lli->lli_async_rc = 0;
376 	}
377 
378 	rc = ll_md_close(sbi->ll_md_exp, inode, file);
379 
380 	if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
381 		libcfs_debug_dumplog();
382 
383 	return rc;
384 }
385 
ll_intent_file_open(struct dentry * dentry,void * lmm,int lmmsize,struct lookup_intent * itp)386 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
387 			       int lmmsize, struct lookup_intent *itp)
388 {
389 	struct inode *inode = d_inode(dentry);
390 	struct ll_sb_info *sbi = ll_i2sbi(inode);
391 	struct dentry *parent = dentry->d_parent;
392 	const char *name = dentry->d_name.name;
393 	const int len = dentry->d_name.len;
394 	struct md_op_data *op_data;
395 	struct ptlrpc_request *req;
396 	__u32 opc = LUSTRE_OPC_ANY;
397 	int rc;
398 
399 	/* Usually we come here only for NFSD, and we want open lock.
400 	   But we can also get here with pre 2.6.15 patchless kernels, and in
401 	   that case that lock is also ok */
402 	/* We can also get here if there was cached open handle in revalidate_it
403 	 * but it disappeared while we were getting from there to ll_file_open.
404 	 * But this means this file was closed and immediately opened which
405 	 * makes a good candidate for using OPEN lock */
406 	/* If lmmsize & lmm are not 0, we are just setting stripe info
407 	 * parameters. No need for the open lock */
408 	if (lmm == NULL && lmmsize == 0) {
409 		itp->it_flags |= MDS_OPEN_LOCK;
410 		if (itp->it_flags & FMODE_WRITE)
411 			opc = LUSTRE_OPC_CREATE;
412 	}
413 
414 	op_data  = ll_prep_md_op_data(NULL, d_inode(parent),
415 				      inode, name, len,
416 				      O_RDWR, opc, NULL);
417 	if (IS_ERR(op_data))
418 		return PTR_ERR(op_data);
419 
420 	itp->it_flags |= MDS_OPEN_BY_FID;
421 	rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
422 			    0 /*unused */, &req, ll_md_blocking_ast, 0);
423 	ll_finish_md_op_data(op_data);
424 	if (rc == -ESTALE) {
425 		/* reason for keep own exit path - don`t flood log
426 		* with messages with -ESTALE errors.
427 		*/
428 		if (!it_disposition(itp, DISP_OPEN_OPEN) ||
429 		     it_open_error(DISP_OPEN_OPEN, itp))
430 			goto out;
431 		ll_release_openhandle(inode, itp);
432 		goto out;
433 	}
434 
435 	if (it_disposition(itp, DISP_LOOKUP_NEG)) {
436 		rc = -ENOENT;
437 		goto out;
438 	}
439 
440 	if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
441 		rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
442 		CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
443 		goto out;
444 	}
445 
446 	rc = ll_prep_inode(&inode, req, NULL, itp);
447 	if (!rc && itp->d.lustre.it_lock_mode)
448 		ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
449 
450 out:
451 	ptlrpc_req_finished(req);
452 	ll_intent_drop_lock(itp);
453 
454 	return rc;
455 }
456 
457 /**
458  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
459  * not believe attributes if a few ioepoch holders exist. Attributes for
460  * previous ioepoch if new one is opened are also skipped by MDS.
461  */
ll_ioepoch_open(struct ll_inode_info * lli,__u64 ioepoch)462 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
463 {
464 	if (ioepoch && lli->lli_ioepoch != ioepoch) {
465 		lli->lli_ioepoch = ioepoch;
466 		CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
467 		       ioepoch, PFID(&lli->lli_fid));
468 	}
469 }
470 
ll_och_fill(struct obd_export * md_exp,struct lookup_intent * it,struct obd_client_handle * och)471 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
472 		       struct obd_client_handle *och)
473 {
474 	struct ptlrpc_request *req = it->d.lustre.it_data;
475 	struct mdt_body *body;
476 
477 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
478 	och->och_fh = body->handle;
479 	och->och_fid = body->fid1;
480 	och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
481 	och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
482 	och->och_flags = it->it_flags;
483 
484 	return md_set_open_replay_data(md_exp, och, it);
485 }
486 
ll_local_open(struct file * file,struct lookup_intent * it,struct ll_file_data * fd,struct obd_client_handle * och)487 static int ll_local_open(struct file *file, struct lookup_intent *it,
488 			 struct ll_file_data *fd, struct obd_client_handle *och)
489 {
490 	struct inode *inode = file_inode(file);
491 	struct ll_inode_info *lli = ll_i2info(inode);
492 
493 	LASSERT(!LUSTRE_FPRIVATE(file));
494 
495 	LASSERT(fd != NULL);
496 
497 	if (och) {
498 		struct ptlrpc_request *req = it->d.lustre.it_data;
499 		struct mdt_body *body;
500 		int rc;
501 
502 		rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
503 		if (rc != 0)
504 			return rc;
505 
506 		body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
507 		ll_ioepoch_open(lli, body->ioepoch);
508 	}
509 
510 	LUSTRE_FPRIVATE(file) = fd;
511 	ll_readahead_init(inode, &fd->fd_ras);
512 	fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
513 	return 0;
514 }
515 
516 /* Open a file, and (for the very first open) create objects on the OSTs at
517  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
518  * creation or open until ll_lov_setstripe() ioctl is called.
519  *
520  * If we already have the stripe MD locally then we don't request it in
521  * md_open(), by passing a lmm_size = 0.
522  *
523  * It is up to the application to ensure no other processes open this file
524  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
525  * used.  We might be able to avoid races of that sort by getting lli_open_sem
526  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
527  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
528  */
ll_file_open(struct inode * inode,struct file * file)529 int ll_file_open(struct inode *inode, struct file *file)
530 {
531 	struct ll_inode_info *lli = ll_i2info(inode);
532 	struct lookup_intent *it, oit = { .it_op = IT_OPEN,
533 					  .it_flags = file->f_flags };
534 	struct obd_client_handle **och_p = NULL;
535 	__u64 *och_usecount = NULL;
536 	struct ll_file_data *fd;
537 	int rc = 0, opendir_set = 0;
538 
539 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
540 	       inode->i_generation, inode, file->f_flags);
541 
542 	it = file->private_data; /* XXX: compat macro */
543 	file->private_data = NULL; /* prevent ll_local_open assertion */
544 
545 	fd = ll_file_data_get();
546 	if (fd == NULL) {
547 		rc = -ENOMEM;
548 		goto out_openerr;
549 	}
550 
551 	fd->fd_file = file;
552 	if (S_ISDIR(inode->i_mode)) {
553 		spin_lock(&lli->lli_sa_lock);
554 		if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
555 		    lli->lli_opendir_pid == 0) {
556 			lli->lli_opendir_key = fd;
557 			lli->lli_opendir_pid = current_pid();
558 			opendir_set = 1;
559 		}
560 		spin_unlock(&lli->lli_sa_lock);
561 	}
562 
563 	if (is_root_inode(inode)) {
564 		LUSTRE_FPRIVATE(file) = fd;
565 		return 0;
566 	}
567 
568 	if (!it || !it->d.lustre.it_disposition) {
569 		/* Convert f_flags into access mode. We cannot use file->f_mode,
570 		 * because everything but O_ACCMODE mask was stripped from
571 		 * there */
572 		if ((oit.it_flags + 1) & O_ACCMODE)
573 			oit.it_flags++;
574 		if (file->f_flags & O_TRUNC)
575 			oit.it_flags |= FMODE_WRITE;
576 
577 		/* kernel only call f_op->open in dentry_open.  filp_open calls
578 		 * dentry_open after call to open_namei that checks permissions.
579 		 * Only nfsd_open call dentry_open directly without checking
580 		 * permissions and because of that this code below is safe. */
581 		if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
582 			oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
583 
584 		/* We do not want O_EXCL here, presumably we opened the file
585 		 * already? XXX - NFS implications? */
586 		oit.it_flags &= ~O_EXCL;
587 
588 		/* bug20584, if "it_flags" contains O_CREAT, the file will be
589 		 * created if necessary, then "IT_CREAT" should be set to keep
590 		 * consistent with it */
591 		if (oit.it_flags & O_CREAT)
592 			oit.it_op |= IT_CREAT;
593 
594 		it = &oit;
595 	}
596 
597 restart:
598 	/* Let's see if we have file open on MDS already. */
599 	if (it->it_flags & FMODE_WRITE) {
600 		och_p = &lli->lli_mds_write_och;
601 		och_usecount = &lli->lli_open_fd_write_count;
602 	} else if (it->it_flags & FMODE_EXEC) {
603 		och_p = &lli->lli_mds_exec_och;
604 		och_usecount = &lli->lli_open_fd_exec_count;
605 	 } else {
606 		och_p = &lli->lli_mds_read_och;
607 		och_usecount = &lli->lli_open_fd_read_count;
608 	}
609 
610 	mutex_lock(&lli->lli_och_mutex);
611 	if (*och_p) { /* Open handle is present */
612 		if (it_disposition(it, DISP_OPEN_OPEN)) {
613 			/* Well, there's extra open request that we do not need,
614 			   let's close it somehow. This will decref request. */
615 			rc = it_open_error(DISP_OPEN_OPEN, it);
616 			if (rc) {
617 				mutex_unlock(&lli->lli_och_mutex);
618 				goto out_openerr;
619 			}
620 
621 			ll_release_openhandle(inode, it);
622 		}
623 		(*och_usecount)++;
624 
625 		rc = ll_local_open(file, it, fd, NULL);
626 		if (rc) {
627 			(*och_usecount)--;
628 			mutex_unlock(&lli->lli_och_mutex);
629 			goto out_openerr;
630 		}
631 	} else {
632 		LASSERT(*och_usecount == 0);
633 		if (!it->d.lustre.it_disposition) {
634 			/* We cannot just request lock handle now, new ELC code
635 			   means that one of other OPEN locks for this file
636 			   could be cancelled, and since blocking ast handler
637 			   would attempt to grab och_mutex as well, that would
638 			   result in a deadlock */
639 			mutex_unlock(&lli->lli_och_mutex);
640 			it->it_create_mode |= M_CHECK_STALE;
641 			rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
642 			it->it_create_mode &= ~M_CHECK_STALE;
643 			if (rc)
644 				goto out_openerr;
645 
646 			goto restart;
647 		}
648 		*och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
649 		if (!*och_p) {
650 			rc = -ENOMEM;
651 			goto out_och_free;
652 		}
653 
654 		(*och_usecount)++;
655 
656 		/* md_intent_lock() didn't get a request ref if there was an
657 		 * open error, so don't do cleanup on the request here
658 		 * (bug 3430) */
659 		/* XXX (green): Should not we bail out on any error here, not
660 		 * just open error? */
661 		rc = it_open_error(DISP_OPEN_OPEN, it);
662 		if (rc)
663 			goto out_och_free;
664 
665 		LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
666 
667 		rc = ll_local_open(file, it, fd, *och_p);
668 		if (rc)
669 			goto out_och_free;
670 	}
671 	mutex_unlock(&lli->lli_och_mutex);
672 	fd = NULL;
673 
674 	/* Must do this outside lli_och_mutex lock to prevent deadlock where
675 	   different kind of OPEN lock for this same inode gets cancelled
676 	   by ldlm_cancel_lru */
677 	if (!S_ISREG(inode->i_mode))
678 		goto out_och_free;
679 
680 	if (!lli->lli_has_smd &&
681 	    (cl_is_lov_delay_create(file->f_flags) ||
682 	     (file->f_mode & FMODE_WRITE) == 0)) {
683 		CDEBUG(D_INODE, "object creation was delayed\n");
684 		goto out_och_free;
685 	}
686 	cl_lov_delay_create_clear(&file->f_flags);
687 	goto out_och_free;
688 
689 out_och_free:
690 	if (rc) {
691 		if (och_p && *och_p) {
692 			kfree(*och_p);
693 			*och_p = NULL;
694 			(*och_usecount)--;
695 		}
696 		mutex_unlock(&lli->lli_och_mutex);
697 
698 out_openerr:
699 		if (opendir_set != 0)
700 			ll_stop_statahead(inode, lli->lli_opendir_key);
701 		ll_file_data_put(fd);
702 	} else {
703 		ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
704 	}
705 
706 	if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
707 		ptlrpc_req_finished(it->d.lustre.it_data);
708 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
709 	}
710 
711 	return rc;
712 }
713 
ll_md_blocking_lease_ast(struct ldlm_lock * lock,struct ldlm_lock_desc * desc,void * data,int flag)714 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
715 			struct ldlm_lock_desc *desc, void *data, int flag)
716 {
717 	int rc;
718 	struct lustre_handle lockh;
719 
720 	switch (flag) {
721 	case LDLM_CB_BLOCKING:
722 		ldlm_lock2handle(lock, &lockh);
723 		rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
724 		if (rc < 0) {
725 			CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
726 			return rc;
727 		}
728 		break;
729 	case LDLM_CB_CANCELING:
730 		/* do nothing */
731 		break;
732 	}
733 	return 0;
734 }
735 
736 /**
737  * Acquire a lease and open the file.
738  */
739 static struct obd_client_handle *
ll_lease_open(struct inode * inode,struct file * file,fmode_t fmode,__u64 open_flags)740 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
741 	      __u64 open_flags)
742 {
743 	struct lookup_intent it = { .it_op = IT_OPEN };
744 	struct ll_sb_info *sbi = ll_i2sbi(inode);
745 	struct md_op_data *op_data;
746 	struct ptlrpc_request *req;
747 	struct lustre_handle old_handle = { 0 };
748 	struct obd_client_handle *och = NULL;
749 	int rc;
750 	int rc2;
751 
752 	if (fmode != FMODE_WRITE && fmode != FMODE_READ)
753 		return ERR_PTR(-EINVAL);
754 
755 	if (file != NULL) {
756 		struct ll_inode_info *lli = ll_i2info(inode);
757 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
758 		struct obd_client_handle **och_p;
759 		__u64 *och_usecount;
760 
761 		if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
762 			return ERR_PTR(-EPERM);
763 
764 		/* Get the openhandle of the file */
765 		rc = -EBUSY;
766 		mutex_lock(&lli->lli_och_mutex);
767 		if (fd->fd_lease_och != NULL) {
768 			mutex_unlock(&lli->lli_och_mutex);
769 			return ERR_PTR(rc);
770 		}
771 
772 		if (fd->fd_och == NULL) {
773 			if (file->f_mode & FMODE_WRITE) {
774 				LASSERT(lli->lli_mds_write_och != NULL);
775 				och_p = &lli->lli_mds_write_och;
776 				och_usecount = &lli->lli_open_fd_write_count;
777 			} else {
778 				LASSERT(lli->lli_mds_read_och != NULL);
779 				och_p = &lli->lli_mds_read_och;
780 				och_usecount = &lli->lli_open_fd_read_count;
781 			}
782 			if (*och_usecount == 1) {
783 				fd->fd_och = *och_p;
784 				*och_p = NULL;
785 				*och_usecount = 0;
786 				rc = 0;
787 			}
788 		}
789 		mutex_unlock(&lli->lli_och_mutex);
790 		if (rc < 0) /* more than 1 opener */
791 			return ERR_PTR(rc);
792 
793 		LASSERT(fd->fd_och != NULL);
794 		old_handle = fd->fd_och->och_fh;
795 	}
796 
797 	och = kzalloc(sizeof(*och), GFP_NOFS);
798 	if (!och)
799 		return ERR_PTR(-ENOMEM);
800 
801 	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
802 					LUSTRE_OPC_ANY, NULL);
803 	if (IS_ERR(op_data)) {
804 		rc = PTR_ERR(op_data);
805 		goto out;
806 	}
807 
808 	/* To tell the MDT this openhandle is from the same owner */
809 	op_data->op_handle = old_handle;
810 
811 	it.it_flags = fmode | open_flags;
812 	it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
813 	rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
814 				ll_md_blocking_lease_ast,
815 	/* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
816 	 * it can be cancelled which may mislead applications that the lease is
817 	 * broken;
818 	 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
819 	 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
820 	 * doesn't deal with openhandle, so normal openhandle will be leaked. */
821 				LDLM_FL_NO_LRU | LDLM_FL_EXCL);
822 	ll_finish_md_op_data(op_data);
823 	ptlrpc_req_finished(req);
824 	if (rc < 0)
825 		goto out_release_it;
826 
827 	if (it_disposition(&it, DISP_LOOKUP_NEG)) {
828 		rc = -ENOENT;
829 		goto out_release_it;
830 	}
831 
832 	rc = it_open_error(DISP_OPEN_OPEN, &it);
833 	if (rc)
834 		goto out_release_it;
835 
836 	LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
837 	ll_och_fill(sbi->ll_md_exp, &it, och);
838 
839 	if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
840 		rc = -EOPNOTSUPP;
841 		goto out_close;
842 	}
843 
844 	/* already get lease, handle lease lock */
845 	ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
846 	if (it.d.lustre.it_lock_mode == 0 ||
847 	    it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
848 		/* open lock must return for lease */
849 		CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
850 			PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
851 			it.d.lustre.it_lock_bits);
852 		rc = -EPROTO;
853 		goto out_close;
854 	}
855 
856 	ll_intent_release(&it);
857 	return och;
858 
859 out_close:
860 	rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
861 	if (rc2)
862 		CERROR("Close openhandle returned %d\n", rc2);
863 
864 	/* cancel open lock */
865 	if (it.d.lustre.it_lock_mode != 0) {
866 		ldlm_lock_decref_and_cancel(&och->och_lease_handle,
867 						it.d.lustre.it_lock_mode);
868 		it.d.lustre.it_lock_mode = 0;
869 	}
870 out_release_it:
871 	ll_intent_release(&it);
872 out:
873 	kfree(och);
874 	return ERR_PTR(rc);
875 }
876 
877 /**
878  * Release lease and close the file.
879  * It will check if the lease has ever broken.
880  */
ll_lease_close(struct obd_client_handle * och,struct inode * inode,bool * lease_broken)881 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
882 			  bool *lease_broken)
883 {
884 	struct ldlm_lock *lock;
885 	bool cancelled = true;
886 	int rc;
887 
888 	lock = ldlm_handle2lock(&och->och_lease_handle);
889 	if (lock != NULL) {
890 		lock_res_and_lock(lock);
891 		cancelled = ldlm_is_cancel(lock);
892 		unlock_res_and_lock(lock);
893 		ldlm_lock_put(lock);
894 	}
895 
896 	CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
897 		PFID(&ll_i2info(inode)->lli_fid), cancelled);
898 
899 	if (!cancelled)
900 		ldlm_cli_cancel(&och->och_lease_handle, 0);
901 	if (lease_broken != NULL)
902 		*lease_broken = cancelled;
903 
904 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
905 				       NULL);
906 	return rc;
907 }
908 
909 /* Fills the obdo with the attributes for the lsm */
ll_lsm_getattr(struct lov_stripe_md * lsm,struct obd_export * exp,struct obdo * obdo,__u64 ioepoch,int sync)910 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
911 			  struct obdo *obdo, __u64 ioepoch, int sync)
912 {
913 	struct ptlrpc_request_set *set;
914 	struct obd_info	    oinfo = { };
915 	int			rc;
916 
917 	LASSERT(lsm != NULL);
918 
919 	oinfo.oi_md = lsm;
920 	oinfo.oi_oa = obdo;
921 	oinfo.oi_oa->o_oi = lsm->lsm_oi;
922 	oinfo.oi_oa->o_mode = S_IFREG;
923 	oinfo.oi_oa->o_ioepoch = ioepoch;
924 	oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
925 			       OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
926 			       OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
927 			       OBD_MD_FLMTIME | OBD_MD_FLCTIME |
928 			       OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
929 			       OBD_MD_FLDATAVERSION;
930 	if (sync) {
931 		oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
932 		oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
933 	}
934 
935 	set = ptlrpc_prep_set();
936 	if (set == NULL) {
937 		CERROR("can't allocate ptlrpc set\n");
938 		rc = -ENOMEM;
939 	} else {
940 		rc = obd_getattr_async(exp, &oinfo, set);
941 		if (rc == 0)
942 			rc = ptlrpc_set_wait(set);
943 		ptlrpc_set_destroy(set);
944 	}
945 	if (rc == 0)
946 		oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
947 					 OBD_MD_FLATIME | OBD_MD_FLMTIME |
948 					 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
949 					 OBD_MD_FLDATAVERSION);
950 	return rc;
951 }
952 
953 /**
954   * Performs the getattr on the inode and updates its fields.
955   * If @sync != 0, perform the getattr under the server-side lock.
956   */
ll_inode_getattr(struct inode * inode,struct obdo * obdo,__u64 ioepoch,int sync)957 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
958 		     __u64 ioepoch, int sync)
959 {
960 	struct lov_stripe_md *lsm;
961 	int rc;
962 
963 	lsm = ccc_inode_lsm_get(inode);
964 	rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
965 			    obdo, ioepoch, sync);
966 	if (rc == 0) {
967 		struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
968 
969 		obdo_refresh_inode(inode, obdo, obdo->o_valid);
970 		CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
971 		       POSTID(oi), i_size_read(inode),
972 		       (unsigned long long)inode->i_blocks,
973 		       1UL << inode->i_blkbits);
974 	}
975 	ccc_inode_lsm_put(inode, lsm);
976 	return rc;
977 }
978 
ll_merge_lvb(const struct lu_env * env,struct inode * inode)979 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
980 {
981 	struct ll_inode_info *lli = ll_i2info(inode);
982 	struct cl_object *obj = lli->lli_clob;
983 	struct cl_attr *attr = ccc_env_thread_attr(env);
984 	struct ost_lvb lvb;
985 	int rc = 0;
986 
987 	ll_inode_size_lock(inode);
988 	/* merge timestamps the most recently obtained from mds with
989 	   timestamps obtained from osts */
990 	LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
991 	LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
992 	LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
993 
994 	lvb.lvb_size = i_size_read(inode);
995 	lvb.lvb_blocks = inode->i_blocks;
996 	lvb.lvb_mtime = LTIME_S(inode->i_mtime);
997 	lvb.lvb_atime = LTIME_S(inode->i_atime);
998 	lvb.lvb_ctime = LTIME_S(inode->i_ctime);
999 
1000 	cl_object_attr_lock(obj);
1001 	rc = cl_object_attr_get(env, obj, attr);
1002 	cl_object_attr_unlock(obj);
1003 
1004 	if (rc == 0) {
1005 		if (lvb.lvb_atime < attr->cat_atime)
1006 			lvb.lvb_atime = attr->cat_atime;
1007 		if (lvb.lvb_ctime < attr->cat_ctime)
1008 			lvb.lvb_ctime = attr->cat_ctime;
1009 		if (lvb.lvb_mtime < attr->cat_mtime)
1010 			lvb.lvb_mtime = attr->cat_mtime;
1011 
1012 		CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1013 				PFID(&lli->lli_fid), attr->cat_size);
1014 		cl_isize_write_nolock(inode, attr->cat_size);
1015 
1016 		inode->i_blocks = attr->cat_blocks;
1017 
1018 		LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1019 		LTIME_S(inode->i_atime) = lvb.lvb_atime;
1020 		LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1021 	}
1022 	ll_inode_size_unlock(inode);
1023 
1024 	return rc;
1025 }
1026 
ll_glimpse_ioctl(struct ll_sb_info * sbi,struct lov_stripe_md * lsm,lstat_t * st)1027 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1028 		     lstat_t *st)
1029 {
1030 	struct obdo obdo = { 0 };
1031 	int rc;
1032 
1033 	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, &obdo, 0, 0);
1034 	if (rc == 0) {
1035 		st->st_size   = obdo.o_size;
1036 		st->st_blocks = obdo.o_blocks;
1037 		st->st_mtime  = obdo.o_mtime;
1038 		st->st_atime  = obdo.o_atime;
1039 		st->st_ctime  = obdo.o_ctime;
1040 	}
1041 	return rc;
1042 }
1043 
file_is_noatime(const struct file * file)1044 static bool file_is_noatime(const struct file *file)
1045 {
1046 	const struct vfsmount *mnt = file->f_path.mnt;
1047 	const struct inode *inode = file_inode(file);
1048 
1049 	/* Adapted from file_accessed() and touch_atime().*/
1050 	if (file->f_flags & O_NOATIME)
1051 		return true;
1052 
1053 	if (inode->i_flags & S_NOATIME)
1054 		return true;
1055 
1056 	if (IS_NOATIME(inode))
1057 		return true;
1058 
1059 	if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1060 		return true;
1061 
1062 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1063 		return true;
1064 
1065 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1066 		return true;
1067 
1068 	return false;
1069 }
1070 
ll_io_init(struct cl_io * io,const struct file * file,int write)1071 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1072 {
1073 	struct inode *inode = file_inode(file);
1074 
1075 	io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1076 	if (write) {
1077 		io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1078 		io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1079 				      file->f_flags & O_DIRECT ||
1080 				      IS_SYNC(inode);
1081 	}
1082 	io->ci_obj     = ll_i2info(inode)->lli_clob;
1083 	io->ci_lockreq = CILR_MAYBE;
1084 	if (ll_file_nolock(file)) {
1085 		io->ci_lockreq = CILR_NEVER;
1086 		io->ci_no_srvlock = 1;
1087 	} else if (file->f_flags & O_APPEND) {
1088 		io->ci_lockreq = CILR_MANDATORY;
1089 	}
1090 
1091 	io->ci_noatime = file_is_noatime(file);
1092 }
1093 
1094 static ssize_t
ll_file_io_generic(const struct lu_env * env,struct vvp_io_args * args,struct file * file,enum cl_io_type iot,loff_t * ppos,size_t count)1095 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1096 		   struct file *file, enum cl_io_type iot,
1097 		   loff_t *ppos, size_t count)
1098 {
1099 	struct ll_inode_info *lli = ll_i2info(file_inode(file));
1100 	struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1101 	struct cl_io	 *io;
1102 	ssize_t	       result;
1103 
1104 restart:
1105 	io = ccc_env_thread_io(env);
1106 	ll_io_init(io, file, iot == CIT_WRITE);
1107 
1108 	if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1109 		struct vvp_io *vio = vvp_env_io(env);
1110 		struct ccc_io *cio = ccc_env_io(env);
1111 		int write_mutex_locked = 0;
1112 
1113 		cio->cui_fd  = LUSTRE_FPRIVATE(file);
1114 		vio->cui_io_subtype = args->via_io_subtype;
1115 
1116 		switch (vio->cui_io_subtype) {
1117 		case IO_NORMAL:
1118 			cio->cui_iter = args->u.normal.via_iter;
1119 			cio->cui_iocb = args->u.normal.via_iocb;
1120 			if ((iot == CIT_WRITE) &&
1121 			    !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1122 				if (mutex_lock_interruptible(&lli->
1123 							       lli_write_mutex)) {
1124 					result = -ERESTARTSYS;
1125 					goto out;
1126 				}
1127 				write_mutex_locked = 1;
1128 			} else if (iot == CIT_READ) {
1129 				down_read(&lli->lli_trunc_sem);
1130 			}
1131 			break;
1132 		case IO_SPLICE:
1133 			vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1134 			vio->u.splice.cui_flags = args->u.splice.via_flags;
1135 			break;
1136 		default:
1137 			CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1138 			LBUG();
1139 		}
1140 		result = cl_io_loop(env, io);
1141 		if (write_mutex_locked)
1142 			mutex_unlock(&lli->lli_write_mutex);
1143 		else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1144 			up_read(&lli->lli_trunc_sem);
1145 	} else {
1146 		/* cl_io_rw_init() handled IO */
1147 		result = io->ci_result;
1148 	}
1149 
1150 	if (io->ci_nob > 0) {
1151 		result = io->ci_nob;
1152 		*ppos = io->u.ci_wr.wr.crw_pos;
1153 	}
1154 	goto out;
1155 out:
1156 	cl_io_fini(env, io);
1157 	/* If any bit been read/written (result != 0), we just return
1158 	 * short read/write instead of restart io. */
1159 	if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1160 		CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1161 		       iot == CIT_READ ? "read" : "write",
1162 		       file, *ppos, count);
1163 		LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1164 		goto restart;
1165 	}
1166 
1167 	if (iot == CIT_READ) {
1168 		if (result >= 0)
1169 			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1170 					   LPROC_LL_READ_BYTES, result);
1171 	} else if (iot == CIT_WRITE) {
1172 		if (result >= 0) {
1173 			ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1174 					   LPROC_LL_WRITE_BYTES, result);
1175 			fd->fd_write_failed = false;
1176 		} else if (result != -ERESTARTSYS) {
1177 			fd->fd_write_failed = true;
1178 		}
1179 	}
1180 
1181 	return result;
1182 }
1183 
ll_file_read_iter(struct kiocb * iocb,struct iov_iter * to)1184 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1185 {
1186 	struct lu_env      *env;
1187 	struct vvp_io_args *args;
1188 	ssize_t	     result;
1189 	int		 refcheck;
1190 
1191 	env = cl_env_get(&refcheck);
1192 	if (IS_ERR(env))
1193 		return PTR_ERR(env);
1194 
1195 	args = vvp_env_args(env, IO_NORMAL);
1196 	args->u.normal.via_iter = to;
1197 	args->u.normal.via_iocb = iocb;
1198 
1199 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1200 				    &iocb->ki_pos, iov_iter_count(to));
1201 	cl_env_put(env, &refcheck);
1202 	return result;
1203 }
1204 
1205 /*
1206  * Write to a file (through the page cache).
1207  */
ll_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1208 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1209 {
1210 	struct lu_env      *env;
1211 	struct vvp_io_args *args;
1212 	ssize_t	     result;
1213 	int		 refcheck;
1214 
1215 	env = cl_env_get(&refcheck);
1216 	if (IS_ERR(env))
1217 		return PTR_ERR(env);
1218 
1219 	args = vvp_env_args(env, IO_NORMAL);
1220 	args->u.normal.via_iter = from;
1221 	args->u.normal.via_iocb = iocb;
1222 
1223 	result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1224 				  &iocb->ki_pos, iov_iter_count(from));
1225 	cl_env_put(env, &refcheck);
1226 	return result;
1227 }
1228 
1229 /*
1230  * Send file content (through pagecache) somewhere with helper
1231  */
ll_file_splice_read(struct file * in_file,loff_t * ppos,struct pipe_inode_info * pipe,size_t count,unsigned int flags)1232 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1233 				   struct pipe_inode_info *pipe, size_t count,
1234 				   unsigned int flags)
1235 {
1236 	struct lu_env      *env;
1237 	struct vvp_io_args *args;
1238 	ssize_t	     result;
1239 	int		 refcheck;
1240 
1241 	env = cl_env_get(&refcheck);
1242 	if (IS_ERR(env))
1243 		return PTR_ERR(env);
1244 
1245 	args = vvp_env_args(env, IO_SPLICE);
1246 	args->u.splice.via_pipe = pipe;
1247 	args->u.splice.via_flags = flags;
1248 
1249 	result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1250 	cl_env_put(env, &refcheck);
1251 	return result;
1252 }
1253 
ll_lov_recreate(struct inode * inode,struct ost_id * oi,u32 ost_idx)1254 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1255 {
1256 	struct obd_export *exp = ll_i2dtexp(inode);
1257 	struct obd_trans_info oti = { 0 };
1258 	struct obdo *oa = NULL;
1259 	int lsm_size;
1260 	int rc = 0;
1261 	struct lov_stripe_md *lsm = NULL, *lsm2;
1262 
1263 	oa = kmem_cache_alloc(obdo_cachep, GFP_NOFS | __GFP_ZERO);
1264 	if (oa == NULL)
1265 		return -ENOMEM;
1266 
1267 	lsm = ccc_inode_lsm_get(inode);
1268 	if (!lsm_has_objects(lsm)) {
1269 		rc = -ENOENT;
1270 		goto out;
1271 	}
1272 
1273 	lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1274 		   (lsm->lsm_stripe_count));
1275 
1276 	lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1277 	if (lsm2 == NULL) {
1278 		rc = -ENOMEM;
1279 		goto out;
1280 	}
1281 
1282 	oa->o_oi = *oi;
1283 	oa->o_nlink = ost_idx;
1284 	oa->o_flags |= OBD_FL_RECREATE_OBJS;
1285 	oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1286 	obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1287 				   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1288 	obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1289 	memcpy(lsm2, lsm, lsm_size);
1290 	ll_inode_size_lock(inode);
1291 	rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1292 	ll_inode_size_unlock(inode);
1293 
1294 	kvfree(lsm2);
1295 	goto out;
1296 out:
1297 	ccc_inode_lsm_put(inode, lsm);
1298 	kmem_cache_free(obdo_cachep, oa);
1299 	return rc;
1300 }
1301 
ll_lov_recreate_obj(struct inode * inode,unsigned long arg)1302 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1303 {
1304 	struct ll_recreate_obj ucreat;
1305 	struct ost_id		oi;
1306 
1307 	if (!capable(CFS_CAP_SYS_ADMIN))
1308 		return -EPERM;
1309 
1310 	if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1311 			   sizeof(ucreat)))
1312 		return -EFAULT;
1313 
1314 	ostid_set_seq_mdt0(&oi);
1315 	ostid_set_id(&oi, ucreat.lrc_id);
1316 	return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1317 }
1318 
ll_lov_recreate_fid(struct inode * inode,unsigned long arg)1319 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1320 {
1321 	struct lu_fid	fid;
1322 	struct ost_id	oi;
1323 	u32		ost_idx;
1324 
1325 	if (!capable(CFS_CAP_SYS_ADMIN))
1326 		return -EPERM;
1327 
1328 	if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1329 		return -EFAULT;
1330 
1331 	fid_to_ostid(&fid, &oi);
1332 	ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1333 	return ll_lov_recreate(inode, &oi, ost_idx);
1334 }
1335 
ll_lov_setstripe_ea_info(struct inode * inode,struct dentry * dentry,int flags,struct lov_user_md * lum,int lum_size)1336 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1337 			     int flags, struct lov_user_md *lum, int lum_size)
1338 {
1339 	struct lov_stripe_md *lsm = NULL;
1340 	struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1341 	int rc = 0;
1342 
1343 	lsm = ccc_inode_lsm_get(inode);
1344 	if (lsm != NULL) {
1345 		ccc_inode_lsm_put(inode, lsm);
1346 		CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1347 		       inode->i_ino);
1348 		rc = -EEXIST;
1349 		goto out;
1350 	}
1351 
1352 	ll_inode_size_lock(inode);
1353 	rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1354 	if (rc)
1355 		goto out_unlock;
1356 	rc = oit.d.lustre.it_status;
1357 	if (rc < 0)
1358 		goto out_req_free;
1359 
1360 	ll_release_openhandle(inode, &oit);
1361 
1362 out_unlock:
1363 	ll_inode_size_unlock(inode);
1364 	ll_intent_release(&oit);
1365 	ccc_inode_lsm_put(inode, lsm);
1366 out:
1367 	return rc;
1368 out_req_free:
1369 	ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1370 	goto out;
1371 }
1372 
ll_lov_getstripe_ea_info(struct inode * inode,const char * filename,struct lov_mds_md ** lmmp,int * lmm_size,struct ptlrpc_request ** request)1373 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1374 			     struct lov_mds_md **lmmp, int *lmm_size,
1375 			     struct ptlrpc_request **request)
1376 {
1377 	struct ll_sb_info *sbi = ll_i2sbi(inode);
1378 	struct mdt_body  *body;
1379 	struct lov_mds_md *lmm = NULL;
1380 	struct ptlrpc_request *req = NULL;
1381 	struct md_op_data *op_data;
1382 	int rc, lmmsize;
1383 
1384 	rc = ll_get_default_mdsize(sbi, &lmmsize);
1385 	if (rc)
1386 		return rc;
1387 
1388 	op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1389 				     strlen(filename), lmmsize,
1390 				     LUSTRE_OPC_ANY, NULL);
1391 	if (IS_ERR(op_data))
1392 		return PTR_ERR(op_data);
1393 
1394 	op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1395 	rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1396 	ll_finish_md_op_data(op_data);
1397 	if (rc < 0) {
1398 		CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1399 		       filename, rc);
1400 		goto out;
1401 	}
1402 
1403 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1404 	LASSERT(body != NULL); /* checked by mdc_getattr_name */
1405 
1406 	lmmsize = body->eadatasize;
1407 
1408 	if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1409 			lmmsize == 0) {
1410 		rc = -ENODATA;
1411 		goto out;
1412 	}
1413 
1414 	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1415 	LASSERT(lmm != NULL);
1416 
1417 	if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1418 	    (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1419 		rc = -EPROTO;
1420 		goto out;
1421 	}
1422 
1423 	/*
1424 	 * This is coming from the MDS, so is probably in
1425 	 * little endian.  We convert it to host endian before
1426 	 * passing it to userspace.
1427 	 */
1428 	if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
1429 		int stripe_count;
1430 
1431 		stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1432 		if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1433 			stripe_count = 0;
1434 
1435 		/* if function called for directory - we should
1436 		 * avoid swab not existent lsm objects */
1437 		if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1438 			lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1439 			if (S_ISREG(body->mode))
1440 				lustre_swab_lov_user_md_objects(
1441 				 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1442 				 stripe_count);
1443 		} else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1444 			lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1445 			if (S_ISREG(body->mode))
1446 				lustre_swab_lov_user_md_objects(
1447 				 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1448 				 stripe_count);
1449 		}
1450 	}
1451 
1452 out:
1453 	*lmmp = lmm;
1454 	*lmm_size = lmmsize;
1455 	*request = req;
1456 	return rc;
1457 }
1458 
ll_lov_setea(struct inode * inode,struct file * file,unsigned long arg)1459 static int ll_lov_setea(struct inode *inode, struct file *file,
1460 			    unsigned long arg)
1461 {
1462 	int			 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1463 	struct lov_user_md	*lump;
1464 	int			 lum_size = sizeof(struct lov_user_md) +
1465 					    sizeof(struct lov_user_ost_data);
1466 	int			 rc;
1467 
1468 	if (!capable(CFS_CAP_SYS_ADMIN))
1469 		return -EPERM;
1470 
1471 	lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1472 	if (lump == NULL)
1473 		return -ENOMEM;
1474 
1475 	if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1476 		kvfree(lump);
1477 		return -EFAULT;
1478 	}
1479 
1480 	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1481 				     lum_size);
1482 	cl_lov_delay_create_clear(&file->f_flags);
1483 
1484 	kvfree(lump);
1485 	return rc;
1486 }
1487 
ll_lov_setstripe(struct inode * inode,struct file * file,unsigned long arg)1488 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1489 			    unsigned long arg)
1490 {
1491 	struct lov_user_md_v3	 lumv3;
1492 	struct lov_user_md_v1	*lumv1 = (struct lov_user_md_v1 *)&lumv3;
1493 	struct lov_user_md_v1	*lumv1p = (struct lov_user_md_v1 *)arg;
1494 	struct lov_user_md_v3	*lumv3p = (struct lov_user_md_v3 *)arg;
1495 	int			 lum_size, rc;
1496 	int			 flags = FMODE_WRITE;
1497 
1498 	/* first try with v1 which is smaller than v3 */
1499 	lum_size = sizeof(struct lov_user_md_v1);
1500 	if (copy_from_user(lumv1, lumv1p, lum_size))
1501 		return -EFAULT;
1502 
1503 	if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1504 		lum_size = sizeof(struct lov_user_md_v3);
1505 		if (copy_from_user(&lumv3, lumv3p, lum_size))
1506 			return -EFAULT;
1507 	}
1508 
1509 	rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1510 				      lum_size);
1511 	cl_lov_delay_create_clear(&file->f_flags);
1512 	if (rc == 0) {
1513 		struct lov_stripe_md *lsm;
1514 		__u32 gen;
1515 
1516 		put_user(0, &lumv1p->lmm_stripe_count);
1517 
1518 		ll_layout_refresh(inode, &gen);
1519 		lsm = ccc_inode_lsm_get(inode);
1520 		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521 				   0, lsm, (void *)arg);
1522 		ccc_inode_lsm_put(inode, lsm);
1523 	}
1524 	return rc;
1525 }
1526 
ll_lov_getstripe(struct inode * inode,unsigned long arg)1527 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1528 {
1529 	struct lov_stripe_md *lsm;
1530 	int rc = -ENODATA;
1531 
1532 	lsm = ccc_inode_lsm_get(inode);
1533 	if (lsm != NULL)
1534 		rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1535 				   lsm, (void *)arg);
1536 	ccc_inode_lsm_put(inode, lsm);
1537 	return rc;
1538 }
1539 
1540 static int
ll_get_grouplock(struct inode * inode,struct file * file,unsigned long arg)1541 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1542 {
1543 	struct ll_inode_info   *lli = ll_i2info(inode);
1544 	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1545 	struct ccc_grouplock    grouplock;
1546 	int		     rc;
1547 
1548 	if (arg == 0) {
1549 		CWARN("group id for group lock must not be 0\n");
1550 		return -EINVAL;
1551 	}
1552 
1553 	if (ll_file_nolock(file))
1554 		return -EOPNOTSUPP;
1555 
1556 	spin_lock(&lli->lli_lock);
1557 	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1558 		CWARN("group lock already existed with gid %lu\n",
1559 		      fd->fd_grouplock.cg_gid);
1560 		spin_unlock(&lli->lli_lock);
1561 		return -EINVAL;
1562 	}
1563 	LASSERT(fd->fd_grouplock.cg_lock == NULL);
1564 	spin_unlock(&lli->lli_lock);
1565 
1566 	rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1567 			      arg, (file->f_flags & O_NONBLOCK), &grouplock);
1568 	if (rc)
1569 		return rc;
1570 
1571 	spin_lock(&lli->lli_lock);
1572 	if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1573 		spin_unlock(&lli->lli_lock);
1574 		CERROR("another thread just won the race\n");
1575 		cl_put_grouplock(&grouplock);
1576 		return -EINVAL;
1577 	}
1578 
1579 	fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1580 	fd->fd_grouplock = grouplock;
1581 	spin_unlock(&lli->lli_lock);
1582 
1583 	CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1584 	return 0;
1585 }
1586 
ll_put_grouplock(struct inode * inode,struct file * file,unsigned long arg)1587 static int ll_put_grouplock(struct inode *inode, struct file *file,
1588 			    unsigned long arg)
1589 {
1590 	struct ll_inode_info   *lli = ll_i2info(inode);
1591 	struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1592 	struct ccc_grouplock    grouplock;
1593 
1594 	spin_lock(&lli->lli_lock);
1595 	if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1596 		spin_unlock(&lli->lli_lock);
1597 		CWARN("no group lock held\n");
1598 		return -EINVAL;
1599 	}
1600 	LASSERT(fd->fd_grouplock.cg_lock != NULL);
1601 
1602 	if (fd->fd_grouplock.cg_gid != arg) {
1603 		CWARN("group lock %lu doesn't match current id %lu\n",
1604 		       arg, fd->fd_grouplock.cg_gid);
1605 		spin_unlock(&lli->lli_lock);
1606 		return -EINVAL;
1607 	}
1608 
1609 	grouplock = fd->fd_grouplock;
1610 	memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1611 	fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1612 	spin_unlock(&lli->lli_lock);
1613 
1614 	cl_put_grouplock(&grouplock);
1615 	CDEBUG(D_INFO, "group lock %lu released\n", arg);
1616 	return 0;
1617 }
1618 
1619 /**
1620  * Close inode open handle
1621  *
1622  * \param inode  [in]     inode in question
1623  * \param it     [in,out] intent which contains open info and result
1624  *
1625  * \retval 0     success
1626  * \retval <0    failure
1627  */
ll_release_openhandle(struct inode * inode,struct lookup_intent * it)1628 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1629 {
1630 	struct obd_client_handle *och;
1631 	int rc;
1632 
1633 	LASSERT(inode);
1634 
1635 	/* Root ? Do nothing. */
1636 	if (is_root_inode(inode))
1637 		return 0;
1638 
1639 	/* No open handle to close? Move away */
1640 	if (!it_disposition(it, DISP_OPEN_OPEN))
1641 		return 0;
1642 
1643 	LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1644 
1645 	och = kzalloc(sizeof(*och), GFP_NOFS);
1646 	if (!och) {
1647 		rc = -ENOMEM;
1648 		goto out;
1649 	}
1650 
1651 	ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1652 
1653 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1654 				       inode, och, NULL);
1655 out:
1656 	/* this one is in place of ll_file_open */
1657 	if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1658 		ptlrpc_req_finished(it->d.lustre.it_data);
1659 		it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1660 	}
1661 	return rc;
1662 }
1663 
1664 /**
1665  * Get size for inode for which FIEMAP mapping is requested.
1666  * Make the FIEMAP get_info call and returns the result.
1667  */
ll_do_fiemap(struct inode * inode,struct ll_user_fiemap * fiemap,size_t num_bytes)1668 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1669 			size_t num_bytes)
1670 {
1671 	struct obd_export *exp = ll_i2dtexp(inode);
1672 	struct lov_stripe_md *lsm = NULL;
1673 	struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1674 	__u32 vallen = num_bytes;
1675 	int rc;
1676 
1677 	/* Checks for fiemap flags */
1678 	if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1679 		fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1680 		return -EBADR;
1681 	}
1682 
1683 	/* Check for FIEMAP_FLAG_SYNC */
1684 	if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1685 		rc = filemap_fdatawrite(inode->i_mapping);
1686 		if (rc)
1687 			return rc;
1688 	}
1689 
1690 	lsm = ccc_inode_lsm_get(inode);
1691 	if (lsm == NULL)
1692 		return -ENOENT;
1693 
1694 	/* If the stripe_count > 1 and the application does not understand
1695 	 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1696 	 */
1697 	if (lsm->lsm_stripe_count > 1 &&
1698 	    !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1699 		rc = -EOPNOTSUPP;
1700 		goto out;
1701 	}
1702 
1703 	fm_key.oa.o_oi = lsm->lsm_oi;
1704 	fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1705 
1706 	if (i_size_read(inode) == 0) {
1707 		rc = ll_glimpse_size(inode);
1708 		if (rc)
1709 			goto out;
1710 	}
1711 
1712 	obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1713 	obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1714 	/* If filesize is 0, then there would be no objects for mapping */
1715 	if (fm_key.oa.o_size == 0) {
1716 		fiemap->fm_mapped_extents = 0;
1717 		rc = 0;
1718 		goto out;
1719 	}
1720 
1721 	memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1722 
1723 	rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1724 			  fiemap, lsm);
1725 	if (rc)
1726 		CERROR("obd_get_info failed: rc = %d\n", rc);
1727 
1728 out:
1729 	ccc_inode_lsm_put(inode, lsm);
1730 	return rc;
1731 }
1732 
ll_fid2path(struct inode * inode,void __user * arg)1733 int ll_fid2path(struct inode *inode, void __user *arg)
1734 {
1735 	struct obd_export *exp = ll_i2mdexp(inode);
1736 	const struct getinfo_fid2path __user *gfin = arg;
1737 	struct getinfo_fid2path *gfout;
1738 	u32 pathlen;
1739 	size_t outsize;
1740 	int rc;
1741 
1742 	if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1743 	    !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1744 		return -EPERM;
1745 
1746 	/* Only need to get the buflen */
1747 	if (get_user(pathlen, &gfin->gf_pathlen))
1748 		return -EFAULT;
1749 
1750 	if (pathlen > PATH_MAX)
1751 		return -EINVAL;
1752 
1753 	outsize = sizeof(*gfout) + pathlen;
1754 
1755 	gfout = kzalloc(outsize, GFP_NOFS);
1756 	if (!gfout)
1757 		return -ENOMEM;
1758 
1759 	if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1760 		rc = -EFAULT;
1761 		goto gf_free;
1762 	}
1763 
1764 	/* Call mdc_iocontrol */
1765 	rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1766 	if (rc != 0)
1767 		goto gf_free;
1768 
1769 	if (copy_to_user(arg, gfout, outsize))
1770 		rc = -EFAULT;
1771 
1772 gf_free:
1773 	kfree(gfout);
1774 	return rc;
1775 }
1776 
ll_ioctl_fiemap(struct inode * inode,unsigned long arg)1777 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1778 {
1779 	struct ll_user_fiemap *fiemap_s;
1780 	size_t num_bytes, ret_bytes;
1781 	unsigned int extent_count;
1782 	int rc = 0;
1783 
1784 	/* Get the extent count so we can calculate the size of
1785 	 * required fiemap buffer */
1786 	if (get_user(extent_count,
1787 	    &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1788 		return -EFAULT;
1789 
1790 	if (extent_count >=
1791 	    (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1792 		return -EINVAL;
1793 	num_bytes = sizeof(*fiemap_s) + (extent_count *
1794 					 sizeof(struct ll_fiemap_extent));
1795 
1796 	fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1797 	if (fiemap_s == NULL)
1798 		return -ENOMEM;
1799 
1800 	/* get the fiemap value */
1801 	if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1802 			   sizeof(*fiemap_s))) {
1803 		rc = -EFAULT;
1804 		goto error;
1805 	}
1806 
1807 	/* If fm_extent_count is non-zero, read the first extent since
1808 	 * it is used to calculate end_offset and device from previous
1809 	 * fiemap call. */
1810 	if (extent_count) {
1811 		if (copy_from_user(&fiemap_s->fm_extents[0],
1812 		    (char __user *)arg + sizeof(*fiemap_s),
1813 		    sizeof(struct ll_fiemap_extent))) {
1814 			rc = -EFAULT;
1815 			goto error;
1816 		}
1817 	}
1818 
1819 	rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1820 	if (rc)
1821 		goto error;
1822 
1823 	ret_bytes = sizeof(struct ll_user_fiemap);
1824 
1825 	if (extent_count != 0)
1826 		ret_bytes += (fiemap_s->fm_mapped_extents *
1827 				 sizeof(struct ll_fiemap_extent));
1828 
1829 	if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1830 		rc = -EFAULT;
1831 
1832 error:
1833 	kvfree(fiemap_s);
1834 	return rc;
1835 }
1836 
1837 /*
1838  * Read the data_version for inode.
1839  *
1840  * This value is computed using stripe object version on OST.
1841  * Version is computed using server side locking.
1842  *
1843  * @param extent_lock  Take extent lock. Not needed if a process is already
1844  *		       holding the OST object group locks.
1845  */
ll_data_version(struct inode * inode,__u64 * data_version,int extent_lock)1846 int ll_data_version(struct inode *inode, __u64 *data_version,
1847 		    int extent_lock)
1848 {
1849 	struct lov_stripe_md	*lsm = NULL;
1850 	struct ll_sb_info	*sbi = ll_i2sbi(inode);
1851 	struct obdo		*obdo = NULL;
1852 	int			 rc;
1853 
1854 	/* If no stripe, we consider version is 0. */
1855 	lsm = ccc_inode_lsm_get(inode);
1856 	if (!lsm_has_objects(lsm)) {
1857 		*data_version = 0;
1858 		CDEBUG(D_INODE, "No object for inode\n");
1859 		rc = 0;
1860 		goto out;
1861 	}
1862 
1863 	obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1864 	if (!obdo) {
1865 		rc = -ENOMEM;
1866 		goto out;
1867 	}
1868 
1869 	rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, obdo, 0, extent_lock);
1870 	if (rc == 0) {
1871 		if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1872 			rc = -EOPNOTSUPP;
1873 		else
1874 			*data_version = obdo->o_data_version;
1875 	}
1876 
1877 	kfree(obdo);
1878 out:
1879 	ccc_inode_lsm_put(inode, lsm);
1880 	return rc;
1881 }
1882 
1883 /*
1884  * Trigger a HSM release request for the provided inode.
1885  */
ll_hsm_release(struct inode * inode)1886 int ll_hsm_release(struct inode *inode)
1887 {
1888 	struct cl_env_nest nest;
1889 	struct lu_env *env;
1890 	struct obd_client_handle *och = NULL;
1891 	__u64 data_version = 0;
1892 	int rc;
1893 
1894 	CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1895 	       ll_get_fsname(inode->i_sb, NULL, 0),
1896 	       PFID(&ll_i2info(inode)->lli_fid));
1897 
1898 	och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1899 	if (IS_ERR(och)) {
1900 		rc = PTR_ERR(och);
1901 		goto out;
1902 	}
1903 
1904 	/* Grab latest data_version and [am]time values */
1905 	rc = ll_data_version(inode, &data_version, 1);
1906 	if (rc != 0)
1907 		goto out;
1908 
1909 	env = cl_env_nested_get(&nest);
1910 	if (IS_ERR(env)) {
1911 		rc = PTR_ERR(env);
1912 		goto out;
1913 	}
1914 
1915 	ll_merge_lvb(env, inode);
1916 	cl_env_nested_put(&nest, env);
1917 
1918 	/* Release the file.
1919 	 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1920 	 * we still need it to pack l_remote_handle to MDT. */
1921 	rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1922 				       &data_version);
1923 	och = NULL;
1924 
1925 out:
1926 	if (och != NULL && !IS_ERR(och)) /* close the file */
1927 		ll_lease_close(och, inode, NULL);
1928 
1929 	return rc;
1930 }
1931 
1932 struct ll_swap_stack {
1933 	struct iattr		 ia1, ia2;
1934 	__u64			 dv1, dv2;
1935 	struct inode		*inode1, *inode2;
1936 	bool			 check_dv1, check_dv2;
1937 };
1938 
ll_swap_layouts(struct file * file1,struct file * file2,struct lustre_swap_layouts * lsl)1939 static int ll_swap_layouts(struct file *file1, struct file *file2,
1940 			   struct lustre_swap_layouts *lsl)
1941 {
1942 	struct mdc_swap_layouts	 msl;
1943 	struct md_op_data	*op_data;
1944 	__u32			 gid;
1945 	__u64			 dv;
1946 	struct ll_swap_stack	*llss = NULL;
1947 	int			 rc;
1948 
1949 	llss = kzalloc(sizeof(*llss), GFP_NOFS);
1950 	if (!llss)
1951 		return -ENOMEM;
1952 
1953 	llss->inode1 = file_inode(file1);
1954 	llss->inode2 = file_inode(file2);
1955 
1956 	if (!S_ISREG(llss->inode2->i_mode)) {
1957 		rc = -EINVAL;
1958 		goto free;
1959 	}
1960 
1961 	if (inode_permission(llss->inode1, MAY_WRITE) ||
1962 	    inode_permission(llss->inode2, MAY_WRITE)) {
1963 		rc = -EPERM;
1964 		goto free;
1965 	}
1966 
1967 	if (llss->inode2->i_sb != llss->inode1->i_sb) {
1968 		rc = -EXDEV;
1969 		goto free;
1970 	}
1971 
1972 	/* we use 2 bool because it is easier to swap than 2 bits */
1973 	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1974 		llss->check_dv1 = true;
1975 
1976 	if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1977 		llss->check_dv2 = true;
1978 
1979 	/* we cannot use lsl->sl_dvX directly because we may swap them */
1980 	llss->dv1 = lsl->sl_dv1;
1981 	llss->dv2 = lsl->sl_dv2;
1982 
1983 	rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1984 	if (rc == 0) /* same file, done! */ {
1985 		rc = 0;
1986 		goto free;
1987 	}
1988 
1989 	if (rc < 0) { /* sequentialize it */
1990 		swap(llss->inode1, llss->inode2);
1991 		swap(file1, file2);
1992 		swap(llss->dv1, llss->dv2);
1993 		swap(llss->check_dv1, llss->check_dv2);
1994 	}
1995 
1996 	gid = lsl->sl_gid;
1997 	if (gid != 0) { /* application asks to flush dirty cache */
1998 		rc = ll_get_grouplock(llss->inode1, file1, gid);
1999 		if (rc < 0)
2000 			goto free;
2001 
2002 		rc = ll_get_grouplock(llss->inode2, file2, gid);
2003 		if (rc < 0) {
2004 			ll_put_grouplock(llss->inode1, file1, gid);
2005 			goto free;
2006 		}
2007 	}
2008 
2009 	/* to be able to restore mtime and atime after swap
2010 	 * we need to first save them */
2011 	if (lsl->sl_flags &
2012 	    (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2013 		llss->ia1.ia_mtime = llss->inode1->i_mtime;
2014 		llss->ia1.ia_atime = llss->inode1->i_atime;
2015 		llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2016 		llss->ia2.ia_mtime = llss->inode2->i_mtime;
2017 		llss->ia2.ia_atime = llss->inode2->i_atime;
2018 		llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2019 	}
2020 
2021 	/* ultimate check, before swapping the layouts we check if
2022 	 * dataversion has changed (if requested) */
2023 	if (llss->check_dv1) {
2024 		rc = ll_data_version(llss->inode1, &dv, 0);
2025 		if (rc)
2026 			goto putgl;
2027 		if (dv != llss->dv1) {
2028 			rc = -EAGAIN;
2029 			goto putgl;
2030 		}
2031 	}
2032 
2033 	if (llss->check_dv2) {
2034 		rc = ll_data_version(llss->inode2, &dv, 0);
2035 		if (rc)
2036 			goto putgl;
2037 		if (dv != llss->dv2) {
2038 			rc = -EAGAIN;
2039 			goto putgl;
2040 		}
2041 	}
2042 
2043 	/* struct md_op_data is used to send the swap args to the mdt
2044 	 * only flags is missing, so we use struct mdc_swap_layouts
2045 	 * through the md_op_data->op_data */
2046 	/* flags from user space have to be converted before they are send to
2047 	 * server, no flag is sent today, they are only used on the client */
2048 	msl.msl_flags = 0;
2049 	rc = -ENOMEM;
2050 	op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2051 				     0, LUSTRE_OPC_ANY, &msl);
2052 	if (IS_ERR(op_data)) {
2053 		rc = PTR_ERR(op_data);
2054 		goto free;
2055 	}
2056 
2057 	rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2058 			   sizeof(*op_data), op_data, NULL);
2059 	ll_finish_md_op_data(op_data);
2060 
2061 putgl:
2062 	if (gid != 0) {
2063 		ll_put_grouplock(llss->inode2, file2, gid);
2064 		ll_put_grouplock(llss->inode1, file1, gid);
2065 	}
2066 
2067 	/* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2068 	if (rc != 0)
2069 		goto free;
2070 
2071 	/* clear useless flags */
2072 	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2073 		llss->ia1.ia_valid &= ~ATTR_MTIME;
2074 		llss->ia2.ia_valid &= ~ATTR_MTIME;
2075 	}
2076 
2077 	if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2078 		llss->ia1.ia_valid &= ~ATTR_ATIME;
2079 		llss->ia2.ia_valid &= ~ATTR_ATIME;
2080 	}
2081 
2082 	/* update time if requested */
2083 	rc = 0;
2084 	if (llss->ia2.ia_valid != 0) {
2085 		mutex_lock(&llss->inode1->i_mutex);
2086 		rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2087 		mutex_unlock(&llss->inode1->i_mutex);
2088 	}
2089 
2090 	if (llss->ia1.ia_valid != 0) {
2091 		int rc1;
2092 
2093 		mutex_lock(&llss->inode2->i_mutex);
2094 		rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2095 		mutex_unlock(&llss->inode2->i_mutex);
2096 		if (rc == 0)
2097 			rc = rc1;
2098 	}
2099 
2100 free:
2101 	kfree(llss);
2102 
2103 	return rc;
2104 }
2105 
ll_hsm_state_set(struct inode * inode,struct hsm_state_set * hss)2106 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2107 {
2108 	struct md_op_data	*op_data;
2109 	int			 rc;
2110 
2111 	/* Detect out-of range masks */
2112 	if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2113 		return -EINVAL;
2114 
2115 	/* Non-root users are forbidden to set or clear flags which are
2116 	 * NOT defined in HSM_USER_MASK. */
2117 	if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2118 	    !capable(CFS_CAP_SYS_ADMIN))
2119 		return -EPERM;
2120 
2121 	/* Detect out-of range archive id */
2122 	if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2123 	    (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2124 		return -EINVAL;
2125 
2126 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2127 				     LUSTRE_OPC_ANY, hss);
2128 	if (IS_ERR(op_data))
2129 		return PTR_ERR(op_data);
2130 
2131 	rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2132 			   sizeof(*op_data), op_data, NULL);
2133 
2134 	ll_finish_md_op_data(op_data);
2135 
2136 	return rc;
2137 }
2138 
ll_hsm_import(struct inode * inode,struct file * file,struct hsm_user_import * hui)2139 static int ll_hsm_import(struct inode *inode, struct file *file,
2140 			 struct hsm_user_import *hui)
2141 {
2142 	struct hsm_state_set	*hss = NULL;
2143 	struct iattr		*attr = NULL;
2144 	int			 rc;
2145 
2146 	if (!S_ISREG(inode->i_mode))
2147 		return -EINVAL;
2148 
2149 	/* set HSM flags */
2150 	hss = kzalloc(sizeof(*hss), GFP_NOFS);
2151 	if (!hss)
2152 		return -ENOMEM;
2153 
2154 	hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2155 	hss->hss_archive_id = hui->hui_archive_id;
2156 	hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2157 	rc = ll_hsm_state_set(inode, hss);
2158 	if (rc != 0)
2159 		goto free_hss;
2160 
2161 	attr = kzalloc(sizeof(*attr), GFP_NOFS);
2162 	if (!attr) {
2163 		rc = -ENOMEM;
2164 		goto free_hss;
2165 	}
2166 
2167 	attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2168 	attr->ia_mode |= S_IFREG;
2169 	attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2170 	attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2171 	attr->ia_size = hui->hui_size;
2172 	attr->ia_mtime.tv_sec = hui->hui_mtime;
2173 	attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2174 	attr->ia_atime.tv_sec = hui->hui_atime;
2175 	attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2176 
2177 	attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2178 			 ATTR_UID | ATTR_GID |
2179 			 ATTR_MTIME | ATTR_MTIME_SET |
2180 			 ATTR_ATIME | ATTR_ATIME_SET;
2181 
2182 	mutex_lock(&inode->i_mutex);
2183 
2184 	rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2185 	if (rc == -ENODATA)
2186 		rc = 0;
2187 
2188 	mutex_unlock(&inode->i_mutex);
2189 
2190 	kfree(attr);
2191 free_hss:
2192 	kfree(hss);
2193 	return rc;
2194 }
2195 
2196 static long
ll_file_ioctl(struct file * file,unsigned int cmd,unsigned long arg)2197 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2198 {
2199 	struct inode		*inode = file_inode(file);
2200 	struct ll_file_data	*fd = LUSTRE_FPRIVATE(file);
2201 	int			 flags, rc;
2202 
2203 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2204 	       inode->i_generation, inode, cmd);
2205 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2206 
2207 	/* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2208 	if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2209 		return -ENOTTY;
2210 
2211 	switch (cmd) {
2212 	case LL_IOC_GETFLAGS:
2213 		/* Get the current value of the file flags */
2214 		return put_user(fd->fd_flags, (int *)arg);
2215 	case LL_IOC_SETFLAGS:
2216 	case LL_IOC_CLRFLAGS:
2217 		/* Set or clear specific file flags */
2218 		/* XXX This probably needs checks to ensure the flags are
2219 		 *     not abused, and to handle any flag side effects.
2220 		 */
2221 		if (get_user(flags, (int *) arg))
2222 			return -EFAULT;
2223 
2224 		if (cmd == LL_IOC_SETFLAGS) {
2225 			if ((flags & LL_FILE_IGNORE_LOCK) &&
2226 			    !(file->f_flags & O_DIRECT)) {
2227 				CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2228 				       current->comm);
2229 				return -EINVAL;
2230 			}
2231 
2232 			fd->fd_flags |= flags;
2233 		} else {
2234 			fd->fd_flags &= ~flags;
2235 		}
2236 		return 0;
2237 	case LL_IOC_LOV_SETSTRIPE:
2238 		return ll_lov_setstripe(inode, file, arg);
2239 	case LL_IOC_LOV_SETEA:
2240 		return ll_lov_setea(inode, file, arg);
2241 	case LL_IOC_LOV_SWAP_LAYOUTS: {
2242 		struct file *file2;
2243 		struct lustre_swap_layouts lsl;
2244 
2245 		if (copy_from_user(&lsl, (char *)arg,
2246 				       sizeof(struct lustre_swap_layouts)))
2247 			return -EFAULT;
2248 
2249 		if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2250 			return -EPERM;
2251 
2252 		file2 = fget(lsl.sl_fd);
2253 		if (file2 == NULL)
2254 			return -EBADF;
2255 
2256 		rc = -EPERM;
2257 		if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2258 			rc = ll_swap_layouts(file, file2, &lsl);
2259 		fput(file2);
2260 		return rc;
2261 	}
2262 	case LL_IOC_LOV_GETSTRIPE:
2263 		return ll_lov_getstripe(inode, arg);
2264 	case LL_IOC_RECREATE_OBJ:
2265 		return ll_lov_recreate_obj(inode, arg);
2266 	case LL_IOC_RECREATE_FID:
2267 		return ll_lov_recreate_fid(inode, arg);
2268 	case FSFILT_IOC_FIEMAP:
2269 		return ll_ioctl_fiemap(inode, arg);
2270 	case FSFILT_IOC_GETFLAGS:
2271 	case FSFILT_IOC_SETFLAGS:
2272 		return ll_iocontrol(inode, file, cmd, arg);
2273 	case FSFILT_IOC_GETVERSION_OLD:
2274 	case FSFILT_IOC_GETVERSION:
2275 		return put_user(inode->i_generation, (int *)arg);
2276 	case LL_IOC_GROUP_LOCK:
2277 		return ll_get_grouplock(inode, file, arg);
2278 	case LL_IOC_GROUP_UNLOCK:
2279 		return ll_put_grouplock(inode, file, arg);
2280 	case IOC_OBD_STATFS:
2281 		return ll_obd_statfs(inode, (void *)arg);
2282 
2283 	/* We need to special case any other ioctls we want to handle,
2284 	 * to send them to the MDS/OST as appropriate and to properly
2285 	 * network encode the arg field.
2286 	case FSFILT_IOC_SETVERSION_OLD:
2287 	case FSFILT_IOC_SETVERSION:
2288 	*/
2289 	case LL_IOC_FLUSHCTX:
2290 		return ll_flush_ctx(inode);
2291 	case LL_IOC_PATH2FID: {
2292 		if (copy_to_user((void *)arg, ll_inode2fid(inode),
2293 				 sizeof(struct lu_fid)))
2294 			return -EFAULT;
2295 
2296 		return 0;
2297 	}
2298 	case OBD_IOC_FID2PATH:
2299 		return ll_fid2path(inode, (void *)arg);
2300 	case LL_IOC_DATA_VERSION: {
2301 		struct ioc_data_version	idv;
2302 		int			rc;
2303 
2304 		if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2305 			return -EFAULT;
2306 
2307 		rc = ll_data_version(inode, &idv.idv_version,
2308 				!(idv.idv_flags & LL_DV_NOFLUSH));
2309 
2310 		if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2311 			return -EFAULT;
2312 
2313 		return rc;
2314 	}
2315 
2316 	case LL_IOC_GET_MDTIDX: {
2317 		int mdtidx;
2318 
2319 		mdtidx = ll_get_mdt_idx(inode);
2320 		if (mdtidx < 0)
2321 			return mdtidx;
2322 
2323 		if (put_user((int)mdtidx, (int *)arg))
2324 			return -EFAULT;
2325 
2326 		return 0;
2327 	}
2328 	case OBD_IOC_GETDTNAME:
2329 	case OBD_IOC_GETMDNAME:
2330 		return ll_get_obd_name(inode, cmd, arg);
2331 	case LL_IOC_HSM_STATE_GET: {
2332 		struct md_op_data	*op_data;
2333 		struct hsm_user_state	*hus;
2334 		int			 rc;
2335 
2336 		hus = kzalloc(sizeof(*hus), GFP_NOFS);
2337 		if (!hus)
2338 			return -ENOMEM;
2339 
2340 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2341 					     LUSTRE_OPC_ANY, hus);
2342 		if (IS_ERR(op_data)) {
2343 			kfree(hus);
2344 			return PTR_ERR(op_data);
2345 		}
2346 
2347 		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2348 				   op_data, NULL);
2349 
2350 		if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2351 			rc = -EFAULT;
2352 
2353 		ll_finish_md_op_data(op_data);
2354 		kfree(hus);
2355 		return rc;
2356 	}
2357 	case LL_IOC_HSM_STATE_SET: {
2358 		struct hsm_state_set	*hss;
2359 		int			 rc;
2360 
2361 		hss = memdup_user((char *)arg, sizeof(*hss));
2362 		if (IS_ERR(hss))
2363 			return PTR_ERR(hss);
2364 
2365 		rc = ll_hsm_state_set(inode, hss);
2366 
2367 		kfree(hss);
2368 		return rc;
2369 	}
2370 	case LL_IOC_HSM_ACTION: {
2371 		struct md_op_data		*op_data;
2372 		struct hsm_current_action	*hca;
2373 		int				 rc;
2374 
2375 		hca = kzalloc(sizeof(*hca), GFP_NOFS);
2376 		if (!hca)
2377 			return -ENOMEM;
2378 
2379 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2380 					     LUSTRE_OPC_ANY, hca);
2381 		if (IS_ERR(op_data)) {
2382 			kfree(hca);
2383 			return PTR_ERR(op_data);
2384 		}
2385 
2386 		rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2387 				   op_data, NULL);
2388 
2389 		if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2390 			rc = -EFAULT;
2391 
2392 		ll_finish_md_op_data(op_data);
2393 		kfree(hca);
2394 		return rc;
2395 	}
2396 	case LL_IOC_SET_LEASE: {
2397 		struct ll_inode_info *lli = ll_i2info(inode);
2398 		struct obd_client_handle *och = NULL;
2399 		bool lease_broken;
2400 		fmode_t mode = 0;
2401 
2402 		switch (arg) {
2403 		case F_WRLCK:
2404 			if (!(file->f_mode & FMODE_WRITE))
2405 				return -EPERM;
2406 			mode = FMODE_WRITE;
2407 			break;
2408 		case F_RDLCK:
2409 			if (!(file->f_mode & FMODE_READ))
2410 				return -EPERM;
2411 			mode = FMODE_READ;
2412 			break;
2413 		case F_UNLCK:
2414 			mutex_lock(&lli->lli_och_mutex);
2415 			if (fd->fd_lease_och != NULL) {
2416 				och = fd->fd_lease_och;
2417 				fd->fd_lease_och = NULL;
2418 			}
2419 			mutex_unlock(&lli->lli_och_mutex);
2420 
2421 			if (och != NULL) {
2422 				mode = och->och_flags &
2423 				       (FMODE_READ|FMODE_WRITE);
2424 				rc = ll_lease_close(och, inode, &lease_broken);
2425 				if (rc == 0 && lease_broken)
2426 					mode = 0;
2427 			} else {
2428 				rc = -ENOLCK;
2429 			}
2430 
2431 			/* return the type of lease or error */
2432 			return rc < 0 ? rc : (int)mode;
2433 		default:
2434 			return -EINVAL;
2435 		}
2436 
2437 		CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2438 
2439 		/* apply for lease */
2440 		och = ll_lease_open(inode, file, mode, 0);
2441 		if (IS_ERR(och))
2442 			return PTR_ERR(och);
2443 
2444 		rc = 0;
2445 		mutex_lock(&lli->lli_och_mutex);
2446 		if (fd->fd_lease_och == NULL) {
2447 			fd->fd_lease_och = och;
2448 			och = NULL;
2449 		}
2450 		mutex_unlock(&lli->lli_och_mutex);
2451 		if (och != NULL) {
2452 			/* impossible now that only excl is supported for now */
2453 			ll_lease_close(och, inode, &lease_broken);
2454 			rc = -EBUSY;
2455 		}
2456 		return rc;
2457 	}
2458 	case LL_IOC_GET_LEASE: {
2459 		struct ll_inode_info *lli = ll_i2info(inode);
2460 		struct ldlm_lock *lock = NULL;
2461 
2462 		rc = 0;
2463 		mutex_lock(&lli->lli_och_mutex);
2464 		if (fd->fd_lease_och != NULL) {
2465 			struct obd_client_handle *och = fd->fd_lease_och;
2466 
2467 			lock = ldlm_handle2lock(&och->och_lease_handle);
2468 			if (lock != NULL) {
2469 				lock_res_and_lock(lock);
2470 				if (!ldlm_is_cancel(lock))
2471 					rc = och->och_flags &
2472 						(FMODE_READ | FMODE_WRITE);
2473 				unlock_res_and_lock(lock);
2474 				ldlm_lock_put(lock);
2475 			}
2476 		}
2477 		mutex_unlock(&lli->lli_och_mutex);
2478 		return rc;
2479 	}
2480 	case LL_IOC_HSM_IMPORT: {
2481 		struct hsm_user_import *hui;
2482 
2483 		hui = memdup_user((void *)arg, sizeof(*hui));
2484 		if (IS_ERR(hui))
2485 			return PTR_ERR(hui);
2486 
2487 		rc = ll_hsm_import(inode, file, hui);
2488 
2489 		kfree(hui);
2490 		return rc;
2491 	}
2492 	default: {
2493 		int err;
2494 
2495 		if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2496 		     LLIOC_STOP)
2497 			return err;
2498 
2499 		return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2500 				     (void *)arg);
2501 	}
2502 	}
2503 }
2504 
ll_file_seek(struct file * file,loff_t offset,int origin)2505 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2506 {
2507 	struct inode *inode = file_inode(file);
2508 	loff_t retval, eof = 0;
2509 
2510 	retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2511 			   (origin == SEEK_CUR) ? file->f_pos : 0);
2512 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2513 	       inode->i_ino, inode->i_generation, inode, retval, retval,
2514 	       origin);
2515 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2516 
2517 	if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2518 		retval = ll_glimpse_size(inode);
2519 		if (retval != 0)
2520 			return retval;
2521 		eof = i_size_read(inode);
2522 	}
2523 
2524 	retval = generic_file_llseek_size(file, offset, origin,
2525 					  ll_file_maxbytes(inode), eof);
2526 	return retval;
2527 }
2528 
ll_flush(struct file * file,fl_owner_t id)2529 static int ll_flush(struct file *file, fl_owner_t id)
2530 {
2531 	struct inode *inode = file_inode(file);
2532 	struct ll_inode_info *lli = ll_i2info(inode);
2533 	struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2534 	int rc, err;
2535 
2536 	LASSERT(!S_ISDIR(inode->i_mode));
2537 
2538 	/* catch async errors that were recorded back when async writeback
2539 	 * failed for pages in this mapping. */
2540 	rc = lli->lli_async_rc;
2541 	lli->lli_async_rc = 0;
2542 	err = lov_read_and_clear_async_rc(lli->lli_clob);
2543 	if (rc == 0)
2544 		rc = err;
2545 
2546 	/* The application has been told write failure already.
2547 	 * Do not report failure again. */
2548 	if (fd->fd_write_failed)
2549 		return 0;
2550 	return rc ? -EIO : 0;
2551 }
2552 
2553 /**
2554  * Called to make sure a portion of file has been written out.
2555  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2556  *
2557  * Return how many pages have been written.
2558  */
cl_sync_file_range(struct inode * inode,loff_t start,loff_t end,enum cl_fsync_mode mode,int ignore_layout)2559 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2560 		       enum cl_fsync_mode mode, int ignore_layout)
2561 {
2562 	struct cl_env_nest nest;
2563 	struct lu_env *env;
2564 	struct cl_io *io;
2565 	struct cl_fsync_io *fio;
2566 	int result;
2567 
2568 	if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2569 	    mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2570 		return -EINVAL;
2571 
2572 	env = cl_env_nested_get(&nest);
2573 	if (IS_ERR(env))
2574 		return PTR_ERR(env);
2575 
2576 	io = ccc_env_thread_io(env);
2577 	io->ci_obj = cl_i2info(inode)->lli_clob;
2578 	io->ci_ignore_layout = ignore_layout;
2579 
2580 	/* initialize parameters for sync */
2581 	fio = &io->u.ci_fsync;
2582 	fio->fi_start = start;
2583 	fio->fi_end = end;
2584 	fio->fi_fid = ll_inode2fid(inode);
2585 	fio->fi_mode = mode;
2586 	fio->fi_nr_written = 0;
2587 
2588 	if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2589 		result = cl_io_loop(env, io);
2590 	else
2591 		result = io->ci_result;
2592 	if (result == 0)
2593 		result = fio->fi_nr_written;
2594 	cl_io_fini(env, io);
2595 	cl_env_nested_put(&nest, env);
2596 
2597 	return result;
2598 }
2599 
ll_fsync(struct file * file,loff_t start,loff_t end,int datasync)2600 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2601 {
2602 	struct inode *inode = file_inode(file);
2603 	struct ll_inode_info *lli = ll_i2info(inode);
2604 	struct ptlrpc_request *req;
2605 	int rc, err;
2606 
2607 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2608 	       inode->i_generation, inode);
2609 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2610 
2611 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2612 	mutex_lock(&inode->i_mutex);
2613 
2614 	/* catch async errors that were recorded back when async writeback
2615 	 * failed for pages in this mapping. */
2616 	if (!S_ISDIR(inode->i_mode)) {
2617 		err = lli->lli_async_rc;
2618 		lli->lli_async_rc = 0;
2619 		if (rc == 0)
2620 			rc = err;
2621 		err = lov_read_and_clear_async_rc(lli->lli_clob);
2622 		if (rc == 0)
2623 			rc = err;
2624 	}
2625 
2626 	err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2627 	if (!rc)
2628 		rc = err;
2629 	if (!err)
2630 		ptlrpc_req_finished(req);
2631 
2632 	if (S_ISREG(inode->i_mode)) {
2633 		struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2634 
2635 		err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2636 		if (rc == 0 && err < 0)
2637 			rc = err;
2638 		if (rc < 0)
2639 			fd->fd_write_failed = true;
2640 		else
2641 			fd->fd_write_failed = false;
2642 	}
2643 
2644 	mutex_unlock(&inode->i_mutex);
2645 	return rc;
2646 }
2647 
2648 static int
ll_file_flock(struct file * file,int cmd,struct file_lock * file_lock)2649 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2650 {
2651 	struct inode *inode = file_inode(file);
2652 	struct ll_sb_info *sbi = ll_i2sbi(inode);
2653 	struct ldlm_enqueue_info einfo = {
2654 		.ei_type	= LDLM_FLOCK,
2655 		.ei_cb_cp	= ldlm_flock_completion_ast,
2656 		.ei_cbdata	= file_lock,
2657 	};
2658 	struct md_op_data *op_data;
2659 	struct lustre_handle lockh = {0};
2660 	ldlm_policy_data_t flock = { {0} };
2661 	__u64 flags = 0;
2662 	int rc;
2663 	int rc2 = 0;
2664 
2665 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2666 	       inode->i_ino, file_lock);
2667 
2668 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2669 
2670 	if (file_lock->fl_flags & FL_FLOCK)
2671 		LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2672 	else if (!(file_lock->fl_flags & FL_POSIX))
2673 		return -EINVAL;
2674 
2675 	flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2676 	flock.l_flock.pid = file_lock->fl_pid;
2677 	flock.l_flock.start = file_lock->fl_start;
2678 	flock.l_flock.end = file_lock->fl_end;
2679 
2680 	/* Somewhat ugly workaround for svc lockd.
2681 	 * lockd installs custom fl_lmops->lm_compare_owner that checks
2682 	 * for the fl_owner to be the same (which it always is on local node
2683 	 * I guess between lockd processes) and then compares pid.
2684 	 * As such we assign pid to the owner field to make it all work,
2685 	 * conflict with normal locks is unlikely since pid space and
2686 	 * pointer space for current->files are not intersecting */
2687 	if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2688 		flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2689 
2690 	switch (file_lock->fl_type) {
2691 	case F_RDLCK:
2692 		einfo.ei_mode = LCK_PR;
2693 		break;
2694 	case F_UNLCK:
2695 		/* An unlock request may or may not have any relation to
2696 		 * existing locks so we may not be able to pass a lock handle
2697 		 * via a normal ldlm_lock_cancel() request. The request may even
2698 		 * unlock a byte range in the middle of an existing lock. In
2699 		 * order to process an unlock request we need all of the same
2700 		 * information that is given with a normal read or write record
2701 		 * lock request. To avoid creating another ldlm unlock (cancel)
2702 		 * message we'll treat a LCK_NL flock request as an unlock. */
2703 		einfo.ei_mode = LCK_NL;
2704 		break;
2705 	case F_WRLCK:
2706 		einfo.ei_mode = LCK_PW;
2707 		break;
2708 	default:
2709 		CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2710 			file_lock->fl_type);
2711 		return -ENOTSUPP;
2712 	}
2713 
2714 	switch (cmd) {
2715 	case F_SETLKW:
2716 #ifdef F_SETLKW64
2717 	case F_SETLKW64:
2718 #endif
2719 		flags = 0;
2720 		break;
2721 	case F_SETLK:
2722 #ifdef F_SETLK64
2723 	case F_SETLK64:
2724 #endif
2725 		flags = LDLM_FL_BLOCK_NOWAIT;
2726 		break;
2727 	case F_GETLK:
2728 #ifdef F_GETLK64
2729 	case F_GETLK64:
2730 #endif
2731 		flags = LDLM_FL_TEST_LOCK;
2732 		/* Save the old mode so that if the mode in the lock changes we
2733 		 * can decrement the appropriate reader or writer refcount. */
2734 		file_lock->fl_type = einfo.ei_mode;
2735 		break;
2736 	default:
2737 		CERROR("unknown fcntl lock command: %d\n", cmd);
2738 		return -EINVAL;
2739 	}
2740 
2741 	op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2742 				     LUSTRE_OPC_ANY, NULL);
2743 	if (IS_ERR(op_data))
2744 		return PTR_ERR(op_data);
2745 
2746 	CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2747 	       inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2748 	       flock.l_flock.start, flock.l_flock.end);
2749 
2750 	rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2751 			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2752 
2753 	if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
2754 	    !(flags & LDLM_FL_TEST_LOCK))
2755 		rc2  = locks_lock_file_wait(file, file_lock);
2756 
2757 	if (rc2 && file_lock->fl_type != F_UNLCK) {
2758 		einfo.ei_mode = LCK_NL;
2759 		md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2760 			op_data, &lockh, &flock, 0, NULL /* req */, flags);
2761 		rc = rc2;
2762 	}
2763 
2764 	ll_finish_md_op_data(op_data);
2765 
2766 	return rc;
2767 }
2768 
2769 static int
ll_file_noflock(struct file * file,int cmd,struct file_lock * file_lock)2770 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2771 {
2772 	return -ENOSYS;
2773 }
2774 
2775 /**
2776  * test if some locks matching bits and l_req_mode are acquired
2777  * - bits can be in different locks
2778  * - if found clear the common lock bits in *bits
2779  * - the bits not found, are kept in *bits
2780  * \param inode [IN]
2781  * \param bits [IN] searched lock bits [IN]
2782  * \param l_req_mode [IN] searched lock mode
2783  * \retval boolean, true iff all bits are found
2784  */
ll_have_md_lock(struct inode * inode,__u64 * bits,ldlm_mode_t l_req_mode)2785 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2786 {
2787 	struct lustre_handle lockh;
2788 	ldlm_policy_data_t policy;
2789 	ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2790 				(LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2791 	struct lu_fid *fid;
2792 	__u64 flags;
2793 	int i;
2794 
2795 	if (!inode)
2796 		return 0;
2797 
2798 	fid = &ll_i2info(inode)->lli_fid;
2799 	CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2800 	       ldlm_lockname[mode]);
2801 
2802 	flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2803 	for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2804 		policy.l_inodebits.bits = *bits & (1 << i);
2805 		if (policy.l_inodebits.bits == 0)
2806 			continue;
2807 
2808 		if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2809 				  &policy, mode, &lockh)) {
2810 			struct ldlm_lock *lock;
2811 
2812 			lock = ldlm_handle2lock(&lockh);
2813 			if (lock) {
2814 				*bits &=
2815 				      ~(lock->l_policy_data.l_inodebits.bits);
2816 				LDLM_LOCK_PUT(lock);
2817 			} else {
2818 				*bits &= ~policy.l_inodebits.bits;
2819 			}
2820 		}
2821 	}
2822 	return *bits == 0;
2823 }
2824 
ll_take_md_lock(struct inode * inode,__u64 bits,struct lustre_handle * lockh,__u64 flags,ldlm_mode_t mode)2825 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2826 			    struct lustre_handle *lockh, __u64 flags,
2827 			    ldlm_mode_t mode)
2828 {
2829 	ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2830 	struct lu_fid *fid;
2831 	ldlm_mode_t rc;
2832 
2833 	fid = &ll_i2info(inode)->lli_fid;
2834 	CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2835 
2836 	rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
2837 			   fid, LDLM_IBITS, &policy, mode, lockh);
2838 
2839 	return rc;
2840 }
2841 
ll_inode_revalidate_fini(struct inode * inode,int rc)2842 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2843 {
2844 	/* Already unlinked. Just update nlink and return success */
2845 	if (rc == -ENOENT) {
2846 		clear_nlink(inode);
2847 		/* This path cannot be hit for regular files unless in
2848 		 * case of obscure races, so no need to validate size.
2849 		 */
2850 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2851 			return 0;
2852 	} else if (rc != 0) {
2853 		CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2854 			     "%s: revalidate FID "DFID" error: rc = %d\n",
2855 			     ll_get_fsname(inode->i_sb, NULL, 0),
2856 			     PFID(ll_inode2fid(inode)), rc);
2857 	}
2858 
2859 	return rc;
2860 }
2861 
__ll_inode_revalidate(struct dentry * dentry,__u64 ibits)2862 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2863 {
2864 	struct inode *inode = d_inode(dentry);
2865 	struct ptlrpc_request *req = NULL;
2866 	struct obd_export *exp;
2867 	int rc = 0;
2868 
2869 	LASSERT(inode != NULL);
2870 
2871 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2872 	       inode->i_ino, inode->i_generation, inode, dentry);
2873 
2874 	exp = ll_i2mdexp(inode);
2875 
2876 	/* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2877 	 *      But under CMD case, it caused some lock issues, should be fixed
2878 	 *      with new CMD ibits lock. See bug 12718 */
2879 	if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2880 		struct lookup_intent oit = { .it_op = IT_GETATTR };
2881 		struct md_op_data *op_data;
2882 
2883 		if (ibits == MDS_INODELOCK_LOOKUP)
2884 			oit.it_op = IT_LOOKUP;
2885 
2886 		/* Call getattr by fid, so do not provide name at all. */
2887 		op_data = ll_prep_md_op_data(NULL, inode,
2888 					     inode, NULL, 0, 0,
2889 					     LUSTRE_OPC_ANY, NULL);
2890 		if (IS_ERR(op_data))
2891 			return PTR_ERR(op_data);
2892 
2893 		oit.it_create_mode |= M_CHECK_STALE;
2894 		rc = md_intent_lock(exp, op_data, NULL, 0,
2895 				    /* we are not interested in name
2896 				       based lookup */
2897 				    &oit, 0, &req,
2898 				    ll_md_blocking_ast, 0);
2899 		ll_finish_md_op_data(op_data);
2900 		oit.it_create_mode &= ~M_CHECK_STALE;
2901 		if (rc < 0) {
2902 			rc = ll_inode_revalidate_fini(inode, rc);
2903 			goto out;
2904 		}
2905 
2906 		rc = ll_revalidate_it_finish(req, &oit, inode);
2907 		if (rc != 0) {
2908 			ll_intent_release(&oit);
2909 			goto out;
2910 		}
2911 
2912 		/* Unlinked? Unhash dentry, so it is not picked up later by
2913 		   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2914 		   here to preserve get_cwd functionality on 2.6.
2915 		   Bug 10503 */
2916 		if (!d_inode(dentry)->i_nlink)
2917 			d_lustre_invalidate(dentry, 0);
2918 
2919 		ll_lookup_finish_locks(&oit, inode);
2920 	} else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2921 		struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2922 		u64 valid = OBD_MD_FLGETATTR;
2923 		struct md_op_data *op_data;
2924 		int ealen = 0;
2925 
2926 		if (S_ISREG(inode->i_mode)) {
2927 			rc = ll_get_default_mdsize(sbi, &ealen);
2928 			if (rc)
2929 				return rc;
2930 			valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2931 		}
2932 
2933 		op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2934 					     0, ealen, LUSTRE_OPC_ANY,
2935 					     NULL);
2936 		if (IS_ERR(op_data))
2937 			return PTR_ERR(op_data);
2938 
2939 		op_data->op_valid = valid;
2940 		rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2941 		ll_finish_md_op_data(op_data);
2942 		if (rc) {
2943 			rc = ll_inode_revalidate_fini(inode, rc);
2944 			return rc;
2945 		}
2946 
2947 		rc = ll_prep_inode(&inode, req, NULL, NULL);
2948 	}
2949 out:
2950 	ptlrpc_req_finished(req);
2951 	return rc;
2952 }
2953 
ll_inode_revalidate(struct dentry * dentry,__u64 ibits)2954 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2955 {
2956 	struct inode *inode = d_inode(dentry);
2957 	int rc;
2958 
2959 	rc = __ll_inode_revalidate(dentry, ibits);
2960 	if (rc != 0)
2961 		return rc;
2962 
2963 	/* if object isn't regular file, don't validate size */
2964 	if (!S_ISREG(inode->i_mode)) {
2965 		LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2966 		LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2967 		LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2968 	} else {
2969 		/* In case of restore, the MDT has the right size and has
2970 		 * already send it back without granting the layout lock,
2971 		 * inode is up-to-date so glimpse is useless.
2972 		 * Also to glimpse we need the layout, in case of a running
2973 		 * restore the MDT holds the layout lock so the glimpse will
2974 		 * block up to the end of restore (getattr will block)
2975 		 */
2976 		if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2977 			rc = ll_glimpse_size(inode);
2978 	}
2979 	return rc;
2980 }
2981 
ll_getattr(struct vfsmount * mnt,struct dentry * de,struct kstat * stat)2982 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2983 {
2984 	struct inode *inode = d_inode(de);
2985 	struct ll_sb_info *sbi = ll_i2sbi(inode);
2986 	struct ll_inode_info *lli = ll_i2info(inode);
2987 	int res;
2988 
2989 	res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2990 				      MDS_INODELOCK_LOOKUP);
2991 	ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2992 
2993 	if (res)
2994 		return res;
2995 
2996 	stat->dev = inode->i_sb->s_dev;
2997 	if (ll_need_32bit_api(sbi))
2998 		stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2999 	else
3000 		stat->ino = inode->i_ino;
3001 	stat->mode = inode->i_mode;
3002 	stat->nlink = inode->i_nlink;
3003 	stat->uid = inode->i_uid;
3004 	stat->gid = inode->i_gid;
3005 	stat->rdev = inode->i_rdev;
3006 	stat->atime = inode->i_atime;
3007 	stat->mtime = inode->i_mtime;
3008 	stat->ctime = inode->i_ctime;
3009 	stat->blksize = 1 << inode->i_blkbits;
3010 
3011 	stat->size = i_size_read(inode);
3012 	stat->blocks = inode->i_blocks;
3013 
3014 	return 0;
3015 }
3016 
ll_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,__u64 start,__u64 len)3017 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3018 		     __u64 start, __u64 len)
3019 {
3020 	int rc;
3021 	size_t num_bytes;
3022 	struct ll_user_fiemap *fiemap;
3023 	unsigned int extent_count = fieinfo->fi_extents_max;
3024 
3025 	num_bytes = sizeof(*fiemap) + (extent_count *
3026 				       sizeof(struct ll_fiemap_extent));
3027 	fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3028 
3029 	if (fiemap == NULL)
3030 		return -ENOMEM;
3031 
3032 	fiemap->fm_flags = fieinfo->fi_flags;
3033 	fiemap->fm_extent_count = fieinfo->fi_extents_max;
3034 	fiemap->fm_start = start;
3035 	fiemap->fm_length = len;
3036 	if (extent_count > 0)
3037 		memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3038 		       sizeof(struct ll_fiemap_extent));
3039 
3040 	rc = ll_do_fiemap(inode, fiemap, num_bytes);
3041 
3042 	fieinfo->fi_flags = fiemap->fm_flags;
3043 	fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3044 	if (extent_count > 0)
3045 		memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3046 		       fiemap->fm_mapped_extents *
3047 		       sizeof(struct ll_fiemap_extent));
3048 
3049 	kvfree(fiemap);
3050 	return rc;
3051 }
3052 
ll_get_acl(struct inode * inode,int type)3053 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3054 {
3055 	struct ll_inode_info *lli = ll_i2info(inode);
3056 	struct posix_acl *acl = NULL;
3057 
3058 	spin_lock(&lli->lli_lock);
3059 	/* VFS' acl_permission_check->check_acl will release the refcount */
3060 	acl = posix_acl_dup(lli->lli_posix_acl);
3061 	spin_unlock(&lli->lli_lock);
3062 
3063 	return acl;
3064 }
3065 
ll_inode_permission(struct inode * inode,int mask)3066 int ll_inode_permission(struct inode *inode, int mask)
3067 {
3068 	int rc = 0;
3069 
3070 #ifdef MAY_NOT_BLOCK
3071 	if (mask & MAY_NOT_BLOCK)
3072 		return -ECHILD;
3073 #endif
3074 
3075        /* as root inode are NOT getting validated in lookup operation,
3076 	* need to do it before permission check. */
3077 
3078 	if (is_root_inode(inode)) {
3079 		rc = __ll_inode_revalidate(inode->i_sb->s_root,
3080 					   MDS_INODELOCK_LOOKUP);
3081 		if (rc)
3082 			return rc;
3083 	}
3084 
3085 	CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3086 	       inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3087 
3088 	if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3089 		return lustre_check_remote_perm(inode, mask);
3090 
3091 	ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3092 	rc = generic_permission(inode, mask);
3093 
3094 	return rc;
3095 }
3096 
3097 /* -o localflock - only provides locally consistent flock locks */
3098 struct file_operations ll_file_operations = {
3099 	.read_iter = ll_file_read_iter,
3100 	.write_iter = ll_file_write_iter,
3101 	.unlocked_ioctl = ll_file_ioctl,
3102 	.open	   = ll_file_open,
3103 	.release	= ll_file_release,
3104 	.mmap	   = ll_file_mmap,
3105 	.llseek	 = ll_file_seek,
3106 	.splice_read    = ll_file_splice_read,
3107 	.fsync	  = ll_fsync,
3108 	.flush	  = ll_flush
3109 };
3110 
3111 struct file_operations ll_file_operations_flock = {
3112 	.read_iter    = ll_file_read_iter,
3113 	.write_iter   = ll_file_write_iter,
3114 	.unlocked_ioctl = ll_file_ioctl,
3115 	.open	   = ll_file_open,
3116 	.release	= ll_file_release,
3117 	.mmap	   = ll_file_mmap,
3118 	.llseek	 = ll_file_seek,
3119 	.splice_read    = ll_file_splice_read,
3120 	.fsync	  = ll_fsync,
3121 	.flush	  = ll_flush,
3122 	.flock	  = ll_file_flock,
3123 	.lock	   = ll_file_flock
3124 };
3125 
3126 /* These are for -o noflock - to return ENOSYS on flock calls */
3127 struct file_operations ll_file_operations_noflock = {
3128 	.read_iter    = ll_file_read_iter,
3129 	.write_iter   = ll_file_write_iter,
3130 	.unlocked_ioctl = ll_file_ioctl,
3131 	.open	   = ll_file_open,
3132 	.release	= ll_file_release,
3133 	.mmap	   = ll_file_mmap,
3134 	.llseek	 = ll_file_seek,
3135 	.splice_read    = ll_file_splice_read,
3136 	.fsync	  = ll_fsync,
3137 	.flush	  = ll_flush,
3138 	.flock	  = ll_file_noflock,
3139 	.lock	   = ll_file_noflock
3140 };
3141 
3142 struct inode_operations ll_file_inode_operations = {
3143 	.setattr	= ll_setattr,
3144 	.getattr	= ll_getattr,
3145 	.permission	= ll_inode_permission,
3146 	.setxattr	= ll_setxattr,
3147 	.getxattr	= ll_getxattr,
3148 	.listxattr	= ll_listxattr,
3149 	.removexattr	= ll_removexattr,
3150 	.fiemap		= ll_fiemap,
3151 	.get_acl	= ll_get_acl,
3152 };
3153 
3154 /* dynamic ioctl number support routines */
3155 static struct llioc_ctl_data {
3156 	struct rw_semaphore	ioc_sem;
3157 	struct list_head	      ioc_head;
3158 } llioc = {
3159 	__RWSEM_INITIALIZER(llioc.ioc_sem),
3160 	LIST_HEAD_INIT(llioc.ioc_head)
3161 };
3162 
3163 struct llioc_data {
3164 	struct list_head	      iocd_list;
3165 	unsigned int	    iocd_size;
3166 	llioc_callback_t	iocd_cb;
3167 	unsigned int	    iocd_count;
3168 	unsigned int	    iocd_cmd[0];
3169 };
3170 
ll_iocontrol_register(llioc_callback_t cb,int count,unsigned int * cmd)3171 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3172 {
3173 	unsigned int size;
3174 	struct llioc_data *in_data = NULL;
3175 
3176 	if (cb == NULL || cmd == NULL ||
3177 	    count > LLIOC_MAX_CMD || count < 0)
3178 		return NULL;
3179 
3180 	size = sizeof(*in_data) + count * sizeof(unsigned int);
3181 	in_data = kzalloc(size, GFP_NOFS);
3182 	if (!in_data)
3183 		return NULL;
3184 
3185 	memset(in_data, 0, sizeof(*in_data));
3186 	in_data->iocd_size = size;
3187 	in_data->iocd_cb = cb;
3188 	in_data->iocd_count = count;
3189 	memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3190 
3191 	down_write(&llioc.ioc_sem);
3192 	list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3193 	up_write(&llioc.ioc_sem);
3194 
3195 	return in_data;
3196 }
3197 EXPORT_SYMBOL(ll_iocontrol_register);
3198 
ll_iocontrol_unregister(void * magic)3199 void ll_iocontrol_unregister(void *magic)
3200 {
3201 	struct llioc_data *tmp;
3202 
3203 	if (magic == NULL)
3204 		return;
3205 
3206 	down_write(&llioc.ioc_sem);
3207 	list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3208 		if (tmp == magic) {
3209 			list_del(&tmp->iocd_list);
3210 			up_write(&llioc.ioc_sem);
3211 
3212 			kfree(tmp);
3213 			return;
3214 		}
3215 	}
3216 	up_write(&llioc.ioc_sem);
3217 
3218 	CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3219 }
3220 EXPORT_SYMBOL(ll_iocontrol_unregister);
3221 
3222 static enum llioc_iter
ll_iocontrol_call(struct inode * inode,struct file * file,unsigned int cmd,unsigned long arg,int * rcp)3223 ll_iocontrol_call(struct inode *inode, struct file *file,
3224 		  unsigned int cmd, unsigned long arg, int *rcp)
3225 {
3226 	enum llioc_iter ret = LLIOC_CONT;
3227 	struct llioc_data *data;
3228 	int rc = -EINVAL, i;
3229 
3230 	down_read(&llioc.ioc_sem);
3231 	list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3232 		for (i = 0; i < data->iocd_count; i++) {
3233 			if (cmd != data->iocd_cmd[i])
3234 				continue;
3235 
3236 			ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3237 			break;
3238 		}
3239 
3240 		if (ret == LLIOC_STOP)
3241 			break;
3242 	}
3243 	up_read(&llioc.ioc_sem);
3244 
3245 	if (rcp)
3246 		*rcp = rc;
3247 	return ret;
3248 }
3249 
ll_layout_conf(struct inode * inode,const struct cl_object_conf * conf)3250 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3251 {
3252 	struct ll_inode_info *lli = ll_i2info(inode);
3253 	struct cl_env_nest nest;
3254 	struct lu_env *env;
3255 	int result;
3256 
3257 	if (lli->lli_clob == NULL)
3258 		return 0;
3259 
3260 	env = cl_env_nested_get(&nest);
3261 	if (IS_ERR(env))
3262 		return PTR_ERR(env);
3263 
3264 	result = cl_conf_set(env, lli->lli_clob, conf);
3265 	cl_env_nested_put(&nest, env);
3266 
3267 	if (conf->coc_opc == OBJECT_CONF_SET) {
3268 		struct ldlm_lock *lock = conf->coc_lock;
3269 
3270 		LASSERT(lock != NULL);
3271 		LASSERT(ldlm_has_layout(lock));
3272 		if (result == 0) {
3273 			/* it can only be allowed to match after layout is
3274 			 * applied to inode otherwise false layout would be
3275 			 * seen. Applying layout should happen before dropping
3276 			 * the intent lock. */
3277 			ldlm_lock_allow_match(lock);
3278 		}
3279 	}
3280 	return result;
3281 }
3282 
3283 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
ll_layout_fetch(struct inode * inode,struct ldlm_lock * lock)3284 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3285 
3286 {
3287 	struct ll_sb_info *sbi = ll_i2sbi(inode);
3288 	struct ptlrpc_request *req;
3289 	struct mdt_body *body;
3290 	void *lvbdata;
3291 	void *lmm;
3292 	int lmmsize;
3293 	int rc;
3294 
3295 	CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3296 	       PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3297 	       lock->l_lvb_data, lock->l_lvb_len);
3298 
3299 	if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3300 		return 0;
3301 
3302 	/* if layout lock was granted right away, the layout is returned
3303 	 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3304 	 * blocked and then granted via completion ast, we have to fetch
3305 	 * layout here. Please note that we can't use the LVB buffer in
3306 	 * completion AST because it doesn't have a large enough buffer */
3307 	rc = ll_get_default_mdsize(sbi, &lmmsize);
3308 	if (rc == 0)
3309 		rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3310 				 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3311 				 lmmsize, 0, &req);
3312 	if (rc < 0)
3313 		return rc;
3314 
3315 	body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3316 	if (body == NULL) {
3317 		rc = -EPROTO;
3318 		goto out;
3319 	}
3320 
3321 	lmmsize = body->eadatasize;
3322 	if (lmmsize == 0) /* empty layout */ {
3323 		rc = 0;
3324 		goto out;
3325 	}
3326 
3327 	lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3328 	if (lmm == NULL) {
3329 		rc = -EFAULT;
3330 		goto out;
3331 	}
3332 
3333 	lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3334 	if (lvbdata == NULL) {
3335 		rc = -ENOMEM;
3336 		goto out;
3337 	}
3338 
3339 	memcpy(lvbdata, lmm, lmmsize);
3340 	lock_res_and_lock(lock);
3341 	if (lock->l_lvb_data != NULL)
3342 		kvfree(lock->l_lvb_data);
3343 
3344 	lock->l_lvb_data = lvbdata;
3345 	lock->l_lvb_len = lmmsize;
3346 	unlock_res_and_lock(lock);
3347 
3348 out:
3349 	ptlrpc_req_finished(req);
3350 	return rc;
3351 }
3352 
3353 /**
3354  * Apply the layout to the inode. Layout lock is held and will be released
3355  * in this function.
3356  */
ll_layout_lock_set(struct lustre_handle * lockh,ldlm_mode_t mode,struct inode * inode,__u32 * gen,bool reconf)3357 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3358 				struct inode *inode, __u32 *gen, bool reconf)
3359 {
3360 	struct ll_inode_info *lli = ll_i2info(inode);
3361 	struct ll_sb_info    *sbi = ll_i2sbi(inode);
3362 	struct ldlm_lock *lock;
3363 	struct lustre_md md = { NULL };
3364 	struct cl_object_conf conf;
3365 	int rc = 0;
3366 	bool lvb_ready;
3367 	bool wait_layout = false;
3368 
3369 	LASSERT(lustre_handle_is_used(lockh));
3370 
3371 	lock = ldlm_handle2lock(lockh);
3372 	LASSERT(lock != NULL);
3373 	LASSERT(ldlm_has_layout(lock));
3374 
3375 	LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3376 		   inode, PFID(&lli->lli_fid), reconf);
3377 
3378 	/* in case this is a caching lock and reinstate with new inode */
3379 	md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3380 
3381 	lock_res_and_lock(lock);
3382 	lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3383 	unlock_res_and_lock(lock);
3384 	/* checking lvb_ready is racy but this is okay. The worst case is
3385 	 * that multi processes may configure the file on the same time. */
3386 	if (lvb_ready || !reconf) {
3387 		rc = -ENODATA;
3388 		if (lvb_ready) {
3389 			/* layout_gen must be valid if layout lock is not
3390 			 * cancelled and stripe has already set */
3391 			*gen = ll_layout_version_get(lli);
3392 			rc = 0;
3393 		}
3394 		goto out;
3395 	}
3396 
3397 	rc = ll_layout_fetch(inode, lock);
3398 	if (rc < 0)
3399 		goto out;
3400 
3401 	/* for layout lock, lmm is returned in lock's lvb.
3402 	 * lvb_data is immutable if the lock is held so it's safe to access it
3403 	 * without res lock. See the description in ldlm_lock_decref_internal()
3404 	 * for the condition to free lvb_data of layout lock */
3405 	if (lock->l_lvb_data != NULL) {
3406 		rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3407 				  lock->l_lvb_data, lock->l_lvb_len);
3408 		if (rc >= 0) {
3409 			*gen = LL_LAYOUT_GEN_EMPTY;
3410 			if (md.lsm != NULL)
3411 				*gen = md.lsm->lsm_layout_gen;
3412 			rc = 0;
3413 		} else {
3414 			CERROR("%s: file "DFID" unpackmd error: %d\n",
3415 				ll_get_fsname(inode->i_sb, NULL, 0),
3416 				PFID(&lli->lli_fid), rc);
3417 		}
3418 	}
3419 	if (rc < 0)
3420 		goto out;
3421 
3422 	/* set layout to file. Unlikely this will fail as old layout was
3423 	 * surely eliminated */
3424 	memset(&conf, 0, sizeof(conf));
3425 	conf.coc_opc = OBJECT_CONF_SET;
3426 	conf.coc_inode = inode;
3427 	conf.coc_lock = lock;
3428 	conf.u.coc_md = &md;
3429 	rc = ll_layout_conf(inode, &conf);
3430 
3431 	if (md.lsm != NULL)
3432 		obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3433 
3434 	/* refresh layout failed, need to wait */
3435 	wait_layout = rc == -EBUSY;
3436 
3437 out:
3438 	LDLM_LOCK_PUT(lock);
3439 	ldlm_lock_decref(lockh, mode);
3440 
3441 	/* wait for IO to complete if it's still being used. */
3442 	if (wait_layout) {
3443 		CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3444 			ll_get_fsname(inode->i_sb, NULL, 0),
3445 			inode, PFID(&lli->lli_fid));
3446 
3447 		memset(&conf, 0, sizeof(conf));
3448 		conf.coc_opc = OBJECT_CONF_WAIT;
3449 		conf.coc_inode = inode;
3450 		rc = ll_layout_conf(inode, &conf);
3451 		if (rc == 0)
3452 			rc = -EAGAIN;
3453 
3454 		CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3455 			PFID(&lli->lli_fid), rc);
3456 	}
3457 	return rc;
3458 }
3459 
3460 /**
3461  * This function checks if there exists a LAYOUT lock on the client side,
3462  * or enqueues it if it doesn't have one in cache.
3463  *
3464  * This function will not hold layout lock so it may be revoked any time after
3465  * this function returns. Any operations depend on layout should be redone
3466  * in that case.
3467  *
3468  * This function should be called before lov_io_init() to get an uptodate
3469  * layout version, the caller should save the version number and after IO
3470  * is finished, this function should be called again to verify that layout
3471  * is not changed during IO time.
3472  */
ll_layout_refresh(struct inode * inode,__u32 * gen)3473 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3474 {
3475 	struct ll_inode_info  *lli = ll_i2info(inode);
3476 	struct ll_sb_info     *sbi = ll_i2sbi(inode);
3477 	struct md_op_data     *op_data;
3478 	struct lookup_intent   it;
3479 	struct lustre_handle   lockh;
3480 	ldlm_mode_t	       mode;
3481 	struct ldlm_enqueue_info einfo = {
3482 		.ei_type = LDLM_IBITS,
3483 		.ei_mode = LCK_CR,
3484 		.ei_cb_bl = ll_md_blocking_ast,
3485 		.ei_cb_cp = ldlm_completion_ast,
3486 	};
3487 	int rc;
3488 
3489 	*gen = ll_layout_version_get(lli);
3490 	if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3491 		return 0;
3492 
3493 	/* sanity checks */
3494 	LASSERT(fid_is_sane(ll_inode2fid(inode)));
3495 	LASSERT(S_ISREG(inode->i_mode));
3496 
3497 	/* take layout lock mutex to enqueue layout lock exclusively. */
3498 	mutex_lock(&lli->lli_layout_mutex);
3499 
3500 again:
3501 	/* mostly layout lock is caching on the local side, so try to match
3502 	 * it before grabbing layout lock mutex. */
3503 	mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3504 			       LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3505 	if (mode != 0) { /* hit cached lock */
3506 		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3507 		if (rc == -EAGAIN)
3508 			goto again;
3509 
3510 		mutex_unlock(&lli->lli_layout_mutex);
3511 		return rc;
3512 	}
3513 
3514 	op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3515 			0, 0, LUSTRE_OPC_ANY, NULL);
3516 	if (IS_ERR(op_data)) {
3517 		mutex_unlock(&lli->lli_layout_mutex);
3518 		return PTR_ERR(op_data);
3519 	}
3520 
3521 	/* have to enqueue one */
3522 	memset(&it, 0, sizeof(it));
3523 	it.it_op = IT_LAYOUT;
3524 	lockh.cookie = 0ULL;
3525 
3526 	LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3527 			ll_get_fsname(inode->i_sb, NULL, 0), inode,
3528 			PFID(&lli->lli_fid));
3529 
3530 	rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3531 			NULL, 0, NULL, 0);
3532 	if (it.d.lustre.it_data != NULL)
3533 		ptlrpc_req_finished(it.d.lustre.it_data);
3534 	it.d.lustre.it_data = NULL;
3535 
3536 	ll_finish_md_op_data(op_data);
3537 
3538 	mode = it.d.lustre.it_lock_mode;
3539 	it.d.lustre.it_lock_mode = 0;
3540 	ll_intent_drop_lock(&it);
3541 
3542 	if (rc == 0) {
3543 		/* set lock data in case this is a new lock */
3544 		ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3545 		rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3546 		if (rc == -EAGAIN)
3547 			goto again;
3548 	}
3549 	mutex_unlock(&lli->lli_layout_mutex);
3550 
3551 	return rc;
3552 }
3553 
3554 /**
3555  *  This function send a restore request to the MDT
3556  */
ll_layout_restore(struct inode * inode)3557 int ll_layout_restore(struct inode *inode)
3558 {
3559 	struct hsm_user_request	*hur;
3560 	int			 len, rc;
3561 
3562 	len = sizeof(struct hsm_user_request) +
3563 	      sizeof(struct hsm_user_item);
3564 	hur = kzalloc(len, GFP_NOFS);
3565 	if (!hur)
3566 		return -ENOMEM;
3567 
3568 	hur->hur_request.hr_action = HUA_RESTORE;
3569 	hur->hur_request.hr_archive_id = 0;
3570 	hur->hur_request.hr_flags = 0;
3571 	memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3572 	       sizeof(hur->hur_user_item[0].hui_fid));
3573 	hur->hur_user_item[0].hui_extent.length = -1;
3574 	hur->hur_request.hr_itemcount = 1;
3575 	rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3576 			   len, hur, NULL);
3577 	kfree(hur);
3578 	return rc;
3579 }
3580