1 /*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26 /*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32 /*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/include/lustre_disk.h
37 *
38 * Lustre disk format definitions.
39 *
40 * Author: Nathan Rutman <nathan@clusterfs.com>
41 */
42
43 #ifndef _LUSTRE_DISK_H
44 #define _LUSTRE_DISK_H
45
46 /** \defgroup disk disk
47 *
48 * @{
49 */
50
51 #include "../../include/linux/libcfs/libcfs.h"
52 #include "../../include/linux/lnet/types.h"
53 #include <linux/backing-dev.h>
54
55 /****************** persistent mount data *********************/
56
57 #define LDD_F_SV_TYPE_MDT 0x0001
58 #define LDD_F_SV_TYPE_OST 0x0002
59 #define LDD_F_SV_TYPE_MGS 0x0004
60 #define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \
61 LDD_F_SV_TYPE_OST | \
62 LDD_F_SV_TYPE_MGS)
63 #define LDD_F_SV_ALL 0x0008
64
65 /****************** mount command *********************/
66
67 /* The lmd is only used internally by Lustre; mount simply passes
68 everything as string options */
69
70 #define LMD_MAGIC 0xbdacbd03
71 #define LMD_PARAMS_MAXLEN 4096
72
73 /* gleaned from the mount command - no persistent info here */
74 struct lustre_mount_data {
75 __u32 lmd_magic;
76 __u32 lmd_flags; /* lustre mount flags */
77 int lmd_mgs_failnodes; /* mgs failover node count */
78 int lmd_exclude_count;
79 int lmd_recovery_time_soft;
80 int lmd_recovery_time_hard;
81 char *lmd_dev; /* device name */
82 char *lmd_profile; /* client only */
83 char *lmd_mgssec; /* sptlrpc flavor to mgs */
84 char *lmd_opts; /* lustre mount options (as opposed to
85 _device_ mount options) */
86 char *lmd_params; /* lustre params */
87 __u32 *lmd_exclude; /* array of OSTs to ignore */
88 char *lmd_mgs; /* MGS nid */
89 char *lmd_osd_type; /* OSD type */
90 };
91
92 #define LMD_FLG_SERVER 0x0001 /* Mounting a server */
93 #define LMD_FLG_CLIENT 0x0002 /* Mounting a client */
94 #define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */
95 #define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers,
96 no other services */
97 #define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing
98 existing MGS services */
99 #define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */
100 #define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */
101 #define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */
102 #define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */
103 #define LMD_FLG_IAM 0x0400 /* IAM dir */
104 #define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */
105 #define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */
106 #define LMD_FLG_UPDATE 0x2000 /* update parameters */
107 #define LMD_FLG_HSM 0x4000 /* Start coordinator */
108
109 #define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT)
110
111 /****************** last_rcvd file *********************/
112
113 /** version recovery epoch */
114 #define LR_EPOCH_BITS 32
115 #define lr_epoch(a) ((a) >> LR_EPOCH_BITS)
116 #define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */
117 #define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */
118
119 #define LR_SERVER_SIZE 512
120 #define LR_CLIENT_START 8192
121 #define LR_CLIENT_SIZE 128
122 #if LR_CLIENT_START < LR_SERVER_SIZE
123 #error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
124 #endif
125
126 /*
127 * This limit is arbitrary (131072 clients on x86), but it is convenient to use
128 * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation.
129 * If we need more than 131072 clients (order-2 allocation on x86) then this
130 * should become an array of single-page pointers that are allocated on demand.
131 */
132 #if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8)
133 #define LR_MAX_CLIENTS (128 * 1024UL)
134 #else
135 #define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8)
136 #endif
137
138 /** COMPAT_146: this is an OST (temporary) */
139 #define OBD_COMPAT_OST 0x00000002
140 /** COMPAT_146: this is an MDT (temporary) */
141 #define OBD_COMPAT_MDT 0x00000004
142 /** 2.0 server, interop flag to show server version is changed */
143 #define OBD_COMPAT_20 0x00000008
144
145 /** MDS handles LOV_OBJID file */
146 #define OBD_ROCOMPAT_LOVOBJID 0x00000001
147
148 /** OST handles group subdirs */
149 #define OBD_INCOMPAT_GROUPS 0x00000001
150 /** this is an OST */
151 #define OBD_INCOMPAT_OST 0x00000002
152 /** this is an MDT */
153 #define OBD_INCOMPAT_MDT 0x00000004
154 /** common last_rvcd format */
155 #define OBD_INCOMPAT_COMMON_LR 0x00000008
156 /** FID is enabled */
157 #define OBD_INCOMPAT_FID 0x00000010
158 /** Size-on-MDS is enabled */
159 #define OBD_INCOMPAT_SOM 0x00000020
160 /** filesystem using iam format to store directory entries */
161 #define OBD_INCOMPAT_IAM_DIR 0x00000040
162 /** LMA attribute contains per-inode incompatible flags */
163 #define OBD_INCOMPAT_LMA 0x00000080
164 /** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16
165 * bits are now used to store a generation. Once we start changing the layout
166 * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count
167 * will be confused by interpreting stripe_count | gen << 16 as the actual
168 * stripe count */
169 #define OBD_INCOMPAT_LMM_VER 0x00000100
170 /** multiple OI files for MDT */
171 #define OBD_INCOMPAT_MULTI_OI 0x00000200
172
173 /* Data stored per server at the head of the last_rcvd file. In le32 order.
174 This should be common to filter_internal.h, lustre_mds.h */
175 struct lr_server_data {
176 __u8 lsd_uuid[40]; /* server UUID */
177 __u64 lsd_last_transno; /* last completed transaction ID */
178 __u64 lsd_compat14; /* reserved - compat with old last_rcvd */
179 __u64 lsd_mount_count; /* incarnation number */
180 __u32 lsd_feature_compat; /* compatible feature flags */
181 __u32 lsd_feature_rocompat;/* read-only compatible feature flags */
182 __u32 lsd_feature_incompat;/* incompatible feature flags */
183 __u32 lsd_server_size; /* size of server data area */
184 __u32 lsd_client_start; /* start of per-client data area */
185 __u16 lsd_client_size; /* size of per-client data area */
186 __u16 lsd_subdir_count; /* number of subdirectories for objects */
187 __u64 lsd_catalog_oid; /* recovery catalog object id */
188 __u32 lsd_catalog_ogen; /* recovery catalog inode generation */
189 __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */
190 __u32 lsd_osd_index; /* index number of OST in LOV */
191 __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */
192 __u32 lsd_start_epoch; /* VBR: start epoch from last boot */
193 /** transaction values since lsd_trans_table_time */
194 __u64 lsd_trans_table[LR_EXPIRE_INTERVALS];
195 /** start point of transno table below */
196 __u32 lsd_trans_table_time; /* time of first slot in table above */
197 __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */
198 __u8 lsd_padding[LR_SERVER_SIZE - 288];
199 };
200
201 /* Data stored per client in the last_rcvd file. In le32 order. */
202 struct lsd_client_data {
203 __u8 lcd_uuid[40]; /* client UUID */
204 __u64 lcd_last_transno; /* last completed transaction ID */
205 __u64 lcd_last_xid; /* xid for the last transaction */
206 __u32 lcd_last_result; /* result from last RPC */
207 __u32 lcd_last_data; /* per-op data (disposition for open &c.) */
208 /* for MDS_CLOSE requests */
209 __u64 lcd_last_close_transno; /* last completed transaction ID */
210 __u64 lcd_last_close_xid; /* xid for the last transaction */
211 __u32 lcd_last_close_result; /* result from last RPC */
212 __u32 lcd_last_close_data; /* per-op data */
213 /* VBR: last versions */
214 __u64 lcd_pre_versions[4];
215 __u32 lcd_last_epoch;
216 /** orphans handling for delayed export rely on that */
217 __u32 lcd_first_epoch;
218 __u8 lcd_padding[LR_CLIENT_SIZE - 128];
219 };
220
221 /* bug20354: the lcd_uuid for export of clients may be wrong */
check_lcd(char * obd_name,int index,struct lsd_client_data * lcd)222 static inline void check_lcd(char *obd_name, int index,
223 struct lsd_client_data *lcd)
224 {
225 int length = sizeof(lcd->lcd_uuid);
226
227 if (strnlen((char *)lcd->lcd_uuid, length) == length) {
228 lcd->lcd_uuid[length - 1] = '\0';
229
230 LCONSOLE_ERROR("the client UUID (%s) on %s for exports stored in last_rcvd(index = %d) is bad!\n",
231 lcd->lcd_uuid, obd_name, index);
232 }
233 }
234
235 /* last_rcvd handling */
lsd_le_to_cpu(struct lr_server_data * buf,struct lr_server_data * lsd)236 static inline void lsd_le_to_cpu(struct lr_server_data *buf,
237 struct lr_server_data *lsd)
238 {
239 int i;
240
241 memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid));
242 lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno);
243 lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14);
244 lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count);
245 lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat);
246 lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat);
247 lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat);
248 lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size);
249 lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start);
250 lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size);
251 lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count);
252 lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid);
253 lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen);
254 memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid));
255 lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index);
256 lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1);
257 lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch);
258 for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
259 lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]);
260 lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time);
261 lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals);
262 }
263
lsd_cpu_to_le(struct lr_server_data * lsd,struct lr_server_data * buf)264 static inline void lsd_cpu_to_le(struct lr_server_data *lsd,
265 struct lr_server_data *buf)
266 {
267 int i;
268
269 memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid));
270 buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno);
271 buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14);
272 buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count);
273 buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat);
274 buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat);
275 buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat);
276 buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size);
277 buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start);
278 buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size);
279 buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count);
280 buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid);
281 buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen);
282 memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid));
283 buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index);
284 buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1);
285 buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch);
286 for (i = 0; i < LR_EXPIRE_INTERVALS; i++)
287 buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]);
288 buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time);
289 buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals);
290 }
291
lcd_le_to_cpu(struct lsd_client_data * buf,struct lsd_client_data * lcd)292 static inline void lcd_le_to_cpu(struct lsd_client_data *buf,
293 struct lsd_client_data *lcd)
294 {
295 memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid));
296 lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno);
297 lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid);
298 lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result);
299 lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data);
300 lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno);
301 lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid);
302 lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result);
303 lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data);
304 lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]);
305 lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]);
306 lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]);
307 lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]);
308 lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch);
309 lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch);
310 }
311
lcd_cpu_to_le(struct lsd_client_data * lcd,struct lsd_client_data * buf)312 static inline void lcd_cpu_to_le(struct lsd_client_data *lcd,
313 struct lsd_client_data *buf)
314 {
315 memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid));
316 buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno);
317 buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid);
318 buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result);
319 buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data);
320 buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno);
321 buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid);
322 buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result);
323 buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data);
324 buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]);
325 buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]);
326 buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]);
327 buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]);
328 buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch);
329 buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch);
330 }
331
lcd_last_transno(struct lsd_client_data * lcd)332 static inline __u64 lcd_last_transno(struct lsd_client_data *lcd)
333 {
334 return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ?
335 lcd->lcd_last_transno : lcd->lcd_last_close_transno);
336 }
337
lcd_last_xid(struct lsd_client_data * lcd)338 static inline __u64 lcd_last_xid(struct lsd_client_data *lcd)
339 {
340 return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ?
341 lcd->lcd_last_xid : lcd->lcd_last_close_xid);
342 }
343
344 /****************** superblock additional info *********************/
345
346 struct ll_sb_info;
347
348 struct lustre_sb_info {
349 int lsi_flags;
350 struct obd_device *lsi_mgc; /* mgc obd */
351 struct lustre_mount_data *lsi_lmd; /* mount command info */
352 struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */
353 struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/
354 struct vfsmount *lsi_srv_mnt; /* the one server mount */
355 atomic_t lsi_mounts; /* references to the srv_mnt */
356 char lsi_svname[MTI_NAME_MAXLEN];
357 char lsi_osd_obdname[64];
358 char lsi_osd_uuid[64];
359 struct obd_export *lsi_osd_exp;
360 char lsi_osd_type[16];
361 char lsi_fstype[16];
362 struct backing_dev_info lsi_bdi; /* each client mountpoint needs
363 own backing_dev_info */
364 };
365
366 #define LSI_UMOUNT_FAILOVER 0x00200000
367 #define LSI_BDI_INITIALIZED 0x00400000
368
369 #define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
370 #define s2lsi_nocast(sb) ((sb)->s_fs_info)
371
372 #define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile)
373 #define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags)
374 #define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev)
375
376 /****************** mount lookup info *********************/
377
378 struct lustre_mount_info {
379 char *lmi_name;
380 struct super_block *lmi_sb;
381 struct vfsmount *lmi_mnt;
382 struct list_head lmi_list_chain;
383 };
384
385 /****************** prototypes *********************/
386
387 /* obd_mount.c */
388
389 int lustre_start_mgc(struct super_block *sb);
390 void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
391 struct vfsmount *mnt));
392 void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb));
393 int lustre_common_put_super(struct super_block *sb);
394
395 int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type);
396
397 /** @} disk */
398
399 #endif /* _LUSTRE_DISK_H */
400