• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright 2019 Google LLC
4  */
5 #ifndef _INCFS_DATA_MGMT_H
6 #define _INCFS_DATA_MGMT_H
7 
8 #include <linux/cred.h>
9 #include <linux/fs.h>
10 #include <linux/types.h>
11 #include <linux/mutex.h>
12 #include <linux/spinlock.h>
13 #include <linux/rcupdate.h>
14 #include <linux/completion.h>
15 #include <linux/wait.h>
16 #include <linux/zstd.h>
17 #include <crypto/hash.h>
18 #include <linux/rwsem.h>
19 
20 #include <uapi/linux/incrementalfs.h>
21 
22 #include "internal.h"
23 #include "pseudo_files.h"
24 
25 #define SEGMENTS_PER_FILE 3
26 
27 enum LOG_RECORD_TYPE {
28 	FULL,
29 	SAME_FILE,
30 	SAME_FILE_CLOSE_BLOCK,
31 	SAME_FILE_CLOSE_BLOCK_SHORT,
32 	SAME_FILE_NEXT_BLOCK,
33 	SAME_FILE_NEXT_BLOCK_SHORT,
34 };
35 
36 struct full_record {
37 	enum LOG_RECORD_TYPE type : 3; /* FULL */
38 	u32 block_index : 29;
39 	incfs_uuid_t file_id;
40 	u64 absolute_ts_us;
41 	uid_t uid;
42 } __packed; /* 32 bytes */
43 
44 struct same_file {
45 	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */
46 	u32 block_index : 29;
47 	uid_t uid;
48 	u16 relative_ts_us; /* max 2^16 us ~= 64 ms */
49 } __packed; /* 10 bytes */
50 
51 struct same_file_close_block {
52 	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */
53 	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
54 	s16 block_index_delta;
55 } __packed; /* 4 bytes */
56 
57 struct same_file_close_block_short {
58 	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */
59 	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
60 	s8 block_index_delta;
61 } __packed; /* 2 bytes */
62 
63 struct same_file_next_block {
64 	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */
65 	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
66 } __packed; /* 2 bytes */
67 
68 struct same_file_next_block_short {
69 	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */
70 	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
71 } __packed; /* 1 byte */
72 
73 union log_record {
74 	struct full_record full_record;
75 	struct same_file same_file;
76 	struct same_file_close_block same_file_close_block;
77 	struct same_file_close_block_short same_file_close_block_short;
78 	struct same_file_next_block same_file_next_block;
79 	struct same_file_next_block_short same_file_next_block_short;
80 };
81 
82 struct read_log_state {
83 	/* Log buffer generation id, incremented on configuration changes */
84 	u32 generation_id;
85 
86 	/* Offset in rl_ring_buf to write into. */
87 	u32 next_offset;
88 
89 	/* Current number of writer passes over rl_ring_buf */
90 	u32 current_pass_no;
91 
92 	/* Current full_record to diff against */
93 	struct full_record base_record;
94 
95 	/* Current record number counting from configuration change */
96 	u64 current_record_no;
97 };
98 
99 /* A ring buffer to save records about data blocks which were recently read. */
100 struct read_log {
101 	void *rl_ring_buf;
102 
103 	int rl_size;
104 
105 	struct read_log_state rl_head;
106 
107 	struct read_log_state rl_tail;
108 
109 	/* A lock to protect the above fields */
110 	spinlock_t rl_lock;
111 
112 	/* A queue of waiters who want to be notified about reads */
113 	wait_queue_head_t ml_notif_wq;
114 
115 	/* A work item to wake up those waiters without slowing down readers */
116 	struct delayed_work ml_wakeup_work;
117 };
118 
119 struct mount_options {
120 	unsigned int read_timeout_ms;
121 	unsigned int readahead_pages;
122 	unsigned int read_log_pages;
123 	unsigned int read_log_wakeup_count;
124 	bool report_uid;
125 	char *sysfs_name;
126 };
127 
128 struct mount_info {
129 	struct super_block *mi_sb;
130 
131 	struct path mi_backing_dir_path;
132 
133 	struct dentry *mi_index_dir;
134 	/* For stacking mounts, if true, this indicates if the index dir needs
135 	 * to be freed for this SB otherwise it was created by lower level SB */
136 	bool mi_index_free;
137 
138 	struct dentry *mi_incomplete_dir;
139 	/* For stacking mounts, if true, this indicates if the incomplete dir
140 	 * needs to be freed for this SB. Similar to mi_index_free */
141 	bool mi_incomplete_free;
142 
143 	const struct cred *mi_owner;
144 
145 	struct mount_options mi_options;
146 
147 	/* This mutex is to be taken before create, rename, delete */
148 	struct mutex mi_dir_struct_mutex;
149 
150 	/*
151 	 * A queue of waiters who want to be notified about new pending reads.
152 	 */
153 	wait_queue_head_t mi_pending_reads_notif_wq;
154 
155 	/*
156 	 * Protects - RCU safe:
157 	 *  - reads_list_head
158 	 *  - mi_pending_reads_count
159 	 *  - mi_last_pending_read_number
160 	 *  - data_file_segment.reads_list_head
161 	 */
162 	spinlock_t pending_read_lock;
163 
164 	/* List of active pending_read objects */
165 	struct list_head mi_reads_list_head;
166 
167 	/* Total number of items in reads_list_head */
168 	int mi_pending_reads_count;
169 
170 	/*
171 	 * Last serial number that was assigned to a pending read.
172 	 * 0 means no pending reads have been seen yet.
173 	 */
174 	int mi_last_pending_read_number;
175 
176 	/* Temporary buffer for read logger. */
177 	struct read_log mi_log;
178 
179 	/* SELinux needs special xattrs on our pseudo files */
180 	struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT];
181 
182 	/* A queue of waiters who want to be notified about blocks_written */
183 	wait_queue_head_t mi_blocks_written_notif_wq;
184 
185 	/* Number of blocks written since mount */
186 	atomic_t mi_blocks_written;
187 
188 	/* Per UID read timeouts */
189 	spinlock_t mi_per_uid_read_timeouts_lock;
190 	struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts;
191 	int mi_per_uid_read_timeouts_size;
192 
193 	/* zstd workspace */
194 	struct mutex mi_zstd_workspace_mutex;
195 	void *mi_zstd_workspace;
196 	ZSTD_DStream *mi_zstd_stream;
197 	struct delayed_work mi_zstd_cleanup_work;
198 
199 	/* sysfs node */
200 	struct incfs_sysfs_node *mi_sysfs_node;
201 
202 	/* Last error information */
203 	struct mutex	mi_le_mutex;
204 	incfs_uuid_t	mi_le_file_id;
205 	u64		mi_le_time_us;
206 	u32		mi_le_page;
207 	u32		mi_le_errno;
208 	uid_t		mi_le_uid;
209 
210 	/* Number of reads timed out */
211 	u32 mi_reads_failed_timed_out;
212 
213 	/* Number of reads failed because hash verification failed */
214 	u32 mi_reads_failed_hash_verification;
215 
216 	/* Number of reads failed for another reason */
217 	u32 mi_reads_failed_other;
218 
219 	/* Number of reads delayed because page had to be fetched */
220 	u32 mi_reads_delayed_pending;
221 
222 	/* Total time waiting for pages to be fetched */
223 	u64 mi_reads_delayed_pending_us;
224 
225 	/*
226 	 * Number of reads delayed because of per-uid min_time_us or
227 	 * min_pending_time_us settings
228 	 */
229 	u32 mi_reads_delayed_min;
230 
231 	/* Total time waiting because of per-uid min_time_us or
232 	 * min_pending_time_us settings.
233 	 *
234 	 * Note that if a read is initially delayed because we have to wait for
235 	 * the page, then further delayed because of min_pending_time_us
236 	 * setting, this counter gets incremented by only the further delay
237 	 * time.
238 	 */
239 	u64 mi_reads_delayed_min_us;
240 };
241 
242 struct data_file_block {
243 	loff_t db_backing_file_data_offset;
244 
245 	size_t db_stored_size;
246 
247 	enum incfs_compression_alg db_comp_alg;
248 };
249 
250 struct pending_read {
251 	incfs_uuid_t file_id;
252 
253 	s64 timestamp_us;
254 
255 	atomic_t done;
256 
257 	int block_index;
258 
259 	int serial_number;
260 
261 	uid_t uid;
262 
263 	struct list_head mi_reads_list;
264 
265 	struct list_head segment_reads_list;
266 
267 	struct rcu_head rcu;
268 };
269 
270 struct data_file_segment {
271 	wait_queue_head_t new_data_arrival_wq;
272 
273 	/* Protects reads and writes from the blockmap */
274 	struct rw_semaphore rwsem;
275 
276 	/* List of active pending_read objects belonging to this segment */
277 	/* Protected by mount_info.pending_reads_mutex */
278 	struct list_head reads_list_head;
279 };
280 
281 /*
282  * Extra info associated with a file. Just a few bytes set by a user.
283  */
284 struct file_attr {
285 	loff_t fa_value_offset;
286 
287 	size_t fa_value_size;
288 
289 	u32 fa_crc;
290 };
291 
292 
293 struct data_file {
294 	struct backing_file_context *df_backing_file_context;
295 
296 	struct mount_info *df_mount_info;
297 
298 	incfs_uuid_t df_id;
299 
300 	/*
301 	 * Array of segments used to reduce lock contention for the file.
302 	 * Segment is chosen for a block depends on the block's index.
303 	 */
304 	struct data_file_segment df_segments[SEGMENTS_PER_FILE];
305 
306 	/* Base offset of the first metadata record. */
307 	loff_t df_metadata_off;
308 
309 	/* Base offset of the block map. */
310 	loff_t df_blockmap_off;
311 
312 	/* File size in bytes */
313 	loff_t df_size;
314 
315 	/* File header flags */
316 	u32 df_header_flags;
317 
318 	/* File size in DATA_FILE_BLOCK_SIZE blocks */
319 	int df_data_block_count;
320 
321 	/* Total number of blocks, data + hash */
322 	int df_total_block_count;
323 
324 	/* For mapped files, the offset into the actual file */
325 	loff_t df_mapped_offset;
326 
327 	/* Number of data blocks written to file */
328 	atomic_t df_data_blocks_written;
329 
330 	/* Number of data blocks in the status block */
331 	u32 df_initial_data_blocks_written;
332 
333 	/* Number of hash blocks written to file */
334 	atomic_t df_hash_blocks_written;
335 
336 	/* Number of hash blocks in the status block */
337 	u32 df_initial_hash_blocks_written;
338 
339 	/* Offset to status metadata header */
340 	loff_t df_status_offset;
341 
342 	/*
343 	 * Mutex acquired while enabling verity. Note that df_hash_tree is set
344 	 * by enable verity.
345 	 *
346 	 * The backing file mutex bc_mutex  may be taken while this mutex is
347 	 * held.
348 	 */
349 	struct mutex df_enable_verity;
350 
351 	/*
352 	 * Set either at construction time or during enabling verity. In the
353 	 * latter case, set via smp_store_release, so use smp_load_acquire to
354 	 * read it.
355 	 */
356 	struct mtree *df_hash_tree;
357 
358 	/* Guaranteed set if df_hash_tree is set. */
359 	struct incfs_df_signature *df_signature;
360 
361 	/*
362 	 * The verity file digest, set when verity is enabled and the file has
363 	 * been opened
364 	 */
365 	struct mem_range df_verity_file_digest;
366 
367 	struct incfs_df_verity_signature *df_verity_signature;
368 };
369 
370 struct dir_file {
371 	struct mount_info *mount_info;
372 
373 	struct file *backing_dir;
374 };
375 
376 struct inode_info {
377 	struct mount_info *n_mount_info; /* A mount, this file belongs to */
378 
379 	struct inode *n_backing_inode;
380 
381 	struct data_file *n_file;
382 
383 	struct inode n_vfs_inode;
384 };
385 
386 struct dentry_info {
387 	struct path backing_path;
388 };
389 
390 enum FILL_PERMISSION {
391 	CANT_FILL = 0,
392 	CAN_FILL = 1,
393 };
394 
395 struct incfs_file_data {
396 	/* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */
397 	enum FILL_PERMISSION fd_fill_permission;
398 
399 	/* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */
400 	int fd_get_block_pos;
401 
402 	/* And how many filled blocks are there up to that point */
403 	int fd_filled_data_blocks;
404 	int fd_filled_hash_blocks;
405 };
406 
407 struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
408 					  struct mount_options *options,
409 					  struct path *backing_dir_path);
410 
411 int incfs_realloc_mount_info(struct mount_info *mi,
412 			     struct mount_options *options);
413 
414 void incfs_free_mount_info(struct mount_info *mi);
415 
416 char *file_id_to_str(incfs_uuid_t id);
417 struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name);
418 struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf);
419 void incfs_free_data_file(struct data_file *df);
420 
421 struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf);
422 void incfs_free_dir_file(struct dir_file *dir);
423 
424 struct incfs_read_data_file_timeouts {
425 	u32 min_time_us;
426 	u32 min_pending_time_us;
427 	u32 max_pending_time_us;
428 };
429 
430 ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
431 			int index, struct mem_range tmp,
432 			struct incfs_read_data_file_timeouts *timeouts,
433 			unsigned int *delayed_min_us);
434 
435 ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
436 				      struct data_file *df, size_t offset);
437 
438 int incfs_get_filled_blocks(struct data_file *df,
439 			    struct incfs_file_data *fd,
440 			    struct incfs_get_filled_blocks_args *arg);
441 
442 int incfs_read_file_signature(struct data_file *df, struct mem_range dst);
443 
444 int incfs_process_new_data_block(struct data_file *df,
445 				 struct incfs_fill_block *block, u8 *data,
446 				 bool *complete);
447 
448 int incfs_process_new_hash_block(struct data_file *df,
449 				 struct incfs_fill_block *block, u8 *data);
450 
451 bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number);
452 
453 /*
454  * Collects pending reads and saves them into the array (reads/reads_size).
455  * Only reads with serial_number > sn_lowerbound are reported.
456  * Returns how many reads were saved into the array.
457  */
458 int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
459 				struct incfs_pending_read_info *reads,
460 				struct incfs_pending_read_info2 *reads2,
461 				int reads_size, int *new_max_sn);
462 
463 int incfs_collect_logged_reads(struct mount_info *mi,
464 			       struct read_log_state *start_state,
465 			       struct incfs_pending_read_info *reads,
466 			       struct incfs_pending_read_info2 *reads2,
467 			       int reads_size);
468 struct read_log_state incfs_get_log_state(struct mount_info *mi);
469 int incfs_get_uncollected_logs_count(struct mount_info *mi,
470 				     const struct read_log_state *state);
471 
get_incfs_node(struct inode * inode)472 static inline struct inode_info *get_incfs_node(struct inode *inode)
473 {
474 	if (!inode)
475 		return NULL;
476 
477 	if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) {
478 		/* This inode doesn't belong to us. */
479 		pr_warn_once("incfs: %s on an alien inode.", __func__);
480 		return NULL;
481 	}
482 
483 	return container_of(inode, struct inode_info, n_vfs_inode);
484 }
485 
get_incfs_data_file(struct file * f)486 static inline struct data_file *get_incfs_data_file(struct file *f)
487 {
488 	struct inode_info *node = NULL;
489 
490 	if (!f)
491 		return NULL;
492 
493 	if (!S_ISREG(f->f_inode->i_mode))
494 		return NULL;
495 
496 	node = get_incfs_node(f->f_inode);
497 	if (!node)
498 		return NULL;
499 
500 	return node->n_file;
501 }
502 
get_incfs_dir_file(struct file * f)503 static inline struct dir_file *get_incfs_dir_file(struct file *f)
504 {
505 	if (!f)
506 		return NULL;
507 
508 	if (!S_ISDIR(f->f_inode->i_mode))
509 		return NULL;
510 
511 	return (struct dir_file *)f->private_data;
512 }
513 
514 /*
515  * Make sure that inode_info.n_file is initialized and inode can be used
516  * for reading and writing data from/to the backing file.
517  */
518 int make_inode_ready_for_data_ops(struct mount_info *mi,
519 				struct inode *inode,
520 				struct file *backing_file);
521 
get_incfs_dentry(const struct dentry * d)522 static inline struct dentry_info *get_incfs_dentry(const struct dentry *d)
523 {
524 	if (!d)
525 		return NULL;
526 
527 	return (struct dentry_info *)d->d_fsdata;
528 }
529 
get_incfs_backing_path(const struct dentry * d,struct path * path)530 static inline void get_incfs_backing_path(const struct dentry *d,
531 					  struct path *path)
532 {
533 	struct dentry_info *di = get_incfs_dentry(d);
534 
535 	if (!di) {
536 		*path = (struct path) {};
537 		return;
538 	}
539 
540 	*path = di->backing_path;
541 	path_get(path);
542 }
543 
get_blocks_count_for_size(u64 size)544 static inline int get_blocks_count_for_size(u64 size)
545 {
546 	if (size == 0)
547 		return 0;
548 	return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE;
549 }
550 
551 #endif /* _INCFS_DATA_MGMT_H */
552