1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/fdtable.h>
5 #include <linux/file.h>
6 #include <linux/fs.h>
7 #include <linux/anon_inodes.h>
8 #include <linux/fsnotify_backend.h>
9 #include <linux/init.h>
10 #include <linux/mount.h>
11 #include <linux/namei.h>
12 #include <linux/poll.h>
13 #include <linux/security.h>
14 #include <linux/syscalls.h>
15 #include <linux/slab.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/compat.h>
19 #include <linux/sched/signal.h>
20 #include <linux/memcontrol.h>
21 #include <linux/statfs.h>
22 #include <linux/exportfs.h>
23
24 #include <asm/ioctls.h>
25
26 #include "../fsnotify.h"
27 #include "../fdinfo.h"
28 #include "fanotify.h"
29
30 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384
31 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
32 #define FANOTIFY_DEFAULT_MAX_GROUPS 128
33 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
34
35 /*
36 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
37 * limit of marks per user, similar to inotify. Effectively, the legacy limit
38 * of fanotify marks per user is <max marks per group> * <max groups per user>.
39 * This default limit (1M) also happens to match the increased limit of inotify
40 * max_user_watches since v5.10.
41 */
42 #define FANOTIFY_DEFAULT_MAX_USER_MARKS \
43 (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
44
45 /*
46 * Most of the memory cost of adding an inode mark is pinning the marked inode.
47 * The size of the filesystem inode struct is not uniform across filesystems,
48 * so double the size of a VFS inode is used as a conservative approximation.
49 */
50 #define INODE_MARK_COST (2 * sizeof(struct inode))
51
52 /* configurable via /proc/sys/fs/fanotify/ */
53 static int fanotify_max_queued_events __read_mostly;
54
55 #ifdef CONFIG_SYSCTL
56
57 #include <linux/sysctl.h>
58
59 static long ft_zero = 0;
60 static long ft_int_max = INT_MAX;
61
62 static struct ctl_table fanotify_table[] = {
63 {
64 .procname = "max_user_groups",
65 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
66 .maxlen = sizeof(long),
67 .mode = 0644,
68 .proc_handler = proc_doulongvec_minmax,
69 .extra1 = &ft_zero,
70 .extra2 = &ft_int_max,
71 },
72 {
73 .procname = "max_user_marks",
74 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
75 .maxlen = sizeof(long),
76 .mode = 0644,
77 .proc_handler = proc_doulongvec_minmax,
78 .extra1 = &ft_zero,
79 .extra2 = &ft_int_max,
80 },
81 {
82 .procname = "max_queued_events",
83 .data = &fanotify_max_queued_events,
84 .maxlen = sizeof(int),
85 .mode = 0644,
86 .proc_handler = proc_dointvec_minmax,
87 .extra1 = SYSCTL_ZERO
88 },
89 };
90
fanotify_sysctls_init(void)91 static void __init fanotify_sysctls_init(void)
92 {
93 register_sysctl("fs/fanotify", fanotify_table);
94 }
95 #else
96 #define fanotify_sysctls_init() do { } while (0)
97 #endif /* CONFIG_SYSCTL */
98
99 /*
100 * All flags that may be specified in parameter event_f_flags of fanotify_init.
101 *
102 * Internal and external open flags are stored together in field f_flags of
103 * struct file. Only external open flags shall be allowed in event_f_flags.
104 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
105 * excluded.
106 */
107 #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
108 O_ACCMODE | O_APPEND | O_NONBLOCK | \
109 __O_SYNC | O_DSYNC | O_CLOEXEC | \
110 O_LARGEFILE | O_NOATIME )
111
112 extern const struct fsnotify_ops fanotify_fsnotify_ops;
113
114 struct kmem_cache *fanotify_mark_cache __ro_after_init;
115 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
116 struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
117 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
118
119 #define FANOTIFY_EVENT_ALIGN 4
120 #define FANOTIFY_FID_INFO_HDR_LEN \
121 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
122 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
123 sizeof(struct fanotify_event_info_pidfd)
124 #define FANOTIFY_ERROR_INFO_LEN \
125 (sizeof(struct fanotify_event_info_error))
126
fanotify_fid_info_len(int fh_len,int name_len)127 static int fanotify_fid_info_len(int fh_len, int name_len)
128 {
129 int info_len = fh_len;
130
131 if (name_len)
132 info_len += name_len + 1;
133
134 return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
135 FANOTIFY_EVENT_ALIGN);
136 }
137
138 /* FAN_RENAME may have one or two dir+name info records */
fanotify_dir_name_info_len(struct fanotify_event * event)139 static int fanotify_dir_name_info_len(struct fanotify_event *event)
140 {
141 struct fanotify_info *info = fanotify_event_info(event);
142 int dir_fh_len = fanotify_event_dir_fh_len(event);
143 int dir2_fh_len = fanotify_event_dir2_fh_len(event);
144 int info_len = 0;
145
146 if (dir_fh_len)
147 info_len += fanotify_fid_info_len(dir_fh_len,
148 info->name_len);
149 if (dir2_fh_len)
150 info_len += fanotify_fid_info_len(dir2_fh_len,
151 info->name2_len);
152
153 return info_len;
154 }
155
fanotify_event_len(unsigned int info_mode,struct fanotify_event * event)156 static size_t fanotify_event_len(unsigned int info_mode,
157 struct fanotify_event *event)
158 {
159 size_t event_len = FAN_EVENT_METADATA_LEN;
160 int fh_len;
161 int dot_len = 0;
162
163 if (!info_mode)
164 return event_len;
165
166 if (fanotify_is_error_event(event->mask))
167 event_len += FANOTIFY_ERROR_INFO_LEN;
168
169 if (fanotify_event_has_any_dir_fh(event)) {
170 event_len += fanotify_dir_name_info_len(event);
171 } else if ((info_mode & FAN_REPORT_NAME) &&
172 (event->mask & FAN_ONDIR)) {
173 /*
174 * With group flag FAN_REPORT_NAME, if name was not recorded in
175 * event on a directory, we will report the name ".".
176 */
177 dot_len = 1;
178 }
179
180 if (info_mode & FAN_REPORT_PIDFD)
181 event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
182
183 if (fanotify_event_has_object_fh(event)) {
184 fh_len = fanotify_event_object_fh_len(event);
185 event_len += fanotify_fid_info_len(fh_len, dot_len);
186 }
187
188 return event_len;
189 }
190
191 /*
192 * Remove an hashed event from merge hash table.
193 */
fanotify_unhash_event(struct fsnotify_group * group,struct fanotify_event * event)194 static void fanotify_unhash_event(struct fsnotify_group *group,
195 struct fanotify_event *event)
196 {
197 assert_spin_locked(&group->notification_lock);
198
199 pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
200 group, event, fanotify_event_hash_bucket(group, event));
201
202 if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
203 return;
204
205 hlist_del_init(&event->merge_list);
206 }
207
208 /*
209 * Get an fanotify notification event if one exists and is small
210 * enough to fit in "count". Return an error pointer if the count
211 * is not large enough. When permission event is dequeued, its state is
212 * updated accordingly.
213 */
get_one_event(struct fsnotify_group * group,size_t count)214 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
215 size_t count)
216 {
217 size_t event_size;
218 struct fanotify_event *event = NULL;
219 struct fsnotify_event *fsn_event;
220 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
221
222 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
223
224 spin_lock(&group->notification_lock);
225 fsn_event = fsnotify_peek_first_event(group);
226 if (!fsn_event)
227 goto out;
228
229 event = FANOTIFY_E(fsn_event);
230 event_size = fanotify_event_len(info_mode, event);
231
232 if (event_size > count) {
233 event = ERR_PTR(-EINVAL);
234 goto out;
235 }
236
237 /*
238 * Held the notification_lock the whole time, so this is the
239 * same event we peeked above.
240 */
241 fsnotify_remove_first_event(group);
242 if (fanotify_is_perm_event(event->mask))
243 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
244 if (fanotify_is_hashed_event(event->mask))
245 fanotify_unhash_event(group, event);
246 out:
247 spin_unlock(&group->notification_lock);
248 return event;
249 }
250
create_fd(struct fsnotify_group * group,const struct path * path,struct file ** file)251 static int create_fd(struct fsnotify_group *group, const struct path *path,
252 struct file **file)
253 {
254 int client_fd;
255 struct file *new_file;
256
257 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
258 if (client_fd < 0)
259 return client_fd;
260
261 /*
262 * we need a new file handle for the userspace program so it can read even if it was
263 * originally opened O_WRONLY.
264 */
265 new_file = dentry_open(path,
266 group->fanotify_data.f_flags | __FMODE_NONOTIFY,
267 current_cred());
268 if (IS_ERR(new_file)) {
269 put_unused_fd(client_fd);
270 client_fd = PTR_ERR(new_file);
271 } else {
272 *file = new_file;
273 }
274
275 return client_fd;
276 }
277
process_access_response_info(const char __user * info,size_t info_len,struct fanotify_response_info_audit_rule * friar)278 static int process_access_response_info(const char __user *info,
279 size_t info_len,
280 struct fanotify_response_info_audit_rule *friar)
281 {
282 if (info_len != sizeof(*friar))
283 return -EINVAL;
284
285 if (copy_from_user(friar, info, sizeof(*friar)))
286 return -EFAULT;
287
288 if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE)
289 return -EINVAL;
290 if (friar->hdr.pad != 0)
291 return -EINVAL;
292 if (friar->hdr.len != sizeof(*friar))
293 return -EINVAL;
294
295 return info_len;
296 }
297
298 /*
299 * Finish processing of permission event by setting it to ANSWERED state and
300 * drop group->notification_lock.
301 */
finish_permission_event(struct fsnotify_group * group,struct fanotify_perm_event * event,u32 response,struct fanotify_response_info_audit_rule * friar)302 static void finish_permission_event(struct fsnotify_group *group,
303 struct fanotify_perm_event *event, u32 response,
304 struct fanotify_response_info_audit_rule *friar)
305 __releases(&group->notification_lock)
306 {
307 bool destroy = false;
308
309 assert_spin_locked(&group->notification_lock);
310 event->response = response & ~FAN_INFO;
311 if (response & FAN_INFO)
312 memcpy(&event->audit_rule, friar, sizeof(*friar));
313
314 if (event->state == FAN_EVENT_CANCELED)
315 destroy = true;
316 else
317 event->state = FAN_EVENT_ANSWERED;
318 spin_unlock(&group->notification_lock);
319 if (destroy)
320 fsnotify_destroy_event(group, &event->fae.fse);
321 }
322
process_access_response(struct fsnotify_group * group,struct fanotify_response * response_struct,const char __user * info,size_t info_len)323 static int process_access_response(struct fsnotify_group *group,
324 struct fanotify_response *response_struct,
325 const char __user *info,
326 size_t info_len)
327 {
328 struct fanotify_perm_event *event;
329 int fd = response_struct->fd;
330 u32 response = response_struct->response;
331 int ret = info_len;
332 struct fanotify_response_info_audit_rule friar;
333
334 pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
335 group, fd, response, info, info_len);
336 /*
337 * make sure the response is valid, if invalid we do nothing and either
338 * userspace can send a valid response or we will clean it up after the
339 * timeout
340 */
341 if (response & ~FANOTIFY_RESPONSE_VALID_MASK)
342 return -EINVAL;
343
344 switch (response & FANOTIFY_RESPONSE_ACCESS) {
345 case FAN_ALLOW:
346 case FAN_DENY:
347 break;
348 default:
349 return -EINVAL;
350 }
351
352 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
353 return -EINVAL;
354
355 if (response & FAN_INFO) {
356 ret = process_access_response_info(info, info_len, &friar);
357 if (ret < 0)
358 return ret;
359 if (fd == FAN_NOFD)
360 return ret;
361 } else {
362 ret = 0;
363 }
364
365 if (fd < 0)
366 return -EINVAL;
367
368 spin_lock(&group->notification_lock);
369 list_for_each_entry(event, &group->fanotify_data.access_list,
370 fae.fse.list) {
371 if (event->fd != fd)
372 continue;
373
374 list_del_init(&event->fae.fse.list);
375 finish_permission_event(group, event, response, &friar);
376 wake_up(&group->fanotify_data.access_waitq);
377 return ret;
378 }
379 spin_unlock(&group->notification_lock);
380
381 return -ENOENT;
382 }
383
copy_error_info_to_user(struct fanotify_event * event,char __user * buf,int count)384 static size_t copy_error_info_to_user(struct fanotify_event *event,
385 char __user *buf, int count)
386 {
387 struct fanotify_event_info_error info = { };
388 struct fanotify_error_event *fee = FANOTIFY_EE(event);
389
390 info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
391 info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
392
393 if (WARN_ON(count < info.hdr.len))
394 return -EFAULT;
395
396 info.error = fee->error;
397 info.error_count = fee->err_count;
398
399 if (copy_to_user(buf, &info, sizeof(info)))
400 return -EFAULT;
401
402 return info.hdr.len;
403 }
404
copy_fid_info_to_user(__kernel_fsid_t * fsid,struct fanotify_fh * fh,int info_type,const char * name,size_t name_len,char __user * buf,size_t count)405 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
406 int info_type, const char *name,
407 size_t name_len,
408 char __user *buf, size_t count)
409 {
410 struct fanotify_event_info_fid info = { };
411 struct file_handle handle = { };
412 unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
413 size_t fh_len = fh ? fh->len : 0;
414 size_t info_len = fanotify_fid_info_len(fh_len, name_len);
415 size_t len = info_len;
416
417 pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
418 __func__, fh_len, name_len, info_len, count);
419
420 if (WARN_ON_ONCE(len < sizeof(info) || len > count))
421 return -EFAULT;
422
423 /*
424 * Copy event info fid header followed by variable sized file handle
425 * and optionally followed by variable sized filename.
426 */
427 switch (info_type) {
428 case FAN_EVENT_INFO_TYPE_FID:
429 case FAN_EVENT_INFO_TYPE_DFID:
430 if (WARN_ON_ONCE(name_len))
431 return -EFAULT;
432 break;
433 case FAN_EVENT_INFO_TYPE_DFID_NAME:
434 case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
435 case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
436 if (WARN_ON_ONCE(!name || !name_len))
437 return -EFAULT;
438 break;
439 default:
440 return -EFAULT;
441 }
442
443 info.hdr.info_type = info_type;
444 info.hdr.len = len;
445 info.fsid = *fsid;
446 if (copy_to_user(buf, &info, sizeof(info)))
447 return -EFAULT;
448
449 buf += sizeof(info);
450 len -= sizeof(info);
451 if (WARN_ON_ONCE(len < sizeof(handle)))
452 return -EFAULT;
453
454 handle.handle_type = fh->type;
455 handle.handle_bytes = fh_len;
456
457 /* Mangle handle_type for bad file_handle */
458 if (!fh_len)
459 handle.handle_type = FILEID_INVALID;
460
461 if (copy_to_user(buf, &handle, sizeof(handle)))
462 return -EFAULT;
463
464 buf += sizeof(handle);
465 len -= sizeof(handle);
466 if (WARN_ON_ONCE(len < fh_len))
467 return -EFAULT;
468
469 /*
470 * For an inline fh and inline file name, copy through stack to exclude
471 * the copy from usercopy hardening protections.
472 */
473 fh_buf = fanotify_fh_buf(fh);
474 if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
475 memcpy(bounce, fh_buf, fh_len);
476 fh_buf = bounce;
477 }
478 if (copy_to_user(buf, fh_buf, fh_len))
479 return -EFAULT;
480
481 buf += fh_len;
482 len -= fh_len;
483
484 if (name_len) {
485 /* Copy the filename with terminating null */
486 name_len++;
487 if (WARN_ON_ONCE(len < name_len))
488 return -EFAULT;
489
490 if (copy_to_user(buf, name, name_len))
491 return -EFAULT;
492
493 buf += name_len;
494 len -= name_len;
495 }
496
497 /* Pad with 0's */
498 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
499 if (len > 0 && clear_user(buf, len))
500 return -EFAULT;
501
502 return info_len;
503 }
504
copy_pidfd_info_to_user(int pidfd,char __user * buf,size_t count)505 static int copy_pidfd_info_to_user(int pidfd,
506 char __user *buf,
507 size_t count)
508 {
509 struct fanotify_event_info_pidfd info = { };
510 size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
511
512 if (WARN_ON_ONCE(info_len > count))
513 return -EFAULT;
514
515 info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
516 info.hdr.len = info_len;
517 info.pidfd = pidfd;
518
519 if (copy_to_user(buf, &info, info_len))
520 return -EFAULT;
521
522 return info_len;
523 }
524
copy_info_records_to_user(struct fanotify_event * event,struct fanotify_info * info,unsigned int info_mode,int pidfd,char __user * buf,size_t count)525 static int copy_info_records_to_user(struct fanotify_event *event,
526 struct fanotify_info *info,
527 unsigned int info_mode, int pidfd,
528 char __user *buf, size_t count)
529 {
530 int ret, total_bytes = 0, info_type = 0;
531 unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
532 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
533
534 /*
535 * Event info records order is as follows:
536 * 1. dir fid + name
537 * 2. (optional) new dir fid + new name
538 * 3. (optional) child fid
539 */
540 if (fanotify_event_has_dir_fh(event)) {
541 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
542 FAN_EVENT_INFO_TYPE_DFID;
543
544 /* FAN_RENAME uses special info types */
545 if (event->mask & FAN_RENAME)
546 info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
547
548 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
549 fanotify_info_dir_fh(info),
550 info_type,
551 fanotify_info_name(info),
552 info->name_len, buf, count);
553 if (ret < 0)
554 return ret;
555
556 buf += ret;
557 count -= ret;
558 total_bytes += ret;
559 }
560
561 /* New dir fid+name may be reported in addition to old dir fid+name */
562 if (fanotify_event_has_dir2_fh(event)) {
563 info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
564 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
565 fanotify_info_dir2_fh(info),
566 info_type,
567 fanotify_info_name2(info),
568 info->name2_len, buf, count);
569 if (ret < 0)
570 return ret;
571
572 buf += ret;
573 count -= ret;
574 total_bytes += ret;
575 }
576
577 if (fanotify_event_has_object_fh(event)) {
578 const char *dot = NULL;
579 int dot_len = 0;
580
581 if (fid_mode == FAN_REPORT_FID || info_type) {
582 /*
583 * With only group flag FAN_REPORT_FID only type FID is
584 * reported. Second info record type is always FID.
585 */
586 info_type = FAN_EVENT_INFO_TYPE_FID;
587 } else if ((fid_mode & FAN_REPORT_NAME) &&
588 (event->mask & FAN_ONDIR)) {
589 /*
590 * With group flag FAN_REPORT_NAME, if name was not
591 * recorded in an event on a directory, report the name
592 * "." with info type DFID_NAME.
593 */
594 info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
595 dot = ".";
596 dot_len = 1;
597 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
598 (event->mask & FAN_ONDIR)) {
599 /*
600 * With group flag FAN_REPORT_DIR_FID, a single info
601 * record has type DFID for directory entry modification
602 * event and for event on a directory.
603 */
604 info_type = FAN_EVENT_INFO_TYPE_DFID;
605 } else {
606 /*
607 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
608 * a single info record has type FID for event on a
609 * non-directory, when there is no directory to report.
610 * For example, on FAN_DELETE_SELF event.
611 */
612 info_type = FAN_EVENT_INFO_TYPE_FID;
613 }
614
615 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
616 fanotify_event_object_fh(event),
617 info_type, dot, dot_len,
618 buf, count);
619 if (ret < 0)
620 return ret;
621
622 buf += ret;
623 count -= ret;
624 total_bytes += ret;
625 }
626
627 if (pidfd_mode) {
628 ret = copy_pidfd_info_to_user(pidfd, buf, count);
629 if (ret < 0)
630 return ret;
631
632 buf += ret;
633 count -= ret;
634 total_bytes += ret;
635 }
636
637 if (fanotify_is_error_event(event->mask)) {
638 ret = copy_error_info_to_user(event, buf, count);
639 if (ret < 0)
640 return ret;
641 buf += ret;
642 count -= ret;
643 total_bytes += ret;
644 }
645
646 return total_bytes;
647 }
648
copy_event_to_user(struct fsnotify_group * group,struct fanotify_event * event,char __user * buf,size_t count)649 static ssize_t copy_event_to_user(struct fsnotify_group *group,
650 struct fanotify_event *event,
651 char __user *buf, size_t count)
652 {
653 struct fanotify_event_metadata metadata;
654 const struct path *path = fanotify_event_path(event);
655 struct fanotify_info *info = fanotify_event_info(event);
656 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
657 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
658 struct file *f = NULL, *pidfd_file = NULL;
659 int ret, pidfd = -ESRCH, fd = -EBADF;
660
661 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
662
663 metadata.event_len = fanotify_event_len(info_mode, event);
664 metadata.metadata_len = FAN_EVENT_METADATA_LEN;
665 metadata.vers = FANOTIFY_METADATA_VERSION;
666 metadata.reserved = 0;
667 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
668 metadata.pid = pid_vnr(event->pid);
669 /*
670 * For an unprivileged listener, event->pid can be used to identify the
671 * events generated by the listener process itself, without disclosing
672 * the pids of other processes.
673 */
674 if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
675 task_tgid(current) != event->pid)
676 metadata.pid = 0;
677
678 /*
679 * For now, fid mode is required for an unprivileged listener and
680 * fid mode does not report fd in events. Keep this check anyway
681 * for safety in case fid mode requirement is relaxed in the future
682 * to allow unprivileged listener to get events with no fd and no fid.
683 */
684 if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
685 path && path->mnt && path->dentry) {
686 fd = create_fd(group, path, &f);
687 /*
688 * Opening an fd from dentry can fail for several reasons.
689 * For example, when tasks are gone and we try to open their
690 * /proc files or we try to open a WRONLY file like in sysfs
691 * or when trying to open a file that was deleted on the
692 * remote network server.
693 *
694 * For a group with FAN_REPORT_FD_ERROR, we will send the
695 * event with the error instead of the open fd, otherwise
696 * Userspace may not get the error at all.
697 * In any case, userspace will not know which file failed to
698 * open, so add a debug print for further investigation.
699 */
700 if (fd < 0) {
701 pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
702 path->dentry, fd);
703 if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
704 /*
705 * Historically, we've handled EOPENSTALE in a
706 * special way and silently dropped such
707 * events. Now we have to keep it to maintain
708 * backward compatibility...
709 */
710 if (fd == -EOPENSTALE)
711 fd = 0;
712 return fd;
713 }
714 }
715 }
716 if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
717 metadata.fd = fd;
718 else
719 metadata.fd = fd >= 0 ? fd : FAN_NOFD;
720
721 if (pidfd_mode) {
722 /*
723 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
724 * exclusion is ever lifted. At the time of incoporating pidfd
725 * support within fanotify, the pidfd API only supported the
726 * creation of pidfds for thread-group leaders.
727 */
728 WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
729
730 /*
731 * The PIDTYPE_TGID check for an event->pid is performed
732 * preemptively in an attempt to catch out cases where the event
733 * listener reads events after the event generating process has
734 * already terminated. Depending on flag FAN_REPORT_FD_ERROR,
735 * report either -ESRCH or FAN_NOPIDFD to the event listener in
736 * those cases with all other pidfd creation errors reported as
737 * the error code itself or as FAN_EPIDFD.
738 */
739 if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
740 pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
741
742 if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
743 pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
744 }
745
746 ret = -EFAULT;
747 /*
748 * Sanity check copy size in case get_one_event() and
749 * event_len sizes ever get out of sync.
750 */
751 if (WARN_ON_ONCE(metadata.event_len > count))
752 goto out_close_fd;
753
754 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
755 goto out_close_fd;
756
757 buf += FAN_EVENT_METADATA_LEN;
758 count -= FAN_EVENT_METADATA_LEN;
759
760 if (info_mode) {
761 ret = copy_info_records_to_user(event, info, info_mode, pidfd,
762 buf, count);
763 if (ret < 0)
764 goto out_close_fd;
765 }
766
767 if (f)
768 fd_install(fd, f);
769
770 if (pidfd_file)
771 fd_install(pidfd, pidfd_file);
772
773 if (fanotify_is_perm_event(event->mask))
774 FANOTIFY_PERM(event)->fd = fd;
775
776 return metadata.event_len;
777
778 out_close_fd:
779 if (f) {
780 put_unused_fd(fd);
781 fput(f);
782 }
783
784 if (pidfd_file) {
785 put_unused_fd(pidfd);
786 fput(pidfd_file);
787 }
788
789 return ret;
790 }
791
792 /* intofiy userspace file descriptor functions */
fanotify_poll(struct file * file,poll_table * wait)793 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
794 {
795 struct fsnotify_group *group = file->private_data;
796 __poll_t ret = 0;
797
798 poll_wait(file, &group->notification_waitq, wait);
799 spin_lock(&group->notification_lock);
800 if (!fsnotify_notify_queue_is_empty(group))
801 ret = EPOLLIN | EPOLLRDNORM;
802 spin_unlock(&group->notification_lock);
803
804 return ret;
805 }
806
fanotify_read(struct file * file,char __user * buf,size_t count,loff_t * pos)807 static ssize_t fanotify_read(struct file *file, char __user *buf,
808 size_t count, loff_t *pos)
809 {
810 struct fsnotify_group *group;
811 struct fanotify_event *event;
812 char __user *start;
813 int ret;
814 DEFINE_WAIT_FUNC(wait, woken_wake_function);
815
816 start = buf;
817 group = file->private_data;
818
819 pr_debug("%s: group=%p\n", __func__, group);
820
821 add_wait_queue(&group->notification_waitq, &wait);
822 while (1) {
823 /*
824 * User can supply arbitrarily large buffer. Avoid softlockups
825 * in case there are lots of available events.
826 */
827 cond_resched();
828 event = get_one_event(group, count);
829 if (IS_ERR(event)) {
830 ret = PTR_ERR(event);
831 break;
832 }
833
834 if (!event) {
835 ret = -EAGAIN;
836 if (file->f_flags & O_NONBLOCK)
837 break;
838
839 ret = -ERESTARTSYS;
840 if (signal_pending(current))
841 break;
842
843 if (start != buf)
844 break;
845
846 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
847 continue;
848 }
849
850 ret = copy_event_to_user(group, event, buf, count);
851
852 /*
853 * Permission events get queued to wait for response. Other
854 * events can be destroyed now.
855 */
856 if (!fanotify_is_perm_event(event->mask)) {
857 fsnotify_destroy_event(group, &event->fse);
858 } else {
859 if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
860 spin_lock(&group->notification_lock);
861 finish_permission_event(group,
862 FANOTIFY_PERM(event), FAN_DENY, NULL);
863 wake_up(&group->fanotify_data.access_waitq);
864 } else {
865 spin_lock(&group->notification_lock);
866 list_add_tail(&event->fse.list,
867 &group->fanotify_data.access_list);
868 spin_unlock(&group->notification_lock);
869 }
870 }
871 if (ret < 0)
872 break;
873 buf += ret;
874 count -= ret;
875 }
876 remove_wait_queue(&group->notification_waitq, &wait);
877
878 if (start != buf && ret != -EFAULT)
879 ret = buf - start;
880 return ret;
881 }
882
fanotify_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)883 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
884 {
885 struct fanotify_response response;
886 struct fsnotify_group *group;
887 int ret;
888 const char __user *info_buf = buf + sizeof(struct fanotify_response);
889 size_t info_len;
890
891 if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
892 return -EINVAL;
893
894 group = file->private_data;
895
896 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
897
898 if (count < sizeof(response))
899 return -EINVAL;
900
901 if (copy_from_user(&response, buf, sizeof(response)))
902 return -EFAULT;
903
904 info_len = count - sizeof(response);
905
906 ret = process_access_response(group, &response, info_buf, info_len);
907 if (ret < 0)
908 count = ret;
909 else
910 count = sizeof(response) + ret;
911
912 return count;
913 }
914
fanotify_release(struct inode * ignored,struct file * file)915 static int fanotify_release(struct inode *ignored, struct file *file)
916 {
917 struct fsnotify_group *group = file->private_data;
918 struct fsnotify_event *fsn_event;
919
920 /*
921 * Stop new events from arriving in the notification queue. since
922 * userspace cannot use fanotify fd anymore, no event can enter or
923 * leave access_list by now either.
924 */
925 fsnotify_group_stop_queueing(group);
926
927 /*
928 * Process all permission events on access_list and notification queue
929 * and simulate reply from userspace.
930 */
931 spin_lock(&group->notification_lock);
932 while (!list_empty(&group->fanotify_data.access_list)) {
933 struct fanotify_perm_event *event;
934
935 event = list_first_entry(&group->fanotify_data.access_list,
936 struct fanotify_perm_event, fae.fse.list);
937 list_del_init(&event->fae.fse.list);
938 finish_permission_event(group, event, FAN_ALLOW, NULL);
939 spin_lock(&group->notification_lock);
940 }
941
942 /*
943 * Destroy all non-permission events. For permission events just
944 * dequeue them and set the response. They will be freed once the
945 * response is consumed and fanotify_get_response() returns.
946 */
947 while ((fsn_event = fsnotify_remove_first_event(group))) {
948 struct fanotify_event *event = FANOTIFY_E(fsn_event);
949
950 if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
951 spin_unlock(&group->notification_lock);
952 fsnotify_destroy_event(group, fsn_event);
953 } else {
954 finish_permission_event(group, FANOTIFY_PERM(event),
955 FAN_ALLOW, NULL);
956 }
957 spin_lock(&group->notification_lock);
958 }
959 spin_unlock(&group->notification_lock);
960
961 /* Response for all permission events it set, wakeup waiters */
962 wake_up(&group->fanotify_data.access_waitq);
963
964 /* matches the fanotify_init->fsnotify_alloc_group */
965 fsnotify_destroy_group(group);
966
967 return 0;
968 }
969
fanotify_ioctl(struct file * file,unsigned int cmd,unsigned long arg)970 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
971 {
972 struct fsnotify_group *group;
973 struct fsnotify_event *fsn_event;
974 void __user *p;
975 int ret = -ENOTTY;
976 size_t send_len = 0;
977
978 group = file->private_data;
979
980 p = (void __user *) arg;
981
982 switch (cmd) {
983 case FIONREAD:
984 spin_lock(&group->notification_lock);
985 list_for_each_entry(fsn_event, &group->notification_list, list)
986 send_len += FAN_EVENT_METADATA_LEN;
987 spin_unlock(&group->notification_lock);
988 ret = put_user(send_len, (int __user *) p);
989 break;
990 }
991
992 return ret;
993 }
994
995 static const struct file_operations fanotify_fops = {
996 .show_fdinfo = fanotify_show_fdinfo,
997 .poll = fanotify_poll,
998 .read = fanotify_read,
999 .write = fanotify_write,
1000 .fasync = NULL,
1001 .release = fanotify_release,
1002 .unlocked_ioctl = fanotify_ioctl,
1003 .compat_ioctl = compat_ptr_ioctl,
1004 .llseek = noop_llseek,
1005 };
1006
fanotify_find_path(int dfd,const char __user * filename,struct path * path,unsigned int flags,__u64 mask,unsigned int obj_type)1007 static int fanotify_find_path(int dfd, const char __user *filename,
1008 struct path *path, unsigned int flags, __u64 mask,
1009 unsigned int obj_type)
1010 {
1011 int ret;
1012
1013 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
1014 dfd, filename, flags);
1015
1016 if (filename == NULL) {
1017 struct fd f = fdget(dfd);
1018
1019 ret = -EBADF;
1020 if (!fd_file(f))
1021 goto out;
1022
1023 ret = -ENOTDIR;
1024 if ((flags & FAN_MARK_ONLYDIR) &&
1025 !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
1026 fdput(f);
1027 goto out;
1028 }
1029
1030 *path = fd_file(f)->f_path;
1031 path_get(path);
1032 fdput(f);
1033 } else {
1034 unsigned int lookup_flags = 0;
1035
1036 if (!(flags & FAN_MARK_DONT_FOLLOW))
1037 lookup_flags |= LOOKUP_FOLLOW;
1038 if (flags & FAN_MARK_ONLYDIR)
1039 lookup_flags |= LOOKUP_DIRECTORY;
1040
1041 ret = user_path_at(dfd, filename, lookup_flags, path);
1042 if (ret)
1043 goto out;
1044 }
1045
1046 /* you can only watch an inode if you have read permissions on it */
1047 ret = path_permission(path, MAY_READ);
1048 if (ret) {
1049 path_put(path);
1050 goto out;
1051 }
1052
1053 ret = security_path_notify(path, mask, obj_type);
1054 if (ret)
1055 path_put(path);
1056
1057 out:
1058 return ret;
1059 }
1060
fanotify_mark_remove_from_mask(struct fsnotify_mark * fsn_mark,__u32 mask,unsigned int flags,__u32 umask,int * destroy)1061 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1062 __u32 mask, unsigned int flags,
1063 __u32 umask, int *destroy)
1064 {
1065 __u32 oldmask, newmask;
1066
1067 /* umask bits cannot be removed by user */
1068 mask &= ~umask;
1069 spin_lock(&fsn_mark->lock);
1070 oldmask = fsnotify_calc_mask(fsn_mark);
1071 if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
1072 fsn_mark->mask &= ~mask;
1073 } else {
1074 fsn_mark->ignore_mask &= ~mask;
1075 }
1076 newmask = fsnotify_calc_mask(fsn_mark);
1077 /*
1078 * We need to keep the mark around even if remaining mask cannot
1079 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1080 * changes to the mask.
1081 * Destroy mark when only umask bits remain.
1082 */
1083 *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
1084 spin_unlock(&fsn_mark->lock);
1085
1086 return oldmask & ~newmask;
1087 }
1088
fanotify_remove_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,__u32 mask,unsigned int flags,__u32 umask)1089 static int fanotify_remove_mark(struct fsnotify_group *group,
1090 void *obj, unsigned int obj_type, __u32 mask,
1091 unsigned int flags, __u32 umask)
1092 {
1093 struct fsnotify_mark *fsn_mark = NULL;
1094 __u32 removed;
1095 int destroy_mark;
1096
1097 fsnotify_group_lock(group);
1098 fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1099 if (!fsn_mark) {
1100 fsnotify_group_unlock(group);
1101 return -ENOENT;
1102 }
1103
1104 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1105 umask, &destroy_mark);
1106 if (removed & fsnotify_conn_mask(fsn_mark->connector))
1107 fsnotify_recalc_mask(fsn_mark->connector);
1108 if (destroy_mark)
1109 fsnotify_detach_mark(fsn_mark);
1110 fsnotify_group_unlock(group);
1111 if (destroy_mark)
1112 fsnotify_free_mark(fsn_mark);
1113
1114 /* matches the fsnotify_find_mark() */
1115 fsnotify_put_mark(fsn_mark);
1116 return 0;
1117 }
1118
fanotify_mark_update_flags(struct fsnotify_mark * fsn_mark,unsigned int fan_flags)1119 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
1120 unsigned int fan_flags)
1121 {
1122 bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
1123 unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
1124 bool recalc = false;
1125
1126 /*
1127 * When using FAN_MARK_IGNORE for the first time, mark starts using
1128 * independent event flags in ignore mask. After that, trying to
1129 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
1130 * will result in EEXIST error.
1131 */
1132 if (ignore == FAN_MARK_IGNORE)
1133 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
1134
1135 /*
1136 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
1137 * the removal of the FS_MODIFY bit in calculated mask if it was set
1138 * because of an ignore mask that is now going to survive FS_MODIFY.
1139 */
1140 if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1141 !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
1142 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1143 if (!(fsn_mark->mask & FS_MODIFY))
1144 recalc = true;
1145 }
1146
1147 if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
1148 want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1149 return recalc;
1150
1151 /*
1152 * NO_IREF may be removed from a mark, but not added.
1153 * When removed, fsnotify_recalc_mask() will take the inode ref.
1154 */
1155 WARN_ON_ONCE(!want_iref);
1156 fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
1157
1158 return true;
1159 }
1160
fanotify_mark_add_to_mask(struct fsnotify_mark * fsn_mark,__u32 mask,unsigned int fan_flags)1161 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1162 __u32 mask, unsigned int fan_flags)
1163 {
1164 bool recalc;
1165
1166 spin_lock(&fsn_mark->lock);
1167 if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
1168 fsn_mark->mask |= mask;
1169 else
1170 fsn_mark->ignore_mask |= mask;
1171
1172 recalc = fsnotify_calc_mask(fsn_mark) &
1173 ~fsnotify_conn_mask(fsn_mark->connector);
1174
1175 recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
1176 spin_unlock(&fsn_mark->lock);
1177
1178 return recalc;
1179 }
1180
1181 struct fan_fsid {
1182 struct super_block *sb;
1183 __kernel_fsid_t id;
1184 bool weak;
1185 };
1186
fanotify_set_mark_fsid(struct fsnotify_group * group,struct fsnotify_mark * mark,struct fan_fsid * fsid)1187 static int fanotify_set_mark_fsid(struct fsnotify_group *group,
1188 struct fsnotify_mark *mark,
1189 struct fan_fsid *fsid)
1190 {
1191 struct fsnotify_mark_connector *conn;
1192 struct fsnotify_mark *old;
1193 struct super_block *old_sb = NULL;
1194
1195 FANOTIFY_MARK(mark)->fsid = fsid->id;
1196 mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
1197 if (fsid->weak)
1198 mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
1199
1200 /* First mark added will determine if group is single or multi fsid */
1201 if (list_empty(&group->marks_list))
1202 return 0;
1203
1204 /* Find sb of an existing mark */
1205 list_for_each_entry(old, &group->marks_list, g_list) {
1206 conn = READ_ONCE(old->connector);
1207 if (!conn)
1208 continue;
1209 old_sb = fsnotify_connector_sb(conn);
1210 if (old_sb)
1211 break;
1212 }
1213
1214 /* Only detached marks left? */
1215 if (!old_sb)
1216 return 0;
1217
1218 /* Do not allow mixing of marks with weak and strong fsid */
1219 if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
1220 return -EXDEV;
1221
1222 /* Allow mixing of marks with strong fsid from different fs */
1223 if (!fsid->weak)
1224 return 0;
1225
1226 /* Do not allow mixing marks with weak fsid from different fs */
1227 if (old_sb != fsid->sb)
1228 return -EXDEV;
1229
1230 /* Do not allow mixing marks from different btrfs sub-volumes */
1231 if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
1232 &FANOTIFY_MARK(mark)->fsid))
1233 return -EXDEV;
1234
1235 return 0;
1236 }
1237
fanotify_add_new_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,unsigned int fan_flags,struct fan_fsid * fsid)1238 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1239 void *obj,
1240 unsigned int obj_type,
1241 unsigned int fan_flags,
1242 struct fan_fsid *fsid)
1243 {
1244 struct ucounts *ucounts = group->fanotify_data.ucounts;
1245 struct fanotify_mark *fan_mark;
1246 struct fsnotify_mark *mark;
1247 int ret;
1248
1249 /*
1250 * Enforce per user marks limits per user in all containing user ns.
1251 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1252 * in the limited groups account.
1253 */
1254 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1255 !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1256 return ERR_PTR(-ENOSPC);
1257
1258 fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1259 if (!fan_mark) {
1260 ret = -ENOMEM;
1261 goto out_dec_ucounts;
1262 }
1263
1264 mark = &fan_mark->fsn_mark;
1265 fsnotify_init_mark(mark, group);
1266 if (fan_flags & FAN_MARK_EVICTABLE)
1267 mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
1268
1269 /* Cache fsid of filesystem containing the marked object */
1270 if (fsid) {
1271 ret = fanotify_set_mark_fsid(group, mark, fsid);
1272 if (ret)
1273 goto out_put_mark;
1274 } else {
1275 fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
1276 }
1277
1278 ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
1279 if (ret)
1280 goto out_put_mark;
1281
1282 return mark;
1283
1284 out_put_mark:
1285 fsnotify_put_mark(mark);
1286 out_dec_ucounts:
1287 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1288 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1289 return ERR_PTR(ret);
1290 }
1291
fanotify_group_init_error_pool(struct fsnotify_group * group)1292 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1293 {
1294 if (mempool_initialized(&group->fanotify_data.error_events_pool))
1295 return 0;
1296
1297 return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1298 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1299 sizeof(struct fanotify_error_event));
1300 }
1301
fanotify_may_update_existing_mark(struct fsnotify_mark * fsn_mark,unsigned int fan_flags)1302 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
1303 unsigned int fan_flags)
1304 {
1305 /*
1306 * Non evictable mark cannot be downgraded to evictable mark.
1307 */
1308 if (fan_flags & FAN_MARK_EVICTABLE &&
1309 !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1310 return -EEXIST;
1311
1312 /*
1313 * New ignore mask semantics cannot be downgraded to old semantics.
1314 */
1315 if (fan_flags & FAN_MARK_IGNORED_MASK &&
1316 fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
1317 return -EEXIST;
1318
1319 /*
1320 * An ignore mask that survives modify could never be downgraded to not
1321 * survive modify. With new FAN_MARK_IGNORE semantics we make that rule
1322 * explicit and return an error when trying to update the ignore mask
1323 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
1324 */
1325 if (fan_flags & FAN_MARK_IGNORE &&
1326 !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1327 fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
1328 return -EEXIST;
1329
1330 return 0;
1331 }
1332
fanotify_add_mark(struct fsnotify_group * group,void * obj,unsigned int obj_type,__u32 mask,unsigned int fan_flags,struct fan_fsid * fsid)1333 static int fanotify_add_mark(struct fsnotify_group *group,
1334 void *obj, unsigned int obj_type,
1335 __u32 mask, unsigned int fan_flags,
1336 struct fan_fsid *fsid)
1337 {
1338 struct fsnotify_mark *fsn_mark;
1339 bool recalc;
1340 int ret = 0;
1341
1342 fsnotify_group_lock(group);
1343 fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1344 if (!fsn_mark) {
1345 fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
1346 fan_flags, fsid);
1347 if (IS_ERR(fsn_mark)) {
1348 fsnotify_group_unlock(group);
1349 return PTR_ERR(fsn_mark);
1350 }
1351 }
1352
1353 /*
1354 * Check if requested mark flags conflict with an existing mark flags.
1355 */
1356 ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
1357 if (ret)
1358 goto out;
1359
1360 /*
1361 * Error events are pre-allocated per group, only if strictly
1362 * needed (i.e. FAN_FS_ERROR was requested).
1363 */
1364 if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
1365 (mask & FAN_FS_ERROR)) {
1366 ret = fanotify_group_init_error_pool(group);
1367 if (ret)
1368 goto out;
1369 }
1370
1371 recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
1372 if (recalc)
1373 fsnotify_recalc_mask(fsn_mark->connector);
1374
1375 out:
1376 fsnotify_group_unlock(group);
1377
1378 fsnotify_put_mark(fsn_mark);
1379 return ret;
1380 }
1381
fanotify_alloc_overflow_event(void)1382 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1383 {
1384 struct fanotify_event *oevent;
1385
1386 oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1387 if (!oevent)
1388 return NULL;
1389
1390 fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1391 oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1392
1393 return &oevent->fse;
1394 }
1395
fanotify_alloc_merge_hash(void)1396 static struct hlist_head *fanotify_alloc_merge_hash(void)
1397 {
1398 struct hlist_head *hash;
1399
1400 hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1401 GFP_KERNEL_ACCOUNT);
1402 if (!hash)
1403 return NULL;
1404
1405 __hash_init(hash, FANOTIFY_HTABLE_SIZE);
1406
1407 return hash;
1408 }
1409
1410 /* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init,unsigned int,flags,unsigned int,event_f_flags)1411 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1412 {
1413 struct fsnotify_group *group;
1414 int f_flags, fd;
1415 unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1416 unsigned int class = flags & FANOTIFY_CLASS_BITS;
1417 unsigned int internal_flags = 0;
1418
1419 pr_debug("%s: flags=%x event_f_flags=%x\n",
1420 __func__, flags, event_f_flags);
1421
1422 if (!capable(CAP_SYS_ADMIN)) {
1423 /*
1424 * An unprivileged user can setup an fanotify group with
1425 * limited functionality - an unprivileged group is limited to
1426 * notification events with file handles and it cannot use
1427 * unlimited queue/marks.
1428 */
1429 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1430 return -EPERM;
1431
1432 /*
1433 * Setting the internal flag FANOTIFY_UNPRIV on the group
1434 * prevents setting mount/filesystem marks on this group and
1435 * prevents reporting pid and open fd in events.
1436 */
1437 internal_flags |= FANOTIFY_UNPRIV;
1438 }
1439
1440 #ifdef CONFIG_AUDITSYSCALL
1441 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1442 #else
1443 if (flags & ~FANOTIFY_INIT_FLAGS)
1444 #endif
1445 return -EINVAL;
1446
1447 /*
1448 * A pidfd can only be returned for a thread-group leader; thus
1449 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1450 * exclusive.
1451 */
1452 if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1453 return -EINVAL;
1454
1455 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1456 return -EINVAL;
1457
1458 switch (event_f_flags & O_ACCMODE) {
1459 case O_RDONLY:
1460 case O_RDWR:
1461 case O_WRONLY:
1462 break;
1463 default:
1464 return -EINVAL;
1465 }
1466
1467 if (fid_mode && class != FAN_CLASS_NOTIF)
1468 return -EINVAL;
1469
1470 /*
1471 * Child name is reported with parent fid so requires dir fid.
1472 * We can report both child fid and dir fid with or without name.
1473 */
1474 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1475 return -EINVAL;
1476
1477 /*
1478 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1479 * and is used as an indication to report both dir and child fid on all
1480 * dirent events.
1481 */
1482 if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1483 (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1484 return -EINVAL;
1485
1486 f_flags = O_RDWR | __FMODE_NONOTIFY;
1487 if (flags & FAN_CLOEXEC)
1488 f_flags |= O_CLOEXEC;
1489 if (flags & FAN_NONBLOCK)
1490 f_flags |= O_NONBLOCK;
1491
1492 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
1493 group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
1494 FSNOTIFY_GROUP_USER);
1495 if (IS_ERR(group)) {
1496 return PTR_ERR(group);
1497 }
1498
1499 /* Enforce groups limits per user in all containing user ns */
1500 group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1501 current_euid(),
1502 UCOUNT_FANOTIFY_GROUPS);
1503 if (!group->fanotify_data.ucounts) {
1504 fd = -EMFILE;
1505 goto out_destroy_group;
1506 }
1507
1508 group->fanotify_data.flags = flags | internal_flags;
1509 group->memcg = get_mem_cgroup_from_mm(current->mm);
1510
1511 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1512 if (!group->fanotify_data.merge_hash) {
1513 fd = -ENOMEM;
1514 goto out_destroy_group;
1515 }
1516
1517 group->overflow_event = fanotify_alloc_overflow_event();
1518 if (unlikely(!group->overflow_event)) {
1519 fd = -ENOMEM;
1520 goto out_destroy_group;
1521 }
1522
1523 if (force_o_largefile())
1524 event_f_flags |= O_LARGEFILE;
1525 group->fanotify_data.f_flags = event_f_flags;
1526 init_waitqueue_head(&group->fanotify_data.access_waitq);
1527 INIT_LIST_HEAD(&group->fanotify_data.access_list);
1528 switch (class) {
1529 case FAN_CLASS_NOTIF:
1530 group->priority = FSNOTIFY_PRIO_NORMAL;
1531 break;
1532 case FAN_CLASS_CONTENT:
1533 group->priority = FSNOTIFY_PRIO_CONTENT;
1534 break;
1535 case FAN_CLASS_PRE_CONTENT:
1536 group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
1537 break;
1538 default:
1539 fd = -EINVAL;
1540 goto out_destroy_group;
1541 }
1542
1543 if (flags & FAN_UNLIMITED_QUEUE) {
1544 fd = -EPERM;
1545 if (!capable(CAP_SYS_ADMIN))
1546 goto out_destroy_group;
1547 group->max_events = UINT_MAX;
1548 } else {
1549 group->max_events = fanotify_max_queued_events;
1550 }
1551
1552 if (flags & FAN_UNLIMITED_MARKS) {
1553 fd = -EPERM;
1554 if (!capable(CAP_SYS_ADMIN))
1555 goto out_destroy_group;
1556 }
1557
1558 if (flags & FAN_ENABLE_AUDIT) {
1559 fd = -EPERM;
1560 if (!capable(CAP_AUDIT_WRITE))
1561 goto out_destroy_group;
1562 }
1563
1564 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1565 if (fd < 0)
1566 goto out_destroy_group;
1567
1568 return fd;
1569
1570 out_destroy_group:
1571 fsnotify_destroy_group(group);
1572 return fd;
1573 }
1574
fanotify_test_fsid(struct dentry * dentry,unsigned int flags,struct fan_fsid * fsid)1575 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
1576 struct fan_fsid *fsid)
1577 {
1578 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1579 __kernel_fsid_t root_fsid;
1580 int err;
1581
1582 /*
1583 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1584 */
1585 err = vfs_get_fsid(dentry, &fsid->id);
1586 if (err)
1587 return err;
1588
1589 fsid->sb = dentry->d_sb;
1590 if (!fsid->id.val[0] && !fsid->id.val[1]) {
1591 err = -ENODEV;
1592 goto weak;
1593 }
1594
1595 /*
1596 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1597 * which uses a different fsid than sb root.
1598 */
1599 err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1600 if (err)
1601 return err;
1602
1603 if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
1604 err = -EXDEV;
1605 goto weak;
1606 }
1607
1608 fsid->weak = false;
1609 return 0;
1610
1611 weak:
1612 /* Allow weak fsid when marking inodes */
1613 fsid->weak = true;
1614 return (mark_type == FAN_MARK_INODE) ? 0 : err;
1615 }
1616
1617 /* Check if filesystem can encode a unique fid */
fanotify_test_fid(struct dentry * dentry,unsigned int flags)1618 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
1619 {
1620 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1621 const struct export_operations *nop = dentry->d_sb->s_export_op;
1622
1623 /*
1624 * We need to make sure that the filesystem supports encoding of
1625 * file handles so user can use name_to_handle_at() to compare fids
1626 * reported with events to the file handle of watched objects.
1627 */
1628 if (!exportfs_can_encode_fid(nop))
1629 return -EOPNOTSUPP;
1630
1631 /*
1632 * For sb/mount mark, we also need to make sure that the filesystem
1633 * supports decoding file handles, so user has a way to map back the
1634 * reported fids to filesystem objects.
1635 */
1636 if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
1637 return -EOPNOTSUPP;
1638
1639 return 0;
1640 }
1641
fanotify_events_supported(struct fsnotify_group * group,const struct path * path,__u64 mask,unsigned int flags)1642 static int fanotify_events_supported(struct fsnotify_group *group,
1643 const struct path *path, __u64 mask,
1644 unsigned int flags)
1645 {
1646 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1647 /* Strict validation of events in non-dir inode mask with v5.17+ APIs */
1648 bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
1649 (mask & FAN_RENAME) ||
1650 (flags & FAN_MARK_IGNORE);
1651
1652 /*
1653 * Some filesystems such as 'proc' acquire unusual locks when opening
1654 * files. For them fanotify permission events have high chances of
1655 * deadlocking the system - open done when reporting fanotify event
1656 * blocks on this "unusual" lock while another process holding the lock
1657 * waits for fanotify permission event to be answered. Just disallow
1658 * permission events for such filesystems.
1659 */
1660 if (mask & FANOTIFY_PERM_EVENTS &&
1661 path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1662 return -EINVAL;
1663
1664 /*
1665 * mount and sb marks are not allowed on kernel internal pseudo fs,
1666 * like pipe_mnt, because that would subscribe to events on all the
1667 * anonynous pipes in the system.
1668 *
1669 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1670 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1671 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1672 * mark is questionable. For now we leave them alone.
1673 */
1674 if (mark_type != FAN_MARK_INODE &&
1675 path->mnt->mnt_sb->s_flags & SB_NOUSER)
1676 return -EINVAL;
1677
1678 /*
1679 * We shouldn't have allowed setting dirent events and the directory
1680 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
1681 * but because we always allowed it, error only when using new APIs.
1682 */
1683 if (strict_dir_events && mark_type == FAN_MARK_INODE &&
1684 !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
1685 return -ENOTDIR;
1686
1687 return 0;
1688 }
1689
do_fanotify_mark(int fanotify_fd,unsigned int flags,__u64 mask,int dfd,const char __user * pathname)1690 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1691 int dfd, const char __user *pathname)
1692 {
1693 struct inode *inode = NULL;
1694 struct vfsmount *mnt = NULL;
1695 struct fsnotify_group *group;
1696 struct fd f;
1697 struct path path;
1698 struct fan_fsid __fsid, *fsid = NULL;
1699 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1700 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1701 unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
1702 unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
1703 unsigned int obj_type, fid_mode;
1704 void *obj;
1705 u32 umask = 0;
1706 int ret;
1707
1708 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1709 __func__, fanotify_fd, flags, dfd, pathname, mask);
1710
1711 /* we only use the lower 32 bits as of right now. */
1712 if (upper_32_bits(mask))
1713 return -EINVAL;
1714
1715 if (flags & ~FANOTIFY_MARK_FLAGS)
1716 return -EINVAL;
1717
1718 switch (mark_type) {
1719 case FAN_MARK_INODE:
1720 obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1721 break;
1722 case FAN_MARK_MOUNT:
1723 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1724 break;
1725 case FAN_MARK_FILESYSTEM:
1726 obj_type = FSNOTIFY_OBJ_TYPE_SB;
1727 break;
1728 default:
1729 return -EINVAL;
1730 }
1731
1732 switch (mark_cmd) {
1733 case FAN_MARK_ADD:
1734 case FAN_MARK_REMOVE:
1735 if (!mask)
1736 return -EINVAL;
1737 break;
1738 case FAN_MARK_FLUSH:
1739 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1740 return -EINVAL;
1741 break;
1742 default:
1743 return -EINVAL;
1744 }
1745
1746 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1747 valid_mask |= FANOTIFY_PERM_EVENTS;
1748
1749 if (mask & ~valid_mask)
1750 return -EINVAL;
1751
1752
1753 /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
1754 if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
1755 return -EINVAL;
1756
1757 /*
1758 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
1759 * FAN_MARK_IGNORED_MASK.
1760 */
1761 if (ignore == FAN_MARK_IGNORED_MASK) {
1762 mask &= ~FANOTIFY_EVENT_FLAGS;
1763 umask = FANOTIFY_EVENT_FLAGS;
1764 }
1765
1766 f = fdget(fanotify_fd);
1767 if (unlikely(!fd_file(f)))
1768 return -EBADF;
1769
1770 /* verify that this is indeed an fanotify instance */
1771 ret = -EINVAL;
1772 if (unlikely(fd_file(f)->f_op != &fanotify_fops))
1773 goto fput_and_out;
1774 group = fd_file(f)->private_data;
1775
1776 /*
1777 * An unprivileged user is not allowed to setup mount nor filesystem
1778 * marks. This also includes setting up such marks by a group that
1779 * was initialized by an unprivileged user.
1780 */
1781 ret = -EPERM;
1782 if ((!capable(CAP_SYS_ADMIN) ||
1783 FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1784 mark_type != FAN_MARK_INODE)
1785 goto fput_and_out;
1786
1787 /*
1788 * Permission events require minimum priority FAN_CLASS_CONTENT.
1789 */
1790 ret = -EINVAL;
1791 if (mask & FANOTIFY_PERM_EVENTS &&
1792 group->priority < FSNOTIFY_PRIO_CONTENT)
1793 goto fput_and_out;
1794
1795 if (mask & FAN_FS_ERROR &&
1796 mark_type != FAN_MARK_FILESYSTEM)
1797 goto fput_and_out;
1798
1799 /*
1800 * Evictable is only relevant for inode marks, because only inode object
1801 * can be evicted on memory pressure.
1802 */
1803 if (flags & FAN_MARK_EVICTABLE &&
1804 mark_type != FAN_MARK_INODE)
1805 goto fput_and_out;
1806
1807 /*
1808 * Events that do not carry enough information to report
1809 * event->fd require a group that supports reporting fid. Those
1810 * events are not supported on a mount mark, because they do not
1811 * carry enough information (i.e. path) to be filtered by mount
1812 * point.
1813 */
1814 fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1815 if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1816 (!fid_mode || mark_type == FAN_MARK_MOUNT))
1817 goto fput_and_out;
1818
1819 /*
1820 * FAN_RENAME uses special info type records to report the old and
1821 * new parent+name. Reporting only old and new parent id is less
1822 * useful and was not implemented.
1823 */
1824 if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1825 goto fput_and_out;
1826
1827 if (mark_cmd == FAN_MARK_FLUSH) {
1828 ret = 0;
1829 if (mark_type == FAN_MARK_MOUNT)
1830 fsnotify_clear_vfsmount_marks_by_group(group);
1831 else if (mark_type == FAN_MARK_FILESYSTEM)
1832 fsnotify_clear_sb_marks_by_group(group);
1833 else
1834 fsnotify_clear_inode_marks_by_group(group);
1835 goto fput_and_out;
1836 }
1837
1838 ret = fanotify_find_path(dfd, pathname, &path, flags,
1839 (mask & ALL_FSNOTIFY_EVENTS), obj_type);
1840 if (ret)
1841 goto fput_and_out;
1842
1843 if (mark_cmd == FAN_MARK_ADD) {
1844 ret = fanotify_events_supported(group, &path, mask, flags);
1845 if (ret)
1846 goto path_put_and_out;
1847 }
1848
1849 if (fid_mode) {
1850 ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
1851 if (ret)
1852 goto path_put_and_out;
1853
1854 ret = fanotify_test_fid(path.dentry, flags);
1855 if (ret)
1856 goto path_put_and_out;
1857
1858 fsid = &__fsid;
1859 }
1860
1861 /* inode held in place by reference to path; group by fget on fd */
1862 if (mark_type == FAN_MARK_INODE) {
1863 inode = path.dentry->d_inode;
1864 obj = inode;
1865 } else {
1866 mnt = path.mnt;
1867 if (mark_type == FAN_MARK_MOUNT)
1868 obj = mnt;
1869 else
1870 obj = mnt->mnt_sb;
1871 }
1872
1873 /*
1874 * If some other task has this inode open for write we should not add
1875 * an ignore mask, unless that ignore mask is supposed to survive
1876 * modification changes anyway.
1877 */
1878 if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
1879 !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
1880 ret = mnt ? -EINVAL : -EISDIR;
1881 /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
1882 if (ignore == FAN_MARK_IGNORE &&
1883 (mnt || S_ISDIR(inode->i_mode)))
1884 goto path_put_and_out;
1885
1886 ret = 0;
1887 if (inode && inode_is_open_for_write(inode))
1888 goto path_put_and_out;
1889 }
1890
1891 /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1892 if (mnt || !S_ISDIR(inode->i_mode)) {
1893 mask &= ~FAN_EVENT_ON_CHILD;
1894 umask = FAN_EVENT_ON_CHILD;
1895 /*
1896 * If group needs to report parent fid, register for getting
1897 * events with parent/name info for non-directory.
1898 */
1899 if ((fid_mode & FAN_REPORT_DIR_FID) &&
1900 (flags & FAN_MARK_ADD) && !ignore)
1901 mask |= FAN_EVENT_ON_CHILD;
1902 }
1903
1904 /* create/update an inode mark */
1905 switch (mark_cmd) {
1906 case FAN_MARK_ADD:
1907 ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
1908 fsid);
1909 break;
1910 case FAN_MARK_REMOVE:
1911 ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
1912 umask);
1913 break;
1914 default:
1915 ret = -EINVAL;
1916 }
1917
1918 path_put_and_out:
1919 path_put(&path);
1920 fput_and_out:
1921 fdput(f);
1922 return ret;
1923 }
1924
1925 #ifndef CONFIG_ARCH_SPLIT_ARG64
SYSCALL_DEFINE5(fanotify_mark,int,fanotify_fd,unsigned int,flags,__u64,mask,int,dfd,const char __user *,pathname)1926 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1927 __u64, mask, int, dfd,
1928 const char __user *, pathname)
1929 {
1930 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1931 }
1932 #endif
1933
1934 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
SYSCALL32_DEFINE6(fanotify_mark,int,fanotify_fd,unsigned int,flags,SC_ARG64 (mask),int,dfd,const char __user *,pathname)1935 SYSCALL32_DEFINE6(fanotify_mark,
1936 int, fanotify_fd, unsigned int, flags,
1937 SC_ARG64(mask), int, dfd,
1938 const char __user *, pathname)
1939 {
1940 return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1941 dfd, pathname);
1942 }
1943 #endif
1944
1945 /*
1946 * fanotify_user_setup - Our initialization function. Note that we cannot return
1947 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
1948 * must result in panic().
1949 */
fanotify_user_setup(void)1950 static int __init fanotify_user_setup(void)
1951 {
1952 struct sysinfo si;
1953 int max_marks;
1954
1955 si_meminfo(&si);
1956 /*
1957 * Allow up to 1% of addressable memory to be accounted for per user
1958 * marks limited to the range [8192, 1048576]. mount and sb marks are
1959 * a lot cheaper than inode marks, but there is no reason for a user
1960 * to have many of those, so calculate by the cost of inode marks.
1961 */
1962 max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1963 INODE_MARK_COST;
1964 max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1965 FANOTIFY_DEFAULT_MAX_USER_MARKS);
1966
1967 BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1968 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
1969 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
1970
1971 fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
1972 SLAB_PANIC|SLAB_ACCOUNT);
1973 fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1974 SLAB_PANIC);
1975 fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1976 SLAB_PANIC);
1977 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1978 fanotify_perm_event_cachep =
1979 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1980 }
1981
1982 fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1983 init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1984 FANOTIFY_DEFAULT_MAX_GROUPS;
1985 init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1986 fanotify_sysctls_init();
1987
1988 return 0;
1989 }
1990 device_initcall(fanotify_user_setup);
1991