1 The text below describes the locking rules for VFS-related methods. 2It is (believed to be) up-to-date. *Please*, if you change anything in 3prototypes or locking protocols - update this file. And update the relevant 4instances in the tree, don't leave that to maintainers of filesystems/devices/ 5etc. At the very least, put the list of dubious cases in the end of this file. 6Don't turn it into log - maintainers of out-of-the-tree code are supposed to 7be able to use diff(1). 8 Thing currently missing here: socket operations. Alexey? 9 10--------------------------- dentry_operations -------------------------- 11prototypes: 12 int (*d_revalidate)(struct dentry *, unsigned int); 13 int (*d_weak_revalidate)(struct dentry *, unsigned int); 14 int (*d_hash)(const struct dentry *, struct qstr *); 15 int (*d_compare)(const struct dentry *, const struct dentry *, 16 unsigned int, const char *, const struct qstr *); 17 int (*d_delete)(struct dentry *); 18 void (*d_release)(struct dentry *); 19 void (*d_iput)(struct dentry *, struct inode *); 20 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 21 struct vfsmount *(*d_automount)(struct path *path); 22 int (*d_manage)(struct dentry *, bool); 23 24locking rules: 25 rename_lock ->d_lock may block rcu-walk 26d_revalidate: no no yes (ref-walk) maybe 27d_weak_revalidate:no no yes no 28d_hash no no no maybe 29d_compare: yes no no maybe 30d_delete: no yes no no 31d_release: no no yes no 32d_prune: no yes no no 33d_iput: no no yes no 34d_dname: no no no no 35d_automount: no no yes no 36d_manage: no no yes (ref-walk) maybe 37 38--------------------------- inode_operations --------------------------- 39prototypes: 40 int (*create) (struct inode *,struct dentry *,umode_t, bool); 41 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); 42 int (*link) (struct dentry *,struct inode *,struct dentry *); 43 int (*unlink) (struct inode *,struct dentry *); 44 int (*symlink) (struct inode *,struct dentry *,const char *); 45 int (*mkdir) (struct inode *,struct dentry *,umode_t); 46 int (*rmdir) (struct inode *,struct dentry *); 47 int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); 48 int (*rename) (struct inode *, struct dentry *, 49 struct inode *, struct dentry *); 50 int (*rename2) (struct inode *, struct dentry *, 51 struct inode *, struct dentry *, unsigned int); 52 int (*readlink) (struct dentry *, char __user *,int); 53 void * (*follow_link) (struct dentry *, struct nameidata *); 54 void (*put_link) (struct dentry *, struct nameidata *, void *); 55 void (*truncate) (struct inode *); 56 int (*permission) (struct inode *, int, unsigned int); 57 int (*get_acl)(struct inode *, int); 58 int (*setattr) (struct dentry *, struct iattr *); 59 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 60 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 61 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 62 ssize_t (*listxattr) (struct dentry *, char *, size_t); 63 int (*removexattr) (struct dentry *, const char *); 64 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); 65 void (*update_time)(struct inode *, struct timespec *, int); 66 int (*atomic_open)(struct inode *, struct dentry *, 67 struct file *, unsigned open_flag, 68 umode_t create_mode, int *opened); 69 int (*tmpfile) (struct inode *, struct dentry *, umode_t); 70 int (*dentry_open)(struct dentry *, struct file *, const struct cred *); 71 72locking rules: 73 all may block 74 i_mutex(inode) 75lookup: yes 76create: yes 77link: yes (both) 78mknod: yes 79symlink: yes 80mkdir: yes 81unlink: yes (both) 82rmdir: yes (both) (see below) 83rename: yes (all) (see below) 84rename2: yes (all) (see below) 85readlink: no 86follow_link: no 87put_link: no 88setattr: yes 89permission: no (may not block if called in rcu-walk mode) 90get_acl: no 91getattr: no 92setxattr: yes 93getxattr: no 94listxattr: no 95removexattr: yes 96fiemap: no 97update_time: no 98atomic_open: yes 99tmpfile: no 100dentry_open: no 101 102 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on 103victim. 104 cross-directory ->rename() and rename2() has (per-superblock) 105->s_vfs_rename_sem. 106 107See Documentation/filesystems/directory-locking for more detailed discussion 108of the locking scheme for directory operations. 109 110--------------------------- super_operations --------------------------- 111prototypes: 112 struct inode *(*alloc_inode)(struct super_block *sb); 113 void (*destroy_inode)(struct inode *); 114 void (*dirty_inode) (struct inode *, int flags); 115 int (*write_inode) (struct inode *, struct writeback_control *wbc); 116 int (*drop_inode) (struct inode *); 117 void (*evict_inode) (struct inode *); 118 void (*put_super) (struct super_block *); 119 int (*sync_fs)(struct super_block *sb, int wait); 120 int (*freeze_fs) (struct super_block *); 121 int (*unfreeze_fs) (struct super_block *); 122 int (*statfs) (struct dentry *, struct kstatfs *); 123 int (*remount_fs) (struct super_block *, int *, char *); 124 void (*umount_begin) (struct super_block *); 125 int (*show_options)(struct seq_file *, struct dentry *); 126 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 127 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 128 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 129 130locking rules: 131 All may block [not true, see below] 132 s_umount 133alloc_inode: 134destroy_inode: 135dirty_inode: 136write_inode: 137drop_inode: !!!inode->i_lock!!! 138evict_inode: 139put_super: write 140sync_fs: read 141freeze_fs: write 142unfreeze_fs: write 143statfs: maybe(read) (see below) 144remount_fs: write 145umount_begin: no 146show_options: no (namespace_sem) 147quota_read: no (see below) 148quota_write: no (see below) 149bdev_try_to_free_page: no (see below) 150 151->statfs() has s_umount (shared) when called by ustat(2) (native or 152compat), but that's an accident of bad API; s_umount is used to pin 153the superblock down when we only have dev_t given us by userland to 154identify the superblock. Everything else (statfs(), fstatfs(), etc.) 155doesn't hold it when calling ->statfs() - superblock is pinned down 156by resolving the pathname passed to syscall. 157->quota_read() and ->quota_write() functions are both guaranteed to 158be the only ones operating on the quota file by the quota code (via 159dqio_sem) (unless an admin really wants to screw up something and 160writes to quota files with quotas on). For other details about locking 161see also dquot_operations section. 162->bdev_try_to_free_page is called from the ->releasepage handler of 163the block device inode. See there for more details. 164 165--------------------------- file_system_type --------------------------- 166prototypes: 167 int (*get_sb) (struct file_system_type *, int, 168 const char *, void *, struct vfsmount *); 169 struct dentry *(*mount) (struct file_system_type *, int, 170 const char *, void *); 171 void (*kill_sb) (struct super_block *); 172locking rules: 173 may block 174mount yes 175kill_sb yes 176 177->mount() returns ERR_PTR or the root dentry; its superblock should be locked 178on return. 179->kill_sb() takes a write-locked superblock, does all shutdown work on it, 180unlocks and drops the reference. 181 182--------------------------- address_space_operations -------------------------- 183prototypes: 184 int (*writepage)(struct page *page, struct writeback_control *wbc); 185 int (*readpage)(struct file *, struct page *); 186 int (*sync_page)(struct page *); 187 int (*writepages)(struct address_space *, struct writeback_control *); 188 int (*set_page_dirty)(struct page *page); 189 int (*readpages)(struct file *filp, struct address_space *mapping, 190 struct list_head *pages, unsigned nr_pages); 191 int (*write_begin)(struct file *, struct address_space *mapping, 192 loff_t pos, unsigned len, unsigned flags, 193 struct page **pagep, void **fsdata); 194 int (*write_end)(struct file *, struct address_space *mapping, 195 loff_t pos, unsigned len, unsigned copied, 196 struct page *page, void *fsdata); 197 sector_t (*bmap)(struct address_space *, sector_t); 198 void (*invalidatepage) (struct page *, unsigned int, unsigned int); 199 int (*releasepage) (struct page *, int); 200 void (*freepage)(struct page *); 201 int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t offset); 202 int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, 203 unsigned long *); 204 int (*migratepage)(struct address_space *, struct page *, struct page *); 205 int (*launder_page)(struct page *); 206 int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); 207 int (*error_remove_page)(struct address_space *, struct page *); 208 int (*swap_activate)(struct file *); 209 int (*swap_deactivate)(struct file *); 210 211locking rules: 212 All except set_page_dirty and freepage may block 213 214 PageLocked(page) i_mutex 215writepage: yes, unlocks (see below) 216readpage: yes, unlocks 217sync_page: maybe 218writepages: 219set_page_dirty no 220readpages: 221write_begin: locks the page yes 222write_end: yes, unlocks yes 223bmap: 224invalidatepage: yes 225releasepage: yes 226freepage: yes 227direct_IO: 228get_xip_mem: maybe 229migratepage: yes (both) 230launder_page: yes 231is_partially_uptodate: yes 232error_remove_page: yes 233swap_activate: no 234swap_deactivate: no 235 236 ->write_begin(), ->write_end(), ->sync_page() and ->readpage() 237may be called from the request handler (/dev/loop). 238 239 ->readpage() unlocks the page, either synchronously or via I/O 240completion. 241 242 ->readpages() populates the pagecache with the passed pages and starts 243I/O against them. They come unlocked upon I/O completion. 244 245 ->writepage() is used for two purposes: for "memory cleansing" and for 246"sync". These are quite different operations and the behaviour may differ 247depending upon the mode. 248 249If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then 250it *must* start I/O against the page, even if that would involve 251blocking on in-progress I/O. 252 253If writepage is called for memory cleansing (sync_mode == 254WBC_SYNC_NONE) then its role is to get as much writeout underway as 255possible. So writepage should try to avoid blocking against 256currently-in-progress I/O. 257 258If the filesystem is not called for "sync" and it determines that it 259would need to block against in-progress I/O to be able to start new I/O 260against the page the filesystem should redirty the page with 261redirty_page_for_writepage(), then unlock the page and return zero. 262This may also be done to avoid internal deadlocks, but rarely. 263 264If the filesystem is called for sync then it must wait on any 265in-progress I/O and then start new I/O. 266 267The filesystem should unlock the page synchronously, before returning to the 268caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE 269value. WRITEPAGE_ACTIVATE means that page cannot really be written out 270currently, and VM should stop calling ->writepage() on this page for some 271time. VM does this by moving page to the head of the active list, hence the 272name. 273 274Unless the filesystem is going to redirty_page_for_writepage(), unlock the page 275and return zero, writepage *must* run set_page_writeback() against the page, 276followed by unlocking it. Once set_page_writeback() has been run against the 277page, write I/O can be submitted and the write I/O completion handler must run 278end_page_writeback() once the I/O is complete. If no I/O is submitted, the 279filesystem must run end_page_writeback() against the page before returning from 280writepage. 281 282That is: after 2.5.12, pages which are under writeout are *not* locked. Note, 283if the filesystem needs the page to be locked during writeout, that is ok, too, 284the page is allowed to be unlocked at any point in time between the calls to 285set_page_writeback() and end_page_writeback(). 286 287Note, failure to run either redirty_page_for_writepage() or the combination of 288set_page_writeback()/end_page_writeback() on a page submitted to writepage 289will leave the page itself marked clean but it will be tagged as dirty in the 290radix tree. This incoherency can lead to all sorts of hard-to-debug problems 291in the filesystem like having dirty inodes at umount and losing written data. 292 293 ->sync_page() locking rules are not well-defined - usually it is called 294with lock on page, but that is not guaranteed. Considering the currently 295existing instances of this method ->sync_page() itself doesn't look 296well-defined... 297 298 ->writepages() is used for periodic writeback and for syscall-initiated 299sync operations. The address_space should start I/O against at least 300*nr_to_write pages. *nr_to_write must be decremented for each page which is 301written. The address_space implementation may write more (or less) pages 302than *nr_to_write asks for, but it should try to be reasonably close. If 303nr_to_write is NULL, all dirty pages must be written. 304 305writepages should _only_ write pages which are present on 306mapping->io_pages. 307 308 ->set_page_dirty() is called from various places in the kernel 309when the target page is marked as needing writeback. It may be called 310under spinlock (it cannot block) and is sometimes called with the page 311not locked. 312 313 ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 314filesystems and by the swapper. The latter will eventually go away. Please, 315keep it that way and don't breed new callers. 316 317 ->invalidatepage() is called when the filesystem must attempt to drop 318some or all of the buffers from the page when it is being truncated. It 319returns zero on success. If ->invalidatepage is zero, the kernel uses 320block_invalidatepage() instead. 321 322 ->releasepage() is called when the kernel is about to try to drop the 323buffers from the page in preparation for freeing it. It returns zero to 324indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 325the kernel assumes that the fs has no private interest in the buffers. 326 327 ->freepage() is called when the kernel is done dropping the page 328from the page cache. 329 330 ->launder_page() may be called prior to releasing a page if 331it is still found to be dirty. It returns zero if the page was successfully 332cleaned, or an error value if not. Note that in order to prevent the page 333getting mapped back in and redirtied, it needs to be kept locked 334across the entire operation. 335 336 ->swap_activate will be called with a non-zero argument on 337files backing (non block device backed) swapfiles. A return value 338of zero indicates success, in which case this file can be used for 339backing swapspace. The swapspace operations will be proxied to the 340address space operations. 341 342 ->swap_deactivate() will be called in the sys_swapoff() 343path after ->swap_activate() returned success. 344 345----------------------- file_lock_operations ------------------------------ 346prototypes: 347 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 348 void (*fl_release_private)(struct file_lock *); 349 350 351locking rules: 352 inode->i_lock may block 353fl_copy_lock: yes no 354fl_release_private: maybe maybe[1] 355 356[1]: ->fl_release_private for flock or POSIX locks is currently allowed 357to block. Leases however can still be freed while the i_lock is held and 358so fl_release_private called on a lease should not block. 359 360----------------------- lock_manager_operations --------------------------- 361prototypes: 362 int (*lm_compare_owner)(struct file_lock *, struct file_lock *); 363 unsigned long (*lm_owner_key)(struct file_lock *); 364 void (*lm_notify)(struct file_lock *); /* unblock callback */ 365 int (*lm_grant)(struct file_lock *, struct file_lock *, int); 366 void (*lm_break)(struct file_lock *); /* break_lease callback */ 367 int (*lm_change)(struct file_lock **, int); 368 369locking rules: 370 371 inode->i_lock blocked_lock_lock may block 372lm_compare_owner: yes[1] maybe no 373lm_owner_key yes[1] yes no 374lm_notify: yes yes no 375lm_grant: no no no 376lm_break: yes no no 377lm_change yes no no 378 379[1]: ->lm_compare_owner and ->lm_owner_key are generally called with 380*an* inode->i_lock held. It may not be the i_lock of the inode 381associated with either file_lock argument! This is the case with deadlock 382detection, since the code has to chase down the owners of locks that may 383be entirely unrelated to the one on which the lock is being acquired. 384For deadlock detection however, the blocked_lock_lock is also held. The 385fact that these locks are held ensures that the file_locks do not 386disappear out from under you while doing the comparison or generating an 387owner key. 388 389--------------------------- buffer_head ----------------------------------- 390prototypes: 391 void (*b_end_io)(struct buffer_head *bh, int uptodate); 392 393locking rules: 394 called from interrupts. In other words, extreme care is needed here. 395bh is locked, but that's all warranties we have here. Currently only RAID1, 396highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices 397call this method upon the IO completion. 398 399--------------------------- block_device_operations ----------------------- 400prototypes: 401 int (*open) (struct block_device *, fmode_t); 402 int (*release) (struct gendisk *, fmode_t); 403 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 404 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 405 int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); 406 int (*media_changed) (struct gendisk *); 407 void (*unlock_native_capacity) (struct gendisk *); 408 int (*revalidate_disk) (struct gendisk *); 409 int (*getgeo)(struct block_device *, struct hd_geometry *); 410 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 411 412locking rules: 413 bd_mutex 414open: yes 415release: yes 416ioctl: no 417compat_ioctl: no 418direct_access: no 419media_changed: no 420unlock_native_capacity: no 421revalidate_disk: no 422getgeo: no 423swap_slot_free_notify: no (see below) 424 425media_changed, unlock_native_capacity and revalidate_disk are called only from 426check_disk_change(). 427 428swap_slot_free_notify is called with swap_lock and sometimes the page lock 429held. 430 431 432--------------------------- file_operations ------------------------------- 433prototypes: 434 loff_t (*llseek) (struct file *, loff_t, int); 435 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 436 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 437 ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 438 ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); 439 ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); 440 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); 441 int (*iterate) (struct file *, struct dir_context *); 442 unsigned int (*poll) (struct file *, struct poll_table_struct *); 443 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 444 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 445 int (*mmap) (struct file *, struct vm_area_struct *); 446 int (*open) (struct inode *, struct file *); 447 int (*flush) (struct file *); 448 int (*release) (struct inode *, struct file *); 449 int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); 450 int (*aio_fsync) (struct kiocb *, int datasync); 451 int (*fasync) (int, struct file *, int); 452 int (*lock) (struct file *, int, struct file_lock *); 453 ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, 454 loff_t *); 455 ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, 456 loff_t *); 457 ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, 458 void __user *); 459 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, 460 loff_t *, int); 461 unsigned long (*get_unmapped_area)(struct file *, unsigned long, 462 unsigned long, unsigned long, unsigned long); 463 int (*check_flags)(int); 464 int (*flock) (struct file *, int, struct file_lock *); 465 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, 466 size_t, unsigned int); 467 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, 468 size_t, unsigned int); 469 int (*setlease)(struct file *, long, struct file_lock **, void **); 470 long (*fallocate)(struct file *, int, loff_t, loff_t); 471}; 472 473locking rules: 474 All may block. 475 476->llseek() locking has moved from llseek to the individual llseek 477implementations. If your fs is not using generic_file_llseek, you 478need to acquire and release the appropriate locks in your ->llseek(). 479For many filesystems, it is probably safe to acquire the inode 480mutex or just to use i_size_read() instead. 481Note: this does not protect the file->f_pos against concurrent modifications 482since this is something the userspace has to take care about. 483 484->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. 485Most instances call fasync_helper(), which does that maintenance, so it's 486not normally something one needs to worry about. Return values > 0 will be 487mapped to zero in the VFS layer. 488 489->readdir() and ->ioctl() on directories must be changed. Ideally we would 490move ->readdir() to inode_operations and use a separate method for directory 491->ioctl() or kill the latter completely. One of the problems is that for 492anything that resembles union-mount we won't have a struct file for all 493components. And there are other reasons why the current interface is a mess... 494 495->read on directories probably must go away - we should just enforce -EISDIR 496in sys_read() and friends. 497 498->setlease operations should call generic_setlease() before or after setting 499the lease within the individual filesystem to record the result of the 500operation 501 502--------------------------- dquot_operations ------------------------------- 503prototypes: 504 int (*write_dquot) (struct dquot *); 505 int (*acquire_dquot) (struct dquot *); 506 int (*release_dquot) (struct dquot *); 507 int (*mark_dirty) (struct dquot *); 508 int (*write_info) (struct super_block *, int); 509 510These operations are intended to be more or less wrapping functions that ensure 511a proper locking wrt the filesystem and call the generic quota operations. 512 513What filesystem should expect from the generic quota functions: 514 515 FS recursion Held locks when called 516write_dquot: yes dqonoff_sem or dqptr_sem 517acquire_dquot: yes dqonoff_sem or dqptr_sem 518release_dquot: yes dqonoff_sem or dqptr_sem 519mark_dirty: no - 520write_info: yes dqonoff_sem 521 522FS recursion means calling ->quota_read() and ->quota_write() from superblock 523operations. 524 525More details about quota locking can be found in fs/dquot.c. 526 527--------------------------- vm_operations_struct ----------------------------- 528prototypes: 529 void (*open)(struct vm_area_struct*); 530 void (*close)(struct vm_area_struct*); 531 int (*fault)(struct vm_area_struct*, struct vm_fault *); 532 int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); 533 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 534 535locking rules: 536 mmap_sem PageLocked(page) 537open: yes 538close: yes 539fault: yes can return with page locked 540map_pages: yes 541page_mkwrite: yes can return with page locked 542access: yes 543 544 ->fault() is called when a previously not present pte is about 545to be faulted in. The filesystem must find and return the page associated 546with the passed in "pgoff" in the vm_fault structure. If it is possible that 547the page may be truncated and/or invalidated, then the filesystem must lock 548the page, then ensure it is not already truncated (the page lock will block 549subsequent truncate), and then return with VM_FAULT_LOCKED, and the page 550locked. The VM will unlock the page. 551 552 ->map_pages() is called when VM asks to map easy accessible pages. 553Filesystem should find and map pages associated with offsets from "pgoff" 554till "max_pgoff". ->map_pages() is called with page table locked and must 555not block. If it's not possible to reach a page without blocking, 556filesystem should skip it. Filesystem should use do_set_pte() to setup 557page table entry. Pointer to entry associated with offset "pgoff" is 558passed in "pte" field in vm_fault structure. Pointers to entries for other 559offsets should be calculated relative to "pte". 560 561 ->page_mkwrite() is called when a previously read-only pte is 562about to become writeable. The filesystem again must ensure that there are 563no truncate/invalidate races, and then return with the page locked. If 564the page has been truncated, the filesystem should not look up a new page 565like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which 566will cause the VM to retry the fault. 567 568 ->access() is called when get_user_pages() fails in 569access_process_vm(), typically used to debug a process through 570/proc/pid/mem or ptrace. This function is needed only for 571VM_IO | VM_PFNMAP VMAs. 572 573================================================================================ 574 Dubious stuff 575 576(if you break something or notice that it is broken and do not fix it yourself 577- at least put it here) 578