1======= 2Locking 3======= 4 5The text below describes the locking rules for VFS-related methods. 6It is (believed to be) up-to-date. *Please*, if you change anything in 7prototypes or locking protocols - update this file. And update the relevant 8instances in the tree, don't leave that to maintainers of filesystems/devices/ 9etc. At the very least, put the list of dubious cases in the end of this file. 10Don't turn it into log - maintainers of out-of-the-tree code are supposed to 11be able to use diff(1). 12 13Thing currently missing here: socket operations. Alexey? 14 15dentry_operations 16================= 17 18prototypes:: 19 20 int (*d_revalidate)(struct dentry *, unsigned int); 21 int (*d_weak_revalidate)(struct dentry *, unsigned int); 22 int (*d_hash)(const struct dentry *, struct qstr *); 23 int (*d_compare)(const struct dentry *, 24 unsigned int, const char *, const struct qstr *); 25 int (*d_delete)(struct dentry *); 26 int (*d_init)(struct dentry *); 27 void (*d_release)(struct dentry *); 28 void (*d_iput)(struct dentry *, struct inode *); 29 char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); 30 struct vfsmount *(*d_automount)(struct path *path); 31 int (*d_manage)(const struct path *, bool); 32 struct dentry *(*d_real)(struct dentry *, const struct inode *); 33 34locking rules: 35 36================== =========== ======== ============== ======== 37ops rename_lock ->d_lock may block rcu-walk 38================== =========== ======== ============== ======== 39d_revalidate: no no yes (ref-walk) maybe 40d_weak_revalidate: no no yes no 41d_hash no no no maybe 42d_compare: yes no no maybe 43d_delete: no yes no no 44d_init: no no yes no 45d_release: no no yes no 46d_prune: no yes no no 47d_iput: no no yes no 48d_dname: no no no no 49d_automount: no no yes no 50d_manage: no no yes (ref-walk) maybe 51d_real no no yes no 52================== =========== ======== ============== ======== 53 54inode_operations 55================ 56 57prototypes:: 58 59 int (*create) (struct inode *,struct dentry *,umode_t, bool); 60 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); 61 int (*link) (struct dentry *,struct inode *,struct dentry *); 62 int (*unlink) (struct inode *,struct dentry *); 63 int (*symlink) (struct inode *,struct dentry *,const char *); 64 int (*mkdir) (struct inode *,struct dentry *,umode_t); 65 int (*rmdir) (struct inode *,struct dentry *); 66 int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); 67 int (*rename) (struct inode *, struct dentry *, 68 struct inode *, struct dentry *, unsigned int); 69 int (*readlink) (struct dentry *, char __user *,int); 70 const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); 71 void (*truncate) (struct inode *); 72 int (*permission) (struct inode *, int, unsigned int); 73 struct posix_acl * (*get_acl)(struct inode *, int, bool); 74 int (*setattr) (struct dentry *, struct iattr *); 75 int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); 76 ssize_t (*listxattr) (struct dentry *, char *, size_t); 77 int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); 78 void (*update_time)(struct inode *, struct timespec *, int); 79 int (*atomic_open)(struct inode *, struct dentry *, 80 struct file *, unsigned open_flag, 81 umode_t create_mode); 82 int (*tmpfile) (struct inode *, struct dentry *, umode_t); 83 int (*fileattr_set)(struct user_namespace *mnt_userns, 84 struct dentry *dentry, struct fileattr *fa); 85 int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); 86 87locking rules: 88 all may block 89 90============= ============================================= 91ops i_rwsem(inode) 92============= ============================================= 93lookup: shared 94create: exclusive 95link: exclusive (both) 96mknod: exclusive 97symlink: exclusive 98mkdir: exclusive 99unlink: exclusive (both) 100rmdir: exclusive (both)(see below) 101rename: exclusive (both parents, some children) (see below) 102readlink: no 103get_link: no 104setattr: exclusive 105permission: no (may not block if called in rcu-walk mode) 106get_acl: no 107getattr: no 108listxattr: no 109fiemap: no 110update_time: no 111atomic_open: shared (exclusive if O_CREAT is set in open flags) 112tmpfile: no 113fileattr_get: no or exclusive 114fileattr_set: exclusive 115============= ============================================= 116 117 118 Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem 119 exclusive on victim. 120 cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. 121 ->unlink() and ->rename() have ->i_rwsem exclusive on all non-directories 122 involved. 123 ->rename() has ->i_rwsem exclusive on any subdirectory that changes parent. 124 125See Documentation/filesystems/directory-locking.rst for more detailed discussion 126of the locking scheme for directory operations. 127 128xattr_handler operations 129======================== 130 131prototypes:: 132 133 bool (*list)(struct dentry *dentry); 134 int (*get)(const struct xattr_handler *handler, struct dentry *dentry, 135 struct inode *inode, const char *name, void *buffer, 136 size_t size); 137 int (*set)(const struct xattr_handler *handler, 138 struct user_namespace *mnt_userns, 139 struct dentry *dentry, struct inode *inode, const char *name, 140 const void *buffer, size_t size, int flags); 141 142locking rules: 143 all may block 144 145===== ============== 146ops i_rwsem(inode) 147===== ============== 148list: no 149get: no 150set: exclusive 151===== ============== 152 153super_operations 154================ 155 156prototypes:: 157 158 struct inode *(*alloc_inode)(struct super_block *sb); 159 void (*free_inode)(struct inode *); 160 void (*destroy_inode)(struct inode *); 161 void (*dirty_inode) (struct inode *, int flags); 162 int (*write_inode) (struct inode *, struct writeback_control *wbc); 163 int (*drop_inode) (struct inode *); 164 void (*evict_inode) (struct inode *); 165 void (*put_super) (struct super_block *); 166 int (*sync_fs)(struct super_block *sb, int wait); 167 int (*freeze_fs) (struct super_block *); 168 int (*unfreeze_fs) (struct super_block *); 169 int (*statfs) (struct dentry *, struct kstatfs *); 170 int (*remount_fs) (struct super_block *, int *, char *); 171 void (*umount_begin) (struct super_block *); 172 int (*show_options)(struct seq_file *, struct dentry *); 173 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 174 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 175 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 176 177locking rules: 178 All may block [not true, see below] 179 180====================== ============ ======================== 181ops s_umount note 182====================== ============ ======================== 183alloc_inode: 184free_inode: called from RCU callback 185destroy_inode: 186dirty_inode: 187write_inode: 188drop_inode: !!!inode->i_lock!!! 189evict_inode: 190put_super: write 191sync_fs: read 192freeze_fs: write 193unfreeze_fs: write 194statfs: maybe(read) (see below) 195remount_fs: write 196umount_begin: no 197show_options: no (namespace_sem) 198quota_read: no (see below) 199quota_write: no (see below) 200bdev_try_to_free_page: no (see below) 201====================== ============ ======================== 202 203->statfs() has s_umount (shared) when called by ustat(2) (native or 204compat), but that's an accident of bad API; s_umount is used to pin 205the superblock down when we only have dev_t given us by userland to 206identify the superblock. Everything else (statfs(), fstatfs(), etc.) 207doesn't hold it when calling ->statfs() - superblock is pinned down 208by resolving the pathname passed to syscall. 209 210->quota_read() and ->quota_write() functions are both guaranteed to 211be the only ones operating on the quota file by the quota code (via 212dqio_sem) (unless an admin really wants to screw up something and 213writes to quota files with quotas on). For other details about locking 214see also dquot_operations section. 215 216->bdev_try_to_free_page is called from the ->releasepage handler of 217the block device inode. See there for more details. 218 219file_system_type 220================ 221 222prototypes:: 223 224 struct dentry *(*mount) (struct file_system_type *, int, 225 const char *, void *); 226 void (*kill_sb) (struct super_block *); 227 228locking rules: 229 230======= ========= 231ops may block 232======= ========= 233mount yes 234kill_sb yes 235======= ========= 236 237->mount() returns ERR_PTR or the root dentry; its superblock should be locked 238on return. 239 240->kill_sb() takes a write-locked superblock, does all shutdown work on it, 241unlocks and drops the reference. 242 243address_space_operations 244======================== 245prototypes:: 246 247 int (*writepage)(struct page *page, struct writeback_control *wbc); 248 int (*readpage)(struct file *, struct page *); 249 int (*writepages)(struct address_space *, struct writeback_control *); 250 int (*set_page_dirty)(struct page *page); 251 void (*readahead)(struct readahead_control *); 252 int (*readpages)(struct file *filp, struct address_space *mapping, 253 struct list_head *pages, unsigned nr_pages); 254 int (*write_begin)(struct file *, struct address_space *mapping, 255 loff_t pos, unsigned len, unsigned flags, 256 struct page **pagep, void **fsdata); 257 int (*write_end)(struct file *, struct address_space *mapping, 258 loff_t pos, unsigned len, unsigned copied, 259 struct page *page, void *fsdata); 260 sector_t (*bmap)(struct address_space *, sector_t); 261 void (*invalidatepage) (struct page *, unsigned int, unsigned int); 262 int (*releasepage) (struct page *, int); 263 void (*freepage)(struct page *); 264 int (*direct_IO)(struct kiocb *, struct iov_iter *iter); 265 bool (*isolate_page) (struct page *, isolate_mode_t); 266 int (*migratepage)(struct address_space *, struct page *, struct page *); 267 void (*putback_page) (struct page *); 268 int (*launder_page)(struct page *); 269 int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); 270 int (*error_remove_page)(struct address_space *, struct page *); 271 int (*swap_activate)(struct file *); 272 int (*swap_deactivate)(struct file *); 273 274locking rules: 275 All except set_page_dirty and freepage may block 276 277====================== ======================== ========= =============== 278ops PageLocked(page) i_rwsem invalidate_lock 279====================== ======================== ========= =============== 280writepage: yes, unlocks (see below) 281readpage: yes, unlocks shared 282writepages: 283set_page_dirty no 284readahead: yes, unlocks shared 285readpages: no shared 286write_begin: locks the page exclusive 287write_end: yes, unlocks exclusive 288bmap: 289invalidatepage: yes exclusive 290releasepage: yes 291freepage: yes 292direct_IO: 293isolate_page: yes 294migratepage: yes (both) 295putback_page: yes 296launder_page: yes 297is_partially_uptodate: yes 298error_remove_page: yes 299swap_activate: no 300swap_deactivate: no 301====================== ======================== ========= =============== 302 303->write_begin(), ->write_end() and ->readpage() may be called from 304the request handler (/dev/loop). 305 306->readpage() unlocks the page, either synchronously or via I/O 307completion. 308 309->readahead() unlocks the pages that I/O is attempted on like ->readpage(). 310 311->readpages() populates the pagecache with the passed pages and starts 312I/O against them. They come unlocked upon I/O completion. 313 314->writepage() is used for two purposes: for "memory cleansing" and for 315"sync". These are quite different operations and the behaviour may differ 316depending upon the mode. 317 318If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then 319it *must* start I/O against the page, even if that would involve 320blocking on in-progress I/O. 321 322If writepage is called for memory cleansing (sync_mode == 323WBC_SYNC_NONE) then its role is to get as much writeout underway as 324possible. So writepage should try to avoid blocking against 325currently-in-progress I/O. 326 327If the filesystem is not called for "sync" and it determines that it 328would need to block against in-progress I/O to be able to start new I/O 329against the page the filesystem should redirty the page with 330redirty_page_for_writepage(), then unlock the page and return zero. 331This may also be done to avoid internal deadlocks, but rarely. 332 333If the filesystem is called for sync then it must wait on any 334in-progress I/O and then start new I/O. 335 336The filesystem should unlock the page synchronously, before returning to the 337caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE 338value. WRITEPAGE_ACTIVATE means that page cannot really be written out 339currently, and VM should stop calling ->writepage() on this page for some 340time. VM does this by moving page to the head of the active list, hence the 341name. 342 343Unless the filesystem is going to redirty_page_for_writepage(), unlock the page 344and return zero, writepage *must* run set_page_writeback() against the page, 345followed by unlocking it. Once set_page_writeback() has been run against the 346page, write I/O can be submitted and the write I/O completion handler must run 347end_page_writeback() once the I/O is complete. If no I/O is submitted, the 348filesystem must run end_page_writeback() against the page before returning from 349writepage. 350 351That is: after 2.5.12, pages which are under writeout are *not* locked. Note, 352if the filesystem needs the page to be locked during writeout, that is ok, too, 353the page is allowed to be unlocked at any point in time between the calls to 354set_page_writeback() and end_page_writeback(). 355 356Note, failure to run either redirty_page_for_writepage() or the combination of 357set_page_writeback()/end_page_writeback() on a page submitted to writepage 358will leave the page itself marked clean but it will be tagged as dirty in the 359radix tree. This incoherency can lead to all sorts of hard-to-debug problems 360in the filesystem like having dirty inodes at umount and losing written data. 361 362->writepages() is used for periodic writeback and for syscall-initiated 363sync operations. The address_space should start I/O against at least 364``*nr_to_write`` pages. ``*nr_to_write`` must be decremented for each page 365which is written. The address_space implementation may write more (or less) 366pages than ``*nr_to_write`` asks for, but it should try to be reasonably close. 367If nr_to_write is NULL, all dirty pages must be written. 368 369writepages should _only_ write pages which are present on 370mapping->io_pages. 371 372->set_page_dirty() is called from various places in the kernel 373when the target page is marked as needing writeback. It may be called 374under spinlock (it cannot block) and is sometimes called with the page 375not locked. 376 377->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some 378filesystems and by the swapper. The latter will eventually go away. Please, 379keep it that way and don't breed new callers. 380 381->invalidatepage() is called when the filesystem must attempt to drop 382some or all of the buffers from the page when it is being truncated. It 383returns zero on success. If ->invalidatepage is zero, the kernel uses 384block_invalidatepage() instead. The filesystem must exclusively acquire 385invalidate_lock before invalidating page cache in truncate / hole punch path 386(and thus calling into ->invalidatepage) to block races between page cache 387invalidation and page cache filling functions (fault, read, ...). 388 389->releasepage() is called when the kernel is about to try to drop the 390buffers from the page in preparation for freeing it. It returns zero to 391indicate that the buffers are (or may be) freeable. If ->releasepage is zero, 392the kernel assumes that the fs has no private interest in the buffers. 393 394->freepage() is called when the kernel is done dropping the page 395from the page cache. 396 397->launder_page() may be called prior to releasing a page if 398it is still found to be dirty. It returns zero if the page was successfully 399cleaned, or an error value if not. Note that in order to prevent the page 400getting mapped back in and redirtied, it needs to be kept locked 401across the entire operation. 402 403->swap_activate will be called with a non-zero argument on 404files backing (non block device backed) swapfiles. A return value 405of zero indicates success, in which case this file can be used for 406backing swapspace. The swapspace operations will be proxied to the 407address space operations. 408 409->swap_deactivate() will be called in the sys_swapoff() 410path after ->swap_activate() returned success. 411 412file_lock_operations 413==================== 414 415prototypes:: 416 417 void (*fl_copy_lock)(struct file_lock *, struct file_lock *); 418 void (*fl_release_private)(struct file_lock *); 419 420 421locking rules: 422 423=================== ============= ========= 424ops inode->i_lock may block 425=================== ============= ========= 426fl_copy_lock: yes no 427fl_release_private: maybe maybe[1]_ 428=================== ============= ========= 429 430.. [1]: 431 ->fl_release_private for flock or POSIX locks is currently allowed 432 to block. Leases however can still be freed while the i_lock is held and 433 so fl_release_private called on a lease should not block. 434 435lock_manager_operations 436======================= 437 438prototypes:: 439 440 void (*lm_notify)(struct file_lock *); /* unblock callback */ 441 int (*lm_grant)(struct file_lock *, struct file_lock *, int); 442 void (*lm_break)(struct file_lock *); /* break_lease callback */ 443 int (*lm_change)(struct file_lock **, int); 444 bool (*lm_breaker_owns_lease)(struct file_lock *); 445 446locking rules: 447 448====================== ============= ================= ========= 449ops inode->i_lock blocked_lock_lock may block 450====================== ============= ================= ========= 451lm_notify: yes yes no 452lm_grant: no no no 453lm_break: yes no no 454lm_change yes no no 455lm_breaker_owns_lease: no no no 456====================== ============= ================= ========= 457 458buffer_head 459=========== 460 461prototypes:: 462 463 void (*b_end_io)(struct buffer_head *bh, int uptodate); 464 465locking rules: 466 467called from interrupts. In other words, extreme care is needed here. 468bh is locked, but that's all warranties we have here. Currently only RAID1, 469highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices 470call this method upon the IO completion. 471 472block_device_operations 473======================= 474prototypes:: 475 476 int (*open) (struct block_device *, fmode_t); 477 int (*release) (struct gendisk *, fmode_t); 478 int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 479 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); 480 int (*direct_access) (struct block_device *, sector_t, void **, 481 unsigned long *); 482 void (*unlock_native_capacity) (struct gendisk *); 483 int (*getgeo)(struct block_device *, struct hd_geometry *); 484 void (*swap_slot_free_notify) (struct block_device *, unsigned long); 485 486locking rules: 487 488======================= =================== 489ops open_mutex 490======================= =================== 491open: yes 492release: yes 493ioctl: no 494compat_ioctl: no 495direct_access: no 496unlock_native_capacity: no 497getgeo: no 498swap_slot_free_notify: no (see below) 499======================= =================== 500 501swap_slot_free_notify is called with swap_lock and sometimes the page lock 502held. 503 504 505file_operations 506=============== 507 508prototypes:: 509 510 loff_t (*llseek) (struct file *, loff_t, int); 511 ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); 512 ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); 513 ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); 514 ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); 515 int (*iopoll) (struct kiocb *kiocb, bool spin); 516 int (*iterate) (struct file *, struct dir_context *); 517 int (*iterate_shared) (struct file *, struct dir_context *); 518 __poll_t (*poll) (struct file *, struct poll_table_struct *); 519 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 520 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 521 int (*mmap) (struct file *, struct vm_area_struct *); 522 int (*open) (struct inode *, struct file *); 523 int (*flush) (struct file *); 524 int (*release) (struct inode *, struct file *); 525 int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); 526 int (*fasync) (int, struct file *, int); 527 int (*lock) (struct file *, int, struct file_lock *); 528 ssize_t (*sendpage) (struct file *, struct page *, int, size_t, 529 loff_t *, int); 530 unsigned long (*get_unmapped_area)(struct file *, unsigned long, 531 unsigned long, unsigned long, unsigned long); 532 int (*check_flags)(int); 533 int (*flock) (struct file *, int, struct file_lock *); 534 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, 535 size_t, unsigned int); 536 ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, 537 size_t, unsigned int); 538 int (*setlease)(struct file *, long, struct file_lock **, void **); 539 long (*fallocate)(struct file *, int, loff_t, loff_t); 540 void (*show_fdinfo)(struct seq_file *m, struct file *f); 541 unsigned (*mmap_capabilities)(struct file *); 542 ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 543 loff_t, size_t, unsigned int); 544 loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, 545 struct file *file_out, loff_t pos_out, 546 loff_t len, unsigned int remap_flags); 547 int (*fadvise)(struct file *, loff_t, loff_t, int); 548 549locking rules: 550 All may block. 551 552->llseek() locking has moved from llseek to the individual llseek 553implementations. If your fs is not using generic_file_llseek, you 554need to acquire and release the appropriate locks in your ->llseek(). 555For many filesystems, it is probably safe to acquire the inode 556mutex or just to use i_size_read() instead. 557Note: this does not protect the file->f_pos against concurrent modifications 558since this is something the userspace has to take care about. 559 560->iterate() is called with i_rwsem exclusive. 561 562->iterate_shared() is called with i_rwsem at least shared. 563 564->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. 565Most instances call fasync_helper(), which does that maintenance, so it's 566not normally something one needs to worry about. Return values > 0 will be 567mapped to zero in the VFS layer. 568 569->readdir() and ->ioctl() on directories must be changed. Ideally we would 570move ->readdir() to inode_operations and use a separate method for directory 571->ioctl() or kill the latter completely. One of the problems is that for 572anything that resembles union-mount we won't have a struct file for all 573components. And there are other reasons why the current interface is a mess... 574 575->read on directories probably must go away - we should just enforce -EISDIR 576in sys_read() and friends. 577 578->setlease operations should call generic_setlease() before or after setting 579the lease within the individual filesystem to record the result of the 580operation 581 582->fallocate implementation must be really careful to maintain page cache 583consistency when punching holes or performing other operations that invalidate 584page cache contents. Usually the filesystem needs to call 585truncate_inode_pages_range() to invalidate relevant range of the page cache. 586However the filesystem usually also needs to update its internal (and on disk) 587view of file offset -> disk block mapping. Until this update is finished, the 588filesystem needs to block page faults and reads from reloading now-stale page 589cache contents from the disk. Since VFS acquires mapping->invalidate_lock in 590shared mode when loading pages from disk (filemap_fault(), filemap_read(), 591readahead paths), the fallocate implementation must take the invalidate_lock to 592prevent reloading. 593 594->copy_file_range and ->remap_file_range implementations need to serialize 595against modifications of file data while the operation is running. For 596blocking changes through write(2) and similar operations inode->i_rwsem can be 597used. To block changes to file contents via a memory mapping during the 598operation, the filesystem must take mapping->invalidate_lock to coordinate 599with ->page_mkwrite. 600 601dquot_operations 602================ 603 604prototypes:: 605 606 int (*write_dquot) (struct dquot *); 607 int (*acquire_dquot) (struct dquot *); 608 int (*release_dquot) (struct dquot *); 609 int (*mark_dirty) (struct dquot *); 610 int (*write_info) (struct super_block *, int); 611 612These operations are intended to be more or less wrapping functions that ensure 613a proper locking wrt the filesystem and call the generic quota operations. 614 615What filesystem should expect from the generic quota functions: 616 617============== ============ ========================= 618ops FS recursion Held locks when called 619============== ============ ========================= 620write_dquot: yes dqonoff_sem or dqptr_sem 621acquire_dquot: yes dqonoff_sem or dqptr_sem 622release_dquot: yes dqonoff_sem or dqptr_sem 623mark_dirty: no - 624write_info: yes dqonoff_sem 625============== ============ ========================= 626 627FS recursion means calling ->quota_read() and ->quota_write() from superblock 628operations. 629 630More details about quota locking can be found in fs/dquot.c. 631 632vm_operations_struct 633==================== 634 635prototypes:: 636 637 void (*open)(struct vm_area_struct*); 638 void (*close)(struct vm_area_struct*); 639 vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *); 640 vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); 641 vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); 642 int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); 643 644locking rules: 645 646============= ========= =========================== 647ops mmap_lock PageLocked(page) 648============= ========= =========================== 649open: yes 650close: yes 651fault: yes can return with page locked 652map_pages: yes 653page_mkwrite: yes can return with page locked 654pfn_mkwrite: yes 655access: yes 656============= ========= =========================== 657 658->fault() is called when a previously not present pte is about to be faulted 659in. The filesystem must find and return the page associated with the passed in 660"pgoff" in the vm_fault structure. If it is possible that the page may be 661truncated and/or invalidated, then the filesystem must lock invalidate_lock, 662then ensure the page is not already truncated (invalidate_lock will block 663subsequent truncate), and then return with VM_FAULT_LOCKED, and the page 664locked. The VM will unlock the page. 665 666->map_pages() is called when VM asks to map easy accessible pages. 667Filesystem should find and map pages associated with offsets from "start_pgoff" 668till "end_pgoff". ->map_pages() is called with page table locked and must 669not block. If it's not possible to reach a page without blocking, 670filesystem should skip it. Filesystem should use do_set_pte() to setup 671page table entry. Pointer to entry associated with the page is passed in 672"pte" field in vm_fault structure. Pointers to entries for other offsets 673should be calculated relative to "pte". 674 675->page_mkwrite() is called when a previously read-only pte is about to become 676writeable. The filesystem again must ensure that there are no 677truncate/invalidate races or races with operations such as ->remap_file_range 678or ->copy_file_range, and then return with the page locked. Usually 679mapping->invalidate_lock is suitable for proper serialization. If the page has 680been truncated, the filesystem should not look up a new page like the ->fault() 681handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to 682retry the fault. 683 684->pfn_mkwrite() is the same as page_mkwrite but when the pte is 685VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is 686VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior 687after this call is to make the pte read-write, unless pfn_mkwrite returns 688an error. 689 690->access() is called when get_user_pages() fails in 691access_process_vm(), typically used to debug a process through 692/proc/pid/mem or ptrace. This function is needed only for 693VM_IO | VM_PFNMAP VMAs. 694 695-------------------------------------------------------------------------------- 696 697 Dubious stuff 698 699(if you break something or notice that it is broken and do not fix it yourself 700- at least put it here) 701