• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 // Copyright (C) 2024 Google LLC.
4 
5 //! This module has utilities for managing a page range where unused pages may be reclaimed by a
6 //! vma shrinker.
7 
8 // To avoid deadlocks, locks are taken in the order:
9 //
10 //  1. mmap lock
11 //  2. spinlock
12 //  3. lru spinlock
13 //
14 // The shrinker will use trylock methods because it locks them in a different order.
15 
16 use core::{
17     alloc::Layout,
18     marker::PhantomPinned,
19     mem::{size_of, size_of_val, MaybeUninit},
20     ptr::{self, NonNull},
21 };
22 
23 use kernel::{
24     alloc::allocator::KVmalloc,
25     alloc::Allocator,
26     bindings,
27     error::Result,
28     ffi::{c_ulong, c_void},
29     mm::{virt, Mm, MmWithUser},
30     new_mutex, new_spinlock,
31     page::{Page, PAGE_SHIFT, PAGE_SIZE},
32     prelude::*,
33     str::CStr,
34     sync::{Mutex, SpinLock},
35     task::Pid,
36     types::ARef,
37     types::{FromBytes, Opaque},
38     uaccess::UserSliceReader,
39 };
40 
41 /// Represents a shrinker that can be registered with the kernel.
42 ///
43 /// Each shrinker can be used by many `ShrinkablePageRange` objects.
44 #[repr(C)]
45 pub(crate) struct Shrinker {
46     inner: Opaque<*mut bindings::shrinker>,
47     list_lru: Opaque<bindings::list_lru>,
48 }
49 
50 unsafe impl Send for Shrinker {}
51 unsafe impl Sync for Shrinker {}
52 
53 impl Shrinker {
54     /// Create a new shrinker.
55     ///
56     /// # Safety
57     ///
58     /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
59     /// been called exactly once, and it must not have returned an error.
new() -> Self60     pub(crate) const unsafe fn new() -> Self {
61         Self {
62             inner: Opaque::uninit(),
63             list_lru: Opaque::uninit(),
64         }
65     }
66 
67     /// Register this shrinker with the kernel.
register(&'static self, name: &CStr) -> Result<()>68     pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
69         // SAFETY: These fields are not yet used, so it's okay to zero them.
70         unsafe {
71             self.inner.get().write(ptr::null_mut());
72             self.list_lru.get().write_bytes(0, 1);
73         }
74 
75         // SAFETY: The field is not yet used, so we can initialize it.
76         let ret = unsafe {
77             bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut(), ptr::null_mut())
78         };
79         if ret != 0 {
80             return Err(Error::from_errno(ret));
81         }
82 
83         // SAFETY: The `name` points at a valid c string.
84         let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
85         if shrinker.is_null() {
86             // SAFETY: We initialized it, so its okay to destroy it.
87             unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
88             return Err(Error::from_errno(ret));
89         }
90 
91         // SAFETY: We're about to register the shrinker, and these are the fields we need to
92         // initialize. (All other fields are already zeroed.)
93         unsafe {
94             ptr::addr_of_mut!((*shrinker).count_objects).write(Some(rust_shrink_count));
95             ptr::addr_of_mut!((*shrinker).scan_objects).write(Some(rust_shrink_scan));
96             ptr::addr_of_mut!((*shrinker).private_data).write(self.list_lru.get().cast());
97         }
98 
99         // SAFETY: The new shrinker has been fully initialized, so we can register it.
100         unsafe { bindings::shrinker_register(shrinker) };
101 
102         // SAFETY: This initializes the pointer to the shrinker so that we can use it.
103         unsafe { self.inner.get().write(shrinker) };
104 
105         Ok(())
106     }
107 }
108 
109 /// A container that manages a page range in a vma.
110 ///
111 /// The pages can be thought of as an array of booleans of whether the pages are usable. The
112 /// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
113 /// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
114 /// immediately. Instead, it is made available to the memory shrinker to free it if the device is
115 /// under memory pressure.
116 ///
117 /// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
118 /// way to know whether an index ends up with true or false if a call to `use_range` races with
119 /// another call to `stop_using_range` on a given index.
120 ///
121 /// It's also okay for the two methods to race with themselves, e.g. if two threads call
122 /// `use_range` on the same index, then that's fine and neither call will return until the page is
123 /// allocated and mapped.
124 ///
125 /// The methods that read or write to a range require that the page is marked as in use. So it is
126 /// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
127 /// write to the page.
128 #[pin_data(PinnedDrop)]
129 pub(crate) struct ShrinkablePageRange {
130     /// Shrinker object registered with the kernel.
131     shrinker: &'static Shrinker,
132     /// Pid using this page range. Only used as debugging information.
133     pid: Pid,
134     /// The mm for the relevant process.
135     mm: ARef<Mm>,
136     /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`.
137     #[pin]
138     mm_lock: Mutex<()>,
139     /// Spinlock protecting changes to pages.
140     #[pin]
141     lock: SpinLock<Inner>,
142 
143     /// Must not move, since page info has pointers back.
144     #[pin]
145     _pin: PhantomPinned,
146 }
147 
148 struct Inner {
149     /// Array of pages.
150     ///
151     /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
152     /// ownership. To deal with that, we manage it using raw pointers.
153     pages: *mut PageInfo,
154     /// Length of the `pages` array.
155     size: usize,
156     /// The address of the vma to insert the pages into.
157     vma_addr: usize,
158 }
159 
160 unsafe impl Send for ShrinkablePageRange {}
161 unsafe impl Sync for ShrinkablePageRange {}
162 
163 type StableMmGuard =
164     kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
165 
166 /// An array element that describes the current state of a page.
167 ///
168 /// There are three states:
169 ///
170 ///  * Free. The page is None. The `lru` element is not queued.
171 ///  * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
172 ///  * Used. The page is Some. The `lru` element is not queued.
173 ///
174 /// When an element is available, the shrinker is able to free the page.
175 #[repr(C)]
176 struct PageInfo {
177     lru: bindings::list_head,
178     page: Option<Page>,
179     range: *const ShrinkablePageRange,
180 }
181 
182 impl PageInfo {
183     /// # Safety
184     ///
185     /// The caller ensures that reading from `me.page` is ok.
has_page(me: *const PageInfo) -> bool186     unsafe fn has_page(me: *const PageInfo) -> bool {
187         // SAFETY: This pointer offset is in bounds.
188         let page = unsafe { ptr::addr_of!((*me).page) };
189 
190         unsafe { (*page).is_some() }
191     }
192 
193     /// # Safety
194     ///
195     /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
set_page(me: *mut PageInfo, page: Page)196     unsafe fn set_page(me: *mut PageInfo, page: Page) {
197         // SAFETY: This pointer offset is in bounds.
198         let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
199 
200         // SAFETY: The pointer is valid for writing, so also valid for reading.
201         if unsafe { (*ptr).is_some() } {
202             pr_err!("set_page called when there is already a page");
203             // SAFETY: We will initialize the page again below.
204             unsafe { ptr::drop_in_place(ptr) };
205         }
206 
207         // SAFETY: The pointer is valid for writing.
208         unsafe { ptr::write(ptr, Some(page)) };
209     }
210 
211     /// # Safety
212     ///
213     /// The caller ensures that reading from `me.page` is ok for the duration of 'a.
get_page<'a>(me: *const PageInfo) -> Option<&'a Page>214     unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
215         // SAFETY: This pointer offset is in bounds.
216         let ptr = unsafe { ptr::addr_of!((*me).page) };
217 
218         // SAFETY: The pointer is valid for reading.
219         unsafe { (*ptr).as_ref() }
220     }
221 
222     /// # Safety
223     ///
224     /// The caller ensures that writing to `me.page` is ok for the duration of 'a.
take_page(me: *mut PageInfo) -> Option<Page>225     unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
226         // SAFETY: This pointer offset is in bounds.
227         let ptr = unsafe { ptr::addr_of_mut!((*me).page) };
228 
229         // SAFETY: The pointer is valid for reading.
230         unsafe { (*ptr).take() }
231     }
232 
233     /// Add this page to the lru list, if not already in the list.
234     ///
235     /// # Safety
236     ///
237     /// The pointer must be valid, and it must be the right shrinker.
list_lru_add(me: *mut PageInfo, shrinker: &'static Shrinker)238     unsafe fn list_lru_add(me: *mut PageInfo, shrinker: &'static Shrinker) {
239         // SAFETY: This pointer offset is in bounds.
240         let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
241         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
242         unsafe { bindings::list_lru_add_obj(shrinker.list_lru.get(), lru_ptr) };
243     }
244 
245     /// Remove this page from the lru list, if it is in the list.
246     ///
247     /// # Safety
248     ///
249     /// The pointer must be valid, and it must be the right shrinker.
list_lru_del(me: *mut PageInfo, shrinker: &'static Shrinker)250     unsafe fn list_lru_del(me: *mut PageInfo, shrinker: &'static Shrinker) {
251         // SAFETY: This pointer offset is in bounds.
252         let lru_ptr = unsafe { ptr::addr_of_mut!((*me).lru) };
253         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
254         unsafe { bindings::list_lru_del_obj(shrinker.list_lru.get(), lru_ptr) };
255     }
256 }
257 
258 impl ShrinkablePageRange {
259     /// Create a new `ShrinkablePageRange` using the given shrinker.
new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error>260     pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
261         try_pin_init!(Self {
262             shrinker,
263             pid: kernel::current!().pid(),
264             mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
265             mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
266             lock <- new_spinlock!(Inner {
267                 pages: ptr::null_mut(),
268                 size: 0,
269                 vma_addr: 0,
270             }, "ShrinkablePageRange"),
271             _pin: PhantomPinned,
272         })
273     }
274 
stable_trylock_mm(&self) -> Option<StableMmGuard>275     pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
276         // SAFETY: This extends the duration of the reference. Since this call happens before
277         // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
278         // until the returned guard is dropped. This ensures that the guard is valid until dropped.
279         let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
280 
281         mm_lock.try_lock()
282     }
283 
284     /// Register a vma with this page range. Returns the size of the region.
register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize>285     pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize> {
286         let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
287         let num_pages = num_bytes >> PAGE_SHIFT;
288 
289         if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
290             pr_debug!("Failed to register with vma: invalid vma->vm_mm");
291             return Err(EINVAL);
292         }
293         if num_pages == 0 {
294             pr_debug!("Failed to register with vma: size zero");
295             return Err(EINVAL);
296         }
297 
298         let layout = Layout::array::<PageInfo>(num_pages).map_err(|_| ENOMEM)?;
299         // SAFETY: The layout has non-zero size.
300         let pages = KVmalloc::alloc(layout, GFP_KERNEL)?.cast::<PageInfo>();
301 
302         // SAFETY: This just initializes the pages array.
303         unsafe {
304             let self_ptr = self as *const ShrinkablePageRange;
305             for i in 0..num_pages {
306                 let info = pages.add(i).as_ptr();
307                 ptr::addr_of_mut!((*info).range).write(self_ptr);
308                 ptr::addr_of_mut!((*info).page).write(None);
309                 let lru = ptr::addr_of_mut!((*info).lru);
310                 ptr::addr_of_mut!((*lru).next).write(lru);
311                 ptr::addr_of_mut!((*lru).prev).write(lru);
312             }
313         }
314 
315         let mut inner = self.lock.lock();
316         if inner.size > 0 {
317             pr_debug!("Failed to register with vma: already registered");
318             drop(inner);
319             // SAFETY: The `pages` array was allocated with the same layout.
320             unsafe { KVmalloc::free(pages.cast(), layout) };
321             return Err(EBUSY);
322         }
323 
324         inner.pages = pages.as_ptr();
325         inner.size = num_pages;
326         inner.vma_addr = vma.start();
327 
328         Ok(num_pages)
329     }
330 
331     /// Make sure that the given pages are allocated and mapped.
332     ///
333     /// Must not be called from an atomic context.
use_range(&self, start: usize, end: usize) -> Result<()>334     pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
335         crate::trace::trace_update_page_range(self.pid, true, start, end);
336 
337         if start >= end {
338             return Ok(());
339         }
340         let mut inner = self.lock.lock();
341         assert!(end <= inner.size);
342 
343         for i in start..end {
344             // SAFETY: This pointer offset is in bounds.
345             let page_info = unsafe { inner.pages.add(i) };
346 
347             // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
348             if unsafe { PageInfo::has_page(page_info) } {
349                 crate::trace::trace_alloc_lru_start(self.pid, i);
350 
351                 // Since we're going to use the page, we should remove it from the lru list so that
352                 // the shrinker will not free it.
353                 //
354                 // SAFETY: The pointer is valid, and this is the right shrinker.
355                 //
356                 // The shrinker can't free the page between the check and this call to
357                 // `list_lru_del` because we hold the lock.
358                 unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
359 
360                 crate::trace::trace_alloc_lru_end(self.pid, i);
361             } else {
362                 // We have to allocate a new page. Use the slow path.
363                 drop(inner);
364                 crate::trace::trace_alloc_page_start(self.pid, i);
365                 match self.use_page_slow(i) {
366                     Ok(()) => {}
367                     Err(err) => {
368                         pr_warn!("Error in use_page_slow: {:?}", err);
369                         return Err(err);
370                     }
371                 }
372                 crate::trace::trace_alloc_page_end(self.pid, i);
373                 inner = self.lock.lock();
374             }
375         }
376         Ok(())
377     }
378 
379     /// Mark the given page as in use, slow path.
380     ///
381     /// Must not be called from an atomic context.
382     ///
383     /// # Safety
384     ///
385     /// Assumes that `i` is in bounds.
386     #[cold]
use_page_slow(&self, i: usize) -> Result<()>387     fn use_page_slow(&self, i: usize) -> Result<()> {
388         let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
389 
390         let mm_mutex = self.mm_lock.lock();
391         let inner = self.lock.lock();
392 
393         // SAFETY: This pointer offset is in bounds.
394         let page_info = unsafe { inner.pages.add(i) };
395 
396         // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
397         if unsafe { PageInfo::has_page(page_info) } {
398             // The page was already there, or someone else added the page while we didn't hold the
399             // spinlock.
400             //
401             // SAFETY: The pointer is valid, and this is the right shrinker.
402             //
403             // The shrinker can't free the page between the check and this call to
404             // `list_lru_del` because we hold the lock.
405             unsafe { PageInfo::list_lru_del(page_info, self.shrinker) };
406             return Ok(());
407         }
408 
409         let vma_addr = inner.vma_addr;
410         // Release the spinlock while we insert the page into the vma.
411         drop(inner);
412 
413         // No overflow since we stay in bounds of the vma.
414         let user_page_addr = vma_addr + (i << PAGE_SHIFT);
415 
416         // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
417         // a remote process. If the call to `mmput` races with the process shutting down, then the
418         // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
419         // happen until it returns to userspace. However, the caller might instead go to sleep and
420         // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
421         // middle of a shutdown process that wont complete until the `mm` is dropped. This can
422         // amount to a deadlock.
423         //
424         // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
425         // workqueue.
426         MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?)
427             .mmap_read_lock()
428             .vma_lookup(vma_addr)
429             .ok_or(ESRCH)?
430             .as_mixedmap_vma()
431             .ok_or(ESRCH)?
432             .vm_insert_page(user_page_addr, &new_page)
433             .inspect_err(|err| {
434                 pr_warn!(
435                     "Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}",
436                     user_page_addr,
437                     vma_addr,
438                     i,
439                     err
440                 )
441             })?;
442 
443         let inner = self.lock.lock();
444 
445         // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
446         // can be written to since we hold the lock.
447         //
448         // We released and reacquired the spinlock since we checked that the page is null, but we
449         // always hold the mm_lock mutex when setting the page to a non-null value, so it's not
450         // possible for someone else to have changed it since our check.
451         unsafe { PageInfo::set_page(page_info, new_page) };
452 
453         drop(inner);
454         drop(mm_mutex);
455 
456         Ok(())
457     }
458 
459     /// If the given page is in use, then mark it as available so that the shrinker can free it.
460     ///
461     /// May be called from an atomic context.
stop_using_range(&self, start: usize, end: usize)462     pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
463         crate::trace::trace_update_page_range(self.pid, false, start, end);
464 
465         if start >= end {
466             return;
467         }
468         let inner = self.lock.lock();
469         assert!(end <= inner.size);
470 
471         for i in (start..end).rev() {
472             // SAFETY: The pointer is in bounds.
473             let page_info = unsafe { inner.pages.add(i) };
474 
475             // SAFETY: Okay for reading since we have the lock.
476             if unsafe { PageInfo::has_page(page_info) } {
477                 crate::trace::trace_free_lru_start(self.pid, i);
478 
479                 // SAFETY: The pointer is valid, and it's the right shrinker.
480                 unsafe { PageInfo::list_lru_add(page_info, self.shrinker) };
481 
482                 crate::trace::trace_free_lru_end(self.pid, i);
483             }
484         }
485     }
486 
487     /// Helper for reading or writing to a range of bytes that may overlap with several pages.
488     ///
489     /// # Safety
490     ///
491     /// All pages touched by this operation must be in use for the duration of this call.
iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result where T: FnMut(&Page, usize, usize) -> Result,492     unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
493     where
494         T: FnMut(&Page, usize, usize) -> Result,
495     {
496         if size == 0 {
497             return Ok(());
498         }
499 
500         // SAFETY: The caller promises that the pages touched by this call are in use. It's only
501         // possible for a page to be in use if we have already been registered with a vma, and we
502         // only change the `pages` and `size` fields during registration with a vma, so there is no
503         // race when we read them here without taking the lock.
504         let (pages, num_pages) = {
505             let inner = self.lock.lock();
506             (inner.pages, inner.size)
507         };
508         let num_bytes = num_pages << PAGE_SHIFT;
509 
510         // Check that the request is within the buffer.
511         if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
512             return Err(EFAULT);
513         }
514 
515         let mut page_index = offset >> PAGE_SHIFT;
516         offset &= PAGE_SIZE - 1;
517         while size > 0 {
518             let available = usize::min(size, PAGE_SIZE - offset);
519             // SAFETY: The pointer is in bounds.
520             let page_info = unsafe { pages.add(page_index) };
521             // SAFETY: The caller guarantees that this page is in the "in use" state for the
522             // duration of this call to `iterate`, so nobody will change the page.
523             let page = unsafe { PageInfo::get_page(page_info) };
524             if page.is_none() {
525                 pr_warn!("Page is null!");
526             }
527             let page = page.ok_or(EFAULT)?;
528             cb(page, offset, available)?;
529             size -= available;
530             page_index += 1;
531             offset = 0;
532         }
533         Ok(())
534     }
535 
536     /// Copy from userspace into this page range.
537     ///
538     /// # Safety
539     ///
540     /// All pages touched by this operation must be in use for the duration of this call.
copy_from_user_slice( &self, reader: &mut UserSliceReader, offset: usize, size: usize, ) -> Result541     pub(crate) unsafe fn copy_from_user_slice(
542         &self,
543         reader: &mut UserSliceReader,
544         offset: usize,
545         size: usize,
546     ) -> Result {
547         // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
548         unsafe {
549             self.iterate(offset, size, |page, offset, to_copy| {
550                 page.copy_from_user_slice_raw(reader, offset, to_copy)
551             })
552         }
553     }
554 
555     /// Copy from this page range into kernel space.
556     ///
557     /// # Safety
558     ///
559     /// All pages touched by this operation must be in use for the duration of this call.
read<T: FromBytes>(&self, offset: usize) -> Result<T>560     pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
561         let mut out = MaybeUninit::<T>::uninit();
562         let mut out_offset = 0;
563         // SAFETY: `self.iterate` has the same safety requirements as `read`.
564         unsafe {
565             self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
566                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
567                 let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
568                 // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
569                 page.read_raw(obj_ptr, offset, to_copy)?;
570                 out_offset += to_copy;
571                 Ok(())
572             })?;
573         }
574         // SAFETY: We just initialised the data.
575         Ok(unsafe { out.assume_init() })
576     }
577 
578     /// Copy from kernel space into this page range.
579     ///
580     /// # Safety
581     ///
582     /// All pages touched by this operation must be in use for the duration of this call.
write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result583     pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
584         let mut obj_offset = 0;
585         // SAFETY: `self.iterate` has the same safety requirements as `write`.
586         unsafe {
587             self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
588                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
589                 let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
590                 // SAFETY: We have a reference to the object, so the pointer is valid.
591                 page.write_raw(obj_ptr, offset, to_copy)?;
592                 obj_offset += to_copy;
593                 Ok(())
594             })
595         }
596     }
597 
598     /// Write zeroes to the given range.
599     ///
600     /// # Safety
601     ///
602     /// All pages touched by this operation must be in use for the duration of this call.
fill_zero(&self, offset: usize, size: usize) -> Result603     pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
604         // SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
605         unsafe {
606             self.iterate(offset, size, |page, offset, len| {
607                 page.fill_zero_raw(offset, len)
608             })
609         }
610     }
611 }
612 
613 #[pinned_drop]
614 impl PinnedDrop for ShrinkablePageRange {
drop(self: Pin<&mut Self>)615     fn drop(self: Pin<&mut Self>) {
616         let (pages, size) = {
617             let lock = self.lock.lock();
618             (lock.pages, lock.size)
619         };
620 
621         if size == 0 {
622             return;
623         }
624 
625         // This is the destructor, so unlike the other methods, we only need to worry about races
626         // with the shrinker here.
627         for i in 0..size {
628             // SAFETY: The pointer is valid and it's the right shrinker.
629             unsafe { PageInfo::list_lru_del(pages.add(i), self.shrinker) };
630             // SAFETY: If the shrinker was going to free this page, then it would have taken it
631             // from the PageInfo before releasing the lru lock. Thus, the call to `list_lru_del`
632             // will either remove it before the shrinker can access it, or the shrinker will
633             // already have taken the page at this point.
634             unsafe { drop(PageInfo::take_page(pages.add(i))) };
635         }
636 
637         // Wait for users of the mutex to go away. This call is necessary for the safety of
638         // `stable_trylock_mm`.
639         drop(self.mm_lock.lock());
640 
641         let Some(pages) = NonNull::new(pages) else {
642             return;
643         };
644 
645         // SAFETY: This computation did not overflow when allocating the pages array, so it will
646         // not overflow this time.
647         let layout = unsafe { Layout::array::<PageInfo>(size).unwrap_unchecked() };
648 
649         // SAFETY: The `pages` array was allocated with the same layout.
650         unsafe { KVmalloc::free(pages.cast(), layout) };
651     }
652 }
653 
654 #[no_mangle]
rust_shrink_count( shrink: *mut bindings::shrinker, _sc: *mut bindings::shrink_control, ) -> c_ulong655 unsafe extern "C" fn rust_shrink_count(
656     shrink: *mut bindings::shrinker,
657     _sc: *mut bindings::shrink_control,
658 ) -> c_ulong {
659     // SAFETY: We can access our own private data.
660     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
661     // SAFETY: Accessing the lru list is okay. Just an FFI call.
662     unsafe { bindings::list_lru_count(list_lru) }
663 }
664 
665 #[no_mangle]
rust_shrink_scan( shrink: *mut bindings::shrinker, sc: *mut bindings::shrink_control, ) -> c_ulong666 unsafe extern "C" fn rust_shrink_scan(
667     shrink: *mut bindings::shrinker,
668     sc: *mut bindings::shrink_control,
669 ) -> c_ulong {
670     // SAFETY: We can access our own private data.
671     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
672     // SAFETY: Caller guarantees that it is safe to read this field.
673     let nr_to_scan = unsafe { (*sc).nr_to_scan };
674     // SAFETY: Accessing the lru list is okay. Just an FFI call.
675     unsafe {
676         bindings::list_lru_walk(
677             list_lru,
678             Some(bindings::rust_shrink_free_page_wrap),
679             ptr::null_mut(),
680             nr_to_scan,
681         )
682     }
683 }
684 
685 const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP;
686 const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY;
687 
688 #[no_mangle]
rust_shrink_free_page( item: *mut bindings::list_head, lru: *mut bindings::list_lru_one, lru_lock: *mut bindings::spinlock_t, _cb_arg: *mut c_void, ) -> bindings::lru_status689 unsafe extern "C" fn rust_shrink_free_page(
690     item: *mut bindings::list_head,
691     lru: *mut bindings::list_lru_one,
692     lru_lock: *mut bindings::spinlock_t,
693     _cb_arg: *mut c_void,
694 ) -> bindings::lru_status {
695     // Fields that should survive after unlocking the lru lock.
696     let pid;
697     let page;
698     let page_index;
699     let mm;
700     let mmap_read;
701     let mm_mutex;
702     let vma_addr;
703 
704     {
705         // SAFETY: The `list_head` field is first in `PageInfo`.
706         let info = item as *mut PageInfo;
707         let range = unsafe { &*((*info).range) };
708 
709         mm = match range.mm.mmget_not_zero() {
710             Some(mm) => MmWithUser::into_mmput_async(mm),
711             None => return LRU_SKIP,
712         };
713 
714         mm_mutex = match range.stable_trylock_mm() {
715             Some(guard) => guard,
716             None => return LRU_SKIP,
717         };
718 
719         mmap_read = match mm.mmap_read_trylock() {
720             Some(guard) => guard,
721             None => return LRU_SKIP,
722         };
723 
724         // We can't lock it normally here, since we hold the lru lock.
725         let inner = match range.lock.try_lock() {
726             Some(inner) => inner,
727             None => return LRU_SKIP,
728         };
729 
730         // SAFETY: The item is in this lru list, so it's okay to remove it.
731         unsafe { bindings::list_lru_isolate(lru, item) };
732 
733         // SAFETY: Both pointers are in bounds of the same allocation.
734         page_index = unsafe { info.offset_from(inner.pages) } as usize;
735         pid = range.pid;
736 
737         crate::trace::trace_unmap_kernel_start(pid, page_index);
738 
739         // SAFETY: We hold the spinlock, so we can take the page.
740         //
741         // This sets the page pointer to zero before we unmap it from the vma. However, we call
742         // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
743         // insert a new page until after our call to `zap_page_range`.
744         page = unsafe { PageInfo::take_page(info) };
745         vma_addr = inner.vma_addr;
746 
747         crate::trace::trace_unmap_kernel_end(pid, page_index);
748 
749         // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
750         // they can be freed at any point after we unlock `lru_lock`. This is with the exception of
751         // `mm_mutex` which is kept alive by holding the lock.
752     }
753 
754     // SAFETY: The lru lock is locked when this method is called.
755     unsafe { bindings::spin_unlock(lru_lock) };
756 
757     if let Some(vma) = mmap_read.vma_lookup(vma_addr) {
758         let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
759         crate::trace::trace_unmap_user_start(pid, page_index);
760         vma.zap_page_range_single(user_page_addr, PAGE_SIZE);
761         crate::trace::trace_unmap_user_end(pid, page_index);
762     }
763 
764     drop(mmap_read);
765     drop(mm_mutex);
766     drop(mm);
767     drop(page);
768 
769     // SAFETY: We just unlocked the lru lock, but it should be locked when we return.
770     unsafe { bindings::spin_lock(lru_lock) };
771 
772     LRU_REMOVED_ENTRY
773 }
774