• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative
13 //! -dma-buffer-tracking-yu-zhang-intel
14 //!
15 //! Also presented at usenix ATC20:
16 //! https://www.usenix.org/conference/atc20/presentation/tian
17 
18 use std::collections::VecDeque;
19 use std::convert::TryInto;
20 use std::default::Default;
21 use std::panic;
22 use std::str::FromStr;
23 use std::sync::atomic::{fence, AtomicU32, Ordering};
24 use std::sync::Arc;
25 use std::time::Duration;
26 use std::{fmt, mem, thread};
27 
28 use anyhow::{anyhow, bail, ensure, Context, Result};
29 use base::{
30     error, info, AsRawDescriptor, Event, MemoryMapping, MemoryMappingBuilder, PollToken,
31     RawDescriptor, SafeDescriptor, SharedMemory, Timer, Tube, TubeError, WaitContext,
32 };
33 use data_model::DataInit;
34 use hypervisor::Datamatch;
35 use resources::{Alloc, MmioType, SystemAllocator};
36 use serde::{Deserialize, Serialize};
37 use sync::Mutex;
38 use thiserror::Error as ThisError;
39 
40 use vm_control::{VmMemoryDestination, VmMemoryRequest, VmMemoryResponse, VmMemorySource};
41 use vm_memory::{GuestAddress, GuestMemory};
42 
43 use crate::pci::pci_configuration::{
44     PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciClassCode, PciConfiguration,
45     PciHeaderType, PciOtherSubclass, COMMAND_REG, COMMAND_REG_MEMORY_SPACE_MASK,
46 };
47 use crate::pci::pci_device::{BarRange, PciDevice, Result as PciResult};
48 use crate::pci::{PciAddress, PciDeviceError};
49 use crate::vfio::VfioContainer;
50 use crate::{UnpinRequest, UnpinResponse};
51 
52 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
53 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
54 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
55 const COIOMMU_CMD_ACTIVATE: u64 = 1;
56 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
57 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
58 const COIOMMU_REVISION_ID: u8 = 0x10;
59 const COIOMMU_MMIO_BAR: u8 = 0;
60 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
61 const COIOMMU_NOTIFYMAP_BAR: u8 = 2;
62 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
63 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
64 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
65 const PAGE_SIZE_4K: u64 = 4096;
66 const PAGE_SHIFT_4K: u64 = 12;
67 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
68 
69 const DTTE_PINNED_FLAG: u32 = 1 << 31;
70 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
71 const DTT_ENTRY_PRESENT: u64 = 1;
72 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
73 
74 #[derive(ThisError, Debug)]
75 enum Error {
76     #[error("CoIommu failed to create shared memory")]
77     CreateSharedMemory,
78     #[error("Failed to get DTT entry")]
79     GetDTTEntry,
80     #[error("Tube error")]
81     TubeError,
82 }
83 
84 //default interval is 60s
85 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
86 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
87 /// Holds the coiommu unpin policy
88 #[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]
89 pub enum CoIommuUnpinPolicy {
90     Off,
91     Lru,
92 }
93 
94 impl FromStr for CoIommuUnpinPolicy {
95     type Err = anyhow::Error;
96 
from_str(s: &str) -> std::result::Result<Self, Self::Err>97     fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
98         match s {
99             "off" => Ok(CoIommuUnpinPolicy::Off),
100             "lru" => Ok(CoIommuUnpinPolicy::Lru),
101             _ => Err(anyhow!(
102                 "CoIommu doesn't have such unpin policy: {}",
103                 s.to_string()
104             )),
105         }
106     }
107 }
108 
109 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result110     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
111         use self::CoIommuUnpinPolicy::*;
112 
113         match self {
114             Off => write!(f, "off"),
115             Lru => write!(f, "lru"),
116         }
117     }
118 }
119 
120 /// Holds the parameters for a coiommu device
121 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
122 pub struct CoIommuParameters {
123     pub unpin_policy: CoIommuUnpinPolicy,
124     pub unpin_interval: Duration,
125     pub unpin_limit: Option<u64>,
126     // Number of unpin intervals a pinned page must be busy for to be aged into the
127     // older, less frequently checked generation.
128     pub unpin_gen_threshold: u64,
129 }
130 
131 impl Default for CoIommuParameters {
default() -> Self132     fn default() -> Self {
133         Self {
134             unpin_policy: CoIommuUnpinPolicy::Off,
135             unpin_interval: UNPIN_DEFAULT_INTERVAL,
136             unpin_limit: None,
137             unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
138         }
139     }
140 }
141 
142 #[derive(Default, Debug, Copy, Clone)]
143 struct CoIommuReg {
144     dtt_root: u64,
145     cmd: u64,
146     dtt_level: u64,
147 }
148 
149 #[derive(Default, Debug, Copy, Clone, PartialEq)]
150 struct PinnedPageInfo {
151     gfn: u64,
152     unpin_busy_cnt: u64,
153 }
154 
155 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self156     fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
157         PinnedPageInfo {
158             gfn,
159             unpin_busy_cnt,
160         }
161     }
162 }
163 
164 #[derive(PartialEq, Debug)]
165 enum UnpinThreadState {
166     Unparked,
167     Parked,
168 }
169 
170 struct CoIommuPinState {
171     new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
172     old_gen_pinned_pages: VecDeque<u64>,
173     unpin_thread_state: UnpinThreadState,
174     unpin_park_count: u64,
175 }
176 
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool177 unsafe fn vfio_map(
178     vfio_container: &Arc<Mutex<VfioContainer>>,
179     iova: u64,
180     size: u64,
181     user_addr: u64,
182 ) -> bool {
183     match vfio_container
184         .lock()
185         .vfio_dma_map(iova, size, user_addr, true)
186     {
187         Ok(_) => true,
188         Err(e) => {
189             if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
190                 if errno == libc::EEXIST {
191                     // Already pinned. set PINNED flag
192                     error!("CoIommu: iova 0x{:x} already pinned", iova);
193                     return true;
194                 }
195             }
196             error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
197             false
198         }
199     }
200 }
201 
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool202 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
203     match vfio_container.lock().vfio_dma_unmap(iova, size) {
204         Ok(_) => true,
205         Err(e) => {
206             error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
207             false
208         }
209     }
210 }
211 
212 #[derive(Default, Debug, Copy, Clone)]
213 #[repr(C)]
214 struct PinPageInfo {
215     bdf: u16,
216     pad: [u16; 3],
217     nr_pages: u64,
218 }
219 // Safe because the PinPageInfo structure is raw data
220 unsafe impl DataInit for PinPageInfo {}
221 
222 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
223 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
224 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
225 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
226 
level_to_offset(gfn: u64, level: u64) -> Result<u64>227 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
228     if level == 1 {
229         return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
230     }
231 
232     if level == 0 {
233         bail!("Invalid level for gfn 0x{:x}", gfn);
234     }
235 
236     let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
237 
238     Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
239 }
240 
241 struct DTTIter {
242     ptr: *const u8,
243     gfn: u64,
244 }
245 
246 impl Default for DTTIter {
default() -> Self247     fn default() -> Self {
248         DTTIter {
249             ptr: std::ptr::null(),
250             gfn: 0,
251         }
252     }
253 }
254 
255 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
256 //
257 // There are two ways to get the entry:
258 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
259 // corresponding entry. The DTT is shared between frontend and
260 // backend. It is page-table-like strctures and the entry is indexed
261 // by GFN. The argument dtt_root represents the root page
262 // pga and dtt_level represents the maximum page table level.
263 //
264 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
265 // stores an entry address and the associated gfn. If the target gfn is
266 // in the same page table page with the gfn in dtt_iter, then can
267 // calculate the target entry address based on the entry address in
268 // dtt_iter.
269 //
270 // As the DTT entry is shared between frontend and backend, the accessing
271 // should be atomic. So the returned value is converted to an AtomicU32
272 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>273 fn gfn_to_dtt_pte(
274     mem: &GuestMemory,
275     dtt_level: u64,
276     dtt_root: u64,
277     dtt_iter: &mut DTTIter,
278     gfn: u64,
279 ) -> Result<*const AtomicU32> {
280     let ptr = if dtt_iter.ptr.is_null()
281         || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
282     {
283         // Slow path to walk the DTT to get the pte entry
284         let mut level = dtt_level;
285         let mut pt_gpa = dtt_root;
286         let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
287 
288         while level != 1 {
289             let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
290             let parent_pt = mem
291                 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
292                 .context(Error::GetDTTEntry)?;
293 
294             if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
295                 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
296             }
297 
298             pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
299             level -= 1;
300         }
301 
302         let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
303 
304         mem.get_host_address(GuestAddress(pt_gpa + index))
305             .context(Error::GetDTTEntry)?
306     } else {
307         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
308         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
309         // means the calculated ptr will point to the same page as dtt_iter.ptr
310         if gfn > dtt_iter.gfn {
311             unsafe {
312                 dtt_iter
313                     .ptr
314                     .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
315             }
316         } else {
317             unsafe {
318                 dtt_iter
319                     .ptr
320                     .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
321             }
322         }
323     };
324 
325     dtt_iter.ptr = ptr;
326     dtt_iter.gfn = gfn;
327 
328     Ok(ptr as *const AtomicU32)
329 }
330 
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>331 fn pin_page(
332     pinstate: &mut CoIommuPinState,
333     policy: CoIommuUnpinPolicy,
334     vfio_container: &Arc<Mutex<VfioContainer>>,
335     mem: &GuestMemory,
336     dtt_level: u64,
337     dtt_root: u64,
338     dtt_iter: &mut DTTIter,
339     gfn: u64,
340 ) -> Result<()> {
341     let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
342 
343     let gpa = (gfn << PAGE_SHIFT_4K) as u64;
344     let host_addr = mem
345         .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
346         .context("failed to get host address")? as u64;
347 
348     // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
349     // Test PINNED flag
350     if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
351         info!("CoIommu: gfn 0x{:x} already pinned", gfn);
352         return Ok(());
353     }
354 
355     // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
356     // is guaranteed by MemoryMapping interface.
357     if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
358         // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
359         // set PINNED flag
360         unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
361         if policy == CoIommuUnpinPolicy::Lru {
362             pinstate
363                 .new_gen_pinned_pages
364                 .push_back(PinnedPageInfo::new(gfn, 0));
365         }
366     }
367 
368     Ok(())
369 }
370 
371 #[derive(PartialEq, Debug)]
372 enum UnpinResult {
373     UnpinlistEmpty,
374     Unpinned,
375     NotPinned,
376     NotUnpinned,
377     FailedUnpin,
378     UnpinParked,
379 }
380 
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult381 fn unpin_page(
382     pinstate: &mut CoIommuPinState,
383     vfio_container: &Arc<Mutex<VfioContainer>>,
384     mem: &GuestMemory,
385     dtt_level: u64,
386     dtt_root: u64,
387     dtt_iter: &mut DTTIter,
388     gfn: u64,
389     force: bool,
390 ) -> UnpinResult {
391     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
392         return UnpinResult::UnpinParked;
393     }
394 
395     let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
396         Ok(v) => v,
397         Err(_) => {
398             // The case force == true may try to unpin a page which is not
399             // mapped in the dtt. For such page, the pte doesn't exist yet
400             // thus don't need to report any error log.
401             // The case force == false is used by coiommu to periodically
402             // unpin the pages which have been mapped in dtt, thus the pte
403             // for such page does exist. However with the unpin request from
404             // virtio balloon, such pages can be unpinned already and the DTT
405             // pages might be reclaimed by the Guest OS kernel as well, thus
406             // it is also possible to be here. Not to report an error log.
407             return UnpinResult::NotPinned;
408         }
409     };
410 
411     if force {
412         // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
413         // This case is for balloon to evict pages so these pages should
414         // already been locked by balloon and no device driver in VM is
415         // able to access these pages, so just clear ACCESSED flag first
416         // to make sure the following unpin can be success.
417         unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
418     }
419 
420     // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
421     if let Err(entry) = unsafe {
422         (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
423     } {
424         // The compare_exchange failed as the original leaf entry is
425         // not DTTE_PINNED_FLAG so cannot do the unpin.
426         if entry == 0 {
427             // The GFN is already unpinned. This is very similar to the
428             // gfn_to_dtt_pte error case, with the only difference being
429             // that the dtt_pte happens to be on a present page table.
430             UnpinResult::NotPinned
431         } else {
432             if !force {
433                 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
434                 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
435                 // this page. It represents whether or not this page is touched by the
436                 // guest. By clearing this flag after an unpin work, we can detect if
437                 // this page has been touched by the guest in the next round of unpin
438                 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
439                 // will be failed and we will be here again to clear this flag. If this
440                 // flag is not set at the next round, unpin this page will be probably
441                 // success.
442                 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
443             } else {
444                 // If we're here, then the guest is trying to release a page via the
445                 // balloon that it still has pinned. This most likely that something is
446                 // wrong in the guest kernel. Just leave the page pinned and log
447                 // an error.
448                 // This failure blocks the balloon from removing the page, which ensures
449                 // that the guest's view of memory will remain consistent with device
450                 // DMA's view of memory. Also note that the host kernel maintains an
451                 // elevated refcount for pinned pages, which is a second guarantee the
452                 // pages accessible by device DMA won't be freed until after they are
453                 // unpinned.
454                 error!(
455                     "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
456                     gfn, entry
457                 );
458             }
459             // GFN cannot be unpinned either because the unmap count
460             // is non-zero or the it has accessed flag set.
461             UnpinResult::NotUnpinned
462         }
463     } else {
464         // The compare_exchange success as the original leaf entry is
465         // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
466         // page.
467         let gpa = (gfn << PAGE_SHIFT_4K) as u64;
468         if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
469             UnpinResult::Unpinned
470         } else {
471             // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
472             // make sure the pinned flag is set
473             unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
474             // need to put this gfn back to pinned vector
475             UnpinResult::FailedUnpin
476         }
477     }
478 }
479 
480 struct PinWorker {
481     mem: GuestMemory,
482     endpoints: Vec<u16>,
483     notifymap_mmap: Arc<MemoryMapping>,
484     dtt_level: u64,
485     dtt_root: u64,
486     ioevents: Vec<Event>,
487     vfio_container: Arc<Mutex<VfioContainer>>,
488     pinstate: Arc<Mutex<CoIommuPinState>>,
489     params: CoIommuParameters,
490 }
491 
492 impl PinWorker {
debug_label(&self) -> &'static str493     fn debug_label(&self) -> &'static str {
494         "CoIommuPinWorker"
495     }
496 
run(&mut self, kill_evt: Event)497     fn run(&mut self, kill_evt: Event) {
498         #[derive(PollToken)]
499         enum Token {
500             Kill,
501             Pin { index: usize },
502         }
503 
504         let wait_ctx: WaitContext<Token> =
505             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
506                 Ok(pc) => pc,
507                 Err(e) => {
508                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
509                     return;
510                 }
511             };
512 
513         for (index, event) in self.ioevents.iter().enumerate() {
514             match wait_ctx.add(event, Token::Pin { index }) {
515                 Ok(_) => {}
516                 Err(e) => {
517                     error!(
518                         "{}: failed to add ioevent for index {}: {}",
519                         self.debug_label(),
520                         index,
521                         e
522                     );
523                     return;
524                 }
525             }
526         }
527 
528         'wait: loop {
529             let events = match wait_ctx.wait() {
530                 Ok(v) => v,
531                 Err(e) => {
532                     error!("{}: failed polling for events: {}", self.debug_label(), e);
533                     break;
534                 }
535             };
536 
537             for event in events.iter().filter(|e| e.is_readable) {
538                 match event.token {
539                     Token::Kill => break 'wait,
540                     Token::Pin { index } => {
541                         let offset = index * mem::size_of::<u64>() as usize;
542                         if let Some(event) = self.ioevents.get(index) {
543                             if let Err(e) = event.read() {
544                                 error!(
545                                     "{}: failed reading event {}: {}",
546                                     self.debug_label(),
547                                     index,
548                                     e
549                                 );
550                                 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
551                                 break 'wait;
552                             }
553                         }
554                         if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
555                             if let Err(e) = self.pin_pages(data) {
556                                 error!("{}: {}", self.debug_label(), e);
557                             }
558                         }
559                         fence(Ordering::SeqCst);
560                         self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
561                     }
562                 }
563             }
564         }
565     }
566 
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>567     fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
568         let pin_page_info = self
569             .mem
570             .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
571             .context("failed to get pin page info")?;
572 
573         let bdf = pin_page_info.bdf;
574         ensure!(
575             self.endpoints.iter().any(|&x| x == bdf),
576             "pin page for unexpected bdf 0x{:x}",
577             bdf
578         );
579 
580         let mut nr_pages = pin_page_info.nr_pages;
581         let mut offset = mem::size_of::<PinPageInfo>() as u64;
582         let mut dtt_iter: DTTIter = Default::default();
583         let mut pinstate = self.pinstate.lock();
584         while nr_pages > 0 {
585             let gfn = self
586                 .mem
587                 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
588                 .context("failed to get pin page gfn")?;
589 
590             pin_page(
591                 &mut pinstate,
592                 self.params.unpin_policy,
593                 &self.vfio_container,
594                 &self.mem,
595                 self.dtt_level,
596                 self.dtt_root,
597                 &mut dtt_iter,
598                 gfn,
599             )?;
600 
601             offset += mem::size_of::<u64>() as u64;
602             nr_pages -= 1;
603         }
604 
605         Ok(())
606     }
607 
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>608     fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
609         if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
610             let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
611             self.pin_pages_in_batch(gpa)
612         } else {
613             let bdf = (gfn_bdf & 0xffff) as u16;
614             let gfn = gfn_bdf >> 16;
615             let mut dtt_iter: DTTIter = Default::default();
616             ensure!(
617                 self.endpoints.iter().any(|&x| x == bdf),
618                 "pin page for unexpected bdf 0x{:x}",
619                 bdf
620             );
621 
622             let mut pinstate = self.pinstate.lock();
623             pin_page(
624                 &mut pinstate,
625                 self.params.unpin_policy,
626                 &self.vfio_container,
627                 &self.mem,
628                 self.dtt_level,
629                 self.dtt_root,
630                 &mut dtt_iter,
631                 gfn,
632             )
633         }
634     }
635 }
636 
637 struct UnpinWorker {
638     mem: GuestMemory,
639     dtt_level: u64,
640     dtt_root: u64,
641     vfio_container: Arc<Mutex<VfioContainer>>,
642     unpin_tube: Option<Tube>,
643     pinstate: Arc<Mutex<CoIommuPinState>>,
644     params: CoIommuParameters,
645     unpin_gen_threshold: u64,
646 }
647 
648 impl UnpinWorker {
debug_label(&self) -> &'static str649     fn debug_label(&self) -> &'static str {
650         "CoIommuUnpinWorker"
651     }
652 
run(&mut self, kill_evt: Event)653     fn run(&mut self, kill_evt: Event) {
654         #[derive(PollToken)]
655         enum Token {
656             UnpinTimer,
657             UnpinReq,
658             Kill,
659         }
660 
661         let wait_ctx: WaitContext<Token> =
662             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
663                 Ok(pc) => pc,
664                 Err(e) => {
665                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
666                     return;
667                 }
668             };
669 
670         if let Some(tube) = &self.unpin_tube {
671             if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
672                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
673                 return;
674             }
675         }
676 
677         let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
678             && !self.params.unpin_interval.is_zero()
679         {
680             let duration = self.params.unpin_interval;
681             let interval = Some(self.params.unpin_interval);
682             let mut timer = match Timer::new() {
683                 Ok(t) => t,
684                 Err(e) => {
685                     error!(
686                         "{}: failed to create the unpin timer: {}",
687                         self.debug_label(),
688                         e
689                     );
690                     return;
691                 }
692             };
693             if let Err(e) = timer.reset(duration, interval) {
694                 error!(
695                     "{}: failed to start the unpin timer: {}",
696                     self.debug_label(),
697                     e
698                 );
699                 return;
700             }
701             if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
702                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
703                 return;
704             }
705             Some(timer)
706         } else {
707             None
708         };
709 
710         let unpin_tube = self.unpin_tube.take();
711         'wait: loop {
712             let events = match wait_ctx.wait() {
713                 Ok(v) => v,
714                 Err(e) => {
715                     error!("{}: failed polling for events: {}", self.debug_label(), e);
716                     break;
717                 }
718             };
719 
720             for event in events.iter().filter(|e| e.is_readable) {
721                 match event.token {
722                     Token::UnpinTimer => {
723                         self.unpin_pages();
724                         if let Some(timer) = &mut unpin_timer {
725                             if let Err(e) = timer.wait() {
726                                 error!(
727                                     "{}: failed to clear unpin timer: {}",
728                                     self.debug_label(),
729                                     e
730                                 );
731                                 break 'wait;
732                             }
733                         }
734                     }
735                     Token::UnpinReq => {
736                         if let Some(tube) = &unpin_tube {
737                             match tube.recv::<UnpinRequest>() {
738                                 Ok(req) => {
739                                     let mut unpin_done = true;
740                                     for range in req.ranges {
741                                         // Locking with respect to pin_pages isn't necessary
742                                         // for this case because the unpinned pages in the range
743                                         // should all be in the balloon and so nothing will attempt
744                                         // to pin them.
745                                         if !self.unpin_pages_in_range(range.0, range.1) {
746                                             unpin_done = false;
747                                             break;
748                                         }
749                                     }
750                                     let resp = if unpin_done {
751                                         UnpinResponse::Success
752                                     } else {
753                                         UnpinResponse::Failed
754                                     };
755                                     if let Err(e) = tube.send(&resp) {
756                                         error!(
757                                             "{}: failed to send unpin response {}",
758                                             self.debug_label(),
759                                             e
760                                         );
761                                     }
762                                 }
763                                 Err(e) => {
764                                     if let TubeError::Disconnected = e {
765                                         if let Err(e) = wait_ctx.delete(tube) {
766                                             error!(
767                                                 "{}: failed to remove unpin_tube: {}",
768                                                 self.debug_label(),
769                                                 e
770                                             );
771                                         }
772                                     } else {
773                                         error!(
774                                             "{}: failed to recv Unpin Request: {}",
775                                             self.debug_label(),
776                                             e
777                                         );
778                                     }
779                                 }
780                             }
781                         }
782                     }
783                     Token::Kill => break 'wait,
784                 }
785             }
786         }
787         self.unpin_tube = unpin_tube;
788     }
789 
unpin_pages(&mut self)790     fn unpin_pages(&mut self) {
791         if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
792             self.lru_unpin_pages();
793         }
794     }
795 
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)796     fn lru_unpin_page(
797         &mut self,
798         dtt_iter: &mut DTTIter,
799         new_gen: bool,
800     ) -> (UnpinResult, Option<PinnedPageInfo>) {
801         let mut pinstate = self.pinstate.lock();
802         let pageinfo = if new_gen {
803             pinstate.new_gen_pinned_pages.pop_front()
804         } else {
805             pinstate
806                 .old_gen_pinned_pages
807                 .pop_front()
808                 .map(|gfn| PinnedPageInfo::new(gfn, 0))
809         };
810 
811         pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
812             (
813                 unpin_page(
814                     &mut pinstate,
815                     &self.vfio_container,
816                     &self.mem,
817                     self.dtt_level,
818                     self.dtt_root,
819                     dtt_iter,
820                     pageinfo.gfn,
821                     false,
822                 ),
823                 Some(pageinfo),
824             )
825         })
826     }
827 
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64828     fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
829         let mut not_unpinned_new_gen_pages = VecDeque::new();
830         let mut not_unpinned_old_gen_pages = VecDeque::new();
831         let mut unpinned_count = 0;
832         let has_limit = unpin_limit.is_some();
833         let limit_count = unpin_limit.unwrap_or(0);
834         let mut dtt_iter: DTTIter = Default::default();
835 
836         // If has_limit is true but limit_count is 0, will not do the unpin
837         while !has_limit || unpinned_count != limit_count {
838             let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
839             match result {
840                 UnpinResult::UnpinlistEmpty => break,
841                 UnpinResult::Unpinned => unpinned_count += 1,
842                 UnpinResult::NotPinned => {}
843                 UnpinResult::NotUnpinned => {
844                     if let Some(mut page) = pinned_page {
845                         if self.params.unpin_gen_threshold != 0 {
846                             page.unpin_busy_cnt += 1;
847                             // Unpin from new_gen queue but not
848                             // successfully unpinned. Need to check
849                             // the unpin_gen threshold. If reach, put
850                             // it to old_gen queue.
851                             // And if it is not from new_gen, directly
852                             // put into old_gen queue.
853                             if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
854                                 not_unpinned_old_gen_pages.push_back(page.gfn);
855                             } else {
856                                 not_unpinned_new_gen_pages.push_back(page);
857                             }
858                         }
859                     }
860                 }
861                 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
862                     // Although UnpinParked means we didn't actually try to unpin
863                     // gfn, it's not worth specifically handing since parking is
864                     // expected to be relatively rare.
865                     if let Some(page) = pinned_page {
866                         if new_gen {
867                             not_unpinned_new_gen_pages.push_back(page);
868                         } else {
869                             not_unpinned_old_gen_pages.push_back(page.gfn);
870                         }
871                     }
872                     if result == UnpinResult::UnpinParked {
873                         thread::park();
874                     }
875                 }
876             }
877         }
878 
879         if !not_unpinned_new_gen_pages.is_empty() {
880             let mut pinstate = self.pinstate.lock();
881             pinstate
882                 .new_gen_pinned_pages
883                 .append(&mut not_unpinned_new_gen_pages);
884         }
885 
886         if !not_unpinned_old_gen_pages.is_empty() {
887             let mut pinstate = self.pinstate.lock();
888             pinstate
889                 .old_gen_pinned_pages
890                 .append(&mut not_unpinned_old_gen_pages);
891         }
892 
893         unpinned_count
894     }
895 
lru_unpin_pages(&mut self)896     fn lru_unpin_pages(&mut self) {
897         let mut unpin_count = 0;
898         if self.params.unpin_gen_threshold != 0 {
899             self.unpin_gen_threshold += 1;
900             if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
901                 self.unpin_gen_threshold = 0;
902                 // Try to unpin inactive queue first if reaches the thres hold
903                 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
904             }
905         }
906         // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
907         self.lru_unpin_pages_in_loop(
908             self.params
909                 .unpin_limit
910                 .map(|limit| limit.saturating_sub(unpin_count)),
911             true,
912         );
913     }
914 
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool915     fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
916         let mut dtt_iter: DTTIter = Default::default();
917         let mut index = 0;
918         while index != count {
919             let mut pinstate = self.pinstate.lock();
920             let result = unpin_page(
921                 &mut pinstate,
922                 &self.vfio_container,
923                 &self.mem,
924                 self.dtt_level,
925                 self.dtt_root,
926                 &mut dtt_iter,
927                 gfn + index,
928                 true,
929             );
930             drop(pinstate);
931 
932             match result {
933                 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
934                 UnpinResult::UnpinParked => {
935                     thread::park();
936                     continue;
937                 }
938                 _ => {
939                     error!("coiommu: force unpin failed by {:?}", result);
940                     return false;
941                 }
942             }
943             index += 1;
944         }
945         true
946     }
947 }
948 
949 pub struct CoIommuDev {
950     config_regs: PciConfiguration,
951     pci_address: Option<PciAddress>,
952     mem: GuestMemory,
953     coiommu_reg: CoIommuReg,
954     endpoints: Vec<u16>,
955     notifymap_mem: SafeDescriptor,
956     notifymap_mmap: Arc<MemoryMapping>,
957     notifymap_addr: Option<u64>,
958     topologymap_mem: SafeDescriptor,
959     topologymap_addr: Option<u64>,
960     mmapped: bool,
961     device_tube: Tube,
962     pin_thread: Option<thread::JoinHandle<PinWorker>>,
963     pin_kill_evt: Option<Event>,
964     unpin_thread: Option<thread::JoinHandle<UnpinWorker>>,
965     unpin_kill_evt: Option<Event>,
966     unpin_tube: Option<Tube>,
967     ioevents: Vec<Event>,
968     vfio_container: Arc<Mutex<VfioContainer>>,
969     pinstate: Arc<Mutex<CoIommuPinState>>,
970     params: CoIommuParameters,
971 }
972 
973 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, device_tube: Tube, unpin_tube: Tube, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>974     pub fn new(
975         mem: GuestMemory,
976         vfio_container: Arc<Mutex<VfioContainer>>,
977         device_tube: Tube,
978         unpin_tube: Tube,
979         endpoints: Vec<u16>,
980         vcpu_count: u64,
981         params: CoIommuParameters,
982     ) -> Result<Self> {
983         let config_regs = PciConfiguration::new(
984             PCI_VENDOR_ID_COIOMMU,
985             PCI_DEVICE_ID_COIOMMU,
986             PciClassCode::Other,
987             &PciOtherSubclass::Other,
988             None, // No Programming interface.
989             PciHeaderType::Device,
990             PCI_VENDOR_ID_COIOMMU,
991             PCI_DEVICE_ID_COIOMMU,
992             COIOMMU_REVISION_ID,
993         );
994 
995         // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
996         let notifymap_mem = SharedMemory::named("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
997             .context(Error::CreateSharedMemory)?;
998         let notifymap_mmap = Arc::new(
999             MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1000                 .from_shared_memory(&notifymap_mem)
1001                 .offset(0)
1002                 .build()?,
1003         );
1004 
1005         // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1006         let topologymap_mem =
1007             SharedMemory::named("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1008                 .context(Error::CreateSharedMemory)?;
1009         let topologymap_mmap = Arc::new(
1010             MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1011                 .from_shared_memory(&topologymap_mem)
1012                 .offset(0)
1013                 .build()?,
1014         );
1015 
1016         ensure!(
1017             (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1018             "Coiommu: too many endpoints"
1019         );
1020         topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1021         for (index, endpoint) in endpoints.iter().enumerate() {
1022             topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1023         }
1024 
1025         let mut ioevents = Vec::new();
1026         for _ in 0..vcpu_count {
1027             ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1028         }
1029 
1030         Ok(Self {
1031             config_regs,
1032             pci_address: None,
1033             mem,
1034             coiommu_reg: Default::default(),
1035             endpoints,
1036             notifymap_mem: notifymap_mem.into(),
1037             notifymap_mmap,
1038             notifymap_addr: None,
1039             topologymap_mem: topologymap_mem.into(),
1040             topologymap_addr: None,
1041             mmapped: false,
1042             device_tube,
1043             pin_thread: None,
1044             pin_kill_evt: None,
1045             unpin_thread: None,
1046             unpin_kill_evt: None,
1047             unpin_tube: Some(unpin_tube),
1048             ioevents,
1049             vfio_container,
1050             pinstate: Arc::new(Mutex::new(CoIommuPinState {
1051                 new_gen_pinned_pages: VecDeque::new(),
1052                 old_gen_pinned_pages: VecDeque::new(),
1053                 unpin_thread_state: UnpinThreadState::Unparked,
1054                 unpin_park_count: 0,
1055             })),
1056             params,
1057         })
1058     }
1059 
send_msg(&self, msg: &VmMemoryRequest) -> Result<()>1060     fn send_msg(&self, msg: &VmMemoryRequest) -> Result<()> {
1061         self.device_tube.send(msg).context(Error::TubeError)?;
1062         let res = self.device_tube.recv().context(Error::TubeError)?;
1063         match res {
1064             VmMemoryResponse::RegisterMemory { .. } => Ok(()),
1065             VmMemoryResponse::Err(e) => Err(anyhow!("Receive msg err {}", e)),
1066             _ => Err(anyhow!("Msg cannot be handled")),
1067         }
1068     }
1069 
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, read_only: bool, ) -> Result<()>1070     fn register_mmap(
1071         &self,
1072         descriptor: SafeDescriptor,
1073         size: usize,
1074         offset: u64,
1075         gpa: u64,
1076         read_only: bool,
1077     ) -> Result<()> {
1078         let request = VmMemoryRequest::RegisterMemory {
1079             source: VmMemorySource::Descriptor {
1080                 descriptor,
1081                 offset,
1082                 size: size as u64,
1083             },
1084             dest: VmMemoryDestination::GuestPhysicalAddress(gpa),
1085             read_only,
1086         };
1087         self.send_msg(&request)
1088     }
1089 
mmap(&mut self)1090     fn mmap(&mut self) {
1091         if self.mmapped {
1092             return;
1093         }
1094 
1095         if let Some(gpa) = self.notifymap_addr {
1096             match self.register_mmap(
1097                 self.notifymap_mem.try_clone().unwrap(),
1098                 COIOMMU_NOTIFYMAP_SIZE,
1099                 0,
1100                 gpa,
1101                 false,
1102             ) {
1103                 Ok(_) => {}
1104                 Err(e) => {
1105                     panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1106                 }
1107             }
1108         }
1109 
1110         if let Some(gpa) = self.topologymap_addr {
1111             match self.register_mmap(
1112                 self.topologymap_mem.try_clone().unwrap(),
1113                 COIOMMU_TOPOLOGYMAP_SIZE,
1114                 0,
1115                 gpa,
1116                 true,
1117             ) {
1118                 Ok(_) => {}
1119                 Err(e) => {
1120                     panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1121                 }
1122             }
1123         }
1124 
1125         self.mmapped = true;
1126     }
1127 
start_workers(&mut self)1128     fn start_workers(&mut self) {
1129         if self.pin_thread.is_none() {
1130             self.start_pin_thread();
1131         }
1132 
1133         if self.unpin_thread.is_none() {
1134             self.start_unpin_thread();
1135         }
1136     }
1137 
start_pin_thread(&mut self)1138     fn start_pin_thread(&mut self) {
1139         let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1140             Ok(v) => v,
1141             Err(e) => {
1142                 error!(
1143                     "{}: failed creating kill Event pair: {}",
1144                     self.debug_label(),
1145                     e
1146                 );
1147                 return;
1148             }
1149         };
1150 
1151         let mem = self.mem.clone();
1152         let endpoints = self.endpoints.to_vec();
1153         let notifymap_mmap = self.notifymap_mmap.clone();
1154         let dtt_root = self.coiommu_reg.dtt_root;
1155         let dtt_level = self.coiommu_reg.dtt_level;
1156         let ioevents = self
1157             .ioevents
1158             .iter()
1159             .map(|e| e.try_clone().unwrap())
1160             .collect();
1161         let vfio_container = self.vfio_container.clone();
1162         let pinstate = self.pinstate.clone();
1163         let params = self.params;
1164 
1165         let worker_result = thread::Builder::new()
1166             .name("coiommu_pin".to_string())
1167             .spawn(move || {
1168                 let mut worker = PinWorker {
1169                     mem,
1170                     endpoints,
1171                     notifymap_mmap,
1172                     dtt_root,
1173                     dtt_level,
1174                     ioevents,
1175                     vfio_container,
1176                     pinstate,
1177                     params,
1178                 };
1179                 worker.run(kill_evt);
1180                 worker
1181             });
1182 
1183         match worker_result {
1184             Err(e) => error!(
1185                 "{}: failed to spawn coiommu pin worker: {}",
1186                 self.debug_label(),
1187                 e
1188             ),
1189             Ok(join_handle) => {
1190                 self.pin_thread = Some(join_handle);
1191                 self.pin_kill_evt = Some(self_kill_evt);
1192             }
1193         }
1194     }
1195 
start_unpin_thread(&mut self)1196     fn start_unpin_thread(&mut self) {
1197         let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1198             Ok(v) => v,
1199             Err(e) => {
1200                 error!(
1201                     "{}: failed creating kill Event pair: {}",
1202                     self.debug_label(),
1203                     e
1204                 );
1205                 return;
1206             }
1207         };
1208 
1209         let mem = self.mem.clone();
1210         let dtt_root = self.coiommu_reg.dtt_root;
1211         let dtt_level = self.coiommu_reg.dtt_level;
1212         let vfio_container = self.vfio_container.clone();
1213         let unpin_tube = self.unpin_tube.take();
1214         let pinstate = self.pinstate.clone();
1215         let params = self.params;
1216         let worker_result = thread::Builder::new()
1217             .name("coiommu_unpin".to_string())
1218             .spawn(move || {
1219                 let mut worker = UnpinWorker {
1220                     mem,
1221                     dtt_level,
1222                     dtt_root,
1223                     vfio_container,
1224                     unpin_tube,
1225                     pinstate,
1226                     params,
1227                     unpin_gen_threshold: 0,
1228                 };
1229                 worker.run(kill_evt);
1230                 worker
1231             });
1232 
1233         match worker_result {
1234             Err(e) => {
1235                 error!(
1236                     "{}: failed to spawn coiommu unpin worker: {}",
1237                     self.debug_label(),
1238                     e
1239                 );
1240             }
1241             Ok(join_handle) => {
1242                 self.unpin_thread = Some(join_handle);
1243                 self.unpin_kill_evt = Some(self_kill_evt);
1244             }
1245         }
1246     }
1247 
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1248     fn allocate_bar_address(
1249         &mut self,
1250         resources: &mut SystemAllocator,
1251         address: PciAddress,
1252         size: u64,
1253         bar_num: u8,
1254         name: &str,
1255     ) -> PciResult<u64> {
1256         let addr = resources
1257             .mmio_allocator(MmioType::High)
1258             .allocate_with_align(
1259                 size,
1260                 Alloc::PciBar {
1261                     bus: address.bus,
1262                     dev: address.dev,
1263                     func: address.func,
1264                     bar: bar_num,
1265                 },
1266                 name.to_string(),
1267                 size,
1268             )
1269             .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1270 
1271         let bar = PciBarConfiguration::new(
1272             bar_num as usize,
1273             size,
1274             PciBarRegionType::Memory64BitRegion,
1275             PciBarPrefetchable::Prefetchable,
1276         )
1277         .set_address(addr);
1278 
1279         self.config_regs
1280             .add_pci_bar(bar)
1281             .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1282 
1283         Ok(addr)
1284     }
1285 
read_mmio(&mut self, addr: u64, data: &mut [u8])1286     fn read_mmio(&mut self, addr: u64, data: &mut [u8]) {
1287         let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1288         let offset = addr - bar;
1289         if offset >= mem::size_of::<CoIommuReg>() as u64 {
1290             error!(
1291                 "{}: read_mmio: invalid addr 0x{:x} bar 0x{:x} offset 0x{:x}",
1292                 self.debug_label(),
1293                 addr,
1294                 bar,
1295                 offset
1296             );
1297             return;
1298         }
1299 
1300         // Sanity check, must be 64bit aligned accessing
1301         if offset % 8 != 0 || data.len() != 8 {
1302             error!(
1303                 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1304                 self.debug_label(),
1305                 offset,
1306                 data.len()
1307             );
1308             return;
1309         }
1310 
1311         let v = match offset / 8 {
1312             0 => self.coiommu_reg.dtt_root,
1313             1 => self.coiommu_reg.cmd,
1314             2 => self.coiommu_reg.dtt_level,
1315             _ => return,
1316         };
1317 
1318         data.copy_from_slice(&v.to_ne_bytes());
1319     }
1320 
write_mmio(&mut self, addr: u64, data: &[u8])1321     fn write_mmio(&mut self, addr: u64, data: &[u8]) {
1322         let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1323         let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1324         let offset = addr - bar;
1325         if offset >= mmio_len {
1326             if data.len() != 1 {
1327                 error!(
1328                     "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1329                     self.debug_label(),
1330                     offset,
1331                     data.len()
1332                 );
1333                 return;
1334             }
1335 
1336             // Usually will not be here as this is for the per-vcpu notify
1337             // register which is monitored by the ioevents. For the notify
1338             // register which is not covered by the ioevents, they are not
1339             // be used by the frontend driver. In case the frontend driver
1340             // went here, do a simple handle to make sure the frontend driver
1341             // will not be blocked, and through an error log.
1342             let index = (offset - mmio_len) as usize * mem::size_of::<u64>();
1343             self.notifymap_mmap.write_obj::<u64>(0, index).unwrap();
1344             error!(
1345                 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1346                 self.debug_label(),
1347                 offset
1348             );
1349             return;
1350         }
1351 
1352         // Sanity check, must be 64bit aligned accessing for CoIommuReg
1353         if offset % 8 != 0 || data.len() != 8 {
1354             error!(
1355                 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1356                 self.debug_label(),
1357                 offset,
1358                 data.len()
1359             );
1360             return;
1361         }
1362 
1363         let index = offset / 8;
1364         let v = u64::from_ne_bytes(data.try_into().unwrap());
1365         match index {
1366             0 => {
1367                 if self.coiommu_reg.dtt_root == 0 {
1368                     self.coiommu_reg.dtt_root = v;
1369                 }
1370             }
1371             1 => match v {
1372                 // Deactivate can happen if the frontend driver in the guest
1373                 // fails during probing or if the CoIommu device is removed
1374                 // by the guest. Neither of these cases is expected, and if
1375                 // either happens the guest will be non-functional due to
1376                 // pass-through devices which rely on CoIommu not working.
1377                 // So just fail hard and panic.
1378                 COIOMMU_CMD_DEACTIVATE => {
1379                     panic!("{}: Deactivate is not supported", self.debug_label())
1380                 }
1381                 COIOMMU_CMD_ACTIVATE => {
1382                     if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1383                         self.start_workers();
1384                     }
1385                 }
1386                 COIOMMU_CMD_PARK_UNPIN => {
1387                     let mut pinstate = self.pinstate.lock();
1388                     pinstate.unpin_thread_state = UnpinThreadState::Parked;
1389                     if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1390                         pinstate.unpin_park_count = v;
1391                     } else {
1392                         panic!("{}: Park request overflowing", self.debug_label());
1393                     }
1394                 }
1395                 COIOMMU_CMD_UNPARK_UNPIN => {
1396                     let mut pinstate = self.pinstate.lock();
1397                     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1398                         if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1399                             pinstate.unpin_park_count = v;
1400                             if pinstate.unpin_park_count == 0 {
1401                                 if let Some(worker_thread) = &self.unpin_thread {
1402                                     worker_thread.thread().unpark();
1403                                 }
1404                                 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1405                             }
1406                         } else {
1407                             error!("{}: Park count is already reached to 0", self.debug_label());
1408                         }
1409                     }
1410                 }
1411                 _ => {}
1412             },
1413             2 => {
1414                 if self.coiommu_reg.dtt_level == 0 {
1415                     self.coiommu_reg.dtt_level = v;
1416                 }
1417             }
1418             _ => {}
1419         }
1420     }
1421 }
1422 
1423 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1424     fn debug_label(&self) -> String {
1425         "CoIommu".to_owned()
1426     }
1427 
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1428     fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1429         if self.pci_address.is_none() {
1430             self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1431                 Some(Alloc::PciBar {
1432                     bus,
1433                     dev,
1434                     func,
1435                     bar: _,
1436                 }) => Some(PciAddress { bus, dev, func }),
1437                 _ => None,
1438             }
1439         }
1440         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1441     }
1442 
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1443     fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1444         let address = self
1445             .pci_address
1446             .expect("allocate_address must be called prior to allocate_io_bars");
1447 
1448         // Allocate one bar for the structures pointed to by the capability structures.
1449         let mut ranges: Vec<BarRange> = Vec::new();
1450 
1451         let mmio_addr = self.allocate_bar_address(
1452             resources,
1453             address,
1454             COIOMMU_MMIO_BAR_SIZE as u64,
1455             COIOMMU_MMIO_BAR,
1456             "coiommu-mmiobar",
1457         )?;
1458 
1459         ranges.push(BarRange {
1460             addr: mmio_addr,
1461             size: COIOMMU_MMIO_BAR_SIZE,
1462             prefetchable: false,
1463         });
1464 
1465         Ok(ranges)
1466     }
1467 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1468     fn allocate_device_bars(
1469         &mut self,
1470         resources: &mut SystemAllocator,
1471     ) -> PciResult<Vec<BarRange>> {
1472         let address = self
1473             .pci_address
1474             .expect("allocate_address must be called prior to allocate_device_bars");
1475 
1476         let mut ranges: Vec<BarRange> = Vec::new();
1477 
1478         let topologymap_addr = self.allocate_bar_address(
1479             resources,
1480             address,
1481             COIOMMU_TOPOLOGYMAP_SIZE as u64,
1482             COIOMMU_TOPOLOGYMAP_BAR,
1483             "coiommu-topology",
1484         )?;
1485         self.topologymap_addr = Some(topologymap_addr);
1486         ranges.push(BarRange {
1487             addr: topologymap_addr,
1488             size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1489             prefetchable: false,
1490         });
1491 
1492         let notifymap_addr = self.allocate_bar_address(
1493             resources,
1494             address,
1495             COIOMMU_NOTIFYMAP_SIZE as u64,
1496             COIOMMU_NOTIFYMAP_BAR,
1497             "coiommu-notifymap",
1498         )?;
1499         self.notifymap_addr = Some(notifymap_addr);
1500         ranges.push(BarRange {
1501             addr: notifymap_addr,
1502             size: COIOMMU_NOTIFYMAP_SIZE as u64,
1503             prefetchable: false,
1504         });
1505 
1506         Ok(ranges)
1507     }
1508 
read_config_register(&self, reg_idx: usize) -> u321509     fn read_config_register(&self, reg_idx: usize) -> u32 {
1510         self.config_regs.read_reg(reg_idx)
1511     }
1512 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1513     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1514         if reg_idx == COMMAND_REG
1515             && data.len() == 2
1516             && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1517             && !self.mmapped
1518         {
1519             self.mmap();
1520         }
1521 
1522         (&mut self.config_regs).write_reg(reg_idx, offset, data);
1523     }
1524 
keep_rds(&self) -> Vec<RawDescriptor>1525     fn keep_rds(&self) -> Vec<RawDescriptor> {
1526         let mut rds = vec![
1527             self.vfio_container.lock().as_raw_descriptor(),
1528             self.device_tube.as_raw_descriptor(),
1529             self.notifymap_mem.as_raw_descriptor(),
1530             self.topologymap_mem.as_raw_descriptor(),
1531         ];
1532         if let Some(unpin_tube) = &self.unpin_tube {
1533             rds.push(unpin_tube.as_raw_descriptor());
1534         }
1535         rds
1536     }
1537 
read_bar(&mut self, addr: u64, data: &mut [u8])1538     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
1539         let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1540         let notifymap = self
1541             .config_regs
1542             .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1543         match addr {
1544             o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1545                 self.read_mmio(addr, data);
1546             }
1547             o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1548                 // With coiommu device activated, the accessing the notifymap bar
1549                 // won't cause vmexit. If goes here, means the coiommu device is
1550                 // deactivated, and will not do the pin/unpin work. Thus no need
1551                 // to handle this notifymap read.
1552             }
1553             _ => {}
1554         }
1555     }
1556 
write_bar(&mut self, addr: u64, data: &[u8])1557     fn write_bar(&mut self, addr: u64, data: &[u8]) {
1558         let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1559         let notifymap = self
1560             .config_regs
1561             .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1562         match addr {
1563             o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1564                 self.write_mmio(addr, data);
1565             }
1566             o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1567                 // With coiommu device activated, the accessing the notifymap bar
1568                 // won't cause vmexit. If goes here, means the coiommu device is
1569                 // deactivated, and will not do the pin/unpin work. Thus no need
1570                 // to handle this notifymap write.
1571             }
1572             _ => {}
1573         }
1574     }
1575 
ioevents(&self) -> Vec<(&Event, u64, Datamatch)>1576     fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> {
1577         let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1578         let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1579         self.ioevents
1580             .iter()
1581             .enumerate()
1582             .map(|(i, event)| (event, notify_base + i as u64, Datamatch::AnyLength))
1583             .collect()
1584     }
1585 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1586     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1587         self.config_regs.get_bar_configuration(bar_num)
1588     }
1589 }
1590 
1591 impl Drop for CoIommuDev {
drop(&mut self)1592     fn drop(&mut self) {
1593         if let Some(kill_evt) = self.pin_kill_evt.take() {
1594             // Ignore the result because there is nothing we can do about it.
1595             if kill_evt.write(1).is_ok() {
1596                 if let Some(worker_thread) = self.pin_thread.take() {
1597                     let _ = worker_thread.join();
1598                 }
1599             } else {
1600                 error!("CoIOMMU: failed to write to kill_evt to stop pin_thread");
1601             }
1602         }
1603 
1604         if let Some(kill_evt) = self.unpin_kill_evt.take() {
1605             // Ignore the result because there is nothing we can do about it.
1606             if kill_evt.write(1).is_ok() {
1607                 if let Some(worker_thread) = self.unpin_thread.take() {
1608                     let _ = worker_thread.join();
1609                 }
1610             } else {
1611                 error!("CoIOMMU: failed to write to kill_evt to stop unpin_thread");
1612             }
1613         }
1614     }
1615 }
1616