• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16 
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29 
30 use anyhow::anyhow;
31 use anyhow::bail;
32 use anyhow::ensure;
33 use anyhow::Context;
34 use anyhow::Result;
35 use base::error;
36 use base::info;
37 use base::AsRawDescriptor;
38 use base::Event;
39 use base::EventToken;
40 use base::MemoryMapping;
41 use base::MemoryMappingBuilder;
42 use base::Protection;
43 use base::RawDescriptor;
44 use base::SafeDescriptor;
45 use base::SharedMemory;
46 use base::Timer;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use resources::Alloc;
53 use resources::AllocOptions;
54 use resources::SystemAllocator;
55 use serde::Deserialize;
56 use serde::Deserializer;
57 use serde::Serialize;
58 use serde_keyvalue::FromKeyValues;
59 use sync::Mutex;
60 use thiserror::Error as ThisError;
61 use vm_control::VmMemoryDestination;
62 use vm_control::VmMemoryRequest;
63 use vm_control::VmMemoryResponse;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::AsBytes;
68 use zerocopy::FromBytes;
69 
70 use crate::pci::pci_configuration::PciBarConfiguration;
71 use crate::pci::pci_configuration::PciBarPrefetchable;
72 use crate::pci::pci_configuration::PciBarRegionType;
73 use crate::pci::pci_configuration::PciClassCode;
74 use crate::pci::pci_configuration::PciConfiguration;
75 use crate::pci::pci_configuration::PciHeaderType;
76 use crate::pci::pci_configuration::PciOtherSubclass;
77 use crate::pci::pci_configuration::COMMAND_REG;
78 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
79 use crate::pci::pci_device::BarRange;
80 use crate::pci::pci_device::PciDevice;
81 use crate::pci::pci_device::Result as PciResult;
82 use crate::pci::PciAddress;
83 use crate::pci::PciDeviceError;
84 use crate::vfio::VfioContainer;
85 use crate::Suspendable;
86 use crate::UnpinRequest;
87 use crate::UnpinResponse;
88 
89 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
90 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
91 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
92 const COIOMMU_CMD_ACTIVATE: u64 = 1;
93 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
94 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
95 const COIOMMU_REVISION_ID: u8 = 0x10;
96 const COIOMMU_MMIO_BAR: u8 = 0;
97 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
98 const COIOMMU_NOTIFYMAP_BAR: u8 = 2;
99 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
100 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
101 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
102 const PAGE_SIZE_4K: u64 = 4096;
103 const PAGE_SHIFT_4K: u64 = 12;
104 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
105 
106 const DTTE_PINNED_FLAG: u32 = 1 << 31;
107 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
108 const DTT_ENTRY_PRESENT: u64 = 1;
109 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
110 
111 #[derive(ThisError, Debug)]
112 enum Error {
113     #[error("CoIommu failed to create shared memory")]
114     CreateSharedMemory,
115     #[error("Failed to get DTT entry")]
116     GetDTTEntry,
117     #[error("Tube error")]
118     TubeError,
119 }
120 
121 //default interval is 60s
122 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
123 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
124 /// Holds the coiommu unpin policy
125 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
126 #[serde(rename_all = "kebab-case")]
127 pub enum CoIommuUnpinPolicy {
128     #[default]
129     Off,
130     Lru,
131 }
132 
133 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result134     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
135         use self::CoIommuUnpinPolicy::*;
136 
137         match self {
138             Off => write!(f, "off"),
139             Lru => write!(f, "lru"),
140         }
141     }
142 }
143 
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>144 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
145     deserializer: D,
146 ) -> Result<Duration, D::Error> {
147     let secs = u64::deserialize(deserializer)?;
148 
149     Ok(Duration::from_secs(secs))
150 }
151 
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>152 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
153     deserializer: D,
154 ) -> Result<Option<u64>, D::Error> {
155     let limit = u64::deserialize(deserializer)?;
156 
157     match limit {
158         0 => Err(serde::de::Error::custom(
159             "Please use non-zero unpin_limit value",
160         )),
161         limit => Ok(Some(limit)),
162     }
163 }
164 
unpin_interval_default() -> Duration165 fn unpin_interval_default() -> Duration {
166     UNPIN_DEFAULT_INTERVAL
167 }
168 
unpin_gen_threshold_default() -> u64169 fn unpin_gen_threshold_default() -> u64 {
170     UNPIN_GEN_DEFAULT_THRES
171 }
172 
173 /// Holds the parameters for a coiommu device
174 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
175 #[serde(deny_unknown_fields)]
176 pub struct CoIommuParameters {
177     #[serde(default)]
178     pub unpin_policy: CoIommuUnpinPolicy,
179     #[serde(
180         deserialize_with = "deserialize_unpin_interval",
181         default = "unpin_interval_default"
182     )]
183     pub unpin_interval: Duration,
184     #[serde(deserialize_with = "deserialize_unpin_limit", default)]
185     pub unpin_limit: Option<u64>,
186     // Number of unpin intervals a pinned page must be busy for to be aged into the
187     // older, less frequently checked generation.
188     #[serde(default = "unpin_gen_threshold_default")]
189     pub unpin_gen_threshold: u64,
190 }
191 
192 impl Default for CoIommuParameters {
default() -> Self193     fn default() -> Self {
194         Self {
195             unpin_policy: CoIommuUnpinPolicy::Off,
196             unpin_interval: UNPIN_DEFAULT_INTERVAL,
197             unpin_limit: None,
198             unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
199         }
200     }
201 }
202 
203 #[derive(Default, Debug, Copy, Clone)]
204 struct CoIommuReg {
205     dtt_root: u64,
206     cmd: u64,
207     dtt_level: u64,
208 }
209 
210 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
211 struct PinnedPageInfo {
212     gfn: u64,
213     unpin_busy_cnt: u64,
214 }
215 
216 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self217     fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
218         PinnedPageInfo {
219             gfn,
220             unpin_busy_cnt,
221         }
222     }
223 }
224 
225 #[derive(PartialEq, Debug, Eq)]
226 enum UnpinThreadState {
227     Unparked,
228     Parked,
229 }
230 
231 struct CoIommuPinState {
232     new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
233     old_gen_pinned_pages: VecDeque<u64>,
234     unpin_thread_state: UnpinThreadState,
235     unpin_park_count: u64,
236 }
237 
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool238 unsafe fn vfio_map(
239     vfio_container: &Arc<Mutex<VfioContainer>>,
240     iova: u64,
241     size: u64,
242     user_addr: u64,
243 ) -> bool {
244     match vfio_container
245         .lock()
246         .vfio_dma_map(iova, size, user_addr, true)
247     {
248         Ok(_) => true,
249         Err(e) => {
250             if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
251                 if errno == libc::EEXIST {
252                     // Already pinned. set PINNED flag
253                     error!("CoIommu: iova 0x{:x} already pinned", iova);
254                     return true;
255                 }
256             }
257             error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
258             false
259         }
260     }
261 }
262 
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool263 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
264     match vfio_container.lock().vfio_dma_unmap(iova, size) {
265         Ok(_) => true,
266         Err(e) => {
267             error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
268             false
269         }
270     }
271 }
272 
273 #[derive(Default, Debug, Copy, Clone, FromBytes, AsBytes)]
274 #[repr(C)]
275 struct PinPageInfo {
276     bdf: u16,
277     pad: [u16; 3],
278     nr_pages: u64,
279 }
280 
281 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
282 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
283 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
284 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
285 
level_to_offset(gfn: u64, level: u64) -> Result<u64>286 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
287     if level == 1 {
288         return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
289     }
290 
291     if level == 0 {
292         bail!("Invalid level for gfn 0x{:x}", gfn);
293     }
294 
295     let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
296 
297     Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
298 }
299 
300 struct DTTIter {
301     ptr: *const u8,
302     gfn: u64,
303 }
304 
305 impl Default for DTTIter {
default() -> Self306     fn default() -> Self {
307         DTTIter {
308             ptr: std::ptr::null(),
309             gfn: 0,
310         }
311     }
312 }
313 
314 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
315 //
316 // There are two ways to get the entry:
317 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
318 // corresponding entry. The DTT is shared between frontend and
319 // backend. It is page-table-like strctures and the entry is indexed
320 // by GFN. The argument dtt_root represents the root page
321 // pga and dtt_level represents the maximum page table level.
322 //
323 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
324 // stores an entry address and the associated gfn. If the target gfn is
325 // in the same page table page with the gfn in dtt_iter, then can
326 // calculate the target entry address based on the entry address in
327 // dtt_iter.
328 //
329 // As the DTT entry is shared between frontend and backend, the accessing
330 // should be atomic. So the returned value is converted to an AtomicU32
331 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>332 fn gfn_to_dtt_pte(
333     mem: &GuestMemory,
334     dtt_level: u64,
335     dtt_root: u64,
336     dtt_iter: &mut DTTIter,
337     gfn: u64,
338 ) -> Result<*const AtomicU32> {
339     let ptr = if dtt_iter.ptr.is_null()
340         || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
341     {
342         // Slow path to walk the DTT to get the pte entry
343         let mut level = dtt_level;
344         let mut pt_gpa = dtt_root;
345         let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
346 
347         while level != 1 {
348             let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
349             let parent_pt = mem
350                 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
351                 .context(Error::GetDTTEntry)?;
352 
353             if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
354                 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
355             }
356 
357             pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
358             level -= 1;
359         }
360 
361         let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
362 
363         mem.get_host_address(GuestAddress(pt_gpa + index))
364             .context(Error::GetDTTEntry)?
365     } else {
366         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
367         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
368         // means the calculated ptr will point to the same page as dtt_iter.ptr
369         if gfn > dtt_iter.gfn {
370             unsafe {
371                 dtt_iter
372                     .ptr
373                     .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
374             }
375         } else {
376             unsafe {
377                 dtt_iter
378                     .ptr
379                     .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
380             }
381         }
382     };
383 
384     dtt_iter.ptr = ptr;
385     dtt_iter.gfn = gfn;
386 
387     Ok(ptr as *const AtomicU32)
388 }
389 
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>390 fn pin_page(
391     pinstate: &mut CoIommuPinState,
392     policy: CoIommuUnpinPolicy,
393     vfio_container: &Arc<Mutex<VfioContainer>>,
394     mem: &GuestMemory,
395     dtt_level: u64,
396     dtt_root: u64,
397     dtt_iter: &mut DTTIter,
398     gfn: u64,
399 ) -> Result<()> {
400     let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
401 
402     let gpa = (gfn << PAGE_SHIFT_4K) as u64;
403     let host_addr = mem
404         .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
405         .context("failed to get host address")? as u64;
406 
407     // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
408     // Test PINNED flag
409     if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
410         info!("CoIommu: gfn 0x{:x} already pinned", gfn);
411         return Ok(());
412     }
413 
414     // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
415     // is guaranteed by MemoryMapping interface.
416     if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
417         // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
418         // set PINNED flag
419         unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
420         if policy == CoIommuUnpinPolicy::Lru {
421             pinstate
422                 .new_gen_pinned_pages
423                 .push_back(PinnedPageInfo::new(gfn, 0));
424         }
425     }
426 
427     Ok(())
428 }
429 
430 #[derive(PartialEq, Debug, Eq)]
431 enum UnpinResult {
432     UnpinlistEmpty,
433     Unpinned,
434     NotPinned,
435     NotUnpinned,
436     FailedUnpin,
437     UnpinParked,
438 }
439 
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult440 fn unpin_page(
441     pinstate: &mut CoIommuPinState,
442     vfio_container: &Arc<Mutex<VfioContainer>>,
443     mem: &GuestMemory,
444     dtt_level: u64,
445     dtt_root: u64,
446     dtt_iter: &mut DTTIter,
447     gfn: u64,
448     force: bool,
449 ) -> UnpinResult {
450     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
451         return UnpinResult::UnpinParked;
452     }
453 
454     let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
455         Ok(v) => v,
456         Err(_) => {
457             // The case force == true may try to unpin a page which is not
458             // mapped in the dtt. For such page, the pte doesn't exist yet
459             // thus don't need to report any error log.
460             // The case force == false is used by coiommu to periodically
461             // unpin the pages which have been mapped in dtt, thus the pte
462             // for such page does exist. However with the unpin request from
463             // virtio balloon, such pages can be unpinned already and the DTT
464             // pages might be reclaimed by the Guest OS kernel as well, thus
465             // it is also possible to be here. Not to report an error log.
466             return UnpinResult::NotPinned;
467         }
468     };
469 
470     if force {
471         // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
472         // This case is for balloon to evict pages so these pages should
473         // already been locked by balloon and no device driver in VM is
474         // able to access these pages, so just clear ACCESSED flag first
475         // to make sure the following unpin can be success.
476         unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
477     }
478 
479     // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
480     if let Err(entry) = unsafe {
481         (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
482     } {
483         // The compare_exchange failed as the original leaf entry is
484         // not DTTE_PINNED_FLAG so cannot do the unpin.
485         if entry == 0 {
486             // The GFN is already unpinned. This is very similar to the
487             // gfn_to_dtt_pte error case, with the only difference being
488             // that the dtt_pte happens to be on a present page table.
489             UnpinResult::NotPinned
490         } else {
491             if !force {
492                 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
493                 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
494                 // this page. It represents whether or not this page is touched by the
495                 // guest. By clearing this flag after an unpin work, we can detect if
496                 // this page has been touched by the guest in the next round of unpin
497                 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
498                 // will be failed and we will be here again to clear this flag. If this
499                 // flag is not set at the next round, unpin this page will be probably
500                 // success.
501                 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
502             } else {
503                 // If we're here, then the guest is trying to release a page via the
504                 // balloon that it still has pinned. This most likely that something is
505                 // wrong in the guest kernel. Just leave the page pinned and log
506                 // an error.
507                 // This failure blocks the balloon from removing the page, which ensures
508                 // that the guest's view of memory will remain consistent with device
509                 // DMA's view of memory. Also note that the host kernel maintains an
510                 // elevated refcount for pinned pages, which is a second guarantee the
511                 // pages accessible by device DMA won't be freed until after they are
512                 // unpinned.
513                 error!(
514                     "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
515                     gfn, entry
516                 );
517             }
518             // GFN cannot be unpinned either because the unmap count
519             // is non-zero or the it has accessed flag set.
520             UnpinResult::NotUnpinned
521         }
522     } else {
523         // The compare_exchange success as the original leaf entry is
524         // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
525         // page.
526         let gpa = (gfn << PAGE_SHIFT_4K) as u64;
527         if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
528             UnpinResult::Unpinned
529         } else {
530             // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
531             // make sure the pinned flag is set
532             unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
533             // need to put this gfn back to pinned vector
534             UnpinResult::FailedUnpin
535         }
536     }
537 }
538 
539 struct PinWorker {
540     mem: GuestMemory,
541     endpoints: Vec<u16>,
542     notifymap_mmap: Arc<MemoryMapping>,
543     dtt_level: u64,
544     dtt_root: u64,
545     ioevents: Vec<Event>,
546     vfio_container: Arc<Mutex<VfioContainer>>,
547     pinstate: Arc<Mutex<CoIommuPinState>>,
548     params: CoIommuParameters,
549 }
550 
551 impl PinWorker {
debug_label(&self) -> &'static str552     fn debug_label(&self) -> &'static str {
553         "CoIommuPinWorker"
554     }
555 
run(&mut self, kill_evt: Event)556     fn run(&mut self, kill_evt: Event) {
557         #[derive(EventToken)]
558         enum Token {
559             Kill,
560             Pin { index: usize },
561         }
562 
563         let wait_ctx: WaitContext<Token> =
564             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
565                 Ok(pc) => pc,
566                 Err(e) => {
567                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
568                     return;
569                 }
570             };
571 
572         for (index, event) in self.ioevents.iter().enumerate() {
573             match wait_ctx.add(event, Token::Pin { index }) {
574                 Ok(_) => {}
575                 Err(e) => {
576                     error!(
577                         "{}: failed to add ioevent for index {}: {}",
578                         self.debug_label(),
579                         index,
580                         e
581                     );
582                     return;
583                 }
584             }
585         }
586 
587         'wait: loop {
588             let events = match wait_ctx.wait() {
589                 Ok(v) => v,
590                 Err(e) => {
591                     error!("{}: failed polling for events: {}", self.debug_label(), e);
592                     break;
593                 }
594             };
595 
596             for event in events.iter().filter(|e| e.is_readable) {
597                 match event.token {
598                     Token::Kill => break 'wait,
599                     Token::Pin { index } => {
600                         let offset = index * mem::size_of::<u64>() as usize;
601                         if let Some(event) = self.ioevents.get(index) {
602                             if let Err(e) = event.wait() {
603                                 error!(
604                                     "{}: failed reading event {}: {}",
605                                     self.debug_label(),
606                                     index,
607                                     e
608                                 );
609                                 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
610                                 break 'wait;
611                             }
612                         }
613                         if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
614                             if let Err(e) = self.pin_pages(data) {
615                                 error!("{}: {}", self.debug_label(), e);
616                             }
617                         }
618                         fence(Ordering::SeqCst);
619                         self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
620                     }
621                 }
622             }
623         }
624     }
625 
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>626     fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
627         let pin_page_info = self
628             .mem
629             .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
630             .context("failed to get pin page info")?;
631 
632         let bdf = pin_page_info.bdf;
633         ensure!(
634             self.endpoints.iter().any(|&x| x == bdf),
635             "pin page for unexpected bdf 0x{:x}",
636             bdf
637         );
638 
639         let mut nr_pages = pin_page_info.nr_pages;
640         let mut offset = mem::size_of::<PinPageInfo>() as u64;
641         let mut dtt_iter: DTTIter = Default::default();
642         let mut pinstate = self.pinstate.lock();
643         while nr_pages > 0 {
644             let gfn = self
645                 .mem
646                 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
647                 .context("failed to get pin page gfn")?;
648 
649             pin_page(
650                 &mut pinstate,
651                 self.params.unpin_policy,
652                 &self.vfio_container,
653                 &self.mem,
654                 self.dtt_level,
655                 self.dtt_root,
656                 &mut dtt_iter,
657                 gfn,
658             )?;
659 
660             offset += mem::size_of::<u64>() as u64;
661             nr_pages -= 1;
662         }
663 
664         Ok(())
665     }
666 
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>667     fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
668         if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
669             let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
670             self.pin_pages_in_batch(gpa)
671         } else {
672             let bdf = (gfn_bdf & 0xffff) as u16;
673             let gfn = gfn_bdf >> 16;
674             let mut dtt_iter: DTTIter = Default::default();
675             ensure!(
676                 self.endpoints.iter().any(|&x| x == bdf),
677                 "pin page for unexpected bdf 0x{:x}",
678                 bdf
679             );
680 
681             let mut pinstate = self.pinstate.lock();
682             pin_page(
683                 &mut pinstate,
684                 self.params.unpin_policy,
685                 &self.vfio_container,
686                 &self.mem,
687                 self.dtt_level,
688                 self.dtt_root,
689                 &mut dtt_iter,
690                 gfn,
691             )
692         }
693     }
694 }
695 
696 struct UnpinWorker {
697     mem: GuestMemory,
698     dtt_level: u64,
699     dtt_root: u64,
700     vfio_container: Arc<Mutex<VfioContainer>>,
701     unpin_tube: Option<Tube>,
702     pinstate: Arc<Mutex<CoIommuPinState>>,
703     params: CoIommuParameters,
704     unpin_gen_threshold: u64,
705 }
706 
707 impl UnpinWorker {
debug_label(&self) -> &'static str708     fn debug_label(&self) -> &'static str {
709         "CoIommuUnpinWorker"
710     }
711 
run(&mut self, kill_evt: Event)712     fn run(&mut self, kill_evt: Event) {
713         #[derive(EventToken)]
714         enum Token {
715             UnpinTimer,
716             UnpinReq,
717             Kill,
718         }
719 
720         let wait_ctx: WaitContext<Token> =
721             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
722                 Ok(pc) => pc,
723                 Err(e) => {
724                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
725                     return;
726                 }
727             };
728 
729         if let Some(tube) = &self.unpin_tube {
730             if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
731                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
732                 return;
733             }
734         }
735 
736         let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
737             && !self.params.unpin_interval.is_zero()
738         {
739             let duration = self.params.unpin_interval;
740             let interval = Some(self.params.unpin_interval);
741             let mut timer = match Timer::new() {
742                 Ok(t) => t,
743                 Err(e) => {
744                     error!(
745                         "{}: failed to create the unpin timer: {}",
746                         self.debug_label(),
747                         e
748                     );
749                     return;
750                 }
751             };
752             if let Err(e) = timer.reset(duration, interval) {
753                 error!(
754                     "{}: failed to start the unpin timer: {}",
755                     self.debug_label(),
756                     e
757                 );
758                 return;
759             }
760             if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
761                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
762                 return;
763             }
764             Some(timer)
765         } else {
766             None
767         };
768 
769         let unpin_tube = self.unpin_tube.take();
770         'wait: loop {
771             let events = match wait_ctx.wait() {
772                 Ok(v) => v,
773                 Err(e) => {
774                     error!("{}: failed polling for events: {}", self.debug_label(), e);
775                     break;
776                 }
777             };
778 
779             for event in events.iter().filter(|e| e.is_readable) {
780                 match event.token {
781                     Token::UnpinTimer => {
782                         self.unpin_pages();
783                         if let Some(timer) = &mut unpin_timer {
784                             if let Err(e) = timer.mark_waited() {
785                                 error!(
786                                     "{}: failed to clear unpin timer: {}",
787                                     self.debug_label(),
788                                     e
789                                 );
790                                 break 'wait;
791                             }
792                         }
793                     }
794                     Token::UnpinReq => {
795                         if let Some(tube) = &unpin_tube {
796                             match tube.recv::<UnpinRequest>() {
797                                 Ok(req) => {
798                                     let mut unpin_done = true;
799                                     for range in req.ranges {
800                                         // Locking with respect to pin_pages isn't necessary
801                                         // for this case because the unpinned pages in the range
802                                         // should all be in the balloon and so nothing will attempt
803                                         // to pin them.
804                                         if !self.unpin_pages_in_range(range.0, range.1) {
805                                             unpin_done = false;
806                                             break;
807                                         }
808                                     }
809                                     let resp = if unpin_done {
810                                         UnpinResponse::Success
811                                     } else {
812                                         UnpinResponse::Failed
813                                     };
814                                     if let Err(e) = tube.send(&resp) {
815                                         error!(
816                                             "{}: failed to send unpin response {}",
817                                             self.debug_label(),
818                                             e
819                                         );
820                                     }
821                                 }
822                                 Err(e) => {
823                                     if let TubeError::Disconnected = e {
824                                         if let Err(e) = wait_ctx.delete(tube) {
825                                             error!(
826                                                 "{}: failed to remove unpin_tube: {}",
827                                                 self.debug_label(),
828                                                 e
829                                             );
830                                         }
831                                     } else {
832                                         error!(
833                                             "{}: failed to recv Unpin Request: {}",
834                                             self.debug_label(),
835                                             e
836                                         );
837                                     }
838                                 }
839                             }
840                         }
841                     }
842                     Token::Kill => break 'wait,
843                 }
844             }
845         }
846         self.unpin_tube = unpin_tube;
847     }
848 
unpin_pages(&mut self)849     fn unpin_pages(&mut self) {
850         if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
851             self.lru_unpin_pages();
852         }
853     }
854 
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)855     fn lru_unpin_page(
856         &mut self,
857         dtt_iter: &mut DTTIter,
858         new_gen: bool,
859     ) -> (UnpinResult, Option<PinnedPageInfo>) {
860         let mut pinstate = self.pinstate.lock();
861         let pageinfo = if new_gen {
862             pinstate.new_gen_pinned_pages.pop_front()
863         } else {
864             pinstate
865                 .old_gen_pinned_pages
866                 .pop_front()
867                 .map(|gfn| PinnedPageInfo::new(gfn, 0))
868         };
869 
870         pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
871             (
872                 unpin_page(
873                     &mut pinstate,
874                     &self.vfio_container,
875                     &self.mem,
876                     self.dtt_level,
877                     self.dtt_root,
878                     dtt_iter,
879                     pageinfo.gfn,
880                     false,
881                 ),
882                 Some(pageinfo),
883             )
884         })
885     }
886 
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64887     fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
888         let mut not_unpinned_new_gen_pages = VecDeque::new();
889         let mut not_unpinned_old_gen_pages = VecDeque::new();
890         let mut unpinned_count = 0;
891         let has_limit = unpin_limit.is_some();
892         let limit_count = unpin_limit.unwrap_or(0);
893         let mut dtt_iter: DTTIter = Default::default();
894 
895         // If has_limit is true but limit_count is 0, will not do the unpin
896         while !has_limit || unpinned_count != limit_count {
897             let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
898             match result {
899                 UnpinResult::UnpinlistEmpty => break,
900                 UnpinResult::Unpinned => unpinned_count += 1,
901                 UnpinResult::NotPinned => {}
902                 UnpinResult::NotUnpinned => {
903                     if let Some(mut page) = pinned_page {
904                         if self.params.unpin_gen_threshold != 0 {
905                             page.unpin_busy_cnt += 1;
906                             // Unpin from new_gen queue but not
907                             // successfully unpinned. Need to check
908                             // the unpin_gen threshold. If reach, put
909                             // it to old_gen queue.
910                             // And if it is not from new_gen, directly
911                             // put into old_gen queue.
912                             if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
913                                 not_unpinned_old_gen_pages.push_back(page.gfn);
914                             } else {
915                                 not_unpinned_new_gen_pages.push_back(page);
916                             }
917                         }
918                     }
919                 }
920                 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
921                     // Although UnpinParked means we didn't actually try to unpin
922                     // gfn, it's not worth specifically handing since parking is
923                     // expected to be relatively rare.
924                     if let Some(page) = pinned_page {
925                         if new_gen {
926                             not_unpinned_new_gen_pages.push_back(page);
927                         } else {
928                             not_unpinned_old_gen_pages.push_back(page.gfn);
929                         }
930                     }
931                     if result == UnpinResult::UnpinParked {
932                         thread::park();
933                     }
934                 }
935             }
936         }
937 
938         if !not_unpinned_new_gen_pages.is_empty() {
939             let mut pinstate = self.pinstate.lock();
940             pinstate
941                 .new_gen_pinned_pages
942                 .append(&mut not_unpinned_new_gen_pages);
943         }
944 
945         if !not_unpinned_old_gen_pages.is_empty() {
946             let mut pinstate = self.pinstate.lock();
947             pinstate
948                 .old_gen_pinned_pages
949                 .append(&mut not_unpinned_old_gen_pages);
950         }
951 
952         unpinned_count
953     }
954 
lru_unpin_pages(&mut self)955     fn lru_unpin_pages(&mut self) {
956         let mut unpin_count = 0;
957         if self.params.unpin_gen_threshold != 0 {
958             self.unpin_gen_threshold += 1;
959             if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
960                 self.unpin_gen_threshold = 0;
961                 // Try to unpin inactive queue first if reaches the thres hold
962                 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
963             }
964         }
965         // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
966         self.lru_unpin_pages_in_loop(
967             self.params
968                 .unpin_limit
969                 .map(|limit| limit.saturating_sub(unpin_count)),
970             true,
971         );
972     }
973 
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool974     fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
975         let mut dtt_iter: DTTIter = Default::default();
976         let mut index = 0;
977         while index != count {
978             let mut pinstate = self.pinstate.lock();
979             let result = unpin_page(
980                 &mut pinstate,
981                 &self.vfio_container,
982                 &self.mem,
983                 self.dtt_level,
984                 self.dtt_root,
985                 &mut dtt_iter,
986                 gfn + index,
987                 true,
988             );
989             drop(pinstate);
990 
991             match result {
992                 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
993                 UnpinResult::UnpinParked => {
994                     thread::park();
995                     continue;
996                 }
997                 _ => {
998                     error!("coiommu: force unpin failed by {:?}", result);
999                     return false;
1000                 }
1001             }
1002             index += 1;
1003         }
1004         true
1005     }
1006 }
1007 
1008 pub struct CoIommuDev {
1009     config_regs: PciConfiguration,
1010     pci_address: Option<PciAddress>,
1011     mem: GuestMemory,
1012     coiommu_reg: CoIommuReg,
1013     endpoints: Vec<u16>,
1014     notifymap_mem: SafeDescriptor,
1015     notifymap_mmap: Arc<MemoryMapping>,
1016     notifymap_addr: Option<u64>,
1017     topologymap_mem: SafeDescriptor,
1018     topologymap_addr: Option<u64>,
1019     mmapped: bool,
1020     device_tube: Tube,
1021     pin_thread: Option<WorkerThread<PinWorker>>,
1022     unpin_thread: Option<WorkerThread<UnpinWorker>>,
1023     unpin_tube: Option<Tube>,
1024     ioevents: Vec<Event>,
1025     vfio_container: Arc<Mutex<VfioContainer>>,
1026     pinstate: Arc<Mutex<CoIommuPinState>>,
1027     params: CoIommuParameters,
1028 }
1029 
1030 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, device_tube: Tube, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1031     pub fn new(
1032         mem: GuestMemory,
1033         vfio_container: Arc<Mutex<VfioContainer>>,
1034         device_tube: Tube,
1035         unpin_tube: Option<Tube>,
1036         endpoints: Vec<u16>,
1037         vcpu_count: u64,
1038         params: CoIommuParameters,
1039     ) -> Result<Self> {
1040         let config_regs = PciConfiguration::new(
1041             PCI_VENDOR_ID_COIOMMU,
1042             PCI_DEVICE_ID_COIOMMU,
1043             PciClassCode::Other,
1044             &PciOtherSubclass::Other,
1045             None, // No Programming interface.
1046             PciHeaderType::Device,
1047             PCI_VENDOR_ID_COIOMMU,
1048             PCI_DEVICE_ID_COIOMMU,
1049             COIOMMU_REVISION_ID,
1050         );
1051 
1052         // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1053         let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1054             .context(Error::CreateSharedMemory)?;
1055         let notifymap_mmap = Arc::new(
1056             MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1057                 .from_shared_memory(&notifymap_mem)
1058                 .offset(0)
1059                 .build()?,
1060         );
1061 
1062         // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1063         let topologymap_mem =
1064             SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1065                 .context(Error::CreateSharedMemory)?;
1066         let topologymap_mmap = Arc::new(
1067             MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1068                 .from_shared_memory(&topologymap_mem)
1069                 .offset(0)
1070                 .build()?,
1071         );
1072 
1073         ensure!(
1074             (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1075             "Coiommu: too many endpoints"
1076         );
1077         topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1078         for (index, endpoint) in endpoints.iter().enumerate() {
1079             topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1080         }
1081 
1082         let mut ioevents = Vec::new();
1083         for _ in 0..vcpu_count {
1084             ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1085         }
1086 
1087         Ok(Self {
1088             config_regs,
1089             pci_address: None,
1090             mem,
1091             coiommu_reg: Default::default(),
1092             endpoints,
1093             notifymap_mem: notifymap_mem.into(),
1094             notifymap_mmap,
1095             notifymap_addr: None,
1096             topologymap_mem: topologymap_mem.into(),
1097             topologymap_addr: None,
1098             mmapped: false,
1099             device_tube,
1100             pin_thread: None,
1101             unpin_thread: None,
1102             unpin_tube,
1103             ioevents,
1104             vfio_container,
1105             pinstate: Arc::new(Mutex::new(CoIommuPinState {
1106                 new_gen_pinned_pages: VecDeque::new(),
1107                 old_gen_pinned_pages: VecDeque::new(),
1108                 unpin_thread_state: UnpinThreadState::Unparked,
1109                 unpin_park_count: 0,
1110             })),
1111             params,
1112         })
1113     }
1114 
send_msg(&self, msg: &VmMemoryRequest) -> Result<()>1115     fn send_msg(&self, msg: &VmMemoryRequest) -> Result<()> {
1116         self.device_tube.send(msg).context(Error::TubeError)?;
1117         let res = self.device_tube.recv().context(Error::TubeError)?;
1118         match res {
1119             VmMemoryResponse::RegisterMemory { .. } => Ok(()),
1120             VmMemoryResponse::Err(e) => Err(anyhow!("Receive msg err {}", e)),
1121             _ => Err(anyhow!("Msg cannot be handled")),
1122         }
1123     }
1124 
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1125     fn register_mmap(
1126         &self,
1127         descriptor: SafeDescriptor,
1128         size: usize,
1129         offset: u64,
1130         gpa: u64,
1131         prot: Protection,
1132     ) -> Result<()> {
1133         let request = VmMemoryRequest::RegisterMemory {
1134             source: VmMemorySource::Descriptor {
1135                 descriptor,
1136                 offset,
1137                 size: size as u64,
1138             },
1139             dest: VmMemoryDestination::GuestPhysicalAddress(gpa),
1140             prot,
1141         };
1142         self.send_msg(&request)
1143     }
1144 
mmap(&mut self)1145     fn mmap(&mut self) {
1146         if self.mmapped {
1147             return;
1148         }
1149 
1150         if let Some(gpa) = self.notifymap_addr {
1151             match self.register_mmap(
1152                 self.notifymap_mem.try_clone().unwrap(),
1153                 COIOMMU_NOTIFYMAP_SIZE,
1154                 0,
1155                 gpa,
1156                 Protection::read_write(),
1157             ) {
1158                 Ok(_) => {}
1159                 Err(e) => {
1160                     panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1161                 }
1162             }
1163         }
1164 
1165         if let Some(gpa) = self.topologymap_addr {
1166             match self.register_mmap(
1167                 self.topologymap_mem.try_clone().unwrap(),
1168                 COIOMMU_TOPOLOGYMAP_SIZE,
1169                 0,
1170                 gpa,
1171                 Protection::read(),
1172             ) {
1173                 Ok(_) => {}
1174                 Err(e) => {
1175                     panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1176                 }
1177             }
1178         }
1179 
1180         self.mmapped = true;
1181     }
1182 
start_workers(&mut self)1183     fn start_workers(&mut self) {
1184         if self.pin_thread.is_none() {
1185             self.start_pin_thread();
1186         }
1187 
1188         if self.unpin_thread.is_none() {
1189             self.start_unpin_thread();
1190         }
1191     }
1192 
start_pin_thread(&mut self)1193     fn start_pin_thread(&mut self) {
1194         let mem = self.mem.clone();
1195         let endpoints = self.endpoints.to_vec();
1196         let notifymap_mmap = self.notifymap_mmap.clone();
1197         let dtt_root = self.coiommu_reg.dtt_root;
1198         let dtt_level = self.coiommu_reg.dtt_level;
1199         let ioevents = self
1200             .ioevents
1201             .iter()
1202             .map(|e| e.try_clone().unwrap())
1203             .collect();
1204         let vfio_container = self.vfio_container.clone();
1205         let pinstate = self.pinstate.clone();
1206         let params = self.params;
1207 
1208         self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1209             let mut worker = PinWorker {
1210                 mem,
1211                 endpoints,
1212                 notifymap_mmap,
1213                 dtt_root,
1214                 dtt_level,
1215                 ioevents,
1216                 vfio_container,
1217                 pinstate,
1218                 params,
1219             };
1220             worker.run(kill_evt);
1221             worker
1222         }));
1223     }
1224 
start_unpin_thread(&mut self)1225     fn start_unpin_thread(&mut self) {
1226         let mem = self.mem.clone();
1227         let dtt_root = self.coiommu_reg.dtt_root;
1228         let dtt_level = self.coiommu_reg.dtt_level;
1229         let vfio_container = self.vfio_container.clone();
1230         let unpin_tube = self.unpin_tube.take();
1231         let pinstate = self.pinstate.clone();
1232         let params = self.params;
1233         self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1234             let mut worker = UnpinWorker {
1235                 mem,
1236                 dtt_level,
1237                 dtt_root,
1238                 vfio_container,
1239                 unpin_tube,
1240                 pinstate,
1241                 params,
1242                 unpin_gen_threshold: 0,
1243             };
1244             worker.run(kill_evt);
1245             worker
1246         }));
1247     }
1248 
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1249     fn allocate_bar_address(
1250         &mut self,
1251         resources: &mut SystemAllocator,
1252         address: PciAddress,
1253         size: u64,
1254         bar_num: u8,
1255         name: &str,
1256     ) -> PciResult<u64> {
1257         let addr = resources
1258             .allocate_mmio(
1259                 size,
1260                 Alloc::PciBar {
1261                     bus: address.bus,
1262                     dev: address.dev,
1263                     func: address.func,
1264                     bar: bar_num,
1265                 },
1266                 name.to_string(),
1267                 AllocOptions::new().prefetchable(true).align(size),
1268             )
1269             .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1270 
1271         let bar = PciBarConfiguration::new(
1272             bar_num as usize,
1273             size,
1274             PciBarRegionType::Memory64BitRegion,
1275             PciBarPrefetchable::Prefetchable,
1276         )
1277         .set_address(addr);
1278 
1279         self.config_regs
1280             .add_pci_bar(bar)
1281             .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1282 
1283         Ok(addr)
1284     }
1285 
read_mmio(&mut self, addr: u64, data: &mut [u8])1286     fn read_mmio(&mut self, addr: u64, data: &mut [u8]) {
1287         let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1288         let offset = addr - bar;
1289         if offset >= mem::size_of::<CoIommuReg>() as u64 {
1290             error!(
1291                 "{}: read_mmio: invalid addr 0x{:x} bar 0x{:x} offset 0x{:x}",
1292                 self.debug_label(),
1293                 addr,
1294                 bar,
1295                 offset
1296             );
1297             return;
1298         }
1299 
1300         // Sanity check, must be 64bit aligned accessing
1301         if offset % 8 != 0 || data.len() != 8 {
1302             error!(
1303                 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1304                 self.debug_label(),
1305                 offset,
1306                 data.len()
1307             );
1308             return;
1309         }
1310 
1311         let v = match offset / 8 {
1312             0 => self.coiommu_reg.dtt_root,
1313             1 => self.coiommu_reg.cmd,
1314             2 => self.coiommu_reg.dtt_level,
1315             _ => return,
1316         };
1317 
1318         data.copy_from_slice(&v.to_ne_bytes());
1319     }
1320 
write_mmio(&mut self, addr: u64, data: &[u8])1321     fn write_mmio(&mut self, addr: u64, data: &[u8]) {
1322         let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1323         let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1324         let offset = addr - bar;
1325         if offset >= mmio_len {
1326             if data.len() != 1 {
1327                 error!(
1328                     "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1329                     self.debug_label(),
1330                     offset,
1331                     data.len()
1332                 );
1333                 return;
1334             }
1335 
1336             // Usually will not be here as this is for the per-vcpu notify
1337             // register which is monitored by the ioevents. For the notify
1338             // register which is not covered by the ioevents, they are not
1339             // be used by the frontend driver. In case the frontend driver
1340             // went here, do a simple handle to make sure the frontend driver
1341             // will not be blocked, and through an error log.
1342             let index = (offset - mmio_len) as usize * mem::size_of::<u64>();
1343             self.notifymap_mmap.write_obj::<u64>(0, index).unwrap();
1344             error!(
1345                 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1346                 self.debug_label(),
1347                 offset
1348             );
1349             return;
1350         }
1351 
1352         // Sanity check, must be 64bit aligned accessing for CoIommuReg
1353         if offset % 8 != 0 || data.len() != 8 {
1354             error!(
1355                 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1356                 self.debug_label(),
1357                 offset,
1358                 data.len()
1359             );
1360             return;
1361         }
1362 
1363         let index = offset / 8;
1364         let v = u64::from_ne_bytes(data.try_into().unwrap());
1365         match index {
1366             0 => {
1367                 if self.coiommu_reg.dtt_root == 0 {
1368                     self.coiommu_reg.dtt_root = v;
1369                 }
1370             }
1371             1 => match v {
1372                 // Deactivate can happen if the frontend driver in the guest
1373                 // fails during probing or if the CoIommu device is removed
1374                 // by the guest. Neither of these cases is expected, and if
1375                 // either happens the guest will be non-functional due to
1376                 // pass-through devices which rely on CoIommu not working.
1377                 // So just fail hard and panic.
1378                 COIOMMU_CMD_DEACTIVATE => {
1379                     panic!("{}: Deactivate is not supported", self.debug_label())
1380                 }
1381                 COIOMMU_CMD_ACTIVATE => {
1382                     if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1383                         self.start_workers();
1384                     }
1385                 }
1386                 COIOMMU_CMD_PARK_UNPIN => {
1387                     let mut pinstate = self.pinstate.lock();
1388                     pinstate.unpin_thread_state = UnpinThreadState::Parked;
1389                     if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1390                         pinstate.unpin_park_count = v;
1391                     } else {
1392                         panic!("{}: Park request overflowing", self.debug_label());
1393                     }
1394                 }
1395                 COIOMMU_CMD_UNPARK_UNPIN => {
1396                     let mut pinstate = self.pinstate.lock();
1397                     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1398                         if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1399                             pinstate.unpin_park_count = v;
1400                             if pinstate.unpin_park_count == 0 {
1401                                 if let Some(worker_thread) = &self.unpin_thread {
1402                                     worker_thread.thread().unpark();
1403                                 }
1404                                 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1405                             }
1406                         } else {
1407                             error!("{}: Park count is already reached to 0", self.debug_label());
1408                         }
1409                     }
1410                 }
1411                 _ => {}
1412             },
1413             2 => {
1414                 if self.coiommu_reg.dtt_level == 0 {
1415                     self.coiommu_reg.dtt_level = v;
1416                 }
1417             }
1418             _ => {}
1419         }
1420     }
1421 }
1422 
1423 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1424     fn debug_label(&self) -> String {
1425         "CoIommu".to_owned()
1426     }
1427 
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1428     fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1429         if self.pci_address.is_none() {
1430             self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1431                 Some(Alloc::PciBar {
1432                     bus,
1433                     dev,
1434                     func,
1435                     bar: _,
1436                 }) => Some(PciAddress { bus, dev, func }),
1437                 _ => None,
1438             }
1439         }
1440         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1441     }
1442 
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1443     fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1444         let address = self
1445             .pci_address
1446             .expect("allocate_address must be called prior to allocate_io_bars");
1447 
1448         // Allocate one bar for the structures pointed to by the capability structures.
1449         let mut ranges: Vec<BarRange> = Vec::new();
1450 
1451         let mmio_addr = self.allocate_bar_address(
1452             resources,
1453             address,
1454             COIOMMU_MMIO_BAR_SIZE as u64,
1455             COIOMMU_MMIO_BAR,
1456             "coiommu-mmiobar",
1457         )?;
1458 
1459         ranges.push(BarRange {
1460             addr: mmio_addr,
1461             size: COIOMMU_MMIO_BAR_SIZE,
1462             prefetchable: false,
1463         });
1464 
1465         Ok(ranges)
1466     }
1467 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1468     fn allocate_device_bars(
1469         &mut self,
1470         resources: &mut SystemAllocator,
1471     ) -> PciResult<Vec<BarRange>> {
1472         let address = self
1473             .pci_address
1474             .expect("allocate_address must be called prior to allocate_device_bars");
1475 
1476         let mut ranges: Vec<BarRange> = Vec::new();
1477 
1478         let topologymap_addr = self.allocate_bar_address(
1479             resources,
1480             address,
1481             COIOMMU_TOPOLOGYMAP_SIZE as u64,
1482             COIOMMU_TOPOLOGYMAP_BAR,
1483             "coiommu-topology",
1484         )?;
1485         self.topologymap_addr = Some(topologymap_addr);
1486         ranges.push(BarRange {
1487             addr: topologymap_addr,
1488             size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1489             prefetchable: false,
1490         });
1491 
1492         let notifymap_addr = self.allocate_bar_address(
1493             resources,
1494             address,
1495             COIOMMU_NOTIFYMAP_SIZE as u64,
1496             COIOMMU_NOTIFYMAP_BAR,
1497             "coiommu-notifymap",
1498         )?;
1499         self.notifymap_addr = Some(notifymap_addr);
1500         ranges.push(BarRange {
1501             addr: notifymap_addr,
1502             size: COIOMMU_NOTIFYMAP_SIZE as u64,
1503             prefetchable: false,
1504         });
1505 
1506         Ok(ranges)
1507     }
1508 
read_config_register(&self, reg_idx: usize) -> u321509     fn read_config_register(&self, reg_idx: usize) -> u32 {
1510         self.config_regs.read_reg(reg_idx)
1511     }
1512 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1513     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1514         if reg_idx == COMMAND_REG
1515             && data.len() == 2
1516             && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1517             && !self.mmapped
1518         {
1519             self.mmap();
1520         }
1521 
1522         self.config_regs.write_reg(reg_idx, offset, data);
1523     }
1524 
keep_rds(&self) -> Vec<RawDescriptor>1525     fn keep_rds(&self) -> Vec<RawDescriptor> {
1526         let mut rds = vec![
1527             self.vfio_container.lock().as_raw_descriptor(),
1528             self.device_tube.as_raw_descriptor(),
1529             self.notifymap_mem.as_raw_descriptor(),
1530             self.topologymap_mem.as_raw_descriptor(),
1531         ];
1532         if let Some(unpin_tube) = &self.unpin_tube {
1533             rds.push(unpin_tube.as_raw_descriptor());
1534         }
1535         rds
1536     }
1537 
read_bar(&mut self, addr: u64, data: &mut [u8])1538     fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
1539         let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1540         let notifymap = self
1541             .config_regs
1542             .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1543         match addr {
1544             o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1545                 self.read_mmio(addr, data);
1546             }
1547             o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1548                 // With coiommu device activated, the accessing the notifymap bar
1549                 // won't cause vmexit. If goes here, means the coiommu device is
1550                 // deactivated, and will not do the pin/unpin work. Thus no need
1551                 // to handle this notifymap read.
1552             }
1553             _ => {}
1554         }
1555     }
1556 
write_bar(&mut self, addr: u64, data: &[u8])1557     fn write_bar(&mut self, addr: u64, data: &[u8]) {
1558         let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1559         let notifymap = self
1560             .config_regs
1561             .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1562         match addr {
1563             o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1564                 self.write_mmio(addr, data);
1565             }
1566             o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1567                 // With coiommu device activated, the accessing the notifymap bar
1568                 // won't cause vmexit. If goes here, means the coiommu device is
1569                 // deactivated, and will not do the pin/unpin work. Thus no need
1570                 // to handle this notifymap write.
1571             }
1572             _ => {}
1573         }
1574     }
1575 
ioevents(&self) -> Vec<(&Event, u64, Datamatch)>1576     fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> {
1577         let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1578         let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1579         self.ioevents
1580             .iter()
1581             .enumerate()
1582             .map(|(i, event)| (event, notify_base + i as u64, Datamatch::AnyLength))
1583             .collect()
1584     }
1585 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1586     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1587         self.config_regs.get_bar_configuration(bar_num)
1588     }
1589 }
1590 
1591 impl Suspendable for CoIommuDev {}
1592