• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16 
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29 
30 use anyhow::bail;
31 use anyhow::ensure;
32 use anyhow::Context;
33 use anyhow::Result;
34 use base::error;
35 use base::info;
36 use base::AsRawDescriptor;
37 use base::Event;
38 use base::EventToken;
39 use base::MemoryMapping;
40 use base::MemoryMappingBuilder;
41 use base::Protection;
42 use base::RawDescriptor;
43 use base::SafeDescriptor;
44 use base::SharedMemory;
45 use base::Timer;
46 use base::TimerTrait;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use hypervisor::MemCacheType;
53 use resources::Alloc;
54 use resources::AllocOptions;
55 use resources::SystemAllocator;
56 use serde::Deserialize;
57 use serde::Deserializer;
58 use serde::Serialize;
59 use serde_keyvalue::FromKeyValues;
60 use sync::Mutex;
61 use thiserror::Error as ThisError;
62 use vm_control::api::VmMemoryClient;
63 use vm_control::VmMemoryDestination;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::FromBytes;
68 use zerocopy::IntoBytes;
69 
70 use crate::pci::pci_configuration::PciBarConfiguration;
71 use crate::pci::pci_configuration::PciBarPrefetchable;
72 use crate::pci::pci_configuration::PciBarRegionType;
73 use crate::pci::pci_configuration::PciClassCode;
74 use crate::pci::pci_configuration::PciConfiguration;
75 use crate::pci::pci_configuration::PciHeaderType;
76 use crate::pci::pci_configuration::PciOtherSubclass;
77 use crate::pci::pci_configuration::COMMAND_REG;
78 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
79 use crate::pci::pci_device::BarRange;
80 use crate::pci::pci_device::PciDevice;
81 use crate::pci::pci_device::Result as PciResult;
82 use crate::pci::PciAddress;
83 use crate::pci::PciBarIndex;
84 use crate::pci::PciDeviceError;
85 use crate::vfio::VfioContainer;
86 use crate::Suspendable;
87 use crate::UnpinRequest;
88 use crate::UnpinResponse;
89 
90 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
91 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
92 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
93 const COIOMMU_CMD_ACTIVATE: u64 = 1;
94 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
95 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
96 const COIOMMU_REVISION_ID: u8 = 0x10;
97 const COIOMMU_MMIO_BAR: PciBarIndex = 0;
98 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
99 const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
100 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
101 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
102 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
103 const PAGE_SIZE_4K: u64 = 4096;
104 const PAGE_SHIFT_4K: u64 = 12;
105 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
106 
107 const DTTE_PINNED_FLAG: u32 = 1 << 31;
108 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
109 const DTT_ENTRY_PRESENT: u64 = 1;
110 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
111 
112 #[derive(ThisError, Debug)]
113 enum Error {
114     #[error("CoIommu failed to create shared memory")]
115     CreateSharedMemory,
116     #[error("Failed to get DTT entry")]
117     GetDTTEntry,
118 }
119 
120 //default interval is 60s
121 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
122 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
123 /// Holds the coiommu unpin policy
124 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
125 #[serde(rename_all = "kebab-case")]
126 pub enum CoIommuUnpinPolicy {
127     #[default]
128     Off,
129     Lru,
130 }
131 
132 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result133     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
134         use self::CoIommuUnpinPolicy::*;
135 
136         match self {
137             Off => write!(f, "off"),
138             Lru => write!(f, "lru"),
139         }
140     }
141 }
142 
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>143 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
144     deserializer: D,
145 ) -> Result<Duration, D::Error> {
146     let secs = u64::deserialize(deserializer)?;
147 
148     Ok(Duration::from_secs(secs))
149 }
150 
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>151 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
152     deserializer: D,
153 ) -> Result<Option<u64>, D::Error> {
154     let limit = u64::deserialize(deserializer)?;
155 
156     match limit {
157         0 => Err(serde::de::Error::custom(
158             "Please use non-zero unpin_limit value",
159         )),
160         limit => Ok(Some(limit)),
161     }
162 }
163 
unpin_interval_default() -> Duration164 fn unpin_interval_default() -> Duration {
165     UNPIN_DEFAULT_INTERVAL
166 }
167 
unpin_gen_threshold_default() -> u64168 fn unpin_gen_threshold_default() -> u64 {
169     UNPIN_GEN_DEFAULT_THRES
170 }
171 
172 /// Holds the parameters for a coiommu device
173 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
174 #[serde(deny_unknown_fields)]
175 pub struct CoIommuParameters {
176     #[serde(default)]
177     pub unpin_policy: CoIommuUnpinPolicy,
178     #[serde(
179         deserialize_with = "deserialize_unpin_interval",
180         default = "unpin_interval_default"
181     )]
182     pub unpin_interval: Duration,
183     #[serde(deserialize_with = "deserialize_unpin_limit", default)]
184     pub unpin_limit: Option<u64>,
185     // Number of unpin intervals a pinned page must be busy for to be aged into the
186     // older, less frequently checked generation.
187     #[serde(default = "unpin_gen_threshold_default")]
188     pub unpin_gen_threshold: u64,
189 }
190 
191 impl Default for CoIommuParameters {
default() -> Self192     fn default() -> Self {
193         Self {
194             unpin_policy: CoIommuUnpinPolicy::Off,
195             unpin_interval: UNPIN_DEFAULT_INTERVAL,
196             unpin_limit: None,
197             unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
198         }
199     }
200 }
201 
202 #[derive(Default, Debug, Copy, Clone)]
203 struct CoIommuReg {
204     dtt_root: u64,
205     cmd: u64,
206     dtt_level: u64,
207 }
208 
209 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
210 struct PinnedPageInfo {
211     gfn: u64,
212     unpin_busy_cnt: u64,
213 }
214 
215 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self216     fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
217         PinnedPageInfo {
218             gfn,
219             unpin_busy_cnt,
220         }
221     }
222 }
223 
224 #[derive(PartialEq, Debug, Eq)]
225 enum UnpinThreadState {
226     Unparked,
227     Parked,
228 }
229 
230 struct CoIommuPinState {
231     new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
232     old_gen_pinned_pages: VecDeque<u64>,
233     unpin_thread_state: UnpinThreadState,
234     unpin_park_count: u64,
235 }
236 
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool237 unsafe fn vfio_map(
238     vfio_container: &Arc<Mutex<VfioContainer>>,
239     iova: u64,
240     size: u64,
241     user_addr: u64,
242 ) -> bool {
243     match vfio_container
244         .lock()
245         .vfio_dma_map(iova, size, user_addr, true)
246     {
247         Ok(_) => true,
248         Err(e) => {
249             if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
250                 if errno == libc::EEXIST {
251                     // Already pinned. set PINNED flag
252                     error!("CoIommu: iova 0x{:x} already pinned", iova);
253                     return true;
254                 }
255             }
256             error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
257             false
258         }
259     }
260 }
261 
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool262 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
263     match vfio_container.lock().vfio_dma_unmap(iova, size) {
264         Ok(_) => true,
265         Err(e) => {
266             error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
267             false
268         }
269     }
270 }
271 
272 #[derive(Default, Debug, Copy, Clone, FromBytes, IntoBytes)]
273 #[repr(C)]
274 struct PinPageInfo {
275     bdf: u16,
276     pad: [u16; 3],
277     nr_pages: u64,
278 }
279 
280 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
281 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
282 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
283 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
284 
level_to_offset(gfn: u64, level: u64) -> Result<u64>285 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
286     if level == 1 {
287         return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
288     }
289 
290     if level == 0 {
291         bail!("Invalid level for gfn 0x{:x}", gfn);
292     }
293 
294     let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
295 
296     Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
297 }
298 
299 struct DTTIter {
300     ptr: *const u8,
301     gfn: u64,
302 }
303 
304 impl Default for DTTIter {
default() -> Self305     fn default() -> Self {
306         DTTIter {
307             ptr: std::ptr::null(),
308             gfn: 0,
309         }
310     }
311 }
312 
313 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
314 //
315 // There are two ways to get the entry:
316 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
317 // corresponding entry. The DTT is shared between frontend and
318 // backend. It is page-table-like strctures and the entry is indexed
319 // by GFN. The argument dtt_root represents the root page
320 // pga and dtt_level represents the maximum page table level.
321 //
322 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
323 // stores an entry address and the associated gfn. If the target gfn is
324 // in the same page table page with the gfn in dtt_iter, then can
325 // calculate the target entry address based on the entry address in
326 // dtt_iter.
327 //
328 // As the DTT entry is shared between frontend and backend, the accessing
329 // should be atomic. So the returned value is converted to an AtomicU32
330 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>331 fn gfn_to_dtt_pte(
332     mem: &GuestMemory,
333     dtt_level: u64,
334     dtt_root: u64,
335     dtt_iter: &mut DTTIter,
336     gfn: u64,
337 ) -> Result<*const AtomicU32> {
338     let ptr = if dtt_iter.ptr.is_null()
339         || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
340     {
341         // Slow path to walk the DTT to get the pte entry
342         let mut level = dtt_level;
343         let mut pt_gpa = dtt_root;
344         let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
345 
346         while level != 1 {
347             let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
348             let parent_pt = mem
349                 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
350                 .context(Error::GetDTTEntry)?;
351 
352             if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
353                 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
354             }
355 
356             pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
357             level -= 1;
358         }
359 
360         let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
361 
362         mem.get_host_address(GuestAddress(pt_gpa + index))
363             .context(Error::GetDTTEntry)?
364     } else if gfn > dtt_iter.gfn {
365         // SAFETY:
366         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
367         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
368         // means the calculated ptr will point to the same page as dtt_iter.ptr
369         unsafe {
370             dtt_iter
371                 .ptr
372                 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
373         }
374     } else {
375         // SAFETY:
376         // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
377         // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
378         // means the calculated ptr will point to the same page as dtt_iter.ptr
379         unsafe {
380             dtt_iter
381                 .ptr
382                 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
383         }
384     };
385 
386     dtt_iter.ptr = ptr;
387     dtt_iter.gfn = gfn;
388 
389     Ok(ptr as *const AtomicU32)
390 }
391 
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>392 fn pin_page(
393     pinstate: &mut CoIommuPinState,
394     policy: CoIommuUnpinPolicy,
395     vfio_container: &Arc<Mutex<VfioContainer>>,
396     mem: &GuestMemory,
397     dtt_level: u64,
398     dtt_root: u64,
399     dtt_iter: &mut DTTIter,
400     gfn: u64,
401 ) -> Result<()> {
402     let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
403 
404     let gpa = gfn << PAGE_SHIFT_4K;
405     let host_addr = mem
406         .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
407         .context("failed to get host address")? as u64;
408 
409     // SAFETY:
410     // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
411     // Test PINNED flag
412     if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
413         info!("CoIommu: gfn 0x{:x} already pinned", gfn);
414         return Ok(());
415     }
416 
417     // SAFETY:
418     // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
419     // is guaranteed by MemoryMapping interface.
420     if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
421         // SAFETY:
422         // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
423         // set PINNED flag
424         unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
425         if policy == CoIommuUnpinPolicy::Lru {
426             pinstate
427                 .new_gen_pinned_pages
428                 .push_back(PinnedPageInfo::new(gfn, 0));
429         }
430     }
431 
432     Ok(())
433 }
434 
435 #[derive(PartialEq, Debug, Eq)]
436 enum UnpinResult {
437     UnpinlistEmpty,
438     Unpinned,
439     NotPinned,
440     NotUnpinned,
441     FailedUnpin,
442     UnpinParked,
443 }
444 
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult445 fn unpin_page(
446     pinstate: &mut CoIommuPinState,
447     vfio_container: &Arc<Mutex<VfioContainer>>,
448     mem: &GuestMemory,
449     dtt_level: u64,
450     dtt_root: u64,
451     dtt_iter: &mut DTTIter,
452     gfn: u64,
453     force: bool,
454 ) -> UnpinResult {
455     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
456         return UnpinResult::UnpinParked;
457     }
458 
459     let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
460         Ok(v) => v,
461         Err(_) => {
462             // The case force == true may try to unpin a page which is not
463             // mapped in the dtt. For such page, the pte doesn't exist yet
464             // thus don't need to report any error log.
465             // The case force == false is used by coiommu to periodically
466             // unpin the pages which have been mapped in dtt, thus the pte
467             // for such page does exist. However with the unpin request from
468             // virtio balloon, such pages can be unpinned already and the DTT
469             // pages might be reclaimed by the Guest OS kernel as well, thus
470             // it is also possible to be here. Not to report an error log.
471             return UnpinResult::NotPinned;
472         }
473     };
474 
475     if force {
476         // SAFETY:
477         // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
478         // This case is for balloon to evict pages so these pages should
479         // already been locked by balloon and no device driver in VM is
480         // able to access these pages, so just clear ACCESSED flag first
481         // to make sure the following unpin can be success.
482         unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
483     }
484 
485     // SAFETY:
486     // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
487     if let Err(entry) = unsafe {
488         (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
489     } {
490         // The compare_exchange failed as the original leaf entry is
491         // not DTTE_PINNED_FLAG so cannot do the unpin.
492         if entry == 0 {
493             // The GFN is already unpinned. This is very similar to the
494             // gfn_to_dtt_pte error case, with the only difference being
495             // that the dtt_pte happens to be on a present page table.
496             UnpinResult::NotPinned
497         } else {
498             if !force {
499                 // SAFETY:
500                 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
501                 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
502                 // this page. It represents whether or not this page is touched by the
503                 // guest. By clearing this flag after an unpin work, we can detect if
504                 // this page has been touched by the guest in the next round of unpin
505                 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
506                 // will be failed and we will be here again to clear this flag. If this
507                 // flag is not set at the next round, unpin this page will be probably
508                 // success.
509                 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
510             } else {
511                 // If we're here, then the guest is trying to release a page via the
512                 // balloon that it still has pinned. This most likely that something is
513                 // wrong in the guest kernel. Just leave the page pinned and log
514                 // an error.
515                 // This failure blocks the balloon from removing the page, which ensures
516                 // that the guest's view of memory will remain consistent with device
517                 // DMA's view of memory. Also note that the host kernel maintains an
518                 // elevated refcount for pinned pages, which is a second guarantee the
519                 // pages accessible by device DMA won't be freed until after they are
520                 // unpinned.
521                 error!(
522                     "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
523                     gfn, entry
524                 );
525             }
526             // GFN cannot be unpinned either because the unmap count
527             // is non-zero or the it has accessed flag set.
528             UnpinResult::NotUnpinned
529         }
530     } else {
531         // The compare_exchange success as the original leaf entry is
532         // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
533         // page.
534         let gpa = gfn << PAGE_SHIFT_4K;
535         if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
536             UnpinResult::Unpinned
537         } else {
538             // SAFETY:
539             // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
540             // make sure the pinned flag is set
541             unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
542             // need to put this gfn back to pinned vector
543             UnpinResult::FailedUnpin
544         }
545     }
546 }
547 
548 struct PinWorker {
549     mem: GuestMemory,
550     endpoints: Vec<u16>,
551     notifymap_mmap: Arc<MemoryMapping>,
552     dtt_level: u64,
553     dtt_root: u64,
554     ioevents: Vec<Event>,
555     vfio_container: Arc<Mutex<VfioContainer>>,
556     pinstate: Arc<Mutex<CoIommuPinState>>,
557     params: CoIommuParameters,
558 }
559 
560 impl PinWorker {
debug_label(&self) -> &'static str561     fn debug_label(&self) -> &'static str {
562         "CoIommuPinWorker"
563     }
564 
run(&mut self, kill_evt: Event)565     fn run(&mut self, kill_evt: Event) {
566         #[derive(EventToken)]
567         enum Token {
568             Kill,
569             Pin { index: usize },
570         }
571 
572         let wait_ctx: WaitContext<Token> =
573             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
574                 Ok(pc) => pc,
575                 Err(e) => {
576                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
577                     return;
578                 }
579             };
580 
581         for (index, event) in self.ioevents.iter().enumerate() {
582             match wait_ctx.add(event, Token::Pin { index }) {
583                 Ok(_) => {}
584                 Err(e) => {
585                     error!(
586                         "{}: failed to add ioevent for index {}: {}",
587                         self.debug_label(),
588                         index,
589                         e
590                     );
591                     return;
592                 }
593             }
594         }
595 
596         'wait: loop {
597             let events = match wait_ctx.wait() {
598                 Ok(v) => v,
599                 Err(e) => {
600                     error!("{}: failed polling for events: {}", self.debug_label(), e);
601                     break;
602                 }
603             };
604 
605             for event in events.iter().filter(|e| e.is_readable) {
606                 match event.token {
607                     Token::Kill => break 'wait,
608                     Token::Pin { index } => {
609                         let offset = index * mem::size_of::<u64>();
610                         if let Some(event) = self.ioevents.get(index) {
611                             if let Err(e) = event.wait() {
612                                 error!(
613                                     "{}: failed reading event {}: {}",
614                                     self.debug_label(),
615                                     index,
616                                     e
617                                 );
618                                 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
619                                 break 'wait;
620                             }
621                         }
622                         if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
623                             if let Err(e) = self.pin_pages(data) {
624                                 error!("{}: {}", self.debug_label(), e);
625                             }
626                         }
627                         fence(Ordering::SeqCst);
628                         self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
629                     }
630                 }
631             }
632         }
633     }
634 
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>635     fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
636         let pin_page_info = self
637             .mem
638             .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
639             .context("failed to get pin page info")?;
640 
641         let bdf = pin_page_info.bdf;
642         ensure!(
643             self.endpoints.iter().any(|&x| x == bdf),
644             "pin page for unexpected bdf 0x{:x}",
645             bdf
646         );
647 
648         let mut nr_pages = pin_page_info.nr_pages;
649         let mut offset = mem::size_of::<PinPageInfo>() as u64;
650         let mut dtt_iter: DTTIter = Default::default();
651         let mut pinstate = self.pinstate.lock();
652         while nr_pages > 0 {
653             let gfn = self
654                 .mem
655                 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
656                 .context("failed to get pin page gfn")?;
657 
658             pin_page(
659                 &mut pinstate,
660                 self.params.unpin_policy,
661                 &self.vfio_container,
662                 &self.mem,
663                 self.dtt_level,
664                 self.dtt_root,
665                 &mut dtt_iter,
666                 gfn,
667             )?;
668 
669             offset += mem::size_of::<u64>() as u64;
670             nr_pages -= 1;
671         }
672 
673         Ok(())
674     }
675 
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>676     fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
677         if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
678             let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
679             self.pin_pages_in_batch(gpa)
680         } else {
681             let bdf = (gfn_bdf & 0xffff) as u16;
682             let gfn = gfn_bdf >> 16;
683             let mut dtt_iter: DTTIter = Default::default();
684             ensure!(
685                 self.endpoints.iter().any(|&x| x == bdf),
686                 "pin page for unexpected bdf 0x{:x}",
687                 bdf
688             );
689 
690             let mut pinstate = self.pinstate.lock();
691             pin_page(
692                 &mut pinstate,
693                 self.params.unpin_policy,
694                 &self.vfio_container,
695                 &self.mem,
696                 self.dtt_level,
697                 self.dtt_root,
698                 &mut dtt_iter,
699                 gfn,
700             )
701         }
702     }
703 }
704 
705 struct UnpinWorker {
706     mem: GuestMemory,
707     dtt_level: u64,
708     dtt_root: u64,
709     vfio_container: Arc<Mutex<VfioContainer>>,
710     unpin_tube: Option<Tube>,
711     pinstate: Arc<Mutex<CoIommuPinState>>,
712     params: CoIommuParameters,
713     unpin_gen_threshold: u64,
714 }
715 
716 impl UnpinWorker {
debug_label(&self) -> &'static str717     fn debug_label(&self) -> &'static str {
718         "CoIommuUnpinWorker"
719     }
720 
run(&mut self, kill_evt: Event)721     fn run(&mut self, kill_evt: Event) {
722         #[derive(EventToken)]
723         enum Token {
724             UnpinTimer,
725             UnpinReq,
726             Kill,
727         }
728 
729         let wait_ctx: WaitContext<Token> =
730             match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
731                 Ok(pc) => pc,
732                 Err(e) => {
733                     error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
734                     return;
735                 }
736             };
737 
738         if let Some(tube) = &self.unpin_tube {
739             if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
740                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
741                 return;
742             }
743         }
744 
745         let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
746             && !self.params.unpin_interval.is_zero()
747         {
748             let mut timer = match Timer::new() {
749                 Ok(t) => t,
750                 Err(e) => {
751                     error!(
752                         "{}: failed to create the unpin timer: {}",
753                         self.debug_label(),
754                         e
755                     );
756                     return;
757                 }
758             };
759             if let Err(e) = timer.reset_repeating(self.params.unpin_interval) {
760                 error!(
761                     "{}: failed to start the unpin timer: {}",
762                     self.debug_label(),
763                     e
764                 );
765                 return;
766             }
767             if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
768                 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
769                 return;
770             }
771             Some(timer)
772         } else {
773             None
774         };
775 
776         let unpin_tube = self.unpin_tube.take();
777         'wait: loop {
778             let events = match wait_ctx.wait() {
779                 Ok(v) => v,
780                 Err(e) => {
781                     error!("{}: failed polling for events: {}", self.debug_label(), e);
782                     break;
783                 }
784             };
785 
786             for event in events.iter().filter(|e| e.is_readable) {
787                 match event.token {
788                     Token::UnpinTimer => {
789                         self.unpin_pages();
790                         if let Some(timer) = &mut unpin_timer {
791                             if let Err(e) = timer.mark_waited() {
792                                 error!(
793                                     "{}: failed to clear unpin timer: {}",
794                                     self.debug_label(),
795                                     e
796                                 );
797                                 break 'wait;
798                             }
799                         }
800                     }
801                     Token::UnpinReq => {
802                         if let Some(tube) = &unpin_tube {
803                             match tube.recv::<UnpinRequest>() {
804                                 Ok(req) => {
805                                     let mut unpin_done = true;
806                                     for range in req.ranges {
807                                         // Locking with respect to pin_pages isn't necessary
808                                         // for this case because the unpinned pages in the range
809                                         // should all be in the balloon and so nothing will attempt
810                                         // to pin them.
811                                         if !self.unpin_pages_in_range(range.0, range.1) {
812                                             unpin_done = false;
813                                             break;
814                                         }
815                                     }
816                                     let resp = if unpin_done {
817                                         UnpinResponse::Success
818                                     } else {
819                                         UnpinResponse::Failed
820                                     };
821                                     if let Err(e) = tube.send(&resp) {
822                                         error!(
823                                             "{}: failed to send unpin response {}",
824                                             self.debug_label(),
825                                             e
826                                         );
827                                     }
828                                 }
829                                 Err(e) => {
830                                     if let TubeError::Disconnected = e {
831                                         if let Err(e) = wait_ctx.delete(tube) {
832                                             error!(
833                                                 "{}: failed to remove unpin_tube: {}",
834                                                 self.debug_label(),
835                                                 e
836                                             );
837                                         }
838                                     } else {
839                                         error!(
840                                             "{}: failed to recv Unpin Request: {}",
841                                             self.debug_label(),
842                                             e
843                                         );
844                                     }
845                                 }
846                             }
847                         }
848                     }
849                     Token::Kill => break 'wait,
850                 }
851             }
852         }
853         self.unpin_tube = unpin_tube;
854     }
855 
unpin_pages(&mut self)856     fn unpin_pages(&mut self) {
857         if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
858             self.lru_unpin_pages();
859         }
860     }
861 
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)862     fn lru_unpin_page(
863         &mut self,
864         dtt_iter: &mut DTTIter,
865         new_gen: bool,
866     ) -> (UnpinResult, Option<PinnedPageInfo>) {
867         let mut pinstate = self.pinstate.lock();
868         let pageinfo = if new_gen {
869             pinstate.new_gen_pinned_pages.pop_front()
870         } else {
871             pinstate
872                 .old_gen_pinned_pages
873                 .pop_front()
874                 .map(|gfn| PinnedPageInfo::new(gfn, 0))
875         };
876 
877         pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
878             (
879                 unpin_page(
880                     &mut pinstate,
881                     &self.vfio_container,
882                     &self.mem,
883                     self.dtt_level,
884                     self.dtt_root,
885                     dtt_iter,
886                     pageinfo.gfn,
887                     false,
888                 ),
889                 Some(pageinfo),
890             )
891         })
892     }
893 
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64894     fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
895         let mut not_unpinned_new_gen_pages = VecDeque::new();
896         let mut not_unpinned_old_gen_pages = VecDeque::new();
897         let mut unpinned_count = 0;
898         let has_limit = unpin_limit.is_some();
899         let limit_count = unpin_limit.unwrap_or(0);
900         let mut dtt_iter: DTTIter = Default::default();
901 
902         // If has_limit is true but limit_count is 0, will not do the unpin
903         while !has_limit || unpinned_count != limit_count {
904             let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
905             match result {
906                 UnpinResult::UnpinlistEmpty => break,
907                 UnpinResult::Unpinned => unpinned_count += 1,
908                 UnpinResult::NotPinned => {}
909                 UnpinResult::NotUnpinned => {
910                     if let Some(mut page) = pinned_page {
911                         if self.params.unpin_gen_threshold != 0 {
912                             page.unpin_busy_cnt += 1;
913                             // Unpin from new_gen queue but not
914                             // successfully unpinned. Need to check
915                             // the unpin_gen threshold. If reach, put
916                             // it to old_gen queue.
917                             // And if it is not from new_gen, directly
918                             // put into old_gen queue.
919                             if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
920                                 not_unpinned_old_gen_pages.push_back(page.gfn);
921                             } else {
922                                 not_unpinned_new_gen_pages.push_back(page);
923                             }
924                         }
925                     }
926                 }
927                 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
928                     // Although UnpinParked means we didn't actually try to unpin
929                     // gfn, it's not worth specifically handing since parking is
930                     // expected to be relatively rare.
931                     if let Some(page) = pinned_page {
932                         if new_gen {
933                             not_unpinned_new_gen_pages.push_back(page);
934                         } else {
935                             not_unpinned_old_gen_pages.push_back(page.gfn);
936                         }
937                     }
938                     if result == UnpinResult::UnpinParked {
939                         thread::park();
940                     }
941                 }
942             }
943         }
944 
945         if !not_unpinned_new_gen_pages.is_empty() {
946             let mut pinstate = self.pinstate.lock();
947             pinstate
948                 .new_gen_pinned_pages
949                 .append(&mut not_unpinned_new_gen_pages);
950         }
951 
952         if !not_unpinned_old_gen_pages.is_empty() {
953             let mut pinstate = self.pinstate.lock();
954             pinstate
955                 .old_gen_pinned_pages
956                 .append(&mut not_unpinned_old_gen_pages);
957         }
958 
959         unpinned_count
960     }
961 
lru_unpin_pages(&mut self)962     fn lru_unpin_pages(&mut self) {
963         let mut unpin_count = 0;
964         if self.params.unpin_gen_threshold != 0 {
965             self.unpin_gen_threshold += 1;
966             if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
967                 self.unpin_gen_threshold = 0;
968                 // Try to unpin inactive queue first if reaches the thres hold
969                 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
970             }
971         }
972         // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
973         self.lru_unpin_pages_in_loop(
974             self.params
975                 .unpin_limit
976                 .map(|limit| limit.saturating_sub(unpin_count)),
977             true,
978         );
979     }
980 
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool981     fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
982         let mut dtt_iter: DTTIter = Default::default();
983         let mut index = 0;
984         while index != count {
985             let mut pinstate = self.pinstate.lock();
986             let result = unpin_page(
987                 &mut pinstate,
988                 &self.vfio_container,
989                 &self.mem,
990                 self.dtt_level,
991                 self.dtt_root,
992                 &mut dtt_iter,
993                 gfn + index,
994                 true,
995             );
996             drop(pinstate);
997 
998             match result {
999                 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
1000                 UnpinResult::UnpinParked => {
1001                     thread::park();
1002                     continue;
1003                 }
1004                 _ => {
1005                     error!("coiommu: force unpin failed by {:?}", result);
1006                     return false;
1007                 }
1008             }
1009             index += 1;
1010         }
1011         true
1012     }
1013 }
1014 
1015 pub struct CoIommuDev {
1016     config_regs: PciConfiguration,
1017     pci_address: Option<PciAddress>,
1018     mem: GuestMemory,
1019     coiommu_reg: CoIommuReg,
1020     endpoints: Vec<u16>,
1021     notifymap_mem: SafeDescriptor,
1022     notifymap_mmap: Arc<MemoryMapping>,
1023     notifymap_addr: Option<u64>,
1024     topologymap_mem: SafeDescriptor,
1025     topologymap_addr: Option<u64>,
1026     mmapped: bool,
1027     vm_memory_client: VmMemoryClient,
1028     pin_thread: Option<WorkerThread<PinWorker>>,
1029     unpin_thread: Option<WorkerThread<UnpinWorker>>,
1030     unpin_tube: Option<Tube>,
1031     ioevents: Vec<Event>,
1032     vfio_container: Arc<Mutex<VfioContainer>>,
1033     pinstate: Arc<Mutex<CoIommuPinState>>,
1034     params: CoIommuParameters,
1035 }
1036 
1037 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, vm_memory_client: VmMemoryClient, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1038     pub fn new(
1039         mem: GuestMemory,
1040         vfio_container: Arc<Mutex<VfioContainer>>,
1041         vm_memory_client: VmMemoryClient,
1042         unpin_tube: Option<Tube>,
1043         endpoints: Vec<u16>,
1044         vcpu_count: u64,
1045         params: CoIommuParameters,
1046     ) -> Result<Self> {
1047         let config_regs = PciConfiguration::new(
1048             PCI_VENDOR_ID_COIOMMU,
1049             PCI_DEVICE_ID_COIOMMU,
1050             PciClassCode::Other,
1051             &PciOtherSubclass::Other,
1052             None, // No Programming interface.
1053             PciHeaderType::Device,
1054             PCI_VENDOR_ID_COIOMMU,
1055             PCI_DEVICE_ID_COIOMMU,
1056             COIOMMU_REVISION_ID,
1057         );
1058 
1059         // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1060         let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1061             .context(Error::CreateSharedMemory)?;
1062         let notifymap_mmap = Arc::new(
1063             MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1064                 .from_shared_memory(&notifymap_mem)
1065                 .offset(0)
1066                 .build()?,
1067         );
1068 
1069         // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1070         let topologymap_mem =
1071             SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1072                 .context(Error::CreateSharedMemory)?;
1073         let topologymap_mmap = Arc::new(
1074             MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1075                 .from_shared_memory(&topologymap_mem)
1076                 .offset(0)
1077                 .build()?,
1078         );
1079 
1080         ensure!(
1081             (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1082             "Coiommu: too many endpoints"
1083         );
1084         topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1085         for (index, endpoint) in endpoints.iter().enumerate() {
1086             topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1087         }
1088 
1089         let mut ioevents = Vec::new();
1090         for _ in 0..vcpu_count {
1091             ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1092         }
1093 
1094         Ok(Self {
1095             config_regs,
1096             pci_address: None,
1097             mem,
1098             coiommu_reg: Default::default(),
1099             endpoints,
1100             notifymap_mem: notifymap_mem.into(),
1101             notifymap_mmap,
1102             notifymap_addr: None,
1103             topologymap_mem: topologymap_mem.into(),
1104             topologymap_addr: None,
1105             mmapped: false,
1106             vm_memory_client,
1107             pin_thread: None,
1108             unpin_thread: None,
1109             unpin_tube,
1110             ioevents,
1111             vfio_container,
1112             pinstate: Arc::new(Mutex::new(CoIommuPinState {
1113                 new_gen_pinned_pages: VecDeque::new(),
1114                 old_gen_pinned_pages: VecDeque::new(),
1115                 unpin_thread_state: UnpinThreadState::Unparked,
1116                 unpin_park_count: 0,
1117             })),
1118             params,
1119         })
1120     }
1121 
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1122     fn register_mmap(
1123         &self,
1124         descriptor: SafeDescriptor,
1125         size: usize,
1126         offset: u64,
1127         gpa: u64,
1128         prot: Protection,
1129     ) -> Result<()> {
1130         let _region = self
1131             .vm_memory_client
1132             .register_memory(
1133                 VmMemorySource::Descriptor {
1134                     descriptor,
1135                     offset,
1136                     size: size as u64,
1137                 },
1138                 VmMemoryDestination::GuestPhysicalAddress(gpa),
1139                 prot,
1140                 MemCacheType::CacheCoherent,
1141             )
1142             .context("register_mmap register_memory failed")?;
1143         Ok(())
1144     }
1145 
mmap(&mut self)1146     fn mmap(&mut self) {
1147         if self.mmapped {
1148             return;
1149         }
1150 
1151         if let Some(gpa) = self.notifymap_addr {
1152             match self.register_mmap(
1153                 self.notifymap_mem.try_clone().unwrap(),
1154                 COIOMMU_NOTIFYMAP_SIZE,
1155                 0,
1156                 gpa,
1157                 Protection::read_write(),
1158             ) {
1159                 Ok(_) => {}
1160                 Err(e) => {
1161                     panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1162                 }
1163             }
1164         }
1165 
1166         if let Some(gpa) = self.topologymap_addr {
1167             match self.register_mmap(
1168                 self.topologymap_mem.try_clone().unwrap(),
1169                 COIOMMU_TOPOLOGYMAP_SIZE,
1170                 0,
1171                 gpa,
1172                 Protection::read(),
1173             ) {
1174                 Ok(_) => {}
1175                 Err(e) => {
1176                     panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1177                 }
1178             }
1179         }
1180 
1181         self.mmapped = true;
1182     }
1183 
start_workers(&mut self)1184     fn start_workers(&mut self) {
1185         if self.pin_thread.is_none() {
1186             self.start_pin_thread();
1187         }
1188 
1189         if self.unpin_thread.is_none() {
1190             self.start_unpin_thread();
1191         }
1192     }
1193 
start_pin_thread(&mut self)1194     fn start_pin_thread(&mut self) {
1195         let mem = self.mem.clone();
1196         let endpoints = self.endpoints.to_vec();
1197         let notifymap_mmap = self.notifymap_mmap.clone();
1198         let dtt_root = self.coiommu_reg.dtt_root;
1199         let dtt_level = self.coiommu_reg.dtt_level;
1200         let ioevents: Vec<Event> = self
1201             .ioevents
1202             .iter()
1203             .map(|e| e.try_clone().unwrap())
1204             .collect();
1205 
1206         let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
1207         let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1208         for (i, evt) in self.ioevents.iter().enumerate() {
1209             self.vm_memory_client
1210                 .register_io_event(
1211                     evt.try_clone().expect("failed to clone event"),
1212                     notify_base + i as u64,
1213                     Datamatch::AnyLength,
1214                 )
1215                 .expect("failed to register ioevent");
1216         }
1217 
1218         let vfio_container = self.vfio_container.clone();
1219         let pinstate = self.pinstate.clone();
1220         let params = self.params;
1221 
1222         self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1223             let mut worker = PinWorker {
1224                 mem,
1225                 endpoints,
1226                 notifymap_mmap,
1227                 dtt_root,
1228                 dtt_level,
1229                 ioevents,
1230                 vfio_container,
1231                 pinstate,
1232                 params,
1233             };
1234             worker.run(kill_evt);
1235             worker
1236         }));
1237     }
1238 
start_unpin_thread(&mut self)1239     fn start_unpin_thread(&mut self) {
1240         let mem = self.mem.clone();
1241         let dtt_root = self.coiommu_reg.dtt_root;
1242         let dtt_level = self.coiommu_reg.dtt_level;
1243         let vfio_container = self.vfio_container.clone();
1244         let unpin_tube = self.unpin_tube.take();
1245         let pinstate = self.pinstate.clone();
1246         let params = self.params;
1247         self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1248             let mut worker = UnpinWorker {
1249                 mem,
1250                 dtt_level,
1251                 dtt_root,
1252                 vfio_container,
1253                 unpin_tube,
1254                 pinstate,
1255                 params,
1256                 unpin_gen_threshold: 0,
1257             };
1258             worker.run(kill_evt);
1259             worker
1260         }));
1261     }
1262 
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1263     fn allocate_bar_address(
1264         &mut self,
1265         resources: &mut SystemAllocator,
1266         address: PciAddress,
1267         size: u64,
1268         bar_num: u8,
1269         name: &str,
1270     ) -> PciResult<u64> {
1271         let addr = resources
1272             .allocate_mmio(
1273                 size,
1274                 Alloc::PciBar {
1275                     bus: address.bus,
1276                     dev: address.dev,
1277                     func: address.func,
1278                     bar: bar_num,
1279                 },
1280                 name.to_string(),
1281                 AllocOptions::new().prefetchable(true).align(size),
1282             )
1283             .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1284 
1285         let bar = PciBarConfiguration::new(
1286             bar_num as usize,
1287             size,
1288             PciBarRegionType::Memory64BitRegion,
1289             PciBarPrefetchable::Prefetchable,
1290         )
1291         .set_address(addr);
1292 
1293         self.config_regs
1294             .add_pci_bar(bar)
1295             .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1296 
1297         Ok(addr)
1298     }
1299 
read_mmio(&mut self, offset: u64, data: &mut [u8])1300     fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
1301         if offset >= mem::size_of::<CoIommuReg>() as u64 {
1302             error!(
1303                 "{}: read_mmio: invalid offset 0x{:x}",
1304                 self.debug_label(),
1305                 offset
1306             );
1307             return;
1308         }
1309 
1310         // Sanity check, must be 64bit aligned accessing
1311         if offset % 8 != 0 || data.len() != 8 {
1312             error!(
1313                 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1314                 self.debug_label(),
1315                 offset,
1316                 data.len()
1317             );
1318             return;
1319         }
1320 
1321         let v = match offset / 8 {
1322             0 => self.coiommu_reg.dtt_root,
1323             1 => self.coiommu_reg.cmd,
1324             2 => self.coiommu_reg.dtt_level,
1325             _ => return,
1326         };
1327 
1328         data.copy_from_slice(&v.to_ne_bytes());
1329     }
1330 
write_mmio(&mut self, offset: u64, data: &[u8])1331     fn write_mmio(&mut self, offset: u64, data: &[u8]) {
1332         let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1333         if offset >= mmio_len {
1334             if data.len() != 1 {
1335                 error!(
1336                     "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1337                     self.debug_label(),
1338                     offset,
1339                     data.len()
1340                 );
1341                 return;
1342             }
1343 
1344             // Usually will not be here as this is for the per-vcpu notify
1345             // register which is monitored by the ioevents. For the notify
1346             // register which is not covered by the ioevents, they are not
1347             // be used by the frontend driver. In case the frontend driver
1348             // went here, do a simple handle to make sure the frontend driver
1349             // will not be blocked, and through an error log.
1350             let index = (offset - mmio_len) as usize;
1351             if let Some(event) = self.ioevents.get(index) {
1352                 let _ = event.signal();
1353             } else {
1354                 self.notifymap_mmap
1355                     .write_obj::<u64>(0, index * mem::size_of::<u64>())
1356                     .unwrap();
1357                 error!(
1358                     "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1359                     self.debug_label(),
1360                     offset
1361                 );
1362             }
1363             return;
1364         }
1365 
1366         // Sanity check, must be 64bit aligned accessing for CoIommuReg
1367         if offset % 8 != 0 || data.len() != 8 {
1368             error!(
1369                 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1370                 self.debug_label(),
1371                 offset,
1372                 data.len()
1373             );
1374             return;
1375         }
1376 
1377         let index = offset / 8;
1378         let v = u64::from_ne_bytes(data.try_into().unwrap());
1379         match index {
1380             0 => {
1381                 if self.coiommu_reg.dtt_root == 0 {
1382                     self.coiommu_reg.dtt_root = v;
1383                 }
1384             }
1385             1 => match v {
1386                 // Deactivate can happen if the frontend driver in the guest
1387                 // fails during probing or if the CoIommu device is removed
1388                 // by the guest. Neither of these cases is expected, and if
1389                 // either happens the guest will be non-functional due to
1390                 // pass-through devices which rely on CoIommu not working.
1391                 // So just fail hard and panic.
1392                 COIOMMU_CMD_DEACTIVATE => {
1393                     panic!("{}: Deactivate is not supported", self.debug_label())
1394                 }
1395                 COIOMMU_CMD_ACTIVATE => {
1396                     if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1397                         self.start_workers();
1398                     }
1399                 }
1400                 COIOMMU_CMD_PARK_UNPIN => {
1401                     let mut pinstate = self.pinstate.lock();
1402                     pinstate.unpin_thread_state = UnpinThreadState::Parked;
1403                     if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1404                         pinstate.unpin_park_count = v;
1405                     } else {
1406                         panic!("{}: Park request overflowing", self.debug_label());
1407                     }
1408                 }
1409                 COIOMMU_CMD_UNPARK_UNPIN => {
1410                     let mut pinstate = self.pinstate.lock();
1411                     if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1412                         if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1413                             pinstate.unpin_park_count = v;
1414                             if pinstate.unpin_park_count == 0 {
1415                                 if let Some(worker_thread) = &self.unpin_thread {
1416                                     worker_thread.thread().unpark();
1417                                 }
1418                                 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1419                             }
1420                         } else {
1421                             error!("{}: Park count is already reached to 0", self.debug_label());
1422                         }
1423                     }
1424                 }
1425                 _ => {}
1426             },
1427             2 => {
1428                 if self.coiommu_reg.dtt_level == 0 {
1429                     self.coiommu_reg.dtt_level = v;
1430                 }
1431             }
1432             _ => {}
1433         }
1434     }
1435 }
1436 
1437 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1438     fn debug_label(&self) -> String {
1439         "CoIommu".to_owned()
1440     }
1441 
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1442     fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1443         if self.pci_address.is_none() {
1444             self.pci_address = resources.allocate_pci(0, self.debug_label());
1445         }
1446         self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1447     }
1448 
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1449     fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1450         let address = self
1451             .pci_address
1452             .expect("allocate_address must be called prior to allocate_io_bars");
1453 
1454         // Allocate one bar for the structures pointed to by the capability structures.
1455         let mut ranges: Vec<BarRange> = Vec::new();
1456 
1457         let mmio_addr = self.allocate_bar_address(
1458             resources,
1459             address,
1460             COIOMMU_MMIO_BAR_SIZE,
1461             COIOMMU_MMIO_BAR as u8,
1462             "coiommu-mmiobar",
1463         )?;
1464 
1465         ranges.push(BarRange {
1466             addr: mmio_addr,
1467             size: COIOMMU_MMIO_BAR_SIZE,
1468             prefetchable: false,
1469         });
1470 
1471         Ok(ranges)
1472     }
1473 
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1474     fn allocate_device_bars(
1475         &mut self,
1476         resources: &mut SystemAllocator,
1477     ) -> PciResult<Vec<BarRange>> {
1478         let address = self
1479             .pci_address
1480             .expect("allocate_address must be called prior to allocate_device_bars");
1481 
1482         let mut ranges: Vec<BarRange> = Vec::new();
1483 
1484         let topologymap_addr = self.allocate_bar_address(
1485             resources,
1486             address,
1487             COIOMMU_TOPOLOGYMAP_SIZE as u64,
1488             COIOMMU_TOPOLOGYMAP_BAR,
1489             "coiommu-topology",
1490         )?;
1491         self.topologymap_addr = Some(topologymap_addr);
1492         ranges.push(BarRange {
1493             addr: topologymap_addr,
1494             size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1495             prefetchable: false,
1496         });
1497 
1498         let notifymap_addr = self.allocate_bar_address(
1499             resources,
1500             address,
1501             COIOMMU_NOTIFYMAP_SIZE as u64,
1502             COIOMMU_NOTIFYMAP_BAR as u8,
1503             "coiommu-notifymap",
1504         )?;
1505         self.notifymap_addr = Some(notifymap_addr);
1506         ranges.push(BarRange {
1507             addr: notifymap_addr,
1508             size: COIOMMU_NOTIFYMAP_SIZE as u64,
1509             prefetchable: false,
1510         });
1511 
1512         Ok(ranges)
1513     }
1514 
read_config_register(&self, reg_idx: usize) -> u321515     fn read_config_register(&self, reg_idx: usize) -> u32 {
1516         self.config_regs.read_reg(reg_idx)
1517     }
1518 
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1519     fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1520         if reg_idx == COMMAND_REG
1521             && data.len() == 2
1522             && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1523             && !self.mmapped
1524         {
1525             self.mmap();
1526         }
1527 
1528         self.config_regs.write_reg(reg_idx, offset, data);
1529     }
1530 
keep_rds(&self) -> Vec<RawDescriptor>1531     fn keep_rds(&self) -> Vec<RawDescriptor> {
1532         let mut rds = vec![
1533             self.vfio_container.lock().as_raw_descriptor(),
1534             self.vm_memory_client.as_raw_descriptor(),
1535             self.notifymap_mem.as_raw_descriptor(),
1536             self.topologymap_mem.as_raw_descriptor(),
1537         ];
1538         if let Some(unpin_tube) = &self.unpin_tube {
1539             rds.push(unpin_tube.as_raw_descriptor());
1540         }
1541         rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
1542         rds
1543     }
1544 
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])1545     fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
1546         match bar_index {
1547             COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
1548             COIOMMU_NOTIFYMAP_BAR => {
1549                 // With coiommu device activated, the accessing the notifymap bar
1550                 // won't cause vmexit. If goes here, means the coiommu device is
1551                 // deactivated, and will not do the pin/unpin work. Thus no need
1552                 // to handle this notifymap read.
1553             }
1554             _ => {}
1555         }
1556     }
1557 
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])1558     fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
1559         match bar_index {
1560             COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
1561             COIOMMU_NOTIFYMAP_BAR => {
1562                 // With coiommu device activated, the accessing the notifymap bar
1563                 // won't cause vmexit. If goes here, means the coiommu device is
1564                 // deactivated, and will not do the pin/unpin work. Thus no need
1565                 // to handle this notifymap write.
1566             }
1567             _ => {}
1568         }
1569     }
1570 
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1571     fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1572         self.config_regs.get_bar_configuration(bar_num)
1573     }
1574 }
1575 
1576 impl Suspendable for CoIommuDev {}
1577