1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29
30 use anyhow::bail;
31 use anyhow::ensure;
32 use anyhow::Context;
33 use anyhow::Result;
34 use base::error;
35 use base::info;
36 use base::AsRawDescriptor;
37 use base::Event;
38 use base::EventToken;
39 use base::MemoryMapping;
40 use base::MemoryMappingBuilder;
41 use base::Protection;
42 use base::RawDescriptor;
43 use base::SafeDescriptor;
44 use base::SharedMemory;
45 use base::Timer;
46 use base::TimerTrait;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use hypervisor::MemCacheType;
53 use resources::Alloc;
54 use resources::AllocOptions;
55 use resources::SystemAllocator;
56 use serde::Deserialize;
57 use serde::Deserializer;
58 use serde::Serialize;
59 use serde_keyvalue::FromKeyValues;
60 use sync::Mutex;
61 use thiserror::Error as ThisError;
62 use vm_control::api::VmMemoryClient;
63 use vm_control::VmMemoryDestination;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::FromBytes;
68 use zerocopy::IntoBytes;
69
70 use crate::pci::pci_configuration::PciBarConfiguration;
71 use crate::pci::pci_configuration::PciBarPrefetchable;
72 use crate::pci::pci_configuration::PciBarRegionType;
73 use crate::pci::pci_configuration::PciClassCode;
74 use crate::pci::pci_configuration::PciConfiguration;
75 use crate::pci::pci_configuration::PciHeaderType;
76 use crate::pci::pci_configuration::PciOtherSubclass;
77 use crate::pci::pci_configuration::COMMAND_REG;
78 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
79 use crate::pci::pci_device::BarRange;
80 use crate::pci::pci_device::PciDevice;
81 use crate::pci::pci_device::Result as PciResult;
82 use crate::pci::PciAddress;
83 use crate::pci::PciBarIndex;
84 use crate::pci::PciDeviceError;
85 use crate::vfio::VfioContainer;
86 use crate::Suspendable;
87 use crate::UnpinRequest;
88 use crate::UnpinResponse;
89
90 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
91 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
92 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
93 const COIOMMU_CMD_ACTIVATE: u64 = 1;
94 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
95 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
96 const COIOMMU_REVISION_ID: u8 = 0x10;
97 const COIOMMU_MMIO_BAR: PciBarIndex = 0;
98 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
99 const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
100 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
101 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
102 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
103 const PAGE_SIZE_4K: u64 = 4096;
104 const PAGE_SHIFT_4K: u64 = 12;
105 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
106
107 const DTTE_PINNED_FLAG: u32 = 1 << 31;
108 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
109 const DTT_ENTRY_PRESENT: u64 = 1;
110 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
111
112 #[derive(ThisError, Debug)]
113 enum Error {
114 #[error("CoIommu failed to create shared memory")]
115 CreateSharedMemory,
116 #[error("Failed to get DTT entry")]
117 GetDTTEntry,
118 }
119
120 //default interval is 60s
121 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
122 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
123 /// Holds the coiommu unpin policy
124 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
125 #[serde(rename_all = "kebab-case")]
126 pub enum CoIommuUnpinPolicy {
127 #[default]
128 Off,
129 Lru,
130 }
131
132 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result133 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
134 use self::CoIommuUnpinPolicy::*;
135
136 match self {
137 Off => write!(f, "off"),
138 Lru => write!(f, "lru"),
139 }
140 }
141 }
142
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>143 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
144 deserializer: D,
145 ) -> Result<Duration, D::Error> {
146 let secs = u64::deserialize(deserializer)?;
147
148 Ok(Duration::from_secs(secs))
149 }
150
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>151 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
152 deserializer: D,
153 ) -> Result<Option<u64>, D::Error> {
154 let limit = u64::deserialize(deserializer)?;
155
156 match limit {
157 0 => Err(serde::de::Error::custom(
158 "Please use non-zero unpin_limit value",
159 )),
160 limit => Ok(Some(limit)),
161 }
162 }
163
unpin_interval_default() -> Duration164 fn unpin_interval_default() -> Duration {
165 UNPIN_DEFAULT_INTERVAL
166 }
167
unpin_gen_threshold_default() -> u64168 fn unpin_gen_threshold_default() -> u64 {
169 UNPIN_GEN_DEFAULT_THRES
170 }
171
172 /// Holds the parameters for a coiommu device
173 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
174 #[serde(deny_unknown_fields)]
175 pub struct CoIommuParameters {
176 #[serde(default)]
177 pub unpin_policy: CoIommuUnpinPolicy,
178 #[serde(
179 deserialize_with = "deserialize_unpin_interval",
180 default = "unpin_interval_default"
181 )]
182 pub unpin_interval: Duration,
183 #[serde(deserialize_with = "deserialize_unpin_limit", default)]
184 pub unpin_limit: Option<u64>,
185 // Number of unpin intervals a pinned page must be busy for to be aged into the
186 // older, less frequently checked generation.
187 #[serde(default = "unpin_gen_threshold_default")]
188 pub unpin_gen_threshold: u64,
189 }
190
191 impl Default for CoIommuParameters {
default() -> Self192 fn default() -> Self {
193 Self {
194 unpin_policy: CoIommuUnpinPolicy::Off,
195 unpin_interval: UNPIN_DEFAULT_INTERVAL,
196 unpin_limit: None,
197 unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
198 }
199 }
200 }
201
202 #[derive(Default, Debug, Copy, Clone)]
203 struct CoIommuReg {
204 dtt_root: u64,
205 cmd: u64,
206 dtt_level: u64,
207 }
208
209 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
210 struct PinnedPageInfo {
211 gfn: u64,
212 unpin_busy_cnt: u64,
213 }
214
215 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self216 fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
217 PinnedPageInfo {
218 gfn,
219 unpin_busy_cnt,
220 }
221 }
222 }
223
224 #[derive(PartialEq, Debug, Eq)]
225 enum UnpinThreadState {
226 Unparked,
227 Parked,
228 }
229
230 struct CoIommuPinState {
231 new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
232 old_gen_pinned_pages: VecDeque<u64>,
233 unpin_thread_state: UnpinThreadState,
234 unpin_park_count: u64,
235 }
236
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool237 unsafe fn vfio_map(
238 vfio_container: &Arc<Mutex<VfioContainer>>,
239 iova: u64,
240 size: u64,
241 user_addr: u64,
242 ) -> bool {
243 match vfio_container
244 .lock()
245 .vfio_dma_map(iova, size, user_addr, true)
246 {
247 Ok(_) => true,
248 Err(e) => {
249 if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
250 if errno == libc::EEXIST {
251 // Already pinned. set PINNED flag
252 error!("CoIommu: iova 0x{:x} already pinned", iova);
253 return true;
254 }
255 }
256 error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
257 false
258 }
259 }
260 }
261
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool262 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
263 match vfio_container.lock().vfio_dma_unmap(iova, size) {
264 Ok(_) => true,
265 Err(e) => {
266 error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
267 false
268 }
269 }
270 }
271
272 #[derive(Default, Debug, Copy, Clone, FromBytes, IntoBytes)]
273 #[repr(C)]
274 struct PinPageInfo {
275 bdf: u16,
276 pad: [u16; 3],
277 nr_pages: u64,
278 }
279
280 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
281 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
282 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
283 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
284
level_to_offset(gfn: u64, level: u64) -> Result<u64>285 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
286 if level == 1 {
287 return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
288 }
289
290 if level == 0 {
291 bail!("Invalid level for gfn 0x{:x}", gfn);
292 }
293
294 let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
295
296 Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
297 }
298
299 struct DTTIter {
300 ptr: *const u8,
301 gfn: u64,
302 }
303
304 impl Default for DTTIter {
default() -> Self305 fn default() -> Self {
306 DTTIter {
307 ptr: std::ptr::null(),
308 gfn: 0,
309 }
310 }
311 }
312
313 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
314 //
315 // There are two ways to get the entry:
316 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
317 // corresponding entry. The DTT is shared between frontend and
318 // backend. It is page-table-like strctures and the entry is indexed
319 // by GFN. The argument dtt_root represents the root page
320 // pga and dtt_level represents the maximum page table level.
321 //
322 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
323 // stores an entry address and the associated gfn. If the target gfn is
324 // in the same page table page with the gfn in dtt_iter, then can
325 // calculate the target entry address based on the entry address in
326 // dtt_iter.
327 //
328 // As the DTT entry is shared between frontend and backend, the accessing
329 // should be atomic. So the returned value is converted to an AtomicU32
330 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>331 fn gfn_to_dtt_pte(
332 mem: &GuestMemory,
333 dtt_level: u64,
334 dtt_root: u64,
335 dtt_iter: &mut DTTIter,
336 gfn: u64,
337 ) -> Result<*const AtomicU32> {
338 let ptr = if dtt_iter.ptr.is_null()
339 || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
340 {
341 // Slow path to walk the DTT to get the pte entry
342 let mut level = dtt_level;
343 let mut pt_gpa = dtt_root;
344 let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
345
346 while level != 1 {
347 let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
348 let parent_pt = mem
349 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
350 .context(Error::GetDTTEntry)?;
351
352 if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
353 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
354 }
355
356 pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
357 level -= 1;
358 }
359
360 let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
361
362 mem.get_host_address(GuestAddress(pt_gpa + index))
363 .context(Error::GetDTTEntry)?
364 } else if gfn > dtt_iter.gfn {
365 // SAFETY:
366 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
367 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
368 // means the calculated ptr will point to the same page as dtt_iter.ptr
369 unsafe {
370 dtt_iter
371 .ptr
372 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
373 }
374 } else {
375 // SAFETY:
376 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
377 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
378 // means the calculated ptr will point to the same page as dtt_iter.ptr
379 unsafe {
380 dtt_iter
381 .ptr
382 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
383 }
384 };
385
386 dtt_iter.ptr = ptr;
387 dtt_iter.gfn = gfn;
388
389 Ok(ptr as *const AtomicU32)
390 }
391
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>392 fn pin_page(
393 pinstate: &mut CoIommuPinState,
394 policy: CoIommuUnpinPolicy,
395 vfio_container: &Arc<Mutex<VfioContainer>>,
396 mem: &GuestMemory,
397 dtt_level: u64,
398 dtt_root: u64,
399 dtt_iter: &mut DTTIter,
400 gfn: u64,
401 ) -> Result<()> {
402 let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
403
404 let gpa = gfn << PAGE_SHIFT_4K;
405 let host_addr = mem
406 .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
407 .context("failed to get host address")? as u64;
408
409 // SAFETY:
410 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
411 // Test PINNED flag
412 if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
413 info!("CoIommu: gfn 0x{:x} already pinned", gfn);
414 return Ok(());
415 }
416
417 // SAFETY:
418 // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
419 // is guaranteed by MemoryMapping interface.
420 if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
421 // SAFETY:
422 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
423 // set PINNED flag
424 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
425 if policy == CoIommuUnpinPolicy::Lru {
426 pinstate
427 .new_gen_pinned_pages
428 .push_back(PinnedPageInfo::new(gfn, 0));
429 }
430 }
431
432 Ok(())
433 }
434
435 #[derive(PartialEq, Debug, Eq)]
436 enum UnpinResult {
437 UnpinlistEmpty,
438 Unpinned,
439 NotPinned,
440 NotUnpinned,
441 FailedUnpin,
442 UnpinParked,
443 }
444
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult445 fn unpin_page(
446 pinstate: &mut CoIommuPinState,
447 vfio_container: &Arc<Mutex<VfioContainer>>,
448 mem: &GuestMemory,
449 dtt_level: u64,
450 dtt_root: u64,
451 dtt_iter: &mut DTTIter,
452 gfn: u64,
453 force: bool,
454 ) -> UnpinResult {
455 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
456 return UnpinResult::UnpinParked;
457 }
458
459 let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
460 Ok(v) => v,
461 Err(_) => {
462 // The case force == true may try to unpin a page which is not
463 // mapped in the dtt. For such page, the pte doesn't exist yet
464 // thus don't need to report any error log.
465 // The case force == false is used by coiommu to periodically
466 // unpin the pages which have been mapped in dtt, thus the pte
467 // for such page does exist. However with the unpin request from
468 // virtio balloon, such pages can be unpinned already and the DTT
469 // pages might be reclaimed by the Guest OS kernel as well, thus
470 // it is also possible to be here. Not to report an error log.
471 return UnpinResult::NotPinned;
472 }
473 };
474
475 if force {
476 // SAFETY:
477 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
478 // This case is for balloon to evict pages so these pages should
479 // already been locked by balloon and no device driver in VM is
480 // able to access these pages, so just clear ACCESSED flag first
481 // to make sure the following unpin can be success.
482 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
483 }
484
485 // SAFETY:
486 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
487 if let Err(entry) = unsafe {
488 (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
489 } {
490 // The compare_exchange failed as the original leaf entry is
491 // not DTTE_PINNED_FLAG so cannot do the unpin.
492 if entry == 0 {
493 // The GFN is already unpinned. This is very similar to the
494 // gfn_to_dtt_pte error case, with the only difference being
495 // that the dtt_pte happens to be on a present page table.
496 UnpinResult::NotPinned
497 } else {
498 if !force {
499 // SAFETY:
500 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
501 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
502 // this page. It represents whether or not this page is touched by the
503 // guest. By clearing this flag after an unpin work, we can detect if
504 // this page has been touched by the guest in the next round of unpin
505 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
506 // will be failed and we will be here again to clear this flag. If this
507 // flag is not set at the next round, unpin this page will be probably
508 // success.
509 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
510 } else {
511 // If we're here, then the guest is trying to release a page via the
512 // balloon that it still has pinned. This most likely that something is
513 // wrong in the guest kernel. Just leave the page pinned and log
514 // an error.
515 // This failure blocks the balloon from removing the page, which ensures
516 // that the guest's view of memory will remain consistent with device
517 // DMA's view of memory. Also note that the host kernel maintains an
518 // elevated refcount for pinned pages, which is a second guarantee the
519 // pages accessible by device DMA won't be freed until after they are
520 // unpinned.
521 error!(
522 "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
523 gfn, entry
524 );
525 }
526 // GFN cannot be unpinned either because the unmap count
527 // is non-zero or the it has accessed flag set.
528 UnpinResult::NotUnpinned
529 }
530 } else {
531 // The compare_exchange success as the original leaf entry is
532 // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
533 // page.
534 let gpa = gfn << PAGE_SHIFT_4K;
535 if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
536 UnpinResult::Unpinned
537 } else {
538 // SAFETY:
539 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
540 // make sure the pinned flag is set
541 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
542 // need to put this gfn back to pinned vector
543 UnpinResult::FailedUnpin
544 }
545 }
546 }
547
548 struct PinWorker {
549 mem: GuestMemory,
550 endpoints: Vec<u16>,
551 notifymap_mmap: Arc<MemoryMapping>,
552 dtt_level: u64,
553 dtt_root: u64,
554 ioevents: Vec<Event>,
555 vfio_container: Arc<Mutex<VfioContainer>>,
556 pinstate: Arc<Mutex<CoIommuPinState>>,
557 params: CoIommuParameters,
558 }
559
560 impl PinWorker {
debug_label(&self) -> &'static str561 fn debug_label(&self) -> &'static str {
562 "CoIommuPinWorker"
563 }
564
run(&mut self, kill_evt: Event)565 fn run(&mut self, kill_evt: Event) {
566 #[derive(EventToken)]
567 enum Token {
568 Kill,
569 Pin { index: usize },
570 }
571
572 let wait_ctx: WaitContext<Token> =
573 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
574 Ok(pc) => pc,
575 Err(e) => {
576 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
577 return;
578 }
579 };
580
581 for (index, event) in self.ioevents.iter().enumerate() {
582 match wait_ctx.add(event, Token::Pin { index }) {
583 Ok(_) => {}
584 Err(e) => {
585 error!(
586 "{}: failed to add ioevent for index {}: {}",
587 self.debug_label(),
588 index,
589 e
590 );
591 return;
592 }
593 }
594 }
595
596 'wait: loop {
597 let events = match wait_ctx.wait() {
598 Ok(v) => v,
599 Err(e) => {
600 error!("{}: failed polling for events: {}", self.debug_label(), e);
601 break;
602 }
603 };
604
605 for event in events.iter().filter(|e| e.is_readable) {
606 match event.token {
607 Token::Kill => break 'wait,
608 Token::Pin { index } => {
609 let offset = index * mem::size_of::<u64>();
610 if let Some(event) = self.ioevents.get(index) {
611 if let Err(e) = event.wait() {
612 error!(
613 "{}: failed reading event {}: {}",
614 self.debug_label(),
615 index,
616 e
617 );
618 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
619 break 'wait;
620 }
621 }
622 if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
623 if let Err(e) = self.pin_pages(data) {
624 error!("{}: {}", self.debug_label(), e);
625 }
626 }
627 fence(Ordering::SeqCst);
628 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
629 }
630 }
631 }
632 }
633 }
634
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>635 fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
636 let pin_page_info = self
637 .mem
638 .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
639 .context("failed to get pin page info")?;
640
641 let bdf = pin_page_info.bdf;
642 ensure!(
643 self.endpoints.iter().any(|&x| x == bdf),
644 "pin page for unexpected bdf 0x{:x}",
645 bdf
646 );
647
648 let mut nr_pages = pin_page_info.nr_pages;
649 let mut offset = mem::size_of::<PinPageInfo>() as u64;
650 let mut dtt_iter: DTTIter = Default::default();
651 let mut pinstate = self.pinstate.lock();
652 while nr_pages > 0 {
653 let gfn = self
654 .mem
655 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
656 .context("failed to get pin page gfn")?;
657
658 pin_page(
659 &mut pinstate,
660 self.params.unpin_policy,
661 &self.vfio_container,
662 &self.mem,
663 self.dtt_level,
664 self.dtt_root,
665 &mut dtt_iter,
666 gfn,
667 )?;
668
669 offset += mem::size_of::<u64>() as u64;
670 nr_pages -= 1;
671 }
672
673 Ok(())
674 }
675
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>676 fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
677 if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
678 let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
679 self.pin_pages_in_batch(gpa)
680 } else {
681 let bdf = (gfn_bdf & 0xffff) as u16;
682 let gfn = gfn_bdf >> 16;
683 let mut dtt_iter: DTTIter = Default::default();
684 ensure!(
685 self.endpoints.iter().any(|&x| x == bdf),
686 "pin page for unexpected bdf 0x{:x}",
687 bdf
688 );
689
690 let mut pinstate = self.pinstate.lock();
691 pin_page(
692 &mut pinstate,
693 self.params.unpin_policy,
694 &self.vfio_container,
695 &self.mem,
696 self.dtt_level,
697 self.dtt_root,
698 &mut dtt_iter,
699 gfn,
700 )
701 }
702 }
703 }
704
705 struct UnpinWorker {
706 mem: GuestMemory,
707 dtt_level: u64,
708 dtt_root: u64,
709 vfio_container: Arc<Mutex<VfioContainer>>,
710 unpin_tube: Option<Tube>,
711 pinstate: Arc<Mutex<CoIommuPinState>>,
712 params: CoIommuParameters,
713 unpin_gen_threshold: u64,
714 }
715
716 impl UnpinWorker {
debug_label(&self) -> &'static str717 fn debug_label(&self) -> &'static str {
718 "CoIommuUnpinWorker"
719 }
720
run(&mut self, kill_evt: Event)721 fn run(&mut self, kill_evt: Event) {
722 #[derive(EventToken)]
723 enum Token {
724 UnpinTimer,
725 UnpinReq,
726 Kill,
727 }
728
729 let wait_ctx: WaitContext<Token> =
730 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
731 Ok(pc) => pc,
732 Err(e) => {
733 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
734 return;
735 }
736 };
737
738 if let Some(tube) = &self.unpin_tube {
739 if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
740 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
741 return;
742 }
743 }
744
745 let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
746 && !self.params.unpin_interval.is_zero()
747 {
748 let mut timer = match Timer::new() {
749 Ok(t) => t,
750 Err(e) => {
751 error!(
752 "{}: failed to create the unpin timer: {}",
753 self.debug_label(),
754 e
755 );
756 return;
757 }
758 };
759 if let Err(e) = timer.reset_repeating(self.params.unpin_interval) {
760 error!(
761 "{}: failed to start the unpin timer: {}",
762 self.debug_label(),
763 e
764 );
765 return;
766 }
767 if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
768 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
769 return;
770 }
771 Some(timer)
772 } else {
773 None
774 };
775
776 let unpin_tube = self.unpin_tube.take();
777 'wait: loop {
778 let events = match wait_ctx.wait() {
779 Ok(v) => v,
780 Err(e) => {
781 error!("{}: failed polling for events: {}", self.debug_label(), e);
782 break;
783 }
784 };
785
786 for event in events.iter().filter(|e| e.is_readable) {
787 match event.token {
788 Token::UnpinTimer => {
789 self.unpin_pages();
790 if let Some(timer) = &mut unpin_timer {
791 if let Err(e) = timer.mark_waited() {
792 error!(
793 "{}: failed to clear unpin timer: {}",
794 self.debug_label(),
795 e
796 );
797 break 'wait;
798 }
799 }
800 }
801 Token::UnpinReq => {
802 if let Some(tube) = &unpin_tube {
803 match tube.recv::<UnpinRequest>() {
804 Ok(req) => {
805 let mut unpin_done = true;
806 for range in req.ranges {
807 // Locking with respect to pin_pages isn't necessary
808 // for this case because the unpinned pages in the range
809 // should all be in the balloon and so nothing will attempt
810 // to pin them.
811 if !self.unpin_pages_in_range(range.0, range.1) {
812 unpin_done = false;
813 break;
814 }
815 }
816 let resp = if unpin_done {
817 UnpinResponse::Success
818 } else {
819 UnpinResponse::Failed
820 };
821 if let Err(e) = tube.send(&resp) {
822 error!(
823 "{}: failed to send unpin response {}",
824 self.debug_label(),
825 e
826 );
827 }
828 }
829 Err(e) => {
830 if let TubeError::Disconnected = e {
831 if let Err(e) = wait_ctx.delete(tube) {
832 error!(
833 "{}: failed to remove unpin_tube: {}",
834 self.debug_label(),
835 e
836 );
837 }
838 } else {
839 error!(
840 "{}: failed to recv Unpin Request: {}",
841 self.debug_label(),
842 e
843 );
844 }
845 }
846 }
847 }
848 }
849 Token::Kill => break 'wait,
850 }
851 }
852 }
853 self.unpin_tube = unpin_tube;
854 }
855
unpin_pages(&mut self)856 fn unpin_pages(&mut self) {
857 if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
858 self.lru_unpin_pages();
859 }
860 }
861
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)862 fn lru_unpin_page(
863 &mut self,
864 dtt_iter: &mut DTTIter,
865 new_gen: bool,
866 ) -> (UnpinResult, Option<PinnedPageInfo>) {
867 let mut pinstate = self.pinstate.lock();
868 let pageinfo = if new_gen {
869 pinstate.new_gen_pinned_pages.pop_front()
870 } else {
871 pinstate
872 .old_gen_pinned_pages
873 .pop_front()
874 .map(|gfn| PinnedPageInfo::new(gfn, 0))
875 };
876
877 pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
878 (
879 unpin_page(
880 &mut pinstate,
881 &self.vfio_container,
882 &self.mem,
883 self.dtt_level,
884 self.dtt_root,
885 dtt_iter,
886 pageinfo.gfn,
887 false,
888 ),
889 Some(pageinfo),
890 )
891 })
892 }
893
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64894 fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
895 let mut not_unpinned_new_gen_pages = VecDeque::new();
896 let mut not_unpinned_old_gen_pages = VecDeque::new();
897 let mut unpinned_count = 0;
898 let has_limit = unpin_limit.is_some();
899 let limit_count = unpin_limit.unwrap_or(0);
900 let mut dtt_iter: DTTIter = Default::default();
901
902 // If has_limit is true but limit_count is 0, will not do the unpin
903 while !has_limit || unpinned_count != limit_count {
904 let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
905 match result {
906 UnpinResult::UnpinlistEmpty => break,
907 UnpinResult::Unpinned => unpinned_count += 1,
908 UnpinResult::NotPinned => {}
909 UnpinResult::NotUnpinned => {
910 if let Some(mut page) = pinned_page {
911 if self.params.unpin_gen_threshold != 0 {
912 page.unpin_busy_cnt += 1;
913 // Unpin from new_gen queue but not
914 // successfully unpinned. Need to check
915 // the unpin_gen threshold. If reach, put
916 // it to old_gen queue.
917 // And if it is not from new_gen, directly
918 // put into old_gen queue.
919 if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
920 not_unpinned_old_gen_pages.push_back(page.gfn);
921 } else {
922 not_unpinned_new_gen_pages.push_back(page);
923 }
924 }
925 }
926 }
927 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
928 // Although UnpinParked means we didn't actually try to unpin
929 // gfn, it's not worth specifically handing since parking is
930 // expected to be relatively rare.
931 if let Some(page) = pinned_page {
932 if new_gen {
933 not_unpinned_new_gen_pages.push_back(page);
934 } else {
935 not_unpinned_old_gen_pages.push_back(page.gfn);
936 }
937 }
938 if result == UnpinResult::UnpinParked {
939 thread::park();
940 }
941 }
942 }
943 }
944
945 if !not_unpinned_new_gen_pages.is_empty() {
946 let mut pinstate = self.pinstate.lock();
947 pinstate
948 .new_gen_pinned_pages
949 .append(&mut not_unpinned_new_gen_pages);
950 }
951
952 if !not_unpinned_old_gen_pages.is_empty() {
953 let mut pinstate = self.pinstate.lock();
954 pinstate
955 .old_gen_pinned_pages
956 .append(&mut not_unpinned_old_gen_pages);
957 }
958
959 unpinned_count
960 }
961
lru_unpin_pages(&mut self)962 fn lru_unpin_pages(&mut self) {
963 let mut unpin_count = 0;
964 if self.params.unpin_gen_threshold != 0 {
965 self.unpin_gen_threshold += 1;
966 if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
967 self.unpin_gen_threshold = 0;
968 // Try to unpin inactive queue first if reaches the thres hold
969 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
970 }
971 }
972 // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
973 self.lru_unpin_pages_in_loop(
974 self.params
975 .unpin_limit
976 .map(|limit| limit.saturating_sub(unpin_count)),
977 true,
978 );
979 }
980
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool981 fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
982 let mut dtt_iter: DTTIter = Default::default();
983 let mut index = 0;
984 while index != count {
985 let mut pinstate = self.pinstate.lock();
986 let result = unpin_page(
987 &mut pinstate,
988 &self.vfio_container,
989 &self.mem,
990 self.dtt_level,
991 self.dtt_root,
992 &mut dtt_iter,
993 gfn + index,
994 true,
995 );
996 drop(pinstate);
997
998 match result {
999 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
1000 UnpinResult::UnpinParked => {
1001 thread::park();
1002 continue;
1003 }
1004 _ => {
1005 error!("coiommu: force unpin failed by {:?}", result);
1006 return false;
1007 }
1008 }
1009 index += 1;
1010 }
1011 true
1012 }
1013 }
1014
1015 pub struct CoIommuDev {
1016 config_regs: PciConfiguration,
1017 pci_address: Option<PciAddress>,
1018 mem: GuestMemory,
1019 coiommu_reg: CoIommuReg,
1020 endpoints: Vec<u16>,
1021 notifymap_mem: SafeDescriptor,
1022 notifymap_mmap: Arc<MemoryMapping>,
1023 notifymap_addr: Option<u64>,
1024 topologymap_mem: SafeDescriptor,
1025 topologymap_addr: Option<u64>,
1026 mmapped: bool,
1027 vm_memory_client: VmMemoryClient,
1028 pin_thread: Option<WorkerThread<PinWorker>>,
1029 unpin_thread: Option<WorkerThread<UnpinWorker>>,
1030 unpin_tube: Option<Tube>,
1031 ioevents: Vec<Event>,
1032 vfio_container: Arc<Mutex<VfioContainer>>,
1033 pinstate: Arc<Mutex<CoIommuPinState>>,
1034 params: CoIommuParameters,
1035 }
1036
1037 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, vm_memory_client: VmMemoryClient, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1038 pub fn new(
1039 mem: GuestMemory,
1040 vfio_container: Arc<Mutex<VfioContainer>>,
1041 vm_memory_client: VmMemoryClient,
1042 unpin_tube: Option<Tube>,
1043 endpoints: Vec<u16>,
1044 vcpu_count: u64,
1045 params: CoIommuParameters,
1046 ) -> Result<Self> {
1047 let config_regs = PciConfiguration::new(
1048 PCI_VENDOR_ID_COIOMMU,
1049 PCI_DEVICE_ID_COIOMMU,
1050 PciClassCode::Other,
1051 &PciOtherSubclass::Other,
1052 None, // No Programming interface.
1053 PciHeaderType::Device,
1054 PCI_VENDOR_ID_COIOMMU,
1055 PCI_DEVICE_ID_COIOMMU,
1056 COIOMMU_REVISION_ID,
1057 );
1058
1059 // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1060 let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1061 .context(Error::CreateSharedMemory)?;
1062 let notifymap_mmap = Arc::new(
1063 MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1064 .from_shared_memory(¬ifymap_mem)
1065 .offset(0)
1066 .build()?,
1067 );
1068
1069 // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1070 let topologymap_mem =
1071 SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1072 .context(Error::CreateSharedMemory)?;
1073 let topologymap_mmap = Arc::new(
1074 MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1075 .from_shared_memory(&topologymap_mem)
1076 .offset(0)
1077 .build()?,
1078 );
1079
1080 ensure!(
1081 (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1082 "Coiommu: too many endpoints"
1083 );
1084 topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1085 for (index, endpoint) in endpoints.iter().enumerate() {
1086 topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1087 }
1088
1089 let mut ioevents = Vec::new();
1090 for _ in 0..vcpu_count {
1091 ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1092 }
1093
1094 Ok(Self {
1095 config_regs,
1096 pci_address: None,
1097 mem,
1098 coiommu_reg: Default::default(),
1099 endpoints,
1100 notifymap_mem: notifymap_mem.into(),
1101 notifymap_mmap,
1102 notifymap_addr: None,
1103 topologymap_mem: topologymap_mem.into(),
1104 topologymap_addr: None,
1105 mmapped: false,
1106 vm_memory_client,
1107 pin_thread: None,
1108 unpin_thread: None,
1109 unpin_tube,
1110 ioevents,
1111 vfio_container,
1112 pinstate: Arc::new(Mutex::new(CoIommuPinState {
1113 new_gen_pinned_pages: VecDeque::new(),
1114 old_gen_pinned_pages: VecDeque::new(),
1115 unpin_thread_state: UnpinThreadState::Unparked,
1116 unpin_park_count: 0,
1117 })),
1118 params,
1119 })
1120 }
1121
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1122 fn register_mmap(
1123 &self,
1124 descriptor: SafeDescriptor,
1125 size: usize,
1126 offset: u64,
1127 gpa: u64,
1128 prot: Protection,
1129 ) -> Result<()> {
1130 let _region = self
1131 .vm_memory_client
1132 .register_memory(
1133 VmMemorySource::Descriptor {
1134 descriptor,
1135 offset,
1136 size: size as u64,
1137 },
1138 VmMemoryDestination::GuestPhysicalAddress(gpa),
1139 prot,
1140 MemCacheType::CacheCoherent,
1141 )
1142 .context("register_mmap register_memory failed")?;
1143 Ok(())
1144 }
1145
mmap(&mut self)1146 fn mmap(&mut self) {
1147 if self.mmapped {
1148 return;
1149 }
1150
1151 if let Some(gpa) = self.notifymap_addr {
1152 match self.register_mmap(
1153 self.notifymap_mem.try_clone().unwrap(),
1154 COIOMMU_NOTIFYMAP_SIZE,
1155 0,
1156 gpa,
1157 Protection::read_write(),
1158 ) {
1159 Ok(_) => {}
1160 Err(e) => {
1161 panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1162 }
1163 }
1164 }
1165
1166 if let Some(gpa) = self.topologymap_addr {
1167 match self.register_mmap(
1168 self.topologymap_mem.try_clone().unwrap(),
1169 COIOMMU_TOPOLOGYMAP_SIZE,
1170 0,
1171 gpa,
1172 Protection::read(),
1173 ) {
1174 Ok(_) => {}
1175 Err(e) => {
1176 panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1177 }
1178 }
1179 }
1180
1181 self.mmapped = true;
1182 }
1183
start_workers(&mut self)1184 fn start_workers(&mut self) {
1185 if self.pin_thread.is_none() {
1186 self.start_pin_thread();
1187 }
1188
1189 if self.unpin_thread.is_none() {
1190 self.start_unpin_thread();
1191 }
1192 }
1193
start_pin_thread(&mut self)1194 fn start_pin_thread(&mut self) {
1195 let mem = self.mem.clone();
1196 let endpoints = self.endpoints.to_vec();
1197 let notifymap_mmap = self.notifymap_mmap.clone();
1198 let dtt_root = self.coiommu_reg.dtt_root;
1199 let dtt_level = self.coiommu_reg.dtt_level;
1200 let ioevents: Vec<Event> = self
1201 .ioevents
1202 .iter()
1203 .map(|e| e.try_clone().unwrap())
1204 .collect();
1205
1206 let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
1207 let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1208 for (i, evt) in self.ioevents.iter().enumerate() {
1209 self.vm_memory_client
1210 .register_io_event(
1211 evt.try_clone().expect("failed to clone event"),
1212 notify_base + i as u64,
1213 Datamatch::AnyLength,
1214 )
1215 .expect("failed to register ioevent");
1216 }
1217
1218 let vfio_container = self.vfio_container.clone();
1219 let pinstate = self.pinstate.clone();
1220 let params = self.params;
1221
1222 self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1223 let mut worker = PinWorker {
1224 mem,
1225 endpoints,
1226 notifymap_mmap,
1227 dtt_root,
1228 dtt_level,
1229 ioevents,
1230 vfio_container,
1231 pinstate,
1232 params,
1233 };
1234 worker.run(kill_evt);
1235 worker
1236 }));
1237 }
1238
start_unpin_thread(&mut self)1239 fn start_unpin_thread(&mut self) {
1240 let mem = self.mem.clone();
1241 let dtt_root = self.coiommu_reg.dtt_root;
1242 let dtt_level = self.coiommu_reg.dtt_level;
1243 let vfio_container = self.vfio_container.clone();
1244 let unpin_tube = self.unpin_tube.take();
1245 let pinstate = self.pinstate.clone();
1246 let params = self.params;
1247 self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1248 let mut worker = UnpinWorker {
1249 mem,
1250 dtt_level,
1251 dtt_root,
1252 vfio_container,
1253 unpin_tube,
1254 pinstate,
1255 params,
1256 unpin_gen_threshold: 0,
1257 };
1258 worker.run(kill_evt);
1259 worker
1260 }));
1261 }
1262
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1263 fn allocate_bar_address(
1264 &mut self,
1265 resources: &mut SystemAllocator,
1266 address: PciAddress,
1267 size: u64,
1268 bar_num: u8,
1269 name: &str,
1270 ) -> PciResult<u64> {
1271 let addr = resources
1272 .allocate_mmio(
1273 size,
1274 Alloc::PciBar {
1275 bus: address.bus,
1276 dev: address.dev,
1277 func: address.func,
1278 bar: bar_num,
1279 },
1280 name.to_string(),
1281 AllocOptions::new().prefetchable(true).align(size),
1282 )
1283 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1284
1285 let bar = PciBarConfiguration::new(
1286 bar_num as usize,
1287 size,
1288 PciBarRegionType::Memory64BitRegion,
1289 PciBarPrefetchable::Prefetchable,
1290 )
1291 .set_address(addr);
1292
1293 self.config_regs
1294 .add_pci_bar(bar)
1295 .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1296
1297 Ok(addr)
1298 }
1299
read_mmio(&mut self, offset: u64, data: &mut [u8])1300 fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
1301 if offset >= mem::size_of::<CoIommuReg>() as u64 {
1302 error!(
1303 "{}: read_mmio: invalid offset 0x{:x}",
1304 self.debug_label(),
1305 offset
1306 );
1307 return;
1308 }
1309
1310 // Sanity check, must be 64bit aligned accessing
1311 if offset % 8 != 0 || data.len() != 8 {
1312 error!(
1313 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1314 self.debug_label(),
1315 offset,
1316 data.len()
1317 );
1318 return;
1319 }
1320
1321 let v = match offset / 8 {
1322 0 => self.coiommu_reg.dtt_root,
1323 1 => self.coiommu_reg.cmd,
1324 2 => self.coiommu_reg.dtt_level,
1325 _ => return,
1326 };
1327
1328 data.copy_from_slice(&v.to_ne_bytes());
1329 }
1330
write_mmio(&mut self, offset: u64, data: &[u8])1331 fn write_mmio(&mut self, offset: u64, data: &[u8]) {
1332 let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1333 if offset >= mmio_len {
1334 if data.len() != 1 {
1335 error!(
1336 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1337 self.debug_label(),
1338 offset,
1339 data.len()
1340 );
1341 return;
1342 }
1343
1344 // Usually will not be here as this is for the per-vcpu notify
1345 // register which is monitored by the ioevents. For the notify
1346 // register which is not covered by the ioevents, they are not
1347 // be used by the frontend driver. In case the frontend driver
1348 // went here, do a simple handle to make sure the frontend driver
1349 // will not be blocked, and through an error log.
1350 let index = (offset - mmio_len) as usize;
1351 if let Some(event) = self.ioevents.get(index) {
1352 let _ = event.signal();
1353 } else {
1354 self.notifymap_mmap
1355 .write_obj::<u64>(0, index * mem::size_of::<u64>())
1356 .unwrap();
1357 error!(
1358 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1359 self.debug_label(),
1360 offset
1361 );
1362 }
1363 return;
1364 }
1365
1366 // Sanity check, must be 64bit aligned accessing for CoIommuReg
1367 if offset % 8 != 0 || data.len() != 8 {
1368 error!(
1369 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1370 self.debug_label(),
1371 offset,
1372 data.len()
1373 );
1374 return;
1375 }
1376
1377 let index = offset / 8;
1378 let v = u64::from_ne_bytes(data.try_into().unwrap());
1379 match index {
1380 0 => {
1381 if self.coiommu_reg.dtt_root == 0 {
1382 self.coiommu_reg.dtt_root = v;
1383 }
1384 }
1385 1 => match v {
1386 // Deactivate can happen if the frontend driver in the guest
1387 // fails during probing or if the CoIommu device is removed
1388 // by the guest. Neither of these cases is expected, and if
1389 // either happens the guest will be non-functional due to
1390 // pass-through devices which rely on CoIommu not working.
1391 // So just fail hard and panic.
1392 COIOMMU_CMD_DEACTIVATE => {
1393 panic!("{}: Deactivate is not supported", self.debug_label())
1394 }
1395 COIOMMU_CMD_ACTIVATE => {
1396 if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1397 self.start_workers();
1398 }
1399 }
1400 COIOMMU_CMD_PARK_UNPIN => {
1401 let mut pinstate = self.pinstate.lock();
1402 pinstate.unpin_thread_state = UnpinThreadState::Parked;
1403 if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1404 pinstate.unpin_park_count = v;
1405 } else {
1406 panic!("{}: Park request overflowing", self.debug_label());
1407 }
1408 }
1409 COIOMMU_CMD_UNPARK_UNPIN => {
1410 let mut pinstate = self.pinstate.lock();
1411 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1412 if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1413 pinstate.unpin_park_count = v;
1414 if pinstate.unpin_park_count == 0 {
1415 if let Some(worker_thread) = &self.unpin_thread {
1416 worker_thread.thread().unpark();
1417 }
1418 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1419 }
1420 } else {
1421 error!("{}: Park count is already reached to 0", self.debug_label());
1422 }
1423 }
1424 }
1425 _ => {}
1426 },
1427 2 => {
1428 if self.coiommu_reg.dtt_level == 0 {
1429 self.coiommu_reg.dtt_level = v;
1430 }
1431 }
1432 _ => {}
1433 }
1434 }
1435 }
1436
1437 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1438 fn debug_label(&self) -> String {
1439 "CoIommu".to_owned()
1440 }
1441
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1442 fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1443 if self.pci_address.is_none() {
1444 self.pci_address = resources.allocate_pci(0, self.debug_label());
1445 }
1446 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1447 }
1448
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1449 fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1450 let address = self
1451 .pci_address
1452 .expect("allocate_address must be called prior to allocate_io_bars");
1453
1454 // Allocate one bar for the structures pointed to by the capability structures.
1455 let mut ranges: Vec<BarRange> = Vec::new();
1456
1457 let mmio_addr = self.allocate_bar_address(
1458 resources,
1459 address,
1460 COIOMMU_MMIO_BAR_SIZE,
1461 COIOMMU_MMIO_BAR as u8,
1462 "coiommu-mmiobar",
1463 )?;
1464
1465 ranges.push(BarRange {
1466 addr: mmio_addr,
1467 size: COIOMMU_MMIO_BAR_SIZE,
1468 prefetchable: false,
1469 });
1470
1471 Ok(ranges)
1472 }
1473
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1474 fn allocate_device_bars(
1475 &mut self,
1476 resources: &mut SystemAllocator,
1477 ) -> PciResult<Vec<BarRange>> {
1478 let address = self
1479 .pci_address
1480 .expect("allocate_address must be called prior to allocate_device_bars");
1481
1482 let mut ranges: Vec<BarRange> = Vec::new();
1483
1484 let topologymap_addr = self.allocate_bar_address(
1485 resources,
1486 address,
1487 COIOMMU_TOPOLOGYMAP_SIZE as u64,
1488 COIOMMU_TOPOLOGYMAP_BAR,
1489 "coiommu-topology",
1490 )?;
1491 self.topologymap_addr = Some(topologymap_addr);
1492 ranges.push(BarRange {
1493 addr: topologymap_addr,
1494 size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1495 prefetchable: false,
1496 });
1497
1498 let notifymap_addr = self.allocate_bar_address(
1499 resources,
1500 address,
1501 COIOMMU_NOTIFYMAP_SIZE as u64,
1502 COIOMMU_NOTIFYMAP_BAR as u8,
1503 "coiommu-notifymap",
1504 )?;
1505 self.notifymap_addr = Some(notifymap_addr);
1506 ranges.push(BarRange {
1507 addr: notifymap_addr,
1508 size: COIOMMU_NOTIFYMAP_SIZE as u64,
1509 prefetchable: false,
1510 });
1511
1512 Ok(ranges)
1513 }
1514
read_config_register(&self, reg_idx: usize) -> u321515 fn read_config_register(&self, reg_idx: usize) -> u32 {
1516 self.config_regs.read_reg(reg_idx)
1517 }
1518
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1519 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1520 if reg_idx == COMMAND_REG
1521 && data.len() == 2
1522 && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1523 && !self.mmapped
1524 {
1525 self.mmap();
1526 }
1527
1528 self.config_regs.write_reg(reg_idx, offset, data);
1529 }
1530
keep_rds(&self) -> Vec<RawDescriptor>1531 fn keep_rds(&self) -> Vec<RawDescriptor> {
1532 let mut rds = vec![
1533 self.vfio_container.lock().as_raw_descriptor(),
1534 self.vm_memory_client.as_raw_descriptor(),
1535 self.notifymap_mem.as_raw_descriptor(),
1536 self.topologymap_mem.as_raw_descriptor(),
1537 ];
1538 if let Some(unpin_tube) = &self.unpin_tube {
1539 rds.push(unpin_tube.as_raw_descriptor());
1540 }
1541 rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
1542 rds
1543 }
1544
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])1545 fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
1546 match bar_index {
1547 COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
1548 COIOMMU_NOTIFYMAP_BAR => {
1549 // With coiommu device activated, the accessing the notifymap bar
1550 // won't cause vmexit. If goes here, means the coiommu device is
1551 // deactivated, and will not do the pin/unpin work. Thus no need
1552 // to handle this notifymap read.
1553 }
1554 _ => {}
1555 }
1556 }
1557
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])1558 fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
1559 match bar_index {
1560 COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
1561 COIOMMU_NOTIFYMAP_BAR => {
1562 // With coiommu device activated, the accessing the notifymap bar
1563 // won't cause vmexit. If goes here, means the coiommu device is
1564 // deactivated, and will not do the pin/unpin work. Thus no need
1565 // to handle this notifymap write.
1566 }
1567 _ => {}
1568 }
1569 }
1570
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1571 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1572 self.config_regs.get_bar_configuration(bar_num)
1573 }
1574 }
1575
1576 impl Suspendable for CoIommuDev {}
1577