1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29
30 use anyhow::anyhow;
31 use anyhow::bail;
32 use anyhow::ensure;
33 use anyhow::Context;
34 use anyhow::Result;
35 use base::error;
36 use base::info;
37 use base::AsRawDescriptor;
38 use base::Event;
39 use base::EventToken;
40 use base::MemoryMapping;
41 use base::MemoryMappingBuilder;
42 use base::Protection;
43 use base::RawDescriptor;
44 use base::SafeDescriptor;
45 use base::SharedMemory;
46 use base::Timer;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use resources::Alloc;
53 use resources::AllocOptions;
54 use resources::SystemAllocator;
55 use serde::Deserialize;
56 use serde::Deserializer;
57 use serde::Serialize;
58 use serde_keyvalue::FromKeyValues;
59 use sync::Mutex;
60 use thiserror::Error as ThisError;
61 use vm_control::VmMemoryDestination;
62 use vm_control::VmMemoryRequest;
63 use vm_control::VmMemoryResponse;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::AsBytes;
68 use zerocopy::FromBytes;
69
70 use crate::pci::pci_configuration::PciBarConfiguration;
71 use crate::pci::pci_configuration::PciBarPrefetchable;
72 use crate::pci::pci_configuration::PciBarRegionType;
73 use crate::pci::pci_configuration::PciClassCode;
74 use crate::pci::pci_configuration::PciConfiguration;
75 use crate::pci::pci_configuration::PciHeaderType;
76 use crate::pci::pci_configuration::PciOtherSubclass;
77 use crate::pci::pci_configuration::COMMAND_REG;
78 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
79 use crate::pci::pci_device::BarRange;
80 use crate::pci::pci_device::PciDevice;
81 use crate::pci::pci_device::Result as PciResult;
82 use crate::pci::PciAddress;
83 use crate::pci::PciDeviceError;
84 use crate::vfio::VfioContainer;
85 use crate::Suspendable;
86 use crate::UnpinRequest;
87 use crate::UnpinResponse;
88
89 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
90 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
91 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
92 const COIOMMU_CMD_ACTIVATE: u64 = 1;
93 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
94 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
95 const COIOMMU_REVISION_ID: u8 = 0x10;
96 const COIOMMU_MMIO_BAR: u8 = 0;
97 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
98 const COIOMMU_NOTIFYMAP_BAR: u8 = 2;
99 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
100 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
101 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
102 const PAGE_SIZE_4K: u64 = 4096;
103 const PAGE_SHIFT_4K: u64 = 12;
104 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
105
106 const DTTE_PINNED_FLAG: u32 = 1 << 31;
107 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
108 const DTT_ENTRY_PRESENT: u64 = 1;
109 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
110
111 #[derive(ThisError, Debug)]
112 enum Error {
113 #[error("CoIommu failed to create shared memory")]
114 CreateSharedMemory,
115 #[error("Failed to get DTT entry")]
116 GetDTTEntry,
117 #[error("Tube error")]
118 TubeError,
119 }
120
121 //default interval is 60s
122 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
123 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
124 /// Holds the coiommu unpin policy
125 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
126 #[serde(rename_all = "kebab-case")]
127 pub enum CoIommuUnpinPolicy {
128 #[default]
129 Off,
130 Lru,
131 }
132
133 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result134 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
135 use self::CoIommuUnpinPolicy::*;
136
137 match self {
138 Off => write!(f, "off"),
139 Lru => write!(f, "lru"),
140 }
141 }
142 }
143
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>144 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
145 deserializer: D,
146 ) -> Result<Duration, D::Error> {
147 let secs = u64::deserialize(deserializer)?;
148
149 Ok(Duration::from_secs(secs))
150 }
151
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>152 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
153 deserializer: D,
154 ) -> Result<Option<u64>, D::Error> {
155 let limit = u64::deserialize(deserializer)?;
156
157 match limit {
158 0 => Err(serde::de::Error::custom(
159 "Please use non-zero unpin_limit value",
160 )),
161 limit => Ok(Some(limit)),
162 }
163 }
164
unpin_interval_default() -> Duration165 fn unpin_interval_default() -> Duration {
166 UNPIN_DEFAULT_INTERVAL
167 }
168
unpin_gen_threshold_default() -> u64169 fn unpin_gen_threshold_default() -> u64 {
170 UNPIN_GEN_DEFAULT_THRES
171 }
172
173 /// Holds the parameters for a coiommu device
174 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
175 #[serde(deny_unknown_fields)]
176 pub struct CoIommuParameters {
177 #[serde(default)]
178 pub unpin_policy: CoIommuUnpinPolicy,
179 #[serde(
180 deserialize_with = "deserialize_unpin_interval",
181 default = "unpin_interval_default"
182 )]
183 pub unpin_interval: Duration,
184 #[serde(deserialize_with = "deserialize_unpin_limit", default)]
185 pub unpin_limit: Option<u64>,
186 // Number of unpin intervals a pinned page must be busy for to be aged into the
187 // older, less frequently checked generation.
188 #[serde(default = "unpin_gen_threshold_default")]
189 pub unpin_gen_threshold: u64,
190 }
191
192 impl Default for CoIommuParameters {
default() -> Self193 fn default() -> Self {
194 Self {
195 unpin_policy: CoIommuUnpinPolicy::Off,
196 unpin_interval: UNPIN_DEFAULT_INTERVAL,
197 unpin_limit: None,
198 unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
199 }
200 }
201 }
202
203 #[derive(Default, Debug, Copy, Clone)]
204 struct CoIommuReg {
205 dtt_root: u64,
206 cmd: u64,
207 dtt_level: u64,
208 }
209
210 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
211 struct PinnedPageInfo {
212 gfn: u64,
213 unpin_busy_cnt: u64,
214 }
215
216 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self217 fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
218 PinnedPageInfo {
219 gfn,
220 unpin_busy_cnt,
221 }
222 }
223 }
224
225 #[derive(PartialEq, Debug, Eq)]
226 enum UnpinThreadState {
227 Unparked,
228 Parked,
229 }
230
231 struct CoIommuPinState {
232 new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
233 old_gen_pinned_pages: VecDeque<u64>,
234 unpin_thread_state: UnpinThreadState,
235 unpin_park_count: u64,
236 }
237
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool238 unsafe fn vfio_map(
239 vfio_container: &Arc<Mutex<VfioContainer>>,
240 iova: u64,
241 size: u64,
242 user_addr: u64,
243 ) -> bool {
244 match vfio_container
245 .lock()
246 .vfio_dma_map(iova, size, user_addr, true)
247 {
248 Ok(_) => true,
249 Err(e) => {
250 if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
251 if errno == libc::EEXIST {
252 // Already pinned. set PINNED flag
253 error!("CoIommu: iova 0x{:x} already pinned", iova);
254 return true;
255 }
256 }
257 error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
258 false
259 }
260 }
261 }
262
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool263 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
264 match vfio_container.lock().vfio_dma_unmap(iova, size) {
265 Ok(_) => true,
266 Err(e) => {
267 error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
268 false
269 }
270 }
271 }
272
273 #[derive(Default, Debug, Copy, Clone, FromBytes, AsBytes)]
274 #[repr(C)]
275 struct PinPageInfo {
276 bdf: u16,
277 pad: [u16; 3],
278 nr_pages: u64,
279 }
280
281 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
282 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
283 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
284 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
285
level_to_offset(gfn: u64, level: u64) -> Result<u64>286 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
287 if level == 1 {
288 return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
289 }
290
291 if level == 0 {
292 bail!("Invalid level for gfn 0x{:x}", gfn);
293 }
294
295 let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
296
297 Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
298 }
299
300 struct DTTIter {
301 ptr: *const u8,
302 gfn: u64,
303 }
304
305 impl Default for DTTIter {
default() -> Self306 fn default() -> Self {
307 DTTIter {
308 ptr: std::ptr::null(),
309 gfn: 0,
310 }
311 }
312 }
313
314 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
315 //
316 // There are two ways to get the entry:
317 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
318 // corresponding entry. The DTT is shared between frontend and
319 // backend. It is page-table-like strctures and the entry is indexed
320 // by GFN. The argument dtt_root represents the root page
321 // pga and dtt_level represents the maximum page table level.
322 //
323 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
324 // stores an entry address and the associated gfn. If the target gfn is
325 // in the same page table page with the gfn in dtt_iter, then can
326 // calculate the target entry address based on the entry address in
327 // dtt_iter.
328 //
329 // As the DTT entry is shared between frontend and backend, the accessing
330 // should be atomic. So the returned value is converted to an AtomicU32
331 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>332 fn gfn_to_dtt_pte(
333 mem: &GuestMemory,
334 dtt_level: u64,
335 dtt_root: u64,
336 dtt_iter: &mut DTTIter,
337 gfn: u64,
338 ) -> Result<*const AtomicU32> {
339 let ptr = if dtt_iter.ptr.is_null()
340 || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
341 {
342 // Slow path to walk the DTT to get the pte entry
343 let mut level = dtt_level;
344 let mut pt_gpa = dtt_root;
345 let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
346
347 while level != 1 {
348 let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
349 let parent_pt = mem
350 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
351 .context(Error::GetDTTEntry)?;
352
353 if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
354 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
355 }
356
357 pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
358 level -= 1;
359 }
360
361 let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
362
363 mem.get_host_address(GuestAddress(pt_gpa + index))
364 .context(Error::GetDTTEntry)?
365 } else {
366 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
367 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
368 // means the calculated ptr will point to the same page as dtt_iter.ptr
369 if gfn > dtt_iter.gfn {
370 unsafe {
371 dtt_iter
372 .ptr
373 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
374 }
375 } else {
376 unsafe {
377 dtt_iter
378 .ptr
379 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
380 }
381 }
382 };
383
384 dtt_iter.ptr = ptr;
385 dtt_iter.gfn = gfn;
386
387 Ok(ptr as *const AtomicU32)
388 }
389
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>390 fn pin_page(
391 pinstate: &mut CoIommuPinState,
392 policy: CoIommuUnpinPolicy,
393 vfio_container: &Arc<Mutex<VfioContainer>>,
394 mem: &GuestMemory,
395 dtt_level: u64,
396 dtt_root: u64,
397 dtt_iter: &mut DTTIter,
398 gfn: u64,
399 ) -> Result<()> {
400 let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
401
402 let gpa = (gfn << PAGE_SHIFT_4K) as u64;
403 let host_addr = mem
404 .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
405 .context("failed to get host address")? as u64;
406
407 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
408 // Test PINNED flag
409 if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
410 info!("CoIommu: gfn 0x{:x} already pinned", gfn);
411 return Ok(());
412 }
413
414 // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
415 // is guaranteed by MemoryMapping interface.
416 if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
417 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
418 // set PINNED flag
419 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
420 if policy == CoIommuUnpinPolicy::Lru {
421 pinstate
422 .new_gen_pinned_pages
423 .push_back(PinnedPageInfo::new(gfn, 0));
424 }
425 }
426
427 Ok(())
428 }
429
430 #[derive(PartialEq, Debug, Eq)]
431 enum UnpinResult {
432 UnpinlistEmpty,
433 Unpinned,
434 NotPinned,
435 NotUnpinned,
436 FailedUnpin,
437 UnpinParked,
438 }
439
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult440 fn unpin_page(
441 pinstate: &mut CoIommuPinState,
442 vfio_container: &Arc<Mutex<VfioContainer>>,
443 mem: &GuestMemory,
444 dtt_level: u64,
445 dtt_root: u64,
446 dtt_iter: &mut DTTIter,
447 gfn: u64,
448 force: bool,
449 ) -> UnpinResult {
450 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
451 return UnpinResult::UnpinParked;
452 }
453
454 let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
455 Ok(v) => v,
456 Err(_) => {
457 // The case force == true may try to unpin a page which is not
458 // mapped in the dtt. For such page, the pte doesn't exist yet
459 // thus don't need to report any error log.
460 // The case force == false is used by coiommu to periodically
461 // unpin the pages which have been mapped in dtt, thus the pte
462 // for such page does exist. However with the unpin request from
463 // virtio balloon, such pages can be unpinned already and the DTT
464 // pages might be reclaimed by the Guest OS kernel as well, thus
465 // it is also possible to be here. Not to report an error log.
466 return UnpinResult::NotPinned;
467 }
468 };
469
470 if force {
471 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
472 // This case is for balloon to evict pages so these pages should
473 // already been locked by balloon and no device driver in VM is
474 // able to access these pages, so just clear ACCESSED flag first
475 // to make sure the following unpin can be success.
476 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
477 }
478
479 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
480 if let Err(entry) = unsafe {
481 (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
482 } {
483 // The compare_exchange failed as the original leaf entry is
484 // not DTTE_PINNED_FLAG so cannot do the unpin.
485 if entry == 0 {
486 // The GFN is already unpinned. This is very similar to the
487 // gfn_to_dtt_pte error case, with the only difference being
488 // that the dtt_pte happens to be on a present page table.
489 UnpinResult::NotPinned
490 } else {
491 if !force {
492 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
493 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
494 // this page. It represents whether or not this page is touched by the
495 // guest. By clearing this flag after an unpin work, we can detect if
496 // this page has been touched by the guest in the next round of unpin
497 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
498 // will be failed and we will be here again to clear this flag. If this
499 // flag is not set at the next round, unpin this page will be probably
500 // success.
501 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
502 } else {
503 // If we're here, then the guest is trying to release a page via the
504 // balloon that it still has pinned. This most likely that something is
505 // wrong in the guest kernel. Just leave the page pinned and log
506 // an error.
507 // This failure blocks the balloon from removing the page, which ensures
508 // that the guest's view of memory will remain consistent with device
509 // DMA's view of memory. Also note that the host kernel maintains an
510 // elevated refcount for pinned pages, which is a second guarantee the
511 // pages accessible by device DMA won't be freed until after they are
512 // unpinned.
513 error!(
514 "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
515 gfn, entry
516 );
517 }
518 // GFN cannot be unpinned either because the unmap count
519 // is non-zero or the it has accessed flag set.
520 UnpinResult::NotUnpinned
521 }
522 } else {
523 // The compare_exchange success as the original leaf entry is
524 // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
525 // page.
526 let gpa = (gfn << PAGE_SHIFT_4K) as u64;
527 if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
528 UnpinResult::Unpinned
529 } else {
530 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
531 // make sure the pinned flag is set
532 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
533 // need to put this gfn back to pinned vector
534 UnpinResult::FailedUnpin
535 }
536 }
537 }
538
539 struct PinWorker {
540 mem: GuestMemory,
541 endpoints: Vec<u16>,
542 notifymap_mmap: Arc<MemoryMapping>,
543 dtt_level: u64,
544 dtt_root: u64,
545 ioevents: Vec<Event>,
546 vfio_container: Arc<Mutex<VfioContainer>>,
547 pinstate: Arc<Mutex<CoIommuPinState>>,
548 params: CoIommuParameters,
549 }
550
551 impl PinWorker {
debug_label(&self) -> &'static str552 fn debug_label(&self) -> &'static str {
553 "CoIommuPinWorker"
554 }
555
run(&mut self, kill_evt: Event)556 fn run(&mut self, kill_evt: Event) {
557 #[derive(EventToken)]
558 enum Token {
559 Kill,
560 Pin { index: usize },
561 }
562
563 let wait_ctx: WaitContext<Token> =
564 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
565 Ok(pc) => pc,
566 Err(e) => {
567 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
568 return;
569 }
570 };
571
572 for (index, event) in self.ioevents.iter().enumerate() {
573 match wait_ctx.add(event, Token::Pin { index }) {
574 Ok(_) => {}
575 Err(e) => {
576 error!(
577 "{}: failed to add ioevent for index {}: {}",
578 self.debug_label(),
579 index,
580 e
581 );
582 return;
583 }
584 }
585 }
586
587 'wait: loop {
588 let events = match wait_ctx.wait() {
589 Ok(v) => v,
590 Err(e) => {
591 error!("{}: failed polling for events: {}", self.debug_label(), e);
592 break;
593 }
594 };
595
596 for event in events.iter().filter(|e| e.is_readable) {
597 match event.token {
598 Token::Kill => break 'wait,
599 Token::Pin { index } => {
600 let offset = index * mem::size_of::<u64>() as usize;
601 if let Some(event) = self.ioevents.get(index) {
602 if let Err(e) = event.wait() {
603 error!(
604 "{}: failed reading event {}: {}",
605 self.debug_label(),
606 index,
607 e
608 );
609 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
610 break 'wait;
611 }
612 }
613 if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
614 if let Err(e) = self.pin_pages(data) {
615 error!("{}: {}", self.debug_label(), e);
616 }
617 }
618 fence(Ordering::SeqCst);
619 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
620 }
621 }
622 }
623 }
624 }
625
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>626 fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
627 let pin_page_info = self
628 .mem
629 .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
630 .context("failed to get pin page info")?;
631
632 let bdf = pin_page_info.bdf;
633 ensure!(
634 self.endpoints.iter().any(|&x| x == bdf),
635 "pin page for unexpected bdf 0x{:x}",
636 bdf
637 );
638
639 let mut nr_pages = pin_page_info.nr_pages;
640 let mut offset = mem::size_of::<PinPageInfo>() as u64;
641 let mut dtt_iter: DTTIter = Default::default();
642 let mut pinstate = self.pinstate.lock();
643 while nr_pages > 0 {
644 let gfn = self
645 .mem
646 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
647 .context("failed to get pin page gfn")?;
648
649 pin_page(
650 &mut pinstate,
651 self.params.unpin_policy,
652 &self.vfio_container,
653 &self.mem,
654 self.dtt_level,
655 self.dtt_root,
656 &mut dtt_iter,
657 gfn,
658 )?;
659
660 offset += mem::size_of::<u64>() as u64;
661 nr_pages -= 1;
662 }
663
664 Ok(())
665 }
666
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>667 fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
668 if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
669 let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
670 self.pin_pages_in_batch(gpa)
671 } else {
672 let bdf = (gfn_bdf & 0xffff) as u16;
673 let gfn = gfn_bdf >> 16;
674 let mut dtt_iter: DTTIter = Default::default();
675 ensure!(
676 self.endpoints.iter().any(|&x| x == bdf),
677 "pin page for unexpected bdf 0x{:x}",
678 bdf
679 );
680
681 let mut pinstate = self.pinstate.lock();
682 pin_page(
683 &mut pinstate,
684 self.params.unpin_policy,
685 &self.vfio_container,
686 &self.mem,
687 self.dtt_level,
688 self.dtt_root,
689 &mut dtt_iter,
690 gfn,
691 )
692 }
693 }
694 }
695
696 struct UnpinWorker {
697 mem: GuestMemory,
698 dtt_level: u64,
699 dtt_root: u64,
700 vfio_container: Arc<Mutex<VfioContainer>>,
701 unpin_tube: Option<Tube>,
702 pinstate: Arc<Mutex<CoIommuPinState>>,
703 params: CoIommuParameters,
704 unpin_gen_threshold: u64,
705 }
706
707 impl UnpinWorker {
debug_label(&self) -> &'static str708 fn debug_label(&self) -> &'static str {
709 "CoIommuUnpinWorker"
710 }
711
run(&mut self, kill_evt: Event)712 fn run(&mut self, kill_evt: Event) {
713 #[derive(EventToken)]
714 enum Token {
715 UnpinTimer,
716 UnpinReq,
717 Kill,
718 }
719
720 let wait_ctx: WaitContext<Token> =
721 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
722 Ok(pc) => pc,
723 Err(e) => {
724 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
725 return;
726 }
727 };
728
729 if let Some(tube) = &self.unpin_tube {
730 if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
731 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
732 return;
733 }
734 }
735
736 let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
737 && !self.params.unpin_interval.is_zero()
738 {
739 let duration = self.params.unpin_interval;
740 let interval = Some(self.params.unpin_interval);
741 let mut timer = match Timer::new() {
742 Ok(t) => t,
743 Err(e) => {
744 error!(
745 "{}: failed to create the unpin timer: {}",
746 self.debug_label(),
747 e
748 );
749 return;
750 }
751 };
752 if let Err(e) = timer.reset(duration, interval) {
753 error!(
754 "{}: failed to start the unpin timer: {}",
755 self.debug_label(),
756 e
757 );
758 return;
759 }
760 if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
761 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
762 return;
763 }
764 Some(timer)
765 } else {
766 None
767 };
768
769 let unpin_tube = self.unpin_tube.take();
770 'wait: loop {
771 let events = match wait_ctx.wait() {
772 Ok(v) => v,
773 Err(e) => {
774 error!("{}: failed polling for events: {}", self.debug_label(), e);
775 break;
776 }
777 };
778
779 for event in events.iter().filter(|e| e.is_readable) {
780 match event.token {
781 Token::UnpinTimer => {
782 self.unpin_pages();
783 if let Some(timer) = &mut unpin_timer {
784 if let Err(e) = timer.mark_waited() {
785 error!(
786 "{}: failed to clear unpin timer: {}",
787 self.debug_label(),
788 e
789 );
790 break 'wait;
791 }
792 }
793 }
794 Token::UnpinReq => {
795 if let Some(tube) = &unpin_tube {
796 match tube.recv::<UnpinRequest>() {
797 Ok(req) => {
798 let mut unpin_done = true;
799 for range in req.ranges {
800 // Locking with respect to pin_pages isn't necessary
801 // for this case because the unpinned pages in the range
802 // should all be in the balloon and so nothing will attempt
803 // to pin them.
804 if !self.unpin_pages_in_range(range.0, range.1) {
805 unpin_done = false;
806 break;
807 }
808 }
809 let resp = if unpin_done {
810 UnpinResponse::Success
811 } else {
812 UnpinResponse::Failed
813 };
814 if let Err(e) = tube.send(&resp) {
815 error!(
816 "{}: failed to send unpin response {}",
817 self.debug_label(),
818 e
819 );
820 }
821 }
822 Err(e) => {
823 if let TubeError::Disconnected = e {
824 if let Err(e) = wait_ctx.delete(tube) {
825 error!(
826 "{}: failed to remove unpin_tube: {}",
827 self.debug_label(),
828 e
829 );
830 }
831 } else {
832 error!(
833 "{}: failed to recv Unpin Request: {}",
834 self.debug_label(),
835 e
836 );
837 }
838 }
839 }
840 }
841 }
842 Token::Kill => break 'wait,
843 }
844 }
845 }
846 self.unpin_tube = unpin_tube;
847 }
848
unpin_pages(&mut self)849 fn unpin_pages(&mut self) {
850 if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
851 self.lru_unpin_pages();
852 }
853 }
854
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)855 fn lru_unpin_page(
856 &mut self,
857 dtt_iter: &mut DTTIter,
858 new_gen: bool,
859 ) -> (UnpinResult, Option<PinnedPageInfo>) {
860 let mut pinstate = self.pinstate.lock();
861 let pageinfo = if new_gen {
862 pinstate.new_gen_pinned_pages.pop_front()
863 } else {
864 pinstate
865 .old_gen_pinned_pages
866 .pop_front()
867 .map(|gfn| PinnedPageInfo::new(gfn, 0))
868 };
869
870 pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
871 (
872 unpin_page(
873 &mut pinstate,
874 &self.vfio_container,
875 &self.mem,
876 self.dtt_level,
877 self.dtt_root,
878 dtt_iter,
879 pageinfo.gfn,
880 false,
881 ),
882 Some(pageinfo),
883 )
884 })
885 }
886
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64887 fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
888 let mut not_unpinned_new_gen_pages = VecDeque::new();
889 let mut not_unpinned_old_gen_pages = VecDeque::new();
890 let mut unpinned_count = 0;
891 let has_limit = unpin_limit.is_some();
892 let limit_count = unpin_limit.unwrap_or(0);
893 let mut dtt_iter: DTTIter = Default::default();
894
895 // If has_limit is true but limit_count is 0, will not do the unpin
896 while !has_limit || unpinned_count != limit_count {
897 let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
898 match result {
899 UnpinResult::UnpinlistEmpty => break,
900 UnpinResult::Unpinned => unpinned_count += 1,
901 UnpinResult::NotPinned => {}
902 UnpinResult::NotUnpinned => {
903 if let Some(mut page) = pinned_page {
904 if self.params.unpin_gen_threshold != 0 {
905 page.unpin_busy_cnt += 1;
906 // Unpin from new_gen queue but not
907 // successfully unpinned. Need to check
908 // the unpin_gen threshold. If reach, put
909 // it to old_gen queue.
910 // And if it is not from new_gen, directly
911 // put into old_gen queue.
912 if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
913 not_unpinned_old_gen_pages.push_back(page.gfn);
914 } else {
915 not_unpinned_new_gen_pages.push_back(page);
916 }
917 }
918 }
919 }
920 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
921 // Although UnpinParked means we didn't actually try to unpin
922 // gfn, it's not worth specifically handing since parking is
923 // expected to be relatively rare.
924 if let Some(page) = pinned_page {
925 if new_gen {
926 not_unpinned_new_gen_pages.push_back(page);
927 } else {
928 not_unpinned_old_gen_pages.push_back(page.gfn);
929 }
930 }
931 if result == UnpinResult::UnpinParked {
932 thread::park();
933 }
934 }
935 }
936 }
937
938 if !not_unpinned_new_gen_pages.is_empty() {
939 let mut pinstate = self.pinstate.lock();
940 pinstate
941 .new_gen_pinned_pages
942 .append(&mut not_unpinned_new_gen_pages);
943 }
944
945 if !not_unpinned_old_gen_pages.is_empty() {
946 let mut pinstate = self.pinstate.lock();
947 pinstate
948 .old_gen_pinned_pages
949 .append(&mut not_unpinned_old_gen_pages);
950 }
951
952 unpinned_count
953 }
954
lru_unpin_pages(&mut self)955 fn lru_unpin_pages(&mut self) {
956 let mut unpin_count = 0;
957 if self.params.unpin_gen_threshold != 0 {
958 self.unpin_gen_threshold += 1;
959 if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
960 self.unpin_gen_threshold = 0;
961 // Try to unpin inactive queue first if reaches the thres hold
962 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
963 }
964 }
965 // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
966 self.lru_unpin_pages_in_loop(
967 self.params
968 .unpin_limit
969 .map(|limit| limit.saturating_sub(unpin_count)),
970 true,
971 );
972 }
973
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool974 fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
975 let mut dtt_iter: DTTIter = Default::default();
976 let mut index = 0;
977 while index != count {
978 let mut pinstate = self.pinstate.lock();
979 let result = unpin_page(
980 &mut pinstate,
981 &self.vfio_container,
982 &self.mem,
983 self.dtt_level,
984 self.dtt_root,
985 &mut dtt_iter,
986 gfn + index,
987 true,
988 );
989 drop(pinstate);
990
991 match result {
992 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
993 UnpinResult::UnpinParked => {
994 thread::park();
995 continue;
996 }
997 _ => {
998 error!("coiommu: force unpin failed by {:?}", result);
999 return false;
1000 }
1001 }
1002 index += 1;
1003 }
1004 true
1005 }
1006 }
1007
1008 pub struct CoIommuDev {
1009 config_regs: PciConfiguration,
1010 pci_address: Option<PciAddress>,
1011 mem: GuestMemory,
1012 coiommu_reg: CoIommuReg,
1013 endpoints: Vec<u16>,
1014 notifymap_mem: SafeDescriptor,
1015 notifymap_mmap: Arc<MemoryMapping>,
1016 notifymap_addr: Option<u64>,
1017 topologymap_mem: SafeDescriptor,
1018 topologymap_addr: Option<u64>,
1019 mmapped: bool,
1020 device_tube: Tube,
1021 pin_thread: Option<WorkerThread<PinWorker>>,
1022 unpin_thread: Option<WorkerThread<UnpinWorker>>,
1023 unpin_tube: Option<Tube>,
1024 ioevents: Vec<Event>,
1025 vfio_container: Arc<Mutex<VfioContainer>>,
1026 pinstate: Arc<Mutex<CoIommuPinState>>,
1027 params: CoIommuParameters,
1028 }
1029
1030 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, device_tube: Tube, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1031 pub fn new(
1032 mem: GuestMemory,
1033 vfio_container: Arc<Mutex<VfioContainer>>,
1034 device_tube: Tube,
1035 unpin_tube: Option<Tube>,
1036 endpoints: Vec<u16>,
1037 vcpu_count: u64,
1038 params: CoIommuParameters,
1039 ) -> Result<Self> {
1040 let config_regs = PciConfiguration::new(
1041 PCI_VENDOR_ID_COIOMMU,
1042 PCI_DEVICE_ID_COIOMMU,
1043 PciClassCode::Other,
1044 &PciOtherSubclass::Other,
1045 None, // No Programming interface.
1046 PciHeaderType::Device,
1047 PCI_VENDOR_ID_COIOMMU,
1048 PCI_DEVICE_ID_COIOMMU,
1049 COIOMMU_REVISION_ID,
1050 );
1051
1052 // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1053 let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1054 .context(Error::CreateSharedMemory)?;
1055 let notifymap_mmap = Arc::new(
1056 MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1057 .from_shared_memory(¬ifymap_mem)
1058 .offset(0)
1059 .build()?,
1060 );
1061
1062 // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1063 let topologymap_mem =
1064 SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1065 .context(Error::CreateSharedMemory)?;
1066 let topologymap_mmap = Arc::new(
1067 MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1068 .from_shared_memory(&topologymap_mem)
1069 .offset(0)
1070 .build()?,
1071 );
1072
1073 ensure!(
1074 (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1075 "Coiommu: too many endpoints"
1076 );
1077 topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1078 for (index, endpoint) in endpoints.iter().enumerate() {
1079 topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1080 }
1081
1082 let mut ioevents = Vec::new();
1083 for _ in 0..vcpu_count {
1084 ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1085 }
1086
1087 Ok(Self {
1088 config_regs,
1089 pci_address: None,
1090 mem,
1091 coiommu_reg: Default::default(),
1092 endpoints,
1093 notifymap_mem: notifymap_mem.into(),
1094 notifymap_mmap,
1095 notifymap_addr: None,
1096 topologymap_mem: topologymap_mem.into(),
1097 topologymap_addr: None,
1098 mmapped: false,
1099 device_tube,
1100 pin_thread: None,
1101 unpin_thread: None,
1102 unpin_tube,
1103 ioevents,
1104 vfio_container,
1105 pinstate: Arc::new(Mutex::new(CoIommuPinState {
1106 new_gen_pinned_pages: VecDeque::new(),
1107 old_gen_pinned_pages: VecDeque::new(),
1108 unpin_thread_state: UnpinThreadState::Unparked,
1109 unpin_park_count: 0,
1110 })),
1111 params,
1112 })
1113 }
1114
send_msg(&self, msg: &VmMemoryRequest) -> Result<()>1115 fn send_msg(&self, msg: &VmMemoryRequest) -> Result<()> {
1116 self.device_tube.send(msg).context(Error::TubeError)?;
1117 let res = self.device_tube.recv().context(Error::TubeError)?;
1118 match res {
1119 VmMemoryResponse::RegisterMemory { .. } => Ok(()),
1120 VmMemoryResponse::Err(e) => Err(anyhow!("Receive msg err {}", e)),
1121 _ => Err(anyhow!("Msg cannot be handled")),
1122 }
1123 }
1124
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1125 fn register_mmap(
1126 &self,
1127 descriptor: SafeDescriptor,
1128 size: usize,
1129 offset: u64,
1130 gpa: u64,
1131 prot: Protection,
1132 ) -> Result<()> {
1133 let request = VmMemoryRequest::RegisterMemory {
1134 source: VmMemorySource::Descriptor {
1135 descriptor,
1136 offset,
1137 size: size as u64,
1138 },
1139 dest: VmMemoryDestination::GuestPhysicalAddress(gpa),
1140 prot,
1141 };
1142 self.send_msg(&request)
1143 }
1144
mmap(&mut self)1145 fn mmap(&mut self) {
1146 if self.mmapped {
1147 return;
1148 }
1149
1150 if let Some(gpa) = self.notifymap_addr {
1151 match self.register_mmap(
1152 self.notifymap_mem.try_clone().unwrap(),
1153 COIOMMU_NOTIFYMAP_SIZE,
1154 0,
1155 gpa,
1156 Protection::read_write(),
1157 ) {
1158 Ok(_) => {}
1159 Err(e) => {
1160 panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1161 }
1162 }
1163 }
1164
1165 if let Some(gpa) = self.topologymap_addr {
1166 match self.register_mmap(
1167 self.topologymap_mem.try_clone().unwrap(),
1168 COIOMMU_TOPOLOGYMAP_SIZE,
1169 0,
1170 gpa,
1171 Protection::read(),
1172 ) {
1173 Ok(_) => {}
1174 Err(e) => {
1175 panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1176 }
1177 }
1178 }
1179
1180 self.mmapped = true;
1181 }
1182
start_workers(&mut self)1183 fn start_workers(&mut self) {
1184 if self.pin_thread.is_none() {
1185 self.start_pin_thread();
1186 }
1187
1188 if self.unpin_thread.is_none() {
1189 self.start_unpin_thread();
1190 }
1191 }
1192
start_pin_thread(&mut self)1193 fn start_pin_thread(&mut self) {
1194 let mem = self.mem.clone();
1195 let endpoints = self.endpoints.to_vec();
1196 let notifymap_mmap = self.notifymap_mmap.clone();
1197 let dtt_root = self.coiommu_reg.dtt_root;
1198 let dtt_level = self.coiommu_reg.dtt_level;
1199 let ioevents = self
1200 .ioevents
1201 .iter()
1202 .map(|e| e.try_clone().unwrap())
1203 .collect();
1204 let vfio_container = self.vfio_container.clone();
1205 let pinstate = self.pinstate.clone();
1206 let params = self.params;
1207
1208 self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1209 let mut worker = PinWorker {
1210 mem,
1211 endpoints,
1212 notifymap_mmap,
1213 dtt_root,
1214 dtt_level,
1215 ioevents,
1216 vfio_container,
1217 pinstate,
1218 params,
1219 };
1220 worker.run(kill_evt);
1221 worker
1222 }));
1223 }
1224
start_unpin_thread(&mut self)1225 fn start_unpin_thread(&mut self) {
1226 let mem = self.mem.clone();
1227 let dtt_root = self.coiommu_reg.dtt_root;
1228 let dtt_level = self.coiommu_reg.dtt_level;
1229 let vfio_container = self.vfio_container.clone();
1230 let unpin_tube = self.unpin_tube.take();
1231 let pinstate = self.pinstate.clone();
1232 let params = self.params;
1233 self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1234 let mut worker = UnpinWorker {
1235 mem,
1236 dtt_level,
1237 dtt_root,
1238 vfio_container,
1239 unpin_tube,
1240 pinstate,
1241 params,
1242 unpin_gen_threshold: 0,
1243 };
1244 worker.run(kill_evt);
1245 worker
1246 }));
1247 }
1248
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1249 fn allocate_bar_address(
1250 &mut self,
1251 resources: &mut SystemAllocator,
1252 address: PciAddress,
1253 size: u64,
1254 bar_num: u8,
1255 name: &str,
1256 ) -> PciResult<u64> {
1257 let addr = resources
1258 .allocate_mmio(
1259 size,
1260 Alloc::PciBar {
1261 bus: address.bus,
1262 dev: address.dev,
1263 func: address.func,
1264 bar: bar_num,
1265 },
1266 name.to_string(),
1267 AllocOptions::new().prefetchable(true).align(size),
1268 )
1269 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1270
1271 let bar = PciBarConfiguration::new(
1272 bar_num as usize,
1273 size,
1274 PciBarRegionType::Memory64BitRegion,
1275 PciBarPrefetchable::Prefetchable,
1276 )
1277 .set_address(addr);
1278
1279 self.config_regs
1280 .add_pci_bar(bar)
1281 .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1282
1283 Ok(addr)
1284 }
1285
read_mmio(&mut self, addr: u64, data: &mut [u8])1286 fn read_mmio(&mut self, addr: u64, data: &mut [u8]) {
1287 let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1288 let offset = addr - bar;
1289 if offset >= mem::size_of::<CoIommuReg>() as u64 {
1290 error!(
1291 "{}: read_mmio: invalid addr 0x{:x} bar 0x{:x} offset 0x{:x}",
1292 self.debug_label(),
1293 addr,
1294 bar,
1295 offset
1296 );
1297 return;
1298 }
1299
1300 // Sanity check, must be 64bit aligned accessing
1301 if offset % 8 != 0 || data.len() != 8 {
1302 error!(
1303 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1304 self.debug_label(),
1305 offset,
1306 data.len()
1307 );
1308 return;
1309 }
1310
1311 let v = match offset / 8 {
1312 0 => self.coiommu_reg.dtt_root,
1313 1 => self.coiommu_reg.cmd,
1314 2 => self.coiommu_reg.dtt_level,
1315 _ => return,
1316 };
1317
1318 data.copy_from_slice(&v.to_ne_bytes());
1319 }
1320
write_mmio(&mut self, addr: u64, data: &[u8])1321 fn write_mmio(&mut self, addr: u64, data: &[u8]) {
1322 let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1323 let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1324 let offset = addr - bar;
1325 if offset >= mmio_len {
1326 if data.len() != 1 {
1327 error!(
1328 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1329 self.debug_label(),
1330 offset,
1331 data.len()
1332 );
1333 return;
1334 }
1335
1336 // Usually will not be here as this is for the per-vcpu notify
1337 // register which is monitored by the ioevents. For the notify
1338 // register which is not covered by the ioevents, they are not
1339 // be used by the frontend driver. In case the frontend driver
1340 // went here, do a simple handle to make sure the frontend driver
1341 // will not be blocked, and through an error log.
1342 let index = (offset - mmio_len) as usize * mem::size_of::<u64>();
1343 self.notifymap_mmap.write_obj::<u64>(0, index).unwrap();
1344 error!(
1345 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1346 self.debug_label(),
1347 offset
1348 );
1349 return;
1350 }
1351
1352 // Sanity check, must be 64bit aligned accessing for CoIommuReg
1353 if offset % 8 != 0 || data.len() != 8 {
1354 error!(
1355 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1356 self.debug_label(),
1357 offset,
1358 data.len()
1359 );
1360 return;
1361 }
1362
1363 let index = offset / 8;
1364 let v = u64::from_ne_bytes(data.try_into().unwrap());
1365 match index {
1366 0 => {
1367 if self.coiommu_reg.dtt_root == 0 {
1368 self.coiommu_reg.dtt_root = v;
1369 }
1370 }
1371 1 => match v {
1372 // Deactivate can happen if the frontend driver in the guest
1373 // fails during probing or if the CoIommu device is removed
1374 // by the guest. Neither of these cases is expected, and if
1375 // either happens the guest will be non-functional due to
1376 // pass-through devices which rely on CoIommu not working.
1377 // So just fail hard and panic.
1378 COIOMMU_CMD_DEACTIVATE => {
1379 panic!("{}: Deactivate is not supported", self.debug_label())
1380 }
1381 COIOMMU_CMD_ACTIVATE => {
1382 if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1383 self.start_workers();
1384 }
1385 }
1386 COIOMMU_CMD_PARK_UNPIN => {
1387 let mut pinstate = self.pinstate.lock();
1388 pinstate.unpin_thread_state = UnpinThreadState::Parked;
1389 if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1390 pinstate.unpin_park_count = v;
1391 } else {
1392 panic!("{}: Park request overflowing", self.debug_label());
1393 }
1394 }
1395 COIOMMU_CMD_UNPARK_UNPIN => {
1396 let mut pinstate = self.pinstate.lock();
1397 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1398 if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1399 pinstate.unpin_park_count = v;
1400 if pinstate.unpin_park_count == 0 {
1401 if let Some(worker_thread) = &self.unpin_thread {
1402 worker_thread.thread().unpark();
1403 }
1404 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1405 }
1406 } else {
1407 error!("{}: Park count is already reached to 0", self.debug_label());
1408 }
1409 }
1410 }
1411 _ => {}
1412 },
1413 2 => {
1414 if self.coiommu_reg.dtt_level == 0 {
1415 self.coiommu_reg.dtt_level = v;
1416 }
1417 }
1418 _ => {}
1419 }
1420 }
1421 }
1422
1423 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1424 fn debug_label(&self) -> String {
1425 "CoIommu".to_owned()
1426 }
1427
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1428 fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1429 if self.pci_address.is_none() {
1430 self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1431 Some(Alloc::PciBar {
1432 bus,
1433 dev,
1434 func,
1435 bar: _,
1436 }) => Some(PciAddress { bus, dev, func }),
1437 _ => None,
1438 }
1439 }
1440 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1441 }
1442
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1443 fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1444 let address = self
1445 .pci_address
1446 .expect("allocate_address must be called prior to allocate_io_bars");
1447
1448 // Allocate one bar for the structures pointed to by the capability structures.
1449 let mut ranges: Vec<BarRange> = Vec::new();
1450
1451 let mmio_addr = self.allocate_bar_address(
1452 resources,
1453 address,
1454 COIOMMU_MMIO_BAR_SIZE as u64,
1455 COIOMMU_MMIO_BAR,
1456 "coiommu-mmiobar",
1457 )?;
1458
1459 ranges.push(BarRange {
1460 addr: mmio_addr,
1461 size: COIOMMU_MMIO_BAR_SIZE,
1462 prefetchable: false,
1463 });
1464
1465 Ok(ranges)
1466 }
1467
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1468 fn allocate_device_bars(
1469 &mut self,
1470 resources: &mut SystemAllocator,
1471 ) -> PciResult<Vec<BarRange>> {
1472 let address = self
1473 .pci_address
1474 .expect("allocate_address must be called prior to allocate_device_bars");
1475
1476 let mut ranges: Vec<BarRange> = Vec::new();
1477
1478 let topologymap_addr = self.allocate_bar_address(
1479 resources,
1480 address,
1481 COIOMMU_TOPOLOGYMAP_SIZE as u64,
1482 COIOMMU_TOPOLOGYMAP_BAR,
1483 "coiommu-topology",
1484 )?;
1485 self.topologymap_addr = Some(topologymap_addr);
1486 ranges.push(BarRange {
1487 addr: topologymap_addr,
1488 size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1489 prefetchable: false,
1490 });
1491
1492 let notifymap_addr = self.allocate_bar_address(
1493 resources,
1494 address,
1495 COIOMMU_NOTIFYMAP_SIZE as u64,
1496 COIOMMU_NOTIFYMAP_BAR,
1497 "coiommu-notifymap",
1498 )?;
1499 self.notifymap_addr = Some(notifymap_addr);
1500 ranges.push(BarRange {
1501 addr: notifymap_addr,
1502 size: COIOMMU_NOTIFYMAP_SIZE as u64,
1503 prefetchable: false,
1504 });
1505
1506 Ok(ranges)
1507 }
1508
read_config_register(&self, reg_idx: usize) -> u321509 fn read_config_register(&self, reg_idx: usize) -> u32 {
1510 self.config_regs.read_reg(reg_idx)
1511 }
1512
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1513 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1514 if reg_idx == COMMAND_REG
1515 && data.len() == 2
1516 && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1517 && !self.mmapped
1518 {
1519 self.mmap();
1520 }
1521
1522 self.config_regs.write_reg(reg_idx, offset, data);
1523 }
1524
keep_rds(&self) -> Vec<RawDescriptor>1525 fn keep_rds(&self) -> Vec<RawDescriptor> {
1526 let mut rds = vec![
1527 self.vfio_container.lock().as_raw_descriptor(),
1528 self.device_tube.as_raw_descriptor(),
1529 self.notifymap_mem.as_raw_descriptor(),
1530 self.topologymap_mem.as_raw_descriptor(),
1531 ];
1532 if let Some(unpin_tube) = &self.unpin_tube {
1533 rds.push(unpin_tube.as_raw_descriptor());
1534 }
1535 rds
1536 }
1537
read_bar(&mut self, addr: u64, data: &mut [u8])1538 fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
1539 let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1540 let notifymap = self
1541 .config_regs
1542 .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1543 match addr {
1544 o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1545 self.read_mmio(addr, data);
1546 }
1547 o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1548 // With coiommu device activated, the accessing the notifymap bar
1549 // won't cause vmexit. If goes here, means the coiommu device is
1550 // deactivated, and will not do the pin/unpin work. Thus no need
1551 // to handle this notifymap read.
1552 }
1553 _ => {}
1554 }
1555 }
1556
write_bar(&mut self, addr: u64, data: &[u8])1557 fn write_bar(&mut self, addr: u64, data: &[u8]) {
1558 let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1559 let notifymap = self
1560 .config_regs
1561 .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1562 match addr {
1563 o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1564 self.write_mmio(addr, data);
1565 }
1566 o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1567 // With coiommu device activated, the accessing the notifymap bar
1568 // won't cause vmexit. If goes here, means the coiommu device is
1569 // deactivated, and will not do the pin/unpin work. Thus no need
1570 // to handle this notifymap write.
1571 }
1572 _ => {}
1573 }
1574 }
1575
ioevents(&self) -> Vec<(&Event, u64, Datamatch)>1576 fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> {
1577 let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1578 let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1579 self.ioevents
1580 .iter()
1581 .enumerate()
1582 .map(|(i, event)| (event, notify_base + i as u64, Datamatch::AnyLength))
1583 .collect()
1584 }
1585
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1586 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1587 self.config_regs.get_bar_configuration(bar_num)
1588 }
1589 }
1590
1591 impl Suspendable for CoIommuDev {}
1592