1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! <https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative-dma-buffer-tracking-yu-zhang-intel>
13 //!
14 //! Also presented at usenix ATC20:
15 //! <https://www.usenix.org/conference/atc20/presentation/tian>
16
17 use std::collections::VecDeque;
18 use std::convert::TryInto;
19 use std::default::Default;
20 use std::fmt;
21 use std::mem;
22 use std::panic;
23 use std::sync::atomic::fence;
24 use std::sync::atomic::AtomicU32;
25 use std::sync::atomic::Ordering;
26 use std::sync::Arc;
27 use std::thread;
28 use std::time::Duration;
29
30 use anyhow::bail;
31 use anyhow::ensure;
32 use anyhow::Context;
33 use anyhow::Result;
34 use base::error;
35 use base::info;
36 use base::AsRawDescriptor;
37 use base::Event;
38 use base::EventToken;
39 use base::MemoryMapping;
40 use base::MemoryMappingBuilder;
41 use base::Protection;
42 use base::RawDescriptor;
43 use base::SafeDescriptor;
44 use base::SharedMemory;
45 use base::Timer;
46 use base::TimerTrait;
47 use base::Tube;
48 use base::TubeError;
49 use base::WaitContext;
50 use base::WorkerThread;
51 use hypervisor::Datamatch;
52 use hypervisor::MemCacheType;
53 use resources::Alloc;
54 use resources::AllocOptions;
55 use resources::SystemAllocator;
56 use serde::Deserialize;
57 use serde::Deserializer;
58 use serde::Serialize;
59 use serde_keyvalue::FromKeyValues;
60 use sync::Mutex;
61 use thiserror::Error as ThisError;
62 use vm_control::api::VmMemoryClient;
63 use vm_control::VmMemoryDestination;
64 use vm_control::VmMemorySource;
65 use vm_memory::GuestAddress;
66 use vm_memory::GuestMemory;
67 use zerocopy::AsBytes;
68 use zerocopy::FromBytes;
69 use zerocopy::FromZeroes;
70
71 use crate::pci::pci_configuration::PciBarConfiguration;
72 use crate::pci::pci_configuration::PciBarPrefetchable;
73 use crate::pci::pci_configuration::PciBarRegionType;
74 use crate::pci::pci_configuration::PciClassCode;
75 use crate::pci::pci_configuration::PciConfiguration;
76 use crate::pci::pci_configuration::PciHeaderType;
77 use crate::pci::pci_configuration::PciOtherSubclass;
78 use crate::pci::pci_configuration::COMMAND_REG;
79 use crate::pci::pci_configuration::COMMAND_REG_MEMORY_SPACE_MASK;
80 use crate::pci::pci_device::BarRange;
81 use crate::pci::pci_device::PciDevice;
82 use crate::pci::pci_device::Result as PciResult;
83 use crate::pci::PciAddress;
84 use crate::pci::PciBarIndex;
85 use crate::pci::PciDeviceError;
86 use crate::vfio::VfioContainer;
87 use crate::Suspendable;
88 use crate::UnpinRequest;
89 use crate::UnpinResponse;
90
91 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
92 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
93 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
94 const COIOMMU_CMD_ACTIVATE: u64 = 1;
95 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
96 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
97 const COIOMMU_REVISION_ID: u8 = 0x10;
98 const COIOMMU_MMIO_BAR: PciBarIndex = 0;
99 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
100 const COIOMMU_NOTIFYMAP_BAR: PciBarIndex = 2;
101 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
102 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
103 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
104 const PAGE_SIZE_4K: u64 = 4096;
105 const PAGE_SHIFT_4K: u64 = 12;
106 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
107
108 const DTTE_PINNED_FLAG: u32 = 1 << 31;
109 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
110 const DTT_ENTRY_PRESENT: u64 = 1;
111 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
112
113 #[derive(ThisError, Debug)]
114 enum Error {
115 #[error("CoIommu failed to create shared memory")]
116 CreateSharedMemory,
117 #[error("Failed to get DTT entry")]
118 GetDTTEntry,
119 }
120
121 //default interval is 60s
122 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
123 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
124 /// Holds the coiommu unpin policy
125 #[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize)]
126 #[serde(rename_all = "kebab-case")]
127 pub enum CoIommuUnpinPolicy {
128 #[default]
129 Off,
130 Lru,
131 }
132
133 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result134 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
135 use self::CoIommuUnpinPolicy::*;
136
137 match self {
138 Off => write!(f, "off"),
139 Lru => write!(f, "lru"),
140 }
141 }
142 }
143
deserialize_unpin_interval<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Duration, D::Error>144 fn deserialize_unpin_interval<'de, D: Deserializer<'de>>(
145 deserializer: D,
146 ) -> Result<Duration, D::Error> {
147 let secs = u64::deserialize(deserializer)?;
148
149 Ok(Duration::from_secs(secs))
150 }
151
deserialize_unpin_limit<'de, D: Deserializer<'de>>( deserializer: D, ) -> Result<Option<u64>, D::Error>152 fn deserialize_unpin_limit<'de, D: Deserializer<'de>>(
153 deserializer: D,
154 ) -> Result<Option<u64>, D::Error> {
155 let limit = u64::deserialize(deserializer)?;
156
157 match limit {
158 0 => Err(serde::de::Error::custom(
159 "Please use non-zero unpin_limit value",
160 )),
161 limit => Ok(Some(limit)),
162 }
163 }
164
unpin_interval_default() -> Duration165 fn unpin_interval_default() -> Duration {
166 UNPIN_DEFAULT_INTERVAL
167 }
168
unpin_gen_threshold_default() -> u64169 fn unpin_gen_threshold_default() -> u64 {
170 UNPIN_GEN_DEFAULT_THRES
171 }
172
173 /// Holds the parameters for a coiommu device
174 #[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize, FromKeyValues)]
175 #[serde(deny_unknown_fields)]
176 pub struct CoIommuParameters {
177 #[serde(default)]
178 pub unpin_policy: CoIommuUnpinPolicy,
179 #[serde(
180 deserialize_with = "deserialize_unpin_interval",
181 default = "unpin_interval_default"
182 )]
183 pub unpin_interval: Duration,
184 #[serde(deserialize_with = "deserialize_unpin_limit", default)]
185 pub unpin_limit: Option<u64>,
186 // Number of unpin intervals a pinned page must be busy for to be aged into the
187 // older, less frequently checked generation.
188 #[serde(default = "unpin_gen_threshold_default")]
189 pub unpin_gen_threshold: u64,
190 }
191
192 impl Default for CoIommuParameters {
default() -> Self193 fn default() -> Self {
194 Self {
195 unpin_policy: CoIommuUnpinPolicy::Off,
196 unpin_interval: UNPIN_DEFAULT_INTERVAL,
197 unpin_limit: None,
198 unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
199 }
200 }
201 }
202
203 #[derive(Default, Debug, Copy, Clone)]
204 struct CoIommuReg {
205 dtt_root: u64,
206 cmd: u64,
207 dtt_level: u64,
208 }
209
210 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
211 struct PinnedPageInfo {
212 gfn: u64,
213 unpin_busy_cnt: u64,
214 }
215
216 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self217 fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
218 PinnedPageInfo {
219 gfn,
220 unpin_busy_cnt,
221 }
222 }
223 }
224
225 #[derive(PartialEq, Debug, Eq)]
226 enum UnpinThreadState {
227 Unparked,
228 Parked,
229 }
230
231 struct CoIommuPinState {
232 new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
233 old_gen_pinned_pages: VecDeque<u64>,
234 unpin_thread_state: UnpinThreadState,
235 unpin_park_count: u64,
236 }
237
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool238 unsafe fn vfio_map(
239 vfio_container: &Arc<Mutex<VfioContainer>>,
240 iova: u64,
241 size: u64,
242 user_addr: u64,
243 ) -> bool {
244 match vfio_container
245 .lock()
246 .vfio_dma_map(iova, size, user_addr, true)
247 {
248 Ok(_) => true,
249 Err(e) => {
250 if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
251 if errno == libc::EEXIST {
252 // Already pinned. set PINNED flag
253 error!("CoIommu: iova 0x{:x} already pinned", iova);
254 return true;
255 }
256 }
257 error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
258 false
259 }
260 }
261 }
262
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool263 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
264 match vfio_container.lock().vfio_dma_unmap(iova, size) {
265 Ok(_) => true,
266 Err(e) => {
267 error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
268 false
269 }
270 }
271 }
272
273 #[derive(Default, Debug, Copy, Clone, FromZeroes, FromBytes, AsBytes)]
274 #[repr(C)]
275 struct PinPageInfo {
276 bdf: u16,
277 pad: [u16; 3],
278 nr_pages: u64,
279 }
280
281 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
282 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
283 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
284 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
285
level_to_offset(gfn: u64, level: u64) -> Result<u64>286 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
287 if level == 1 {
288 return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
289 }
290
291 if level == 0 {
292 bail!("Invalid level for gfn 0x{:x}", gfn);
293 }
294
295 let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
296
297 Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
298 }
299
300 struct DTTIter {
301 ptr: *const u8,
302 gfn: u64,
303 }
304
305 impl Default for DTTIter {
default() -> Self306 fn default() -> Self {
307 DTTIter {
308 ptr: std::ptr::null(),
309 gfn: 0,
310 }
311 }
312 }
313
314 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
315 //
316 // There are two ways to get the entry:
317 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
318 // corresponding entry. The DTT is shared between frontend and
319 // backend. It is page-table-like strctures and the entry is indexed
320 // by GFN. The argument dtt_root represents the root page
321 // pga and dtt_level represents the maximum page table level.
322 //
323 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
324 // stores an entry address and the associated gfn. If the target gfn is
325 // in the same page table page with the gfn in dtt_iter, then can
326 // calculate the target entry address based on the entry address in
327 // dtt_iter.
328 //
329 // As the DTT entry is shared between frontend and backend, the accessing
330 // should be atomic. So the returned value is converted to an AtomicU32
331 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>332 fn gfn_to_dtt_pte(
333 mem: &GuestMemory,
334 dtt_level: u64,
335 dtt_root: u64,
336 dtt_iter: &mut DTTIter,
337 gfn: u64,
338 ) -> Result<*const AtomicU32> {
339 let ptr = if dtt_iter.ptr.is_null()
340 || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
341 {
342 // Slow path to walk the DTT to get the pte entry
343 let mut level = dtt_level;
344 let mut pt_gpa = dtt_root;
345 let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
346
347 while level != 1 {
348 let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
349 let parent_pt = mem
350 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
351 .context(Error::GetDTTEntry)?;
352
353 if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
354 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
355 }
356
357 pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
358 level -= 1;
359 }
360
361 let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
362
363 mem.get_host_address(GuestAddress(pt_gpa + index))
364 .context(Error::GetDTTEntry)?
365 } else if gfn > dtt_iter.gfn {
366 // SAFETY:
367 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
368 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
369 // means the calculated ptr will point to the same page as dtt_iter.ptr
370 unsafe {
371 dtt_iter
372 .ptr
373 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
374 }
375 } else {
376 // SAFETY:
377 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
378 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
379 // means the calculated ptr will point to the same page as dtt_iter.ptr
380 unsafe {
381 dtt_iter
382 .ptr
383 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
384 }
385 };
386
387 dtt_iter.ptr = ptr;
388 dtt_iter.gfn = gfn;
389
390 Ok(ptr as *const AtomicU32)
391 }
392
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>393 fn pin_page(
394 pinstate: &mut CoIommuPinState,
395 policy: CoIommuUnpinPolicy,
396 vfio_container: &Arc<Mutex<VfioContainer>>,
397 mem: &GuestMemory,
398 dtt_level: u64,
399 dtt_root: u64,
400 dtt_iter: &mut DTTIter,
401 gfn: u64,
402 ) -> Result<()> {
403 let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
404
405 let gpa = gfn << PAGE_SHIFT_4K;
406 let host_addr = mem
407 .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
408 .context("failed to get host address")? as u64;
409
410 // SAFETY:
411 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
412 // Test PINNED flag
413 if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
414 info!("CoIommu: gfn 0x{:x} already pinned", gfn);
415 return Ok(());
416 }
417
418 // SAFETY:
419 // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
420 // is guaranteed by MemoryMapping interface.
421 if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
422 // SAFETY:
423 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
424 // set PINNED flag
425 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
426 if policy == CoIommuUnpinPolicy::Lru {
427 pinstate
428 .new_gen_pinned_pages
429 .push_back(PinnedPageInfo::new(gfn, 0));
430 }
431 }
432
433 Ok(())
434 }
435
436 #[derive(PartialEq, Debug, Eq)]
437 enum UnpinResult {
438 UnpinlistEmpty,
439 Unpinned,
440 NotPinned,
441 NotUnpinned,
442 FailedUnpin,
443 UnpinParked,
444 }
445
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult446 fn unpin_page(
447 pinstate: &mut CoIommuPinState,
448 vfio_container: &Arc<Mutex<VfioContainer>>,
449 mem: &GuestMemory,
450 dtt_level: u64,
451 dtt_root: u64,
452 dtt_iter: &mut DTTIter,
453 gfn: u64,
454 force: bool,
455 ) -> UnpinResult {
456 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
457 return UnpinResult::UnpinParked;
458 }
459
460 let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
461 Ok(v) => v,
462 Err(_) => {
463 // The case force == true may try to unpin a page which is not
464 // mapped in the dtt. For such page, the pte doesn't exist yet
465 // thus don't need to report any error log.
466 // The case force == false is used by coiommu to periodically
467 // unpin the pages which have been mapped in dtt, thus the pte
468 // for such page does exist. However with the unpin request from
469 // virtio balloon, such pages can be unpinned already and the DTT
470 // pages might be reclaimed by the Guest OS kernel as well, thus
471 // it is also possible to be here. Not to report an error log.
472 return UnpinResult::NotPinned;
473 }
474 };
475
476 if force {
477 // SAFETY:
478 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
479 // This case is for balloon to evict pages so these pages should
480 // already been locked by balloon and no device driver in VM is
481 // able to access these pages, so just clear ACCESSED flag first
482 // to make sure the following unpin can be success.
483 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
484 }
485
486 // SAFETY:
487 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
488 if let Err(entry) = unsafe {
489 (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
490 } {
491 // The compare_exchange failed as the original leaf entry is
492 // not DTTE_PINNED_FLAG so cannot do the unpin.
493 if entry == 0 {
494 // The GFN is already unpinned. This is very similar to the
495 // gfn_to_dtt_pte error case, with the only difference being
496 // that the dtt_pte happens to be on a present page table.
497 UnpinResult::NotPinned
498 } else {
499 if !force {
500 // SAFETY:
501 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
502 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
503 // this page. It represents whether or not this page is touched by the
504 // guest. By clearing this flag after an unpin work, we can detect if
505 // this page has been touched by the guest in the next round of unpin
506 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
507 // will be failed and we will be here again to clear this flag. If this
508 // flag is not set at the next round, unpin this page will be probably
509 // success.
510 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
511 } else {
512 // If we're here, then the guest is trying to release a page via the
513 // balloon that it still has pinned. This most likely that something is
514 // wrong in the guest kernel. Just leave the page pinned and log
515 // an error.
516 // This failure blocks the balloon from removing the page, which ensures
517 // that the guest's view of memory will remain consistent with device
518 // DMA's view of memory. Also note that the host kernel maintains an
519 // elevated refcount for pinned pages, which is a second guarantee the
520 // pages accessible by device DMA won't be freed until after they are
521 // unpinned.
522 error!(
523 "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
524 gfn, entry
525 );
526 }
527 // GFN cannot be unpinned either because the unmap count
528 // is non-zero or the it has accessed flag set.
529 UnpinResult::NotUnpinned
530 }
531 } else {
532 // The compare_exchange success as the original leaf entry is
533 // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
534 // page.
535 let gpa = gfn << PAGE_SHIFT_4K;
536 if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
537 UnpinResult::Unpinned
538 } else {
539 // SAFETY:
540 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
541 // make sure the pinned flag is set
542 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
543 // need to put this gfn back to pinned vector
544 UnpinResult::FailedUnpin
545 }
546 }
547 }
548
549 struct PinWorker {
550 mem: GuestMemory,
551 endpoints: Vec<u16>,
552 notifymap_mmap: Arc<MemoryMapping>,
553 dtt_level: u64,
554 dtt_root: u64,
555 ioevents: Vec<Event>,
556 vfio_container: Arc<Mutex<VfioContainer>>,
557 pinstate: Arc<Mutex<CoIommuPinState>>,
558 params: CoIommuParameters,
559 }
560
561 impl PinWorker {
debug_label(&self) -> &'static str562 fn debug_label(&self) -> &'static str {
563 "CoIommuPinWorker"
564 }
565
run(&mut self, kill_evt: Event)566 fn run(&mut self, kill_evt: Event) {
567 #[derive(EventToken)]
568 enum Token {
569 Kill,
570 Pin { index: usize },
571 }
572
573 let wait_ctx: WaitContext<Token> =
574 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
575 Ok(pc) => pc,
576 Err(e) => {
577 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
578 return;
579 }
580 };
581
582 for (index, event) in self.ioevents.iter().enumerate() {
583 match wait_ctx.add(event, Token::Pin { index }) {
584 Ok(_) => {}
585 Err(e) => {
586 error!(
587 "{}: failed to add ioevent for index {}: {}",
588 self.debug_label(),
589 index,
590 e
591 );
592 return;
593 }
594 }
595 }
596
597 'wait: loop {
598 let events = match wait_ctx.wait() {
599 Ok(v) => v,
600 Err(e) => {
601 error!("{}: failed polling for events: {}", self.debug_label(), e);
602 break;
603 }
604 };
605
606 for event in events.iter().filter(|e| e.is_readable) {
607 match event.token {
608 Token::Kill => break 'wait,
609 Token::Pin { index } => {
610 let offset = index * mem::size_of::<u64>();
611 if let Some(event) = self.ioevents.get(index) {
612 if let Err(e) = event.wait() {
613 error!(
614 "{}: failed reading event {}: {}",
615 self.debug_label(),
616 index,
617 e
618 );
619 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
620 break 'wait;
621 }
622 }
623 if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
624 if let Err(e) = self.pin_pages(data) {
625 error!("{}: {}", self.debug_label(), e);
626 }
627 }
628 fence(Ordering::SeqCst);
629 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
630 }
631 }
632 }
633 }
634 }
635
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>636 fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
637 let pin_page_info = self
638 .mem
639 .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
640 .context("failed to get pin page info")?;
641
642 let bdf = pin_page_info.bdf;
643 ensure!(
644 self.endpoints.iter().any(|&x| x == bdf),
645 "pin page for unexpected bdf 0x{:x}",
646 bdf
647 );
648
649 let mut nr_pages = pin_page_info.nr_pages;
650 let mut offset = mem::size_of::<PinPageInfo>() as u64;
651 let mut dtt_iter: DTTIter = Default::default();
652 let mut pinstate = self.pinstate.lock();
653 while nr_pages > 0 {
654 let gfn = self
655 .mem
656 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
657 .context("failed to get pin page gfn")?;
658
659 pin_page(
660 &mut pinstate,
661 self.params.unpin_policy,
662 &self.vfio_container,
663 &self.mem,
664 self.dtt_level,
665 self.dtt_root,
666 &mut dtt_iter,
667 gfn,
668 )?;
669
670 offset += mem::size_of::<u64>() as u64;
671 nr_pages -= 1;
672 }
673
674 Ok(())
675 }
676
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>677 fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
678 if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
679 let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
680 self.pin_pages_in_batch(gpa)
681 } else {
682 let bdf = (gfn_bdf & 0xffff) as u16;
683 let gfn = gfn_bdf >> 16;
684 let mut dtt_iter: DTTIter = Default::default();
685 ensure!(
686 self.endpoints.iter().any(|&x| x == bdf),
687 "pin page for unexpected bdf 0x{:x}",
688 bdf
689 );
690
691 let mut pinstate = self.pinstate.lock();
692 pin_page(
693 &mut pinstate,
694 self.params.unpin_policy,
695 &self.vfio_container,
696 &self.mem,
697 self.dtt_level,
698 self.dtt_root,
699 &mut dtt_iter,
700 gfn,
701 )
702 }
703 }
704 }
705
706 struct UnpinWorker {
707 mem: GuestMemory,
708 dtt_level: u64,
709 dtt_root: u64,
710 vfio_container: Arc<Mutex<VfioContainer>>,
711 unpin_tube: Option<Tube>,
712 pinstate: Arc<Mutex<CoIommuPinState>>,
713 params: CoIommuParameters,
714 unpin_gen_threshold: u64,
715 }
716
717 impl UnpinWorker {
debug_label(&self) -> &'static str718 fn debug_label(&self) -> &'static str {
719 "CoIommuUnpinWorker"
720 }
721
run(&mut self, kill_evt: Event)722 fn run(&mut self, kill_evt: Event) {
723 #[derive(EventToken)]
724 enum Token {
725 UnpinTimer,
726 UnpinReq,
727 Kill,
728 }
729
730 let wait_ctx: WaitContext<Token> =
731 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
732 Ok(pc) => pc,
733 Err(e) => {
734 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
735 return;
736 }
737 };
738
739 if let Some(tube) = &self.unpin_tube {
740 if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
741 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
742 return;
743 }
744 }
745
746 let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
747 && !self.params.unpin_interval.is_zero()
748 {
749 let duration = self.params.unpin_interval;
750 let interval = Some(self.params.unpin_interval);
751 let mut timer = match Timer::new() {
752 Ok(t) => t,
753 Err(e) => {
754 error!(
755 "{}: failed to create the unpin timer: {}",
756 self.debug_label(),
757 e
758 );
759 return;
760 }
761 };
762 if let Err(e) = timer.reset(duration, interval) {
763 error!(
764 "{}: failed to start the unpin timer: {}",
765 self.debug_label(),
766 e
767 );
768 return;
769 }
770 if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
771 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
772 return;
773 }
774 Some(timer)
775 } else {
776 None
777 };
778
779 let unpin_tube = self.unpin_tube.take();
780 'wait: loop {
781 let events = match wait_ctx.wait() {
782 Ok(v) => v,
783 Err(e) => {
784 error!("{}: failed polling for events: {}", self.debug_label(), e);
785 break;
786 }
787 };
788
789 for event in events.iter().filter(|e| e.is_readable) {
790 match event.token {
791 Token::UnpinTimer => {
792 self.unpin_pages();
793 if let Some(timer) = &mut unpin_timer {
794 if let Err(e) = timer.mark_waited() {
795 error!(
796 "{}: failed to clear unpin timer: {}",
797 self.debug_label(),
798 e
799 );
800 break 'wait;
801 }
802 }
803 }
804 Token::UnpinReq => {
805 if let Some(tube) = &unpin_tube {
806 match tube.recv::<UnpinRequest>() {
807 Ok(req) => {
808 let mut unpin_done = true;
809 for range in req.ranges {
810 // Locking with respect to pin_pages isn't necessary
811 // for this case because the unpinned pages in the range
812 // should all be in the balloon and so nothing will attempt
813 // to pin them.
814 if !self.unpin_pages_in_range(range.0, range.1) {
815 unpin_done = false;
816 break;
817 }
818 }
819 let resp = if unpin_done {
820 UnpinResponse::Success
821 } else {
822 UnpinResponse::Failed
823 };
824 if let Err(e) = tube.send(&resp) {
825 error!(
826 "{}: failed to send unpin response {}",
827 self.debug_label(),
828 e
829 );
830 }
831 }
832 Err(e) => {
833 if let TubeError::Disconnected = e {
834 if let Err(e) = wait_ctx.delete(tube) {
835 error!(
836 "{}: failed to remove unpin_tube: {}",
837 self.debug_label(),
838 e
839 );
840 }
841 } else {
842 error!(
843 "{}: failed to recv Unpin Request: {}",
844 self.debug_label(),
845 e
846 );
847 }
848 }
849 }
850 }
851 }
852 Token::Kill => break 'wait,
853 }
854 }
855 }
856 self.unpin_tube = unpin_tube;
857 }
858
unpin_pages(&mut self)859 fn unpin_pages(&mut self) {
860 if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
861 self.lru_unpin_pages();
862 }
863 }
864
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)865 fn lru_unpin_page(
866 &mut self,
867 dtt_iter: &mut DTTIter,
868 new_gen: bool,
869 ) -> (UnpinResult, Option<PinnedPageInfo>) {
870 let mut pinstate = self.pinstate.lock();
871 let pageinfo = if new_gen {
872 pinstate.new_gen_pinned_pages.pop_front()
873 } else {
874 pinstate
875 .old_gen_pinned_pages
876 .pop_front()
877 .map(|gfn| PinnedPageInfo::new(gfn, 0))
878 };
879
880 pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
881 (
882 unpin_page(
883 &mut pinstate,
884 &self.vfio_container,
885 &self.mem,
886 self.dtt_level,
887 self.dtt_root,
888 dtt_iter,
889 pageinfo.gfn,
890 false,
891 ),
892 Some(pageinfo),
893 )
894 })
895 }
896
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64897 fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
898 let mut not_unpinned_new_gen_pages = VecDeque::new();
899 let mut not_unpinned_old_gen_pages = VecDeque::new();
900 let mut unpinned_count = 0;
901 let has_limit = unpin_limit.is_some();
902 let limit_count = unpin_limit.unwrap_or(0);
903 let mut dtt_iter: DTTIter = Default::default();
904
905 // If has_limit is true but limit_count is 0, will not do the unpin
906 while !has_limit || unpinned_count != limit_count {
907 let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
908 match result {
909 UnpinResult::UnpinlistEmpty => break,
910 UnpinResult::Unpinned => unpinned_count += 1,
911 UnpinResult::NotPinned => {}
912 UnpinResult::NotUnpinned => {
913 if let Some(mut page) = pinned_page {
914 if self.params.unpin_gen_threshold != 0 {
915 page.unpin_busy_cnt += 1;
916 // Unpin from new_gen queue but not
917 // successfully unpinned. Need to check
918 // the unpin_gen threshold. If reach, put
919 // it to old_gen queue.
920 // And if it is not from new_gen, directly
921 // put into old_gen queue.
922 if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
923 not_unpinned_old_gen_pages.push_back(page.gfn);
924 } else {
925 not_unpinned_new_gen_pages.push_back(page);
926 }
927 }
928 }
929 }
930 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
931 // Although UnpinParked means we didn't actually try to unpin
932 // gfn, it's not worth specifically handing since parking is
933 // expected to be relatively rare.
934 if let Some(page) = pinned_page {
935 if new_gen {
936 not_unpinned_new_gen_pages.push_back(page);
937 } else {
938 not_unpinned_old_gen_pages.push_back(page.gfn);
939 }
940 }
941 if result == UnpinResult::UnpinParked {
942 thread::park();
943 }
944 }
945 }
946 }
947
948 if !not_unpinned_new_gen_pages.is_empty() {
949 let mut pinstate = self.pinstate.lock();
950 pinstate
951 .new_gen_pinned_pages
952 .append(&mut not_unpinned_new_gen_pages);
953 }
954
955 if !not_unpinned_old_gen_pages.is_empty() {
956 let mut pinstate = self.pinstate.lock();
957 pinstate
958 .old_gen_pinned_pages
959 .append(&mut not_unpinned_old_gen_pages);
960 }
961
962 unpinned_count
963 }
964
lru_unpin_pages(&mut self)965 fn lru_unpin_pages(&mut self) {
966 let mut unpin_count = 0;
967 if self.params.unpin_gen_threshold != 0 {
968 self.unpin_gen_threshold += 1;
969 if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
970 self.unpin_gen_threshold = 0;
971 // Try to unpin inactive queue first if reaches the thres hold
972 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
973 }
974 }
975 // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
976 self.lru_unpin_pages_in_loop(
977 self.params
978 .unpin_limit
979 .map(|limit| limit.saturating_sub(unpin_count)),
980 true,
981 );
982 }
983
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool984 fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
985 let mut dtt_iter: DTTIter = Default::default();
986 let mut index = 0;
987 while index != count {
988 let mut pinstate = self.pinstate.lock();
989 let result = unpin_page(
990 &mut pinstate,
991 &self.vfio_container,
992 &self.mem,
993 self.dtt_level,
994 self.dtt_root,
995 &mut dtt_iter,
996 gfn + index,
997 true,
998 );
999 drop(pinstate);
1000
1001 match result {
1002 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
1003 UnpinResult::UnpinParked => {
1004 thread::park();
1005 continue;
1006 }
1007 _ => {
1008 error!("coiommu: force unpin failed by {:?}", result);
1009 return false;
1010 }
1011 }
1012 index += 1;
1013 }
1014 true
1015 }
1016 }
1017
1018 pub struct CoIommuDev {
1019 config_regs: PciConfiguration,
1020 pci_address: Option<PciAddress>,
1021 mem: GuestMemory,
1022 coiommu_reg: CoIommuReg,
1023 endpoints: Vec<u16>,
1024 notifymap_mem: SafeDescriptor,
1025 notifymap_mmap: Arc<MemoryMapping>,
1026 notifymap_addr: Option<u64>,
1027 topologymap_mem: SafeDescriptor,
1028 topologymap_addr: Option<u64>,
1029 mmapped: bool,
1030 vm_memory_client: VmMemoryClient,
1031 pin_thread: Option<WorkerThread<PinWorker>>,
1032 unpin_thread: Option<WorkerThread<UnpinWorker>>,
1033 unpin_tube: Option<Tube>,
1034 ioevents: Vec<Event>,
1035 vfio_container: Arc<Mutex<VfioContainer>>,
1036 pinstate: Arc<Mutex<CoIommuPinState>>,
1037 params: CoIommuParameters,
1038 }
1039
1040 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, vm_memory_client: VmMemoryClient, unpin_tube: Option<Tube>, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>1041 pub fn new(
1042 mem: GuestMemory,
1043 vfio_container: Arc<Mutex<VfioContainer>>,
1044 vm_memory_client: VmMemoryClient,
1045 unpin_tube: Option<Tube>,
1046 endpoints: Vec<u16>,
1047 vcpu_count: u64,
1048 params: CoIommuParameters,
1049 ) -> Result<Self> {
1050 let config_regs = PciConfiguration::new(
1051 PCI_VENDOR_ID_COIOMMU,
1052 PCI_DEVICE_ID_COIOMMU,
1053 PciClassCode::Other,
1054 &PciOtherSubclass::Other,
1055 None, // No Programming interface.
1056 PciHeaderType::Device,
1057 PCI_VENDOR_ID_COIOMMU,
1058 PCI_DEVICE_ID_COIOMMU,
1059 COIOMMU_REVISION_ID,
1060 );
1061
1062 // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
1063 let notifymap_mem = SharedMemory::new("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
1064 .context(Error::CreateSharedMemory)?;
1065 let notifymap_mmap = Arc::new(
1066 MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1067 .from_shared_memory(¬ifymap_mem)
1068 .offset(0)
1069 .build()?,
1070 );
1071
1072 // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1073 let topologymap_mem =
1074 SharedMemory::new("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1075 .context(Error::CreateSharedMemory)?;
1076 let topologymap_mmap = Arc::new(
1077 MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1078 .from_shared_memory(&topologymap_mem)
1079 .offset(0)
1080 .build()?,
1081 );
1082
1083 ensure!(
1084 (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1085 "Coiommu: too many endpoints"
1086 );
1087 topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1088 for (index, endpoint) in endpoints.iter().enumerate() {
1089 topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1090 }
1091
1092 let mut ioevents = Vec::new();
1093 for _ in 0..vcpu_count {
1094 ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1095 }
1096
1097 Ok(Self {
1098 config_regs,
1099 pci_address: None,
1100 mem,
1101 coiommu_reg: Default::default(),
1102 endpoints,
1103 notifymap_mem: notifymap_mem.into(),
1104 notifymap_mmap,
1105 notifymap_addr: None,
1106 topologymap_mem: topologymap_mem.into(),
1107 topologymap_addr: None,
1108 mmapped: false,
1109 vm_memory_client,
1110 pin_thread: None,
1111 unpin_thread: None,
1112 unpin_tube,
1113 ioevents,
1114 vfio_container,
1115 pinstate: Arc::new(Mutex::new(CoIommuPinState {
1116 new_gen_pinned_pages: VecDeque::new(),
1117 old_gen_pinned_pages: VecDeque::new(),
1118 unpin_thread_state: UnpinThreadState::Unparked,
1119 unpin_park_count: 0,
1120 })),
1121 params,
1122 })
1123 }
1124
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, prot: Protection, ) -> Result<()>1125 fn register_mmap(
1126 &self,
1127 descriptor: SafeDescriptor,
1128 size: usize,
1129 offset: u64,
1130 gpa: u64,
1131 prot: Protection,
1132 ) -> Result<()> {
1133 let _region = self
1134 .vm_memory_client
1135 .register_memory(
1136 VmMemorySource::Descriptor {
1137 descriptor,
1138 offset,
1139 size: size as u64,
1140 },
1141 VmMemoryDestination::GuestPhysicalAddress(gpa),
1142 prot,
1143 MemCacheType::CacheCoherent,
1144 )
1145 .context("register_mmap register_memory failed")?;
1146 Ok(())
1147 }
1148
mmap(&mut self)1149 fn mmap(&mut self) {
1150 if self.mmapped {
1151 return;
1152 }
1153
1154 if let Some(gpa) = self.notifymap_addr {
1155 match self.register_mmap(
1156 self.notifymap_mem.try_clone().unwrap(),
1157 COIOMMU_NOTIFYMAP_SIZE,
1158 0,
1159 gpa,
1160 Protection::read_write(),
1161 ) {
1162 Ok(_) => {}
1163 Err(e) => {
1164 panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1165 }
1166 }
1167 }
1168
1169 if let Some(gpa) = self.topologymap_addr {
1170 match self.register_mmap(
1171 self.topologymap_mem.try_clone().unwrap(),
1172 COIOMMU_TOPOLOGYMAP_SIZE,
1173 0,
1174 gpa,
1175 Protection::read(),
1176 ) {
1177 Ok(_) => {}
1178 Err(e) => {
1179 panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1180 }
1181 }
1182 }
1183
1184 self.mmapped = true;
1185 }
1186
start_workers(&mut self)1187 fn start_workers(&mut self) {
1188 if self.pin_thread.is_none() {
1189 self.start_pin_thread();
1190 }
1191
1192 if self.unpin_thread.is_none() {
1193 self.start_unpin_thread();
1194 }
1195 }
1196
start_pin_thread(&mut self)1197 fn start_pin_thread(&mut self) {
1198 let mem = self.mem.clone();
1199 let endpoints = self.endpoints.to_vec();
1200 let notifymap_mmap = self.notifymap_mmap.clone();
1201 let dtt_root = self.coiommu_reg.dtt_root;
1202 let dtt_level = self.coiommu_reg.dtt_level;
1203 let ioevents: Vec<Event> = self
1204 .ioevents
1205 .iter()
1206 .map(|e| e.try_clone().unwrap())
1207 .collect();
1208
1209 let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR);
1210 let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1211 for (i, evt) in self.ioevents.iter().enumerate() {
1212 self.vm_memory_client
1213 .register_io_event(
1214 evt.try_clone().expect("failed to clone event"),
1215 notify_base + i as u64,
1216 Datamatch::AnyLength,
1217 )
1218 .expect("failed to register ioevent");
1219 }
1220
1221 let vfio_container = self.vfio_container.clone();
1222 let pinstate = self.pinstate.clone();
1223 let params = self.params;
1224
1225 self.pin_thread = Some(WorkerThread::start("coiommu_pin", move |kill_evt| {
1226 let mut worker = PinWorker {
1227 mem,
1228 endpoints,
1229 notifymap_mmap,
1230 dtt_root,
1231 dtt_level,
1232 ioevents,
1233 vfio_container,
1234 pinstate,
1235 params,
1236 };
1237 worker.run(kill_evt);
1238 worker
1239 }));
1240 }
1241
start_unpin_thread(&mut self)1242 fn start_unpin_thread(&mut self) {
1243 let mem = self.mem.clone();
1244 let dtt_root = self.coiommu_reg.dtt_root;
1245 let dtt_level = self.coiommu_reg.dtt_level;
1246 let vfio_container = self.vfio_container.clone();
1247 let unpin_tube = self.unpin_tube.take();
1248 let pinstate = self.pinstate.clone();
1249 let params = self.params;
1250 self.unpin_thread = Some(WorkerThread::start("coiommu_unpin", move |kill_evt| {
1251 let mut worker = UnpinWorker {
1252 mem,
1253 dtt_level,
1254 dtt_root,
1255 vfio_container,
1256 unpin_tube,
1257 pinstate,
1258 params,
1259 unpin_gen_threshold: 0,
1260 };
1261 worker.run(kill_evt);
1262 worker
1263 }));
1264 }
1265
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1266 fn allocate_bar_address(
1267 &mut self,
1268 resources: &mut SystemAllocator,
1269 address: PciAddress,
1270 size: u64,
1271 bar_num: u8,
1272 name: &str,
1273 ) -> PciResult<u64> {
1274 let addr = resources
1275 .allocate_mmio(
1276 size,
1277 Alloc::PciBar {
1278 bus: address.bus,
1279 dev: address.dev,
1280 func: address.func,
1281 bar: bar_num,
1282 },
1283 name.to_string(),
1284 AllocOptions::new().prefetchable(true).align(size),
1285 )
1286 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1287
1288 let bar = PciBarConfiguration::new(
1289 bar_num as usize,
1290 size,
1291 PciBarRegionType::Memory64BitRegion,
1292 PciBarPrefetchable::Prefetchable,
1293 )
1294 .set_address(addr);
1295
1296 self.config_regs
1297 .add_pci_bar(bar)
1298 .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1299
1300 Ok(addr)
1301 }
1302
read_mmio(&mut self, offset: u64, data: &mut [u8])1303 fn read_mmio(&mut self, offset: u64, data: &mut [u8]) {
1304 if offset >= mem::size_of::<CoIommuReg>() as u64 {
1305 error!(
1306 "{}: read_mmio: invalid offset 0x{:x}",
1307 self.debug_label(),
1308 offset
1309 );
1310 return;
1311 }
1312
1313 // Sanity check, must be 64bit aligned accessing
1314 if offset % 8 != 0 || data.len() != 8 {
1315 error!(
1316 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1317 self.debug_label(),
1318 offset,
1319 data.len()
1320 );
1321 return;
1322 }
1323
1324 let v = match offset / 8 {
1325 0 => self.coiommu_reg.dtt_root,
1326 1 => self.coiommu_reg.cmd,
1327 2 => self.coiommu_reg.dtt_level,
1328 _ => return,
1329 };
1330
1331 data.copy_from_slice(&v.to_ne_bytes());
1332 }
1333
write_mmio(&mut self, offset: u64, data: &[u8])1334 fn write_mmio(&mut self, offset: u64, data: &[u8]) {
1335 let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1336 if offset >= mmio_len {
1337 if data.len() != 1 {
1338 error!(
1339 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1340 self.debug_label(),
1341 offset,
1342 data.len()
1343 );
1344 return;
1345 }
1346
1347 // Usually will not be here as this is for the per-vcpu notify
1348 // register which is monitored by the ioevents. For the notify
1349 // register which is not covered by the ioevents, they are not
1350 // be used by the frontend driver. In case the frontend driver
1351 // went here, do a simple handle to make sure the frontend driver
1352 // will not be blocked, and through an error log.
1353 let index = (offset - mmio_len) as usize;
1354 if let Some(event) = self.ioevents.get(index) {
1355 let _ = event.signal();
1356 } else {
1357 self.notifymap_mmap
1358 .write_obj::<u64>(0, index * mem::size_of::<u64>())
1359 .unwrap();
1360 error!(
1361 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1362 self.debug_label(),
1363 offset
1364 );
1365 }
1366 return;
1367 }
1368
1369 // Sanity check, must be 64bit aligned accessing for CoIommuReg
1370 if offset % 8 != 0 || data.len() != 8 {
1371 error!(
1372 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1373 self.debug_label(),
1374 offset,
1375 data.len()
1376 );
1377 return;
1378 }
1379
1380 let index = offset / 8;
1381 let v = u64::from_ne_bytes(data.try_into().unwrap());
1382 match index {
1383 0 => {
1384 if self.coiommu_reg.dtt_root == 0 {
1385 self.coiommu_reg.dtt_root = v;
1386 }
1387 }
1388 1 => match v {
1389 // Deactivate can happen if the frontend driver in the guest
1390 // fails during probing or if the CoIommu device is removed
1391 // by the guest. Neither of these cases is expected, and if
1392 // either happens the guest will be non-functional due to
1393 // pass-through devices which rely on CoIommu not working.
1394 // So just fail hard and panic.
1395 COIOMMU_CMD_DEACTIVATE => {
1396 panic!("{}: Deactivate is not supported", self.debug_label())
1397 }
1398 COIOMMU_CMD_ACTIVATE => {
1399 if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1400 self.start_workers();
1401 }
1402 }
1403 COIOMMU_CMD_PARK_UNPIN => {
1404 let mut pinstate = self.pinstate.lock();
1405 pinstate.unpin_thread_state = UnpinThreadState::Parked;
1406 if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1407 pinstate.unpin_park_count = v;
1408 } else {
1409 panic!("{}: Park request overflowing", self.debug_label());
1410 }
1411 }
1412 COIOMMU_CMD_UNPARK_UNPIN => {
1413 let mut pinstate = self.pinstate.lock();
1414 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1415 if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1416 pinstate.unpin_park_count = v;
1417 if pinstate.unpin_park_count == 0 {
1418 if let Some(worker_thread) = &self.unpin_thread {
1419 worker_thread.thread().unpark();
1420 }
1421 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1422 }
1423 } else {
1424 error!("{}: Park count is already reached to 0", self.debug_label());
1425 }
1426 }
1427 }
1428 _ => {}
1429 },
1430 2 => {
1431 if self.coiommu_reg.dtt_level == 0 {
1432 self.coiommu_reg.dtt_level = v;
1433 }
1434 }
1435 _ => {}
1436 }
1437 }
1438 }
1439
1440 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1441 fn debug_label(&self) -> String {
1442 "CoIommu".to_owned()
1443 }
1444
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1445 fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1446 if self.pci_address.is_none() {
1447 self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1448 Some(Alloc::PciBar {
1449 bus,
1450 dev,
1451 func,
1452 bar: _,
1453 }) => Some(PciAddress { bus, dev, func }),
1454 _ => None,
1455 }
1456 }
1457 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1458 }
1459
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1460 fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1461 let address = self
1462 .pci_address
1463 .expect("allocate_address must be called prior to allocate_io_bars");
1464
1465 // Allocate one bar for the structures pointed to by the capability structures.
1466 let mut ranges: Vec<BarRange> = Vec::new();
1467
1468 let mmio_addr = self.allocate_bar_address(
1469 resources,
1470 address,
1471 COIOMMU_MMIO_BAR_SIZE,
1472 COIOMMU_MMIO_BAR as u8,
1473 "coiommu-mmiobar",
1474 )?;
1475
1476 ranges.push(BarRange {
1477 addr: mmio_addr,
1478 size: COIOMMU_MMIO_BAR_SIZE,
1479 prefetchable: false,
1480 });
1481
1482 Ok(ranges)
1483 }
1484
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1485 fn allocate_device_bars(
1486 &mut self,
1487 resources: &mut SystemAllocator,
1488 ) -> PciResult<Vec<BarRange>> {
1489 let address = self
1490 .pci_address
1491 .expect("allocate_address must be called prior to allocate_device_bars");
1492
1493 let mut ranges: Vec<BarRange> = Vec::new();
1494
1495 let topologymap_addr = self.allocate_bar_address(
1496 resources,
1497 address,
1498 COIOMMU_TOPOLOGYMAP_SIZE as u64,
1499 COIOMMU_TOPOLOGYMAP_BAR,
1500 "coiommu-topology",
1501 )?;
1502 self.topologymap_addr = Some(topologymap_addr);
1503 ranges.push(BarRange {
1504 addr: topologymap_addr,
1505 size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1506 prefetchable: false,
1507 });
1508
1509 let notifymap_addr = self.allocate_bar_address(
1510 resources,
1511 address,
1512 COIOMMU_NOTIFYMAP_SIZE as u64,
1513 COIOMMU_NOTIFYMAP_BAR as u8,
1514 "coiommu-notifymap",
1515 )?;
1516 self.notifymap_addr = Some(notifymap_addr);
1517 ranges.push(BarRange {
1518 addr: notifymap_addr,
1519 size: COIOMMU_NOTIFYMAP_SIZE as u64,
1520 prefetchable: false,
1521 });
1522
1523 Ok(ranges)
1524 }
1525
read_config_register(&self, reg_idx: usize) -> u321526 fn read_config_register(&self, reg_idx: usize) -> u32 {
1527 self.config_regs.read_reg(reg_idx)
1528 }
1529
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1530 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1531 if reg_idx == COMMAND_REG
1532 && data.len() == 2
1533 && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1534 && !self.mmapped
1535 {
1536 self.mmap();
1537 }
1538
1539 self.config_regs.write_reg(reg_idx, offset, data);
1540 }
1541
keep_rds(&self) -> Vec<RawDescriptor>1542 fn keep_rds(&self) -> Vec<RawDescriptor> {
1543 let mut rds = vec![
1544 self.vfio_container.lock().as_raw_descriptor(),
1545 self.vm_memory_client.as_raw_descriptor(),
1546 self.notifymap_mem.as_raw_descriptor(),
1547 self.topologymap_mem.as_raw_descriptor(),
1548 ];
1549 if let Some(unpin_tube) = &self.unpin_tube {
1550 rds.push(unpin_tube.as_raw_descriptor());
1551 }
1552 rds.extend(self.ioevents.iter().map(Event::as_raw_descriptor));
1553 rds
1554 }
1555
read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8])1556 fn read_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &mut [u8]) {
1557 match bar_index {
1558 COIOMMU_MMIO_BAR => self.read_mmio(offset, data),
1559 COIOMMU_NOTIFYMAP_BAR => {
1560 // With coiommu device activated, the accessing the notifymap bar
1561 // won't cause vmexit. If goes here, means the coiommu device is
1562 // deactivated, and will not do the pin/unpin work. Thus no need
1563 // to handle this notifymap read.
1564 }
1565 _ => {}
1566 }
1567 }
1568
write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8])1569 fn write_bar(&mut self, bar_index: PciBarIndex, offset: u64, data: &[u8]) {
1570 match bar_index {
1571 COIOMMU_MMIO_BAR => self.write_mmio(offset, data),
1572 COIOMMU_NOTIFYMAP_BAR => {
1573 // With coiommu device activated, the accessing the notifymap bar
1574 // won't cause vmexit. If goes here, means the coiommu device is
1575 // deactivated, and will not do the pin/unpin work. Thus no need
1576 // to handle this notifymap write.
1577 }
1578 _ => {}
1579 }
1580 }
1581
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1582 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1583 self.config_regs.get_bar_configuration(bar_num)
1584 }
1585 }
1586
1587 impl Suspendable for CoIommuDev {}
1588