1 // Copyright 2022 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! This is the CoIOMMU backend implementation. CoIOMMU is a virtual device
6 //! which provide fine-grained pinning for the VFIO pci-passthrough device
7 //! so that hypervisor doesn't need to pin the enter VM's memory to improve
8 //! the memory utilization. CoIOMMU doesn't provide the intra-guest protection
9 //! so it can only be used for the TRUSTED passthrough devices.
10 //!
11 //! CoIOMMU is presented at KVM forum 2020:
12 //! https://kvmforum2020.sched.com/event/eE2z/a-virtual-iommu-with-cooperative
13 //! -dma-buffer-tracking-yu-zhang-intel
14 //!
15 //! Also presented at usenix ATC20:
16 //! https://www.usenix.org/conference/atc20/presentation/tian
17
18 use std::collections::VecDeque;
19 use std::convert::TryInto;
20 use std::default::Default;
21 use std::panic;
22 use std::str::FromStr;
23 use std::sync::atomic::{fence, AtomicU32, Ordering};
24 use std::sync::Arc;
25 use std::time::Duration;
26 use std::{fmt, mem, thread};
27
28 use anyhow::{anyhow, bail, ensure, Context, Result};
29 use base::{
30 error, info, AsRawDescriptor, Event, MemoryMapping, MemoryMappingBuilder, PollToken,
31 RawDescriptor, SafeDescriptor, SharedMemory, Timer, Tube, TubeError, WaitContext,
32 };
33 use data_model::DataInit;
34 use hypervisor::Datamatch;
35 use resources::{Alloc, MmioType, SystemAllocator};
36 use serde::{Deserialize, Serialize};
37 use sync::Mutex;
38 use thiserror::Error as ThisError;
39
40 use vm_control::{VmMemoryDestination, VmMemoryRequest, VmMemoryResponse, VmMemorySource};
41 use vm_memory::{GuestAddress, GuestMemory};
42
43 use crate::pci::pci_configuration::{
44 PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciClassCode, PciConfiguration,
45 PciHeaderType, PciOtherSubclass, COMMAND_REG, COMMAND_REG_MEMORY_SPACE_MASK,
46 };
47 use crate::pci::pci_device::{BarRange, PciDevice, Result as PciResult};
48 use crate::pci::{PciAddress, PciDeviceError};
49 use crate::vfio::VfioContainer;
50 use crate::{UnpinRequest, UnpinResponse};
51
52 const PCI_VENDOR_ID_COIOMMU: u16 = 0x1234;
53 const PCI_DEVICE_ID_COIOMMU: u16 = 0xabcd;
54 const COIOMMU_CMD_DEACTIVATE: u64 = 0;
55 const COIOMMU_CMD_ACTIVATE: u64 = 1;
56 const COIOMMU_CMD_PARK_UNPIN: u64 = 2;
57 const COIOMMU_CMD_UNPARK_UNPIN: u64 = 3;
58 const COIOMMU_REVISION_ID: u8 = 0x10;
59 const COIOMMU_MMIO_BAR: u8 = 0;
60 const COIOMMU_MMIO_BAR_SIZE: u64 = 0x2000;
61 const COIOMMU_NOTIFYMAP_BAR: u8 = 2;
62 const COIOMMU_NOTIFYMAP_SIZE: usize = 0x2000;
63 const COIOMMU_TOPOLOGYMAP_BAR: u8 = 4;
64 const COIOMMU_TOPOLOGYMAP_SIZE: usize = 0x2000;
65 const PAGE_SIZE_4K: u64 = 4096;
66 const PAGE_SHIFT_4K: u64 = 12;
67 const PIN_PAGES_IN_BATCH: u64 = 1 << 63;
68
69 const DTTE_PINNED_FLAG: u32 = 1 << 31;
70 const DTTE_ACCESSED_FLAG: u32 = 1 << 30;
71 const DTT_ENTRY_PRESENT: u64 = 1;
72 const DTT_ENTRY_PFN_SHIFT: u64 = 12;
73
74 #[derive(ThisError, Debug)]
75 enum Error {
76 #[error("CoIommu failed to create shared memory")]
77 CreateSharedMemory,
78 #[error("Failed to get DTT entry")]
79 GetDTTEntry,
80 #[error("Tube error")]
81 TubeError,
82 }
83
84 //default interval is 60s
85 const UNPIN_DEFAULT_INTERVAL: Duration = Duration::from_secs(60);
86 const UNPIN_GEN_DEFAULT_THRES: u64 = 10;
87 /// Holds the coiommu unpin policy
88 #[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)]
89 pub enum CoIommuUnpinPolicy {
90 Off,
91 Lru,
92 }
93
94 impl FromStr for CoIommuUnpinPolicy {
95 type Err = anyhow::Error;
96
from_str(s: &str) -> std::result::Result<Self, Self::Err>97 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
98 match s {
99 "off" => Ok(CoIommuUnpinPolicy::Off),
100 "lru" => Ok(CoIommuUnpinPolicy::Lru),
101 _ => Err(anyhow!(
102 "CoIommu doesn't have such unpin policy: {}",
103 s.to_string()
104 )),
105 }
106 }
107 }
108
109 impl fmt::Display for CoIommuUnpinPolicy {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result110 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
111 use self::CoIommuUnpinPolicy::*;
112
113 match self {
114 Off => write!(f, "off"),
115 Lru => write!(f, "lru"),
116 }
117 }
118 }
119
120 /// Holds the parameters for a coiommu device
121 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
122 pub struct CoIommuParameters {
123 pub unpin_policy: CoIommuUnpinPolicy,
124 pub unpin_interval: Duration,
125 pub unpin_limit: Option<u64>,
126 // Number of unpin intervals a pinned page must be busy for to be aged into the
127 // older, less frequently checked generation.
128 pub unpin_gen_threshold: u64,
129 }
130
131 impl Default for CoIommuParameters {
default() -> Self132 fn default() -> Self {
133 Self {
134 unpin_policy: CoIommuUnpinPolicy::Off,
135 unpin_interval: UNPIN_DEFAULT_INTERVAL,
136 unpin_limit: None,
137 unpin_gen_threshold: UNPIN_GEN_DEFAULT_THRES,
138 }
139 }
140 }
141
142 #[derive(Default, Debug, Copy, Clone)]
143 struct CoIommuReg {
144 dtt_root: u64,
145 cmd: u64,
146 dtt_level: u64,
147 }
148
149 #[derive(Default, Debug, Copy, Clone, PartialEq)]
150 struct PinnedPageInfo {
151 gfn: u64,
152 unpin_busy_cnt: u64,
153 }
154
155 impl PinnedPageInfo {
new(gfn: u64, unpin_busy_cnt: u64) -> Self156 fn new(gfn: u64, unpin_busy_cnt: u64) -> Self {
157 PinnedPageInfo {
158 gfn,
159 unpin_busy_cnt,
160 }
161 }
162 }
163
164 #[derive(PartialEq, Debug)]
165 enum UnpinThreadState {
166 Unparked,
167 Parked,
168 }
169
170 struct CoIommuPinState {
171 new_gen_pinned_pages: VecDeque<PinnedPageInfo>,
172 old_gen_pinned_pages: VecDeque<u64>,
173 unpin_thread_state: UnpinThreadState,
174 unpin_park_count: u64,
175 }
176
vfio_map( vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64, user_addr: u64, ) -> bool177 unsafe fn vfio_map(
178 vfio_container: &Arc<Mutex<VfioContainer>>,
179 iova: u64,
180 size: u64,
181 user_addr: u64,
182 ) -> bool {
183 match vfio_container
184 .lock()
185 .vfio_dma_map(iova, size, user_addr, true)
186 {
187 Ok(_) => true,
188 Err(e) => {
189 if let Some(errno) = std::io::Error::last_os_error().raw_os_error() {
190 if errno == libc::EEXIST {
191 // Already pinned. set PINNED flag
192 error!("CoIommu: iova 0x{:x} already pinned", iova);
193 return true;
194 }
195 }
196 error!("CoIommu: failed to map iova 0x{:x}: {}", iova, e);
197 false
198 }
199 }
200 }
201
vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool202 fn vfio_unmap(vfio_container: &Arc<Mutex<VfioContainer>>, iova: u64, size: u64) -> bool {
203 match vfio_container.lock().vfio_dma_unmap(iova, size) {
204 Ok(_) => true,
205 Err(e) => {
206 error!("CoIommu: failed to unmap iova 0x{:x}: {}", iova, e);
207 false
208 }
209 }
210 }
211
212 #[derive(Default, Debug, Copy, Clone)]
213 #[repr(C)]
214 struct PinPageInfo {
215 bdf: u16,
216 pad: [u16; 3],
217 nr_pages: u64,
218 }
219 // Safe because the PinPageInfo structure is raw data
220 unsafe impl DataInit for PinPageInfo {}
221
222 const COIOMMU_UPPER_LEVEL_STRIDE: u64 = 9;
223 const COIOMMU_UPPER_LEVEL_MASK: u64 = (1 << COIOMMU_UPPER_LEVEL_STRIDE) - 1;
224 const COIOMMU_PT_LEVEL_STRIDE: u64 = 10;
225 const COIOMMU_PT_LEVEL_MASK: u64 = (1 << COIOMMU_PT_LEVEL_STRIDE) - 1;
226
level_to_offset(gfn: u64, level: u64) -> Result<u64>227 fn level_to_offset(gfn: u64, level: u64) -> Result<u64> {
228 if level == 1 {
229 return Ok(gfn & COIOMMU_PT_LEVEL_MASK);
230 }
231
232 if level == 0 {
233 bail!("Invalid level for gfn 0x{:x}", gfn);
234 }
235
236 let offset = COIOMMU_PT_LEVEL_STRIDE + (level - 2) * COIOMMU_UPPER_LEVEL_STRIDE;
237
238 Ok((gfn >> offset) & COIOMMU_UPPER_LEVEL_MASK)
239 }
240
241 struct DTTIter {
242 ptr: *const u8,
243 gfn: u64,
244 }
245
246 impl Default for DTTIter {
default() -> Self247 fn default() -> Self {
248 DTTIter {
249 ptr: std::ptr::null(),
250 gfn: 0,
251 }
252 }
253 }
254
255 // Get a DMA Tracking Table(DTT) entry associated with the gfn.
256 //
257 // There are two ways to get the entry:
258 // #1. Walking the DMA Tracking Table(DTT) by the GFN to get the
259 // corresponding entry. The DTT is shared between frontend and
260 // backend. It is page-table-like strctures and the entry is indexed
261 // by GFN. The argument dtt_root represents the root page
262 // pga and dtt_level represents the maximum page table level.
263 //
264 // #2. Calculate the entry address via the argument dtt_iter. dtt_iter
265 // stores an entry address and the associated gfn. If the target gfn is
266 // in the same page table page with the gfn in dtt_iter, then can
267 // calculate the target entry address based on the entry address in
268 // dtt_iter.
269 //
270 // As the DTT entry is shared between frontend and backend, the accessing
271 // should be atomic. So the returned value is converted to an AtomicU32
272 // pointer.
gfn_to_dtt_pte( mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<*const AtomicU32>273 fn gfn_to_dtt_pte(
274 mem: &GuestMemory,
275 dtt_level: u64,
276 dtt_root: u64,
277 dtt_iter: &mut DTTIter,
278 gfn: u64,
279 ) -> Result<*const AtomicU32> {
280 let ptr = if dtt_iter.ptr.is_null()
281 || dtt_iter.gfn >> COIOMMU_PT_LEVEL_STRIDE != gfn >> COIOMMU_PT_LEVEL_STRIDE
282 {
283 // Slow path to walk the DTT to get the pte entry
284 let mut level = dtt_level;
285 let mut pt_gpa = dtt_root;
286 let dtt_nonleaf_entry_size = mem::size_of::<u64>() as u64;
287
288 while level != 1 {
289 let index = level_to_offset(gfn, level)? * dtt_nonleaf_entry_size;
290 let parent_pt = mem
291 .read_obj_from_addr::<u64>(GuestAddress(pt_gpa + index))
292 .context(Error::GetDTTEntry)?;
293
294 if (parent_pt & DTT_ENTRY_PRESENT) == 0 {
295 bail!("DTT absent at level {} for gfn 0x{:x}", level, gfn);
296 }
297
298 pt_gpa = (parent_pt >> DTT_ENTRY_PFN_SHIFT) << PAGE_SHIFT_4K;
299 level -= 1;
300 }
301
302 let index = level_to_offset(gfn, level)? * mem::size_of::<u32>() as u64;
303
304 mem.get_host_address(GuestAddress(pt_gpa + index))
305 .context(Error::GetDTTEntry)?
306 } else {
307 // Safe because we checked that dtt_iter.ptr is valid and that the dtt_pte
308 // for gfn lies on the same dtt page as the dtt_pte for dtt_iter.gfn, which
309 // means the calculated ptr will point to the same page as dtt_iter.ptr
310 if gfn > dtt_iter.gfn {
311 unsafe {
312 dtt_iter
313 .ptr
314 .add(mem::size_of::<AtomicU32>() * (gfn - dtt_iter.gfn) as usize)
315 }
316 } else {
317 unsafe {
318 dtt_iter
319 .ptr
320 .sub(mem::size_of::<AtomicU32>() * (dtt_iter.gfn - gfn) as usize)
321 }
322 }
323 };
324
325 dtt_iter.ptr = ptr;
326 dtt_iter.gfn = gfn;
327
328 Ok(ptr as *const AtomicU32)
329 }
330
pin_page( pinstate: &mut CoIommuPinState, policy: CoIommuUnpinPolicy, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, ) -> Result<()>331 fn pin_page(
332 pinstate: &mut CoIommuPinState,
333 policy: CoIommuUnpinPolicy,
334 vfio_container: &Arc<Mutex<VfioContainer>>,
335 mem: &GuestMemory,
336 dtt_level: u64,
337 dtt_root: u64,
338 dtt_iter: &mut DTTIter,
339 gfn: u64,
340 ) -> Result<()> {
341 let leaf_entry = gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn)?;
342
343 let gpa = (gfn << PAGE_SHIFT_4K) as u64;
344 let host_addr = mem
345 .get_host_address_range(GuestAddress(gpa), PAGE_SIZE_4K as usize)
346 .context("failed to get host address")? as u64;
347
348 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
349 // Test PINNED flag
350 if (unsafe { (*leaf_entry).load(Ordering::Relaxed) } & DTTE_PINNED_FLAG) != 0 {
351 info!("CoIommu: gfn 0x{:x} already pinned", gfn);
352 return Ok(());
353 }
354
355 // Safe because the gpa is valid from the gfn_to_dtt_pte and the host_addr
356 // is guaranteed by MemoryMapping interface.
357 if unsafe { vfio_map(vfio_container, gpa, PAGE_SIZE_4K, host_addr) } {
358 // Safe because ptr is valid and guaranteed by the gfn_to_dtt_pte.
359 // set PINNED flag
360 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
361 if policy == CoIommuUnpinPolicy::Lru {
362 pinstate
363 .new_gen_pinned_pages
364 .push_back(PinnedPageInfo::new(gfn, 0));
365 }
366 }
367
368 Ok(())
369 }
370
371 #[derive(PartialEq, Debug)]
372 enum UnpinResult {
373 UnpinlistEmpty,
374 Unpinned,
375 NotPinned,
376 NotUnpinned,
377 FailedUnpin,
378 UnpinParked,
379 }
380
unpin_page( pinstate: &mut CoIommuPinState, vfio_container: &Arc<Mutex<VfioContainer>>, mem: &GuestMemory, dtt_level: u64, dtt_root: u64, dtt_iter: &mut DTTIter, gfn: u64, force: bool, ) -> UnpinResult381 fn unpin_page(
382 pinstate: &mut CoIommuPinState,
383 vfio_container: &Arc<Mutex<VfioContainer>>,
384 mem: &GuestMemory,
385 dtt_level: u64,
386 dtt_root: u64,
387 dtt_iter: &mut DTTIter,
388 gfn: u64,
389 force: bool,
390 ) -> UnpinResult {
391 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
392 return UnpinResult::UnpinParked;
393 }
394
395 let leaf_entry = match gfn_to_dtt_pte(mem, dtt_level, dtt_root, dtt_iter, gfn) {
396 Ok(v) => v,
397 Err(_) => {
398 // The case force == true may try to unpin a page which is not
399 // mapped in the dtt. For such page, the pte doesn't exist yet
400 // thus don't need to report any error log.
401 // The case force == false is used by coiommu to periodically
402 // unpin the pages which have been mapped in dtt, thus the pte
403 // for such page does exist. However with the unpin request from
404 // virtio balloon, such pages can be unpinned already and the DTT
405 // pages might be reclaimed by the Guest OS kernel as well, thus
406 // it is also possible to be here. Not to report an error log.
407 return UnpinResult::NotPinned;
408 }
409 };
410
411 if force {
412 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
413 // This case is for balloon to evict pages so these pages should
414 // already been locked by balloon and no device driver in VM is
415 // able to access these pages, so just clear ACCESSED flag first
416 // to make sure the following unpin can be success.
417 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
418 }
419
420 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
421 if let Err(entry) = unsafe {
422 (*leaf_entry).compare_exchange(DTTE_PINNED_FLAG, 0, Ordering::SeqCst, Ordering::SeqCst)
423 } {
424 // The compare_exchange failed as the original leaf entry is
425 // not DTTE_PINNED_FLAG so cannot do the unpin.
426 if entry == 0 {
427 // The GFN is already unpinned. This is very similar to the
428 // gfn_to_dtt_pte error case, with the only difference being
429 // that the dtt_pte happens to be on a present page table.
430 UnpinResult::NotPinned
431 } else {
432 if !force {
433 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
434 // The ACCESSED_FLAG is set by the guest if guest requires DMA map for
435 // this page. It represents whether or not this page is touched by the
436 // guest. By clearing this flag after an unpin work, we can detect if
437 // this page has been touched by the guest in the next round of unpin
438 // work. If the ACCESSED_FLAG is set at the next round, unpin this page
439 // will be failed and we will be here again to clear this flag. If this
440 // flag is not set at the next round, unpin this page will be probably
441 // success.
442 unsafe { (*leaf_entry).fetch_and(!DTTE_ACCESSED_FLAG, Ordering::SeqCst) };
443 } else {
444 // If we're here, then the guest is trying to release a page via the
445 // balloon that it still has pinned. This most likely that something is
446 // wrong in the guest kernel. Just leave the page pinned and log
447 // an error.
448 // This failure blocks the balloon from removing the page, which ensures
449 // that the guest's view of memory will remain consistent with device
450 // DMA's view of memory. Also note that the host kernel maintains an
451 // elevated refcount for pinned pages, which is a second guarantee the
452 // pages accessible by device DMA won't be freed until after they are
453 // unpinned.
454 error!(
455 "CoIommu: force case cannot pin gfn 0x{:x} entry 0x{:x}",
456 gfn, entry
457 );
458 }
459 // GFN cannot be unpinned either because the unmap count
460 // is non-zero or the it has accessed flag set.
461 UnpinResult::NotUnpinned
462 }
463 } else {
464 // The compare_exchange success as the original leaf entry is
465 // DTTE_PINNED_FLAG and the new leaf entry is 0 now. Unpin the
466 // page.
467 let gpa = (gfn << PAGE_SHIFT_4K) as u64;
468 if vfio_unmap(vfio_container, gpa, PAGE_SIZE_4K) {
469 UnpinResult::Unpinned
470 } else {
471 // Safe because leaf_entry is valid and guaranteed by the gfn_to_dtt_pte.
472 // make sure the pinned flag is set
473 unsafe { (*leaf_entry).fetch_or(DTTE_PINNED_FLAG, Ordering::SeqCst) };
474 // need to put this gfn back to pinned vector
475 UnpinResult::FailedUnpin
476 }
477 }
478 }
479
480 struct PinWorker {
481 mem: GuestMemory,
482 endpoints: Vec<u16>,
483 notifymap_mmap: Arc<MemoryMapping>,
484 dtt_level: u64,
485 dtt_root: u64,
486 ioevents: Vec<Event>,
487 vfio_container: Arc<Mutex<VfioContainer>>,
488 pinstate: Arc<Mutex<CoIommuPinState>>,
489 params: CoIommuParameters,
490 }
491
492 impl PinWorker {
debug_label(&self) -> &'static str493 fn debug_label(&self) -> &'static str {
494 "CoIommuPinWorker"
495 }
496
run(&mut self, kill_evt: Event)497 fn run(&mut self, kill_evt: Event) {
498 #[derive(PollToken)]
499 enum Token {
500 Kill,
501 Pin { index: usize },
502 }
503
504 let wait_ctx: WaitContext<Token> =
505 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
506 Ok(pc) => pc,
507 Err(e) => {
508 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
509 return;
510 }
511 };
512
513 for (index, event) in self.ioevents.iter().enumerate() {
514 match wait_ctx.add(event, Token::Pin { index }) {
515 Ok(_) => {}
516 Err(e) => {
517 error!(
518 "{}: failed to add ioevent for index {}: {}",
519 self.debug_label(),
520 index,
521 e
522 );
523 return;
524 }
525 }
526 }
527
528 'wait: loop {
529 let events = match wait_ctx.wait() {
530 Ok(v) => v,
531 Err(e) => {
532 error!("{}: failed polling for events: {}", self.debug_label(), e);
533 break;
534 }
535 };
536
537 for event in events.iter().filter(|e| e.is_readable) {
538 match event.token {
539 Token::Kill => break 'wait,
540 Token::Pin { index } => {
541 let offset = index * mem::size_of::<u64>() as usize;
542 if let Some(event) = self.ioevents.get(index) {
543 if let Err(e) = event.read() {
544 error!(
545 "{}: failed reading event {}: {}",
546 self.debug_label(),
547 index,
548 e
549 );
550 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
551 break 'wait;
552 }
553 }
554 if let Ok(data) = self.notifymap_mmap.read_obj::<u64>(offset) {
555 if let Err(e) = self.pin_pages(data) {
556 error!("{}: {}", self.debug_label(), e);
557 }
558 }
559 fence(Ordering::SeqCst);
560 self.notifymap_mmap.write_obj::<u64>(0, offset).unwrap();
561 }
562 }
563 }
564 }
565 }
566
pin_pages_in_batch(&mut self, gpa: u64) -> Result<()>567 fn pin_pages_in_batch(&mut self, gpa: u64) -> Result<()> {
568 let pin_page_info = self
569 .mem
570 .read_obj_from_addr::<PinPageInfo>(GuestAddress(gpa))
571 .context("failed to get pin page info")?;
572
573 let bdf = pin_page_info.bdf;
574 ensure!(
575 self.endpoints.iter().any(|&x| x == bdf),
576 "pin page for unexpected bdf 0x{:x}",
577 bdf
578 );
579
580 let mut nr_pages = pin_page_info.nr_pages;
581 let mut offset = mem::size_of::<PinPageInfo>() as u64;
582 let mut dtt_iter: DTTIter = Default::default();
583 let mut pinstate = self.pinstate.lock();
584 while nr_pages > 0 {
585 let gfn = self
586 .mem
587 .read_obj_from_addr::<u64>(GuestAddress(gpa + offset))
588 .context("failed to get pin page gfn")?;
589
590 pin_page(
591 &mut pinstate,
592 self.params.unpin_policy,
593 &self.vfio_container,
594 &self.mem,
595 self.dtt_level,
596 self.dtt_root,
597 &mut dtt_iter,
598 gfn,
599 )?;
600
601 offset += mem::size_of::<u64>() as u64;
602 nr_pages -= 1;
603 }
604
605 Ok(())
606 }
607
pin_pages(&mut self, gfn_bdf: u64) -> Result<()>608 fn pin_pages(&mut self, gfn_bdf: u64) -> Result<()> {
609 if gfn_bdf & PIN_PAGES_IN_BATCH != 0 {
610 let gpa = gfn_bdf & !PIN_PAGES_IN_BATCH;
611 self.pin_pages_in_batch(gpa)
612 } else {
613 let bdf = (gfn_bdf & 0xffff) as u16;
614 let gfn = gfn_bdf >> 16;
615 let mut dtt_iter: DTTIter = Default::default();
616 ensure!(
617 self.endpoints.iter().any(|&x| x == bdf),
618 "pin page for unexpected bdf 0x{:x}",
619 bdf
620 );
621
622 let mut pinstate = self.pinstate.lock();
623 pin_page(
624 &mut pinstate,
625 self.params.unpin_policy,
626 &self.vfio_container,
627 &self.mem,
628 self.dtt_level,
629 self.dtt_root,
630 &mut dtt_iter,
631 gfn,
632 )
633 }
634 }
635 }
636
637 struct UnpinWorker {
638 mem: GuestMemory,
639 dtt_level: u64,
640 dtt_root: u64,
641 vfio_container: Arc<Mutex<VfioContainer>>,
642 unpin_tube: Option<Tube>,
643 pinstate: Arc<Mutex<CoIommuPinState>>,
644 params: CoIommuParameters,
645 unpin_gen_threshold: u64,
646 }
647
648 impl UnpinWorker {
debug_label(&self) -> &'static str649 fn debug_label(&self) -> &'static str {
650 "CoIommuUnpinWorker"
651 }
652
run(&mut self, kill_evt: Event)653 fn run(&mut self, kill_evt: Event) {
654 #[derive(PollToken)]
655 enum Token {
656 UnpinTimer,
657 UnpinReq,
658 Kill,
659 }
660
661 let wait_ctx: WaitContext<Token> =
662 match WaitContext::build_with(&[(&kill_evt, Token::Kill)]) {
663 Ok(pc) => pc,
664 Err(e) => {
665 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
666 return;
667 }
668 };
669
670 if let Some(tube) = &self.unpin_tube {
671 if let Err(e) = wait_ctx.add(tube, Token::UnpinReq) {
672 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
673 return;
674 }
675 }
676
677 let mut unpin_timer = if self.params.unpin_policy != CoIommuUnpinPolicy::Off
678 && !self.params.unpin_interval.is_zero()
679 {
680 let duration = self.params.unpin_interval;
681 let interval = Some(self.params.unpin_interval);
682 let mut timer = match Timer::new() {
683 Ok(t) => t,
684 Err(e) => {
685 error!(
686 "{}: failed to create the unpin timer: {}",
687 self.debug_label(),
688 e
689 );
690 return;
691 }
692 };
693 if let Err(e) = timer.reset(duration, interval) {
694 error!(
695 "{}: failed to start the unpin timer: {}",
696 self.debug_label(),
697 e
698 );
699 return;
700 }
701 if let Err(e) = wait_ctx.add(&timer, Token::UnpinTimer) {
702 error!("{}: failed creating WaitContext: {}", self.debug_label(), e);
703 return;
704 }
705 Some(timer)
706 } else {
707 None
708 };
709
710 let unpin_tube = self.unpin_tube.take();
711 'wait: loop {
712 let events = match wait_ctx.wait() {
713 Ok(v) => v,
714 Err(e) => {
715 error!("{}: failed polling for events: {}", self.debug_label(), e);
716 break;
717 }
718 };
719
720 for event in events.iter().filter(|e| e.is_readable) {
721 match event.token {
722 Token::UnpinTimer => {
723 self.unpin_pages();
724 if let Some(timer) = &mut unpin_timer {
725 if let Err(e) = timer.wait() {
726 error!(
727 "{}: failed to clear unpin timer: {}",
728 self.debug_label(),
729 e
730 );
731 break 'wait;
732 }
733 }
734 }
735 Token::UnpinReq => {
736 if let Some(tube) = &unpin_tube {
737 match tube.recv::<UnpinRequest>() {
738 Ok(req) => {
739 let mut unpin_done = true;
740 for range in req.ranges {
741 // Locking with respect to pin_pages isn't necessary
742 // for this case because the unpinned pages in the range
743 // should all be in the balloon and so nothing will attempt
744 // to pin them.
745 if !self.unpin_pages_in_range(range.0, range.1) {
746 unpin_done = false;
747 break;
748 }
749 }
750 let resp = if unpin_done {
751 UnpinResponse::Success
752 } else {
753 UnpinResponse::Failed
754 };
755 if let Err(e) = tube.send(&resp) {
756 error!(
757 "{}: failed to send unpin response {}",
758 self.debug_label(),
759 e
760 );
761 }
762 }
763 Err(e) => {
764 if let TubeError::Disconnected = e {
765 if let Err(e) = wait_ctx.delete(tube) {
766 error!(
767 "{}: failed to remove unpin_tube: {}",
768 self.debug_label(),
769 e
770 );
771 }
772 } else {
773 error!(
774 "{}: failed to recv Unpin Request: {}",
775 self.debug_label(),
776 e
777 );
778 }
779 }
780 }
781 }
782 }
783 Token::Kill => break 'wait,
784 }
785 }
786 }
787 self.unpin_tube = unpin_tube;
788 }
789
unpin_pages(&mut self)790 fn unpin_pages(&mut self) {
791 if self.params.unpin_policy == CoIommuUnpinPolicy::Lru {
792 self.lru_unpin_pages();
793 }
794 }
795
lru_unpin_page( &mut self, dtt_iter: &mut DTTIter, new_gen: bool, ) -> (UnpinResult, Option<PinnedPageInfo>)796 fn lru_unpin_page(
797 &mut self,
798 dtt_iter: &mut DTTIter,
799 new_gen: bool,
800 ) -> (UnpinResult, Option<PinnedPageInfo>) {
801 let mut pinstate = self.pinstate.lock();
802 let pageinfo = if new_gen {
803 pinstate.new_gen_pinned_pages.pop_front()
804 } else {
805 pinstate
806 .old_gen_pinned_pages
807 .pop_front()
808 .map(|gfn| PinnedPageInfo::new(gfn, 0))
809 };
810
811 pageinfo.map_or((UnpinResult::UnpinlistEmpty, None), |pageinfo| {
812 (
813 unpin_page(
814 &mut pinstate,
815 &self.vfio_container,
816 &self.mem,
817 self.dtt_level,
818 self.dtt_root,
819 dtt_iter,
820 pageinfo.gfn,
821 false,
822 ),
823 Some(pageinfo),
824 )
825 })
826 }
827
lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64828 fn lru_unpin_pages_in_loop(&mut self, unpin_limit: Option<u64>, new_gen: bool) -> u64 {
829 let mut not_unpinned_new_gen_pages = VecDeque::new();
830 let mut not_unpinned_old_gen_pages = VecDeque::new();
831 let mut unpinned_count = 0;
832 let has_limit = unpin_limit.is_some();
833 let limit_count = unpin_limit.unwrap_or(0);
834 let mut dtt_iter: DTTIter = Default::default();
835
836 // If has_limit is true but limit_count is 0, will not do the unpin
837 while !has_limit || unpinned_count != limit_count {
838 let (result, pinned_page) = self.lru_unpin_page(&mut dtt_iter, new_gen);
839 match result {
840 UnpinResult::UnpinlistEmpty => break,
841 UnpinResult::Unpinned => unpinned_count += 1,
842 UnpinResult::NotPinned => {}
843 UnpinResult::NotUnpinned => {
844 if let Some(mut page) = pinned_page {
845 if self.params.unpin_gen_threshold != 0 {
846 page.unpin_busy_cnt += 1;
847 // Unpin from new_gen queue but not
848 // successfully unpinned. Need to check
849 // the unpin_gen threshold. If reach, put
850 // it to old_gen queue.
851 // And if it is not from new_gen, directly
852 // put into old_gen queue.
853 if !new_gen || page.unpin_busy_cnt >= self.params.unpin_gen_threshold {
854 not_unpinned_old_gen_pages.push_back(page.gfn);
855 } else {
856 not_unpinned_new_gen_pages.push_back(page);
857 }
858 }
859 }
860 }
861 UnpinResult::FailedUnpin | UnpinResult::UnpinParked => {
862 // Although UnpinParked means we didn't actually try to unpin
863 // gfn, it's not worth specifically handing since parking is
864 // expected to be relatively rare.
865 if let Some(page) = pinned_page {
866 if new_gen {
867 not_unpinned_new_gen_pages.push_back(page);
868 } else {
869 not_unpinned_old_gen_pages.push_back(page.gfn);
870 }
871 }
872 if result == UnpinResult::UnpinParked {
873 thread::park();
874 }
875 }
876 }
877 }
878
879 if !not_unpinned_new_gen_pages.is_empty() {
880 let mut pinstate = self.pinstate.lock();
881 pinstate
882 .new_gen_pinned_pages
883 .append(&mut not_unpinned_new_gen_pages);
884 }
885
886 if !not_unpinned_old_gen_pages.is_empty() {
887 let mut pinstate = self.pinstate.lock();
888 pinstate
889 .old_gen_pinned_pages
890 .append(&mut not_unpinned_old_gen_pages);
891 }
892
893 unpinned_count
894 }
895
lru_unpin_pages(&mut self)896 fn lru_unpin_pages(&mut self) {
897 let mut unpin_count = 0;
898 if self.params.unpin_gen_threshold != 0 {
899 self.unpin_gen_threshold += 1;
900 if self.unpin_gen_threshold == self.params.unpin_gen_threshold {
901 self.unpin_gen_threshold = 0;
902 // Try to unpin inactive queue first if reaches the thres hold
903 unpin_count = self.lru_unpin_pages_in_loop(self.params.unpin_limit, false);
904 }
905 }
906 // Unpin the new_gen queue with the updated unpin_limit after unpin old_gen queue
907 self.lru_unpin_pages_in_loop(
908 self.params
909 .unpin_limit
910 .map(|limit| limit.saturating_sub(unpin_count)),
911 true,
912 );
913 }
914
unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool915 fn unpin_pages_in_range(&self, gfn: u64, count: u64) -> bool {
916 let mut dtt_iter: DTTIter = Default::default();
917 let mut index = 0;
918 while index != count {
919 let mut pinstate = self.pinstate.lock();
920 let result = unpin_page(
921 &mut pinstate,
922 &self.vfio_container,
923 &self.mem,
924 self.dtt_level,
925 self.dtt_root,
926 &mut dtt_iter,
927 gfn + index,
928 true,
929 );
930 drop(pinstate);
931
932 match result {
933 UnpinResult::Unpinned | UnpinResult::NotPinned => {}
934 UnpinResult::UnpinParked => {
935 thread::park();
936 continue;
937 }
938 _ => {
939 error!("coiommu: force unpin failed by {:?}", result);
940 return false;
941 }
942 }
943 index += 1;
944 }
945 true
946 }
947 }
948
949 pub struct CoIommuDev {
950 config_regs: PciConfiguration,
951 pci_address: Option<PciAddress>,
952 mem: GuestMemory,
953 coiommu_reg: CoIommuReg,
954 endpoints: Vec<u16>,
955 notifymap_mem: SafeDescriptor,
956 notifymap_mmap: Arc<MemoryMapping>,
957 notifymap_addr: Option<u64>,
958 topologymap_mem: SafeDescriptor,
959 topologymap_addr: Option<u64>,
960 mmapped: bool,
961 device_tube: Tube,
962 pin_thread: Option<thread::JoinHandle<PinWorker>>,
963 pin_kill_evt: Option<Event>,
964 unpin_thread: Option<thread::JoinHandle<UnpinWorker>>,
965 unpin_kill_evt: Option<Event>,
966 unpin_tube: Option<Tube>,
967 ioevents: Vec<Event>,
968 vfio_container: Arc<Mutex<VfioContainer>>,
969 pinstate: Arc<Mutex<CoIommuPinState>>,
970 params: CoIommuParameters,
971 }
972
973 impl CoIommuDev {
new( mem: GuestMemory, vfio_container: Arc<Mutex<VfioContainer>>, device_tube: Tube, unpin_tube: Tube, endpoints: Vec<u16>, vcpu_count: u64, params: CoIommuParameters, ) -> Result<Self>974 pub fn new(
975 mem: GuestMemory,
976 vfio_container: Arc<Mutex<VfioContainer>>,
977 device_tube: Tube,
978 unpin_tube: Tube,
979 endpoints: Vec<u16>,
980 vcpu_count: u64,
981 params: CoIommuParameters,
982 ) -> Result<Self> {
983 let config_regs = PciConfiguration::new(
984 PCI_VENDOR_ID_COIOMMU,
985 PCI_DEVICE_ID_COIOMMU,
986 PciClassCode::Other,
987 &PciOtherSubclass::Other,
988 None, // No Programming interface.
989 PciHeaderType::Device,
990 PCI_VENDOR_ID_COIOMMU,
991 PCI_DEVICE_ID_COIOMMU,
992 COIOMMU_REVISION_ID,
993 );
994
995 // notifymap_mem is used as Bar2 for Guest to check if request is completed by coIOMMU.
996 let notifymap_mem = SharedMemory::named("coiommu_notifymap", COIOMMU_NOTIFYMAP_SIZE as u64)
997 .context(Error::CreateSharedMemory)?;
998 let notifymap_mmap = Arc::new(
999 MemoryMappingBuilder::new(COIOMMU_NOTIFYMAP_SIZE)
1000 .from_shared_memory(¬ifymap_mem)
1001 .offset(0)
1002 .build()?,
1003 );
1004
1005 // topologymap_mem is used as Bar4 for Guest to check which device is on top of coIOMMU.
1006 let topologymap_mem =
1007 SharedMemory::named("coiommu_topologymap", COIOMMU_TOPOLOGYMAP_SIZE as u64)
1008 .context(Error::CreateSharedMemory)?;
1009 let topologymap_mmap = Arc::new(
1010 MemoryMappingBuilder::new(COIOMMU_TOPOLOGYMAP_SIZE)
1011 .from_shared_memory(&topologymap_mem)
1012 .offset(0)
1013 .build()?,
1014 );
1015
1016 ensure!(
1017 (endpoints.len() + 1) * mem::size_of::<u16>() <= COIOMMU_TOPOLOGYMAP_SIZE,
1018 "Coiommu: too many endpoints"
1019 );
1020 topologymap_mmap.write_obj::<u16>(endpoints.len() as u16, 0)?;
1021 for (index, endpoint) in endpoints.iter().enumerate() {
1022 topologymap_mmap.write_obj::<u16>(*endpoint, (index + 1) * mem::size_of::<u16>())?;
1023 }
1024
1025 let mut ioevents = Vec::new();
1026 for _ in 0..vcpu_count {
1027 ioevents.push(Event::new().context("CoIommu failed to create event fd")?);
1028 }
1029
1030 Ok(Self {
1031 config_regs,
1032 pci_address: None,
1033 mem,
1034 coiommu_reg: Default::default(),
1035 endpoints,
1036 notifymap_mem: notifymap_mem.into(),
1037 notifymap_mmap,
1038 notifymap_addr: None,
1039 topologymap_mem: topologymap_mem.into(),
1040 topologymap_addr: None,
1041 mmapped: false,
1042 device_tube,
1043 pin_thread: None,
1044 pin_kill_evt: None,
1045 unpin_thread: None,
1046 unpin_kill_evt: None,
1047 unpin_tube: Some(unpin_tube),
1048 ioevents,
1049 vfio_container,
1050 pinstate: Arc::new(Mutex::new(CoIommuPinState {
1051 new_gen_pinned_pages: VecDeque::new(),
1052 old_gen_pinned_pages: VecDeque::new(),
1053 unpin_thread_state: UnpinThreadState::Unparked,
1054 unpin_park_count: 0,
1055 })),
1056 params,
1057 })
1058 }
1059
send_msg(&self, msg: &VmMemoryRequest) -> Result<()>1060 fn send_msg(&self, msg: &VmMemoryRequest) -> Result<()> {
1061 self.device_tube.send(msg).context(Error::TubeError)?;
1062 let res = self.device_tube.recv().context(Error::TubeError)?;
1063 match res {
1064 VmMemoryResponse::RegisterMemory { .. } => Ok(()),
1065 VmMemoryResponse::Err(e) => Err(anyhow!("Receive msg err {}", e)),
1066 _ => Err(anyhow!("Msg cannot be handled")),
1067 }
1068 }
1069
register_mmap( &self, descriptor: SafeDescriptor, size: usize, offset: u64, gpa: u64, read_only: bool, ) -> Result<()>1070 fn register_mmap(
1071 &self,
1072 descriptor: SafeDescriptor,
1073 size: usize,
1074 offset: u64,
1075 gpa: u64,
1076 read_only: bool,
1077 ) -> Result<()> {
1078 let request = VmMemoryRequest::RegisterMemory {
1079 source: VmMemorySource::Descriptor {
1080 descriptor,
1081 offset,
1082 size: size as u64,
1083 },
1084 dest: VmMemoryDestination::GuestPhysicalAddress(gpa),
1085 read_only,
1086 };
1087 self.send_msg(&request)
1088 }
1089
mmap(&mut self)1090 fn mmap(&mut self) {
1091 if self.mmapped {
1092 return;
1093 }
1094
1095 if let Some(gpa) = self.notifymap_addr {
1096 match self.register_mmap(
1097 self.notifymap_mem.try_clone().unwrap(),
1098 COIOMMU_NOTIFYMAP_SIZE,
1099 0,
1100 gpa,
1101 false,
1102 ) {
1103 Ok(_) => {}
1104 Err(e) => {
1105 panic!("{}: map notifymap failed: {}", self.debug_label(), e);
1106 }
1107 }
1108 }
1109
1110 if let Some(gpa) = self.topologymap_addr {
1111 match self.register_mmap(
1112 self.topologymap_mem.try_clone().unwrap(),
1113 COIOMMU_TOPOLOGYMAP_SIZE,
1114 0,
1115 gpa,
1116 true,
1117 ) {
1118 Ok(_) => {}
1119 Err(e) => {
1120 panic!("{}: map topologymap failed: {}", self.debug_label(), e);
1121 }
1122 }
1123 }
1124
1125 self.mmapped = true;
1126 }
1127
start_workers(&mut self)1128 fn start_workers(&mut self) {
1129 if self.pin_thread.is_none() {
1130 self.start_pin_thread();
1131 }
1132
1133 if self.unpin_thread.is_none() {
1134 self.start_unpin_thread();
1135 }
1136 }
1137
start_pin_thread(&mut self)1138 fn start_pin_thread(&mut self) {
1139 let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1140 Ok(v) => v,
1141 Err(e) => {
1142 error!(
1143 "{}: failed creating kill Event pair: {}",
1144 self.debug_label(),
1145 e
1146 );
1147 return;
1148 }
1149 };
1150
1151 let mem = self.mem.clone();
1152 let endpoints = self.endpoints.to_vec();
1153 let notifymap_mmap = self.notifymap_mmap.clone();
1154 let dtt_root = self.coiommu_reg.dtt_root;
1155 let dtt_level = self.coiommu_reg.dtt_level;
1156 let ioevents = self
1157 .ioevents
1158 .iter()
1159 .map(|e| e.try_clone().unwrap())
1160 .collect();
1161 let vfio_container = self.vfio_container.clone();
1162 let pinstate = self.pinstate.clone();
1163 let params = self.params;
1164
1165 let worker_result = thread::Builder::new()
1166 .name("coiommu_pin".to_string())
1167 .spawn(move || {
1168 let mut worker = PinWorker {
1169 mem,
1170 endpoints,
1171 notifymap_mmap,
1172 dtt_root,
1173 dtt_level,
1174 ioevents,
1175 vfio_container,
1176 pinstate,
1177 params,
1178 };
1179 worker.run(kill_evt);
1180 worker
1181 });
1182
1183 match worker_result {
1184 Err(e) => error!(
1185 "{}: failed to spawn coiommu pin worker: {}",
1186 self.debug_label(),
1187 e
1188 ),
1189 Ok(join_handle) => {
1190 self.pin_thread = Some(join_handle);
1191 self.pin_kill_evt = Some(self_kill_evt);
1192 }
1193 }
1194 }
1195
start_unpin_thread(&mut self)1196 fn start_unpin_thread(&mut self) {
1197 let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
1198 Ok(v) => v,
1199 Err(e) => {
1200 error!(
1201 "{}: failed creating kill Event pair: {}",
1202 self.debug_label(),
1203 e
1204 );
1205 return;
1206 }
1207 };
1208
1209 let mem = self.mem.clone();
1210 let dtt_root = self.coiommu_reg.dtt_root;
1211 let dtt_level = self.coiommu_reg.dtt_level;
1212 let vfio_container = self.vfio_container.clone();
1213 let unpin_tube = self.unpin_tube.take();
1214 let pinstate = self.pinstate.clone();
1215 let params = self.params;
1216 let worker_result = thread::Builder::new()
1217 .name("coiommu_unpin".to_string())
1218 .spawn(move || {
1219 let mut worker = UnpinWorker {
1220 mem,
1221 dtt_level,
1222 dtt_root,
1223 vfio_container,
1224 unpin_tube,
1225 pinstate,
1226 params,
1227 unpin_gen_threshold: 0,
1228 };
1229 worker.run(kill_evt);
1230 worker
1231 });
1232
1233 match worker_result {
1234 Err(e) => {
1235 error!(
1236 "{}: failed to spawn coiommu unpin worker: {}",
1237 self.debug_label(),
1238 e
1239 );
1240 }
1241 Ok(join_handle) => {
1242 self.unpin_thread = Some(join_handle);
1243 self.unpin_kill_evt = Some(self_kill_evt);
1244 }
1245 }
1246 }
1247
allocate_bar_address( &mut self, resources: &mut SystemAllocator, address: PciAddress, size: u64, bar_num: u8, name: &str, ) -> PciResult<u64>1248 fn allocate_bar_address(
1249 &mut self,
1250 resources: &mut SystemAllocator,
1251 address: PciAddress,
1252 size: u64,
1253 bar_num: u8,
1254 name: &str,
1255 ) -> PciResult<u64> {
1256 let addr = resources
1257 .mmio_allocator(MmioType::High)
1258 .allocate_with_align(
1259 size,
1260 Alloc::PciBar {
1261 bus: address.bus,
1262 dev: address.dev,
1263 func: address.func,
1264 bar: bar_num,
1265 },
1266 name.to_string(),
1267 size,
1268 )
1269 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?;
1270
1271 let bar = PciBarConfiguration::new(
1272 bar_num as usize,
1273 size,
1274 PciBarRegionType::Memory64BitRegion,
1275 PciBarPrefetchable::Prefetchable,
1276 )
1277 .set_address(addr);
1278
1279 self.config_regs
1280 .add_pci_bar(bar)
1281 .map_err(|e| PciDeviceError::IoRegistrationFailed(addr, e))?;
1282
1283 Ok(addr)
1284 }
1285
read_mmio(&mut self, addr: u64, data: &mut [u8])1286 fn read_mmio(&mut self, addr: u64, data: &mut [u8]) {
1287 let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1288 let offset = addr - bar;
1289 if offset >= mem::size_of::<CoIommuReg>() as u64 {
1290 error!(
1291 "{}: read_mmio: invalid addr 0x{:x} bar 0x{:x} offset 0x{:x}",
1292 self.debug_label(),
1293 addr,
1294 bar,
1295 offset
1296 );
1297 return;
1298 }
1299
1300 // Sanity check, must be 64bit aligned accessing
1301 if offset % 8 != 0 || data.len() != 8 {
1302 error!(
1303 "{}: read_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1304 self.debug_label(),
1305 offset,
1306 data.len()
1307 );
1308 return;
1309 }
1310
1311 let v = match offset / 8 {
1312 0 => self.coiommu_reg.dtt_root,
1313 1 => self.coiommu_reg.cmd,
1314 2 => self.coiommu_reg.dtt_level,
1315 _ => return,
1316 };
1317
1318 data.copy_from_slice(&v.to_ne_bytes());
1319 }
1320
write_mmio(&mut self, addr: u64, data: &[u8])1321 fn write_mmio(&mut self, addr: u64, data: &[u8]) {
1322 let bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1323 let mmio_len = mem::size_of::<CoIommuReg>() as u64;
1324 let offset = addr - bar;
1325 if offset >= mmio_len {
1326 if data.len() != 1 {
1327 error!(
1328 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 1",
1329 self.debug_label(),
1330 offset,
1331 data.len()
1332 );
1333 return;
1334 }
1335
1336 // Usually will not be here as this is for the per-vcpu notify
1337 // register which is monitored by the ioevents. For the notify
1338 // register which is not covered by the ioevents, they are not
1339 // be used by the frontend driver. In case the frontend driver
1340 // went here, do a simple handle to make sure the frontend driver
1341 // will not be blocked, and through an error log.
1342 let index = (offset - mmio_len) as usize * mem::size_of::<u64>();
1343 self.notifymap_mmap.write_obj::<u64>(0, index).unwrap();
1344 error!(
1345 "{}: No page will be pinned as driver is accessing unused trigger register: offset 0x{:x}",
1346 self.debug_label(),
1347 offset
1348 );
1349 return;
1350 }
1351
1352 // Sanity check, must be 64bit aligned accessing for CoIommuReg
1353 if offset % 8 != 0 || data.len() != 8 {
1354 error!(
1355 "{}: write_mmio: unaligned accessing: offset 0x{:x} actual len {} expect len 8",
1356 self.debug_label(),
1357 offset,
1358 data.len()
1359 );
1360 return;
1361 }
1362
1363 let index = offset / 8;
1364 let v = u64::from_ne_bytes(data.try_into().unwrap());
1365 match index {
1366 0 => {
1367 if self.coiommu_reg.dtt_root == 0 {
1368 self.coiommu_reg.dtt_root = v;
1369 }
1370 }
1371 1 => match v {
1372 // Deactivate can happen if the frontend driver in the guest
1373 // fails during probing or if the CoIommu device is removed
1374 // by the guest. Neither of these cases is expected, and if
1375 // either happens the guest will be non-functional due to
1376 // pass-through devices which rely on CoIommu not working.
1377 // So just fail hard and panic.
1378 COIOMMU_CMD_DEACTIVATE => {
1379 panic!("{}: Deactivate is not supported", self.debug_label())
1380 }
1381 COIOMMU_CMD_ACTIVATE => {
1382 if self.coiommu_reg.dtt_root != 0 && self.coiommu_reg.dtt_level != 0 {
1383 self.start_workers();
1384 }
1385 }
1386 COIOMMU_CMD_PARK_UNPIN => {
1387 let mut pinstate = self.pinstate.lock();
1388 pinstate.unpin_thread_state = UnpinThreadState::Parked;
1389 if let Some(v) = pinstate.unpin_park_count.checked_add(1) {
1390 pinstate.unpin_park_count = v;
1391 } else {
1392 panic!("{}: Park request overflowing", self.debug_label());
1393 }
1394 }
1395 COIOMMU_CMD_UNPARK_UNPIN => {
1396 let mut pinstate = self.pinstate.lock();
1397 if pinstate.unpin_thread_state == UnpinThreadState::Parked {
1398 if let Some(v) = pinstate.unpin_park_count.checked_sub(1) {
1399 pinstate.unpin_park_count = v;
1400 if pinstate.unpin_park_count == 0 {
1401 if let Some(worker_thread) = &self.unpin_thread {
1402 worker_thread.thread().unpark();
1403 }
1404 pinstate.unpin_thread_state = UnpinThreadState::Unparked;
1405 }
1406 } else {
1407 error!("{}: Park count is already reached to 0", self.debug_label());
1408 }
1409 }
1410 }
1411 _ => {}
1412 },
1413 2 => {
1414 if self.coiommu_reg.dtt_level == 0 {
1415 self.coiommu_reg.dtt_level = v;
1416 }
1417 }
1418 _ => {}
1419 }
1420 }
1421 }
1422
1423 impl PciDevice for CoIommuDev {
debug_label(&self) -> String1424 fn debug_label(&self) -> String {
1425 "CoIommu".to_owned()
1426 }
1427
allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress>1428 fn allocate_address(&mut self, resources: &mut SystemAllocator) -> PciResult<PciAddress> {
1429 if self.pci_address.is_none() {
1430 self.pci_address = match resources.allocate_pci(0, self.debug_label()) {
1431 Some(Alloc::PciBar {
1432 bus,
1433 dev,
1434 func,
1435 bar: _,
1436 }) => Some(PciAddress { bus, dev, func }),
1437 _ => None,
1438 }
1439 }
1440 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed)
1441 }
1442
allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>>1443 fn allocate_io_bars(&mut self, resources: &mut SystemAllocator) -> PciResult<Vec<BarRange>> {
1444 let address = self
1445 .pci_address
1446 .expect("allocate_address must be called prior to allocate_io_bars");
1447
1448 // Allocate one bar for the structures pointed to by the capability structures.
1449 let mut ranges: Vec<BarRange> = Vec::new();
1450
1451 let mmio_addr = self.allocate_bar_address(
1452 resources,
1453 address,
1454 COIOMMU_MMIO_BAR_SIZE as u64,
1455 COIOMMU_MMIO_BAR,
1456 "coiommu-mmiobar",
1457 )?;
1458
1459 ranges.push(BarRange {
1460 addr: mmio_addr,
1461 size: COIOMMU_MMIO_BAR_SIZE,
1462 prefetchable: false,
1463 });
1464
1465 Ok(ranges)
1466 }
1467
allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> PciResult<Vec<BarRange>>1468 fn allocate_device_bars(
1469 &mut self,
1470 resources: &mut SystemAllocator,
1471 ) -> PciResult<Vec<BarRange>> {
1472 let address = self
1473 .pci_address
1474 .expect("allocate_address must be called prior to allocate_device_bars");
1475
1476 let mut ranges: Vec<BarRange> = Vec::new();
1477
1478 let topologymap_addr = self.allocate_bar_address(
1479 resources,
1480 address,
1481 COIOMMU_TOPOLOGYMAP_SIZE as u64,
1482 COIOMMU_TOPOLOGYMAP_BAR,
1483 "coiommu-topology",
1484 )?;
1485 self.topologymap_addr = Some(topologymap_addr);
1486 ranges.push(BarRange {
1487 addr: topologymap_addr,
1488 size: COIOMMU_TOPOLOGYMAP_SIZE as u64,
1489 prefetchable: false,
1490 });
1491
1492 let notifymap_addr = self.allocate_bar_address(
1493 resources,
1494 address,
1495 COIOMMU_NOTIFYMAP_SIZE as u64,
1496 COIOMMU_NOTIFYMAP_BAR,
1497 "coiommu-notifymap",
1498 )?;
1499 self.notifymap_addr = Some(notifymap_addr);
1500 ranges.push(BarRange {
1501 addr: notifymap_addr,
1502 size: COIOMMU_NOTIFYMAP_SIZE as u64,
1503 prefetchable: false,
1504 });
1505
1506 Ok(ranges)
1507 }
1508
read_config_register(&self, reg_idx: usize) -> u321509 fn read_config_register(&self, reg_idx: usize) -> u32 {
1510 self.config_regs.read_reg(reg_idx)
1511 }
1512
write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])1513 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
1514 if reg_idx == COMMAND_REG
1515 && data.len() == 2
1516 && data[0] & COMMAND_REG_MEMORY_SPACE_MASK as u8 != 0
1517 && !self.mmapped
1518 {
1519 self.mmap();
1520 }
1521
1522 (&mut self.config_regs).write_reg(reg_idx, offset, data);
1523 }
1524
keep_rds(&self) -> Vec<RawDescriptor>1525 fn keep_rds(&self) -> Vec<RawDescriptor> {
1526 let mut rds = vec![
1527 self.vfio_container.lock().as_raw_descriptor(),
1528 self.device_tube.as_raw_descriptor(),
1529 self.notifymap_mem.as_raw_descriptor(),
1530 self.topologymap_mem.as_raw_descriptor(),
1531 ];
1532 if let Some(unpin_tube) = &self.unpin_tube {
1533 rds.push(unpin_tube.as_raw_descriptor());
1534 }
1535 rds
1536 }
1537
read_bar(&mut self, addr: u64, data: &mut [u8])1538 fn read_bar(&mut self, addr: u64, data: &mut [u8]) {
1539 let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1540 let notifymap = self
1541 .config_regs
1542 .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1543 match addr {
1544 o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1545 self.read_mmio(addr, data);
1546 }
1547 o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1548 // With coiommu device activated, the accessing the notifymap bar
1549 // won't cause vmexit. If goes here, means the coiommu device is
1550 // deactivated, and will not do the pin/unpin work. Thus no need
1551 // to handle this notifymap read.
1552 }
1553 _ => {}
1554 }
1555 }
1556
write_bar(&mut self, addr: u64, data: &[u8])1557 fn write_bar(&mut self, addr: u64, data: &[u8]) {
1558 let mmio_bar = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1559 let notifymap = self
1560 .config_regs
1561 .get_bar_addr(COIOMMU_NOTIFYMAP_BAR as usize);
1562 match addr {
1563 o if mmio_bar <= o && o < mmio_bar + COIOMMU_MMIO_BAR_SIZE as u64 => {
1564 self.write_mmio(addr, data);
1565 }
1566 o if notifymap <= o && o < notifymap + COIOMMU_NOTIFYMAP_SIZE as u64 => {
1567 // With coiommu device activated, the accessing the notifymap bar
1568 // won't cause vmexit. If goes here, means the coiommu device is
1569 // deactivated, and will not do the pin/unpin work. Thus no need
1570 // to handle this notifymap write.
1571 }
1572 _ => {}
1573 }
1574 }
1575
ioevents(&self) -> Vec<(&Event, u64, Datamatch)>1576 fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> {
1577 let bar0 = self.config_regs.get_bar_addr(COIOMMU_MMIO_BAR as usize);
1578 let notify_base = bar0 + mem::size_of::<CoIommuReg>() as u64;
1579 self.ioevents
1580 .iter()
1581 .enumerate()
1582 .map(|(i, event)| (event, notify_base + i as u64, Datamatch::AnyLength))
1583 .collect()
1584 }
1585
get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration>1586 fn get_bar_configuration(&self, bar_num: usize) -> Option<PciBarConfiguration> {
1587 self.config_regs.get_bar_configuration(bar_num)
1588 }
1589 }
1590
1591 impl Drop for CoIommuDev {
drop(&mut self)1592 fn drop(&mut self) {
1593 if let Some(kill_evt) = self.pin_kill_evt.take() {
1594 // Ignore the result because there is nothing we can do about it.
1595 if kill_evt.write(1).is_ok() {
1596 if let Some(worker_thread) = self.pin_thread.take() {
1597 let _ = worker_thread.join();
1598 }
1599 } else {
1600 error!("CoIOMMU: failed to write to kill_evt to stop pin_thread");
1601 }
1602 }
1603
1604 if let Some(kill_evt) = self.unpin_kill_evt.take() {
1605 // Ignore the result because there is nothing we can do about it.
1606 if kill_evt.write(1).is_ok() {
1607 if let Some(worker_thread) = self.unpin_thread.take() {
1608 let _ = worker_thread.join();
1609 }
1610 } else {
1611 error!("CoIOMMU: failed to write to kill_evt to stop unpin_thread");
1612 }
1613 }
1614 }
1615 }
1616