1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 pub mod ipc_memory_mapper;
6 pub mod memory_mapper;
7 pub mod protocol;
8 pub(crate) mod sys;
9
10 use std::cell::RefCell;
11 use std::collections::btree_map::Entry;
12 use std::collections::BTreeMap;
13 use std::io;
14 use std::io::Write;
15 use std::mem::size_of;
16 use std::ops::RangeInclusive;
17 use std::rc::Rc;
18 use std::result;
19 use std::sync::Arc;
20
21 #[cfg(target_arch = "x86_64")]
22 use acpi_tables::sdt::SDT;
23 use anyhow::anyhow;
24 use anyhow::Context;
25 use base::debug;
26 use base::error;
27 use base::pagesize;
28 #[cfg(target_arch = "x86_64")]
29 use base::warn;
30 use base::AsRawDescriptor;
31 use base::Error as SysError;
32 use base::Event;
33 use base::MappedRegion;
34 use base::MemoryMapping;
35 use base::Protection;
36 use base::RawDescriptor;
37 use base::Result as SysResult;
38 use base::Tube;
39 use base::TubeError;
40 use base::WorkerThread;
41 use cros_async::AsyncError;
42 use cros_async::AsyncTube;
43 use cros_async::EventAsync;
44 use cros_async::Executor;
45 use data_model::Le64;
46 use futures::select;
47 use futures::FutureExt;
48 use hypervisor::MemSlot;
49 use remain::sorted;
50 use sync::Mutex;
51 use thiserror::Error;
52 use vm_memory::GuestAddress;
53 use vm_memory::GuestMemory;
54 use vm_memory::GuestMemoryError;
55 use zerocopy::AsBytes;
56 use zerocopy::FromBytes;
57 use zerocopy::FromZeroes;
58
59 #[cfg(target_arch = "x86_64")]
60 use crate::pci::PciAddress;
61 use crate::virtio::async_utils;
62 use crate::virtio::copy_config;
63 use crate::virtio::iommu::memory_mapper::*;
64 use crate::virtio::iommu::protocol::*;
65 use crate::virtio::DescriptorChain;
66 use crate::virtio::DeviceType;
67 use crate::virtio::Interrupt;
68 use crate::virtio::Queue;
69 use crate::virtio::Reader;
70 use crate::virtio::VirtioDevice;
71 #[cfg(target_arch = "x86_64")]
72 use crate::virtio::Writer;
73
74 const QUEUE_SIZE: u16 = 256;
75 const NUM_QUEUES: usize = 2;
76 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES];
77
78 // Size of struct virtio_iommu_probe_property
79 #[cfg(target_arch = "x86_64")]
80 const IOMMU_PROBE_SIZE: usize = size_of::<virtio_iommu_probe_resv_mem>();
81
82 #[cfg(target_arch = "x86_64")]
83 const VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE: u8 = 1;
84 #[cfg(target_arch = "x86_64")]
85 const VIRTIO_IOMMU_VIOT_NODE_VIRTIO_IOMMU_PCI: u8 = 3;
86
87 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
88 #[repr(C, packed)]
89 struct VirtioIommuViotHeader {
90 node_count: u16,
91 node_offset: u16,
92 reserved: [u8; 8],
93 }
94
95 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
96 #[repr(C, packed)]
97 struct VirtioIommuViotVirtioPciNode {
98 type_: u8,
99 reserved: [u8; 1],
100 length: u16,
101 segment: u16,
102 bdf: u16,
103 reserved2: [u8; 8],
104 }
105
106 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
107 #[repr(C, packed)]
108 struct VirtioIommuViotPciRangeNode {
109 type_: u8,
110 reserved: [u8; 1],
111 length: u16,
112 endpoint_start: u32,
113 segment_start: u16,
114 segment_end: u16,
115 bdf_start: u16,
116 bdf_end: u16,
117 output_node: u16,
118 reserved2: [u8; 2],
119 reserved3: [u8; 4],
120 }
121
122 type Result<T> = result::Result<T, IommuError>;
123
124 #[sorted]
125 #[derive(Error, Debug)]
126 pub enum IommuError {
127 #[error("async executor error: {0}")]
128 AsyncExec(AsyncError),
129 #[error("failed to create wait context: {0}")]
130 CreateWaitContext(SysError),
131 #[error("failed getting host address: {0}")]
132 GetHostAddress(GuestMemoryError),
133 #[error("failed to read from guest address: {0}")]
134 GuestMemoryRead(io::Error),
135 #[error("failed to write to guest address: {0}")]
136 GuestMemoryWrite(io::Error),
137 #[error("memory mapper failed: {0}")]
138 MemoryMapper(anyhow::Error),
139 #[error("Failed to read descriptor asynchronously: {0}")]
140 ReadAsyncDesc(AsyncError),
141 #[error("failed to read from virtio queue Event: {0}")]
142 ReadQueueEvent(SysError),
143 #[error("tube error: {0}")]
144 Tube(TubeError),
145 #[error("unexpected descriptor error")]
146 UnexpectedDescriptor,
147 #[error("failed to receive virtio-iommu control request: {0}")]
148 VirtioIOMMUReqError(TubeError),
149 #[error("failed to send virtio-iommu control response: {0}")]
150 VirtioIOMMUResponseError(TubeError),
151 #[error("failed to wait for events: {0}")]
152 WaitError(SysError),
153 #[error("write buffer length too small")]
154 WriteBufferTooSmall,
155 }
156
157 // key: domain ID
158 // value: reference counter and MemoryMapperTrait
159 type DomainMap = BTreeMap<u32, (u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>)>;
160
161 struct DmabufRegionEntry {
162 mmap: MemoryMapping,
163 mem_slot: MemSlot,
164 len: u64,
165 }
166
167 // Shared state for the virtio-iommu device.
168 struct State {
169 mem: GuestMemory,
170 page_mask: u64,
171 // Hot-pluggable PCI endpoints ranges
172 // RangeInclusive: (start endpoint PCI address .. =end endpoint PCI address)
173 #[cfg_attr(windows, allow(dead_code))]
174 hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
175 // All PCI endpoints that attach to certain IOMMU domain
176 // key: endpoint PCI address
177 // value: attached domain ID
178 endpoint_map: BTreeMap<u32, u32>,
179 // All attached domains
180 domain_map: DomainMap,
181 // Contains all pass-through endpoints that attach to this IOMMU device
182 // key: endpoint PCI address
183 // value: reference counter and MemoryMapperTrait
184 endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
185 // Contains dmabuf regions
186 // key: guest physical address
187 dmabuf_mem: BTreeMap<u64, DmabufRegionEntry>,
188 }
189
190 impl State {
191 // Detach the given endpoint if possible, and return whether or not the endpoint
192 // was actually detached. If a successfully detached endpoint has exported
193 // memory, returns an event that will be signaled once all exported memory is released.
194 //
195 // The device MUST ensure that after being detached from a domain, the endpoint
196 // cannot access any mapping from that domain.
197 //
198 // Currently, we only support detaching an endpoint if it is the only endpoint attached
199 // to its domain.
detach_endpoint( endpoint_map: &mut BTreeMap<u32, u32>, domain_map: &mut DomainMap, endpoint: u32, ) -> (bool, Option<EventAsync>)200 fn detach_endpoint(
201 endpoint_map: &mut BTreeMap<u32, u32>,
202 domain_map: &mut DomainMap,
203 endpoint: u32,
204 ) -> (bool, Option<EventAsync>) {
205 let mut evt = None;
206 // The endpoint has attached to an IOMMU domain
207 if let Some(attached_domain) = endpoint_map.get(&endpoint) {
208 // Remove the entry or update the domain reference count
209 if let Entry::Occupied(o) = domain_map.entry(*attached_domain) {
210 let (refs, mapper) = o.get();
211 if !mapper.lock().supports_detach() {
212 return (false, None);
213 }
214
215 match refs {
216 0 => unreachable!(),
217 1 => {
218 evt = mapper.lock().reset_domain();
219 o.remove();
220 }
221 _ => return (false, None),
222 }
223 }
224 }
225
226 endpoint_map.remove(&endpoint);
227 (true, evt)
228 }
229
230 // Processes an attach request. This may require detaching the endpoint from
231 // its current endpoint before attaching it to a new endpoint. If that happens
232 // while the endpoint has exported memory, this function returns an event that
233 // will be signaled once all exported memory is released.
234 //
235 // Notes: if a VFIO group contains multiple devices, it could violate the follow
236 // requirement from the virtio IOMMU spec: If the VIRTIO_IOMMU_F_BYPASS feature
237 // is negotiated, all accesses from unattached endpoints are allowed and translated
238 // by the IOMMU using the identity function. If the feature is not negotiated, any
239 // memory access from an unattached endpoint fails.
240 //
241 // This happens after the virtio-iommu device receives a VIRTIO_IOMMU_T_ATTACH
242 // request for the first endpoint in a VFIO group, any not yet attached endpoints
243 // in the VFIO group will be able to access the domain.
244 //
245 // This violation is benign for current virtualization use cases. Since device
246 // topology in the guest matches topology in the host, the guest doesn't expect
247 // the device in the same VFIO group are isolated from each other in the first place.
process_attach_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>248 fn process_attach_request(
249 &mut self,
250 reader: &mut Reader,
251 tail: &mut virtio_iommu_req_tail,
252 ) -> Result<(usize, Option<EventAsync>)> {
253 let req: virtio_iommu_req_attach =
254 reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
255 let mut fault_resolved_event = None;
256
257 // If the reserved field of an ATTACH request is not zero,
258 // the device MUST reject the request and set status to
259 // VIRTIO_IOMMU_S_INVAL.
260 if req.reserved.iter().any(|&x| x != 0) {
261 tail.status = VIRTIO_IOMMU_S_INVAL;
262 return Ok((0, None));
263 }
264
265 let domain: u32 = req.domain.into();
266 let endpoint: u32 = req.endpoint.into();
267
268 if let Some(mapper) = self.endpoints.get(&endpoint) {
269 // The same mapper can't be used for two domains at the same time,
270 // since that would result in conflicts/permission leaks between
271 // the two domains.
272 let mapper_id = {
273 let m = mapper.lock();
274 ((**m).type_id(), m.id())
275 };
276 for (other_endpoint, other_mapper) in self.endpoints.iter() {
277 if *other_endpoint == endpoint {
278 continue;
279 }
280 let other_id = {
281 let m = other_mapper.lock();
282 ((**m).type_id(), m.id())
283 };
284 if mapper_id == other_id {
285 if !self
286 .endpoint_map
287 .get(other_endpoint)
288 .map_or(true, |d| d == &domain)
289 {
290 tail.status = VIRTIO_IOMMU_S_UNSUPP;
291 return Ok((0, None));
292 }
293 }
294 }
295
296 // If the endpoint identified by `endpoint` is already attached
297 // to another domain, then the device SHOULD first detach it
298 // from that domain and attach it to the one identified by domain.
299 if self.endpoint_map.contains_key(&endpoint) {
300 // In that case the device SHOULD behave as if the driver issued
301 // a DETACH request with this endpoint, followed by the ATTACH
302 // request. If the device cannot do so, it MUST reject the request
303 // and set status to VIRTIO_IOMMU_S_UNSUPP.
304 let (detached, evt) =
305 Self::detach_endpoint(&mut self.endpoint_map, &mut self.domain_map, endpoint);
306 if !detached {
307 tail.status = VIRTIO_IOMMU_S_UNSUPP;
308 return Ok((0, None));
309 }
310 fault_resolved_event = evt;
311 }
312
313 let new_ref = match self.domain_map.get(&domain) {
314 None => 1,
315 Some(val) => val.0 + 1,
316 };
317
318 self.endpoint_map.insert(endpoint, domain);
319 self.domain_map.insert(domain, (new_ref, mapper.clone()));
320 } else {
321 // If the endpoint identified by endpoint doesn’t exist,
322 // the device MUST reject the request and set status to
323 // VIRTIO_IOMMU_S_NOENT.
324 tail.status = VIRTIO_IOMMU_S_NOENT;
325 }
326
327 Ok((0, fault_resolved_event))
328 }
329
process_detach_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>330 fn process_detach_request(
331 &mut self,
332 reader: &mut Reader,
333 tail: &mut virtio_iommu_req_tail,
334 ) -> Result<(usize, Option<EventAsync>)> {
335 let req: virtio_iommu_req_detach =
336 reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
337
338 // If the endpoint identified by |req.endpoint| doesn’t exist,
339 // the device MUST reject the request and set status to
340 // VIRTIO_IOMMU_S_NOENT.
341 let endpoint: u32 = req.endpoint.into();
342 if !self.endpoints.contains_key(&endpoint) {
343 tail.status = VIRTIO_IOMMU_S_NOENT;
344 return Ok((0, None));
345 }
346
347 let (detached, evt) =
348 Self::detach_endpoint(&mut self.endpoint_map, &mut self.domain_map, endpoint);
349 if !detached {
350 tail.status = VIRTIO_IOMMU_S_UNSUPP;
351 }
352 Ok((0, evt))
353 }
354
process_dma_map_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<usize>355 fn process_dma_map_request(
356 &mut self,
357 reader: &mut Reader,
358 tail: &mut virtio_iommu_req_tail,
359 ) -> Result<usize> {
360 let req: virtio_iommu_req_map = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
361
362 // If virt_start, phys_start or (virt_end + 1) is not aligned
363 // on the page granularity, the device SHOULD reject the
364 // request and set status to VIRTIO_IOMMU_S_RANGE
365 if self.page_mask & u64::from(req.phys_start) != 0
366 || self.page_mask & u64::from(req.virt_start) != 0
367 || self.page_mask & (u64::from(req.virt_end) + 1) != 0
368 {
369 tail.status = VIRTIO_IOMMU_S_RANGE;
370 return Ok(0);
371 }
372
373 // If the device doesn’t recognize a flags bit, it MUST reject
374 // the request and set status to VIRTIO_IOMMU_S_INVAL.
375 if u32::from(req.flags) & !VIRTIO_IOMMU_MAP_F_MASK != 0 {
376 tail.status = VIRTIO_IOMMU_S_INVAL;
377 return Ok(0);
378 }
379
380 let domain: u32 = req.domain.into();
381 if !self.domain_map.contains_key(&domain) {
382 // If domain does not exist, the device SHOULD reject
383 // the request and set status to VIRTIO_IOMMU_S_NOENT.
384 tail.status = VIRTIO_IOMMU_S_NOENT;
385 return Ok(0);
386 }
387
388 // The device MUST NOT allow writes to a range mapped
389 // without the VIRTIO_IOMMU_MAP_F_WRITE flag.
390 let write_en = u32::from(req.flags) & VIRTIO_IOMMU_MAP_F_WRITE != 0;
391
392 if let Some(mapper) = self.domain_map.get(&domain) {
393 let size = u64::from(req.virt_end) - u64::from(req.virt_start) + 1u64;
394
395 let dmabuf_map = self
396 .dmabuf_mem
397 .range(..=u64::from(req.phys_start))
398 .next_back()
399 .and_then(|(addr, region)| {
400 if u64::from(req.phys_start) + size <= addr + region.len {
401 Some(region.mmap.as_ptr() as u64 + (u64::from(req.phys_start) - addr))
402 } else {
403 None
404 }
405 });
406
407 let prot = match write_en {
408 true => Protection::read_write(),
409 false => Protection::read(),
410 };
411
412 let vfio_map_result = match dmabuf_map {
413 // SAFETY:
414 // Safe because [dmabuf_map, dmabuf_map + size) refers to an external mmap'ed
415 // region.
416 Some(dmabuf_map) => unsafe {
417 mapper
418 .1
419 .lock()
420 .vfio_dma_map(req.virt_start.into(), dmabuf_map, size, prot)
421 },
422 None => mapper.1.lock().add_map(MappingInfo {
423 iova: req.virt_start.into(),
424 gpa: GuestAddress(req.phys_start.into()),
425 size,
426 prot,
427 }),
428 };
429
430 match vfio_map_result {
431 Ok(AddMapResult::Ok) => (),
432 Ok(AddMapResult::OverlapFailure) => {
433 // If a mapping already exists in the requested range,
434 // the device SHOULD reject the request and set status
435 // to VIRTIO_IOMMU_S_INVAL.
436 tail.status = VIRTIO_IOMMU_S_INVAL;
437 }
438 Err(e) => return Err(IommuError::MemoryMapper(e)),
439 }
440 }
441
442 Ok(0)
443 }
444
process_dma_unmap_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>445 fn process_dma_unmap_request(
446 &mut self,
447 reader: &mut Reader,
448 tail: &mut virtio_iommu_req_tail,
449 ) -> Result<(usize, Option<EventAsync>)> {
450 let req: virtio_iommu_req_unmap = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
451
452 let domain: u32 = req.domain.into();
453 let fault_resolved_event = if let Some(mapper) = self.domain_map.get(&domain) {
454 let size = u64::from(req.virt_end) - u64::from(req.virt_start) + 1;
455 let res = mapper
456 .1
457 .lock()
458 .remove_map(u64::from(req.virt_start), size)
459 .map_err(IommuError::MemoryMapper)?;
460 match res {
461 RemoveMapResult::Success(evt) => evt,
462 RemoveMapResult::OverlapFailure => {
463 // If a mapping affected by the range is not covered in its entirety by the
464 // range (the UNMAP request would split the mapping), then the device SHOULD
465 // set the request `status` to VIRTIO_IOMMU_S_RANGE, and SHOULD NOT remove
466 // any mapping.
467 tail.status = VIRTIO_IOMMU_S_RANGE;
468 None
469 }
470 }
471 } else {
472 // If domain does not exist, the device SHOULD set the
473 // request status to VIRTIO_IOMMU_S_NOENT
474 tail.status = VIRTIO_IOMMU_S_NOENT;
475 None
476 };
477
478 Ok((0, fault_resolved_event))
479 }
480
481 #[cfg(target_arch = "x86_64")]
process_probe_request( &mut self, reader: &mut Reader, writer: &mut Writer, tail: &mut virtio_iommu_req_tail, ) -> Result<usize>482 fn process_probe_request(
483 &mut self,
484 reader: &mut Reader,
485 writer: &mut Writer,
486 tail: &mut virtio_iommu_req_tail,
487 ) -> Result<usize> {
488 let req: virtio_iommu_req_probe = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
489 let endpoint: u32 = req.endpoint.into();
490
491 // If the endpoint identified by endpoint doesn’t exist,
492 // then the device SHOULD reject the request and set status
493 // to VIRTIO_IOMMU_S_NOENT.
494 if !self.endpoints.contains_key(&endpoint) {
495 tail.status = VIRTIO_IOMMU_S_NOENT;
496 }
497
498 let properties_size = writer.available_bytes() - size_of::<virtio_iommu_req_tail>();
499
500 // It's OK if properties_size is larger than probe_size
501 // We are good even if properties_size is 0
502 if properties_size < IOMMU_PROBE_SIZE {
503 // If the properties list is smaller than probe_size, the device
504 // SHOULD NOT write any property. It SHOULD reject the request
505 // and set status to VIRTIO_IOMMU_S_INVAL.
506 tail.status = VIRTIO_IOMMU_S_INVAL;
507 } else if tail.status == VIRTIO_IOMMU_S_OK {
508 const VIRTIO_IOMMU_PROBE_T_RESV_MEM: u16 = 1;
509 const VIRTIO_IOMMU_RESV_MEM_T_MSI: u8 = 1;
510 const PROBE_PROPERTY_SIZE: u16 = 4;
511 const X86_MSI_IOVA_START: u64 = 0xfee0_0000;
512 const X86_MSI_IOVA_END: u64 = 0xfeef_ffff;
513
514 let properties = virtio_iommu_probe_resv_mem {
515 head: virtio_iommu_probe_property {
516 type_: VIRTIO_IOMMU_PROBE_T_RESV_MEM.into(),
517 length: (IOMMU_PROBE_SIZE as u16 - PROBE_PROPERTY_SIZE).into(),
518 },
519 subtype: VIRTIO_IOMMU_RESV_MEM_T_MSI,
520 start: X86_MSI_IOVA_START.into(),
521 end: X86_MSI_IOVA_END.into(),
522 ..Default::default()
523 };
524 writer
525 .write_all(properties.as_bytes())
526 .map_err(IommuError::GuestMemoryWrite)?;
527 }
528
529 // If the device doesn’t fill all probe_size bytes with properties,
530 // it SHOULD fill the remaining bytes of properties with zeroes.
531 let remaining_bytes = writer.available_bytes() - size_of::<virtio_iommu_req_tail>();
532
533 if remaining_bytes > 0 {
534 let buffer: Vec<u8> = vec![0; remaining_bytes];
535 writer
536 .write_all(buffer.as_slice())
537 .map_err(IommuError::GuestMemoryWrite)?;
538 }
539
540 Ok(properties_size)
541 }
542
execute_request( &mut self, avail_desc: &mut DescriptorChain, ) -> Result<(usize, Option<EventAsync>)>543 fn execute_request(
544 &mut self,
545 avail_desc: &mut DescriptorChain,
546 ) -> Result<(usize, Option<EventAsync>)> {
547 let reader = &mut avail_desc.reader;
548 let writer = &mut avail_desc.writer;
549
550 // at least we need space to write VirtioIommuReqTail
551 if writer.available_bytes() < size_of::<virtio_iommu_req_tail>() {
552 return Err(IommuError::WriteBufferTooSmall);
553 }
554
555 let req_head: virtio_iommu_req_head =
556 reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
557
558 let mut tail = virtio_iommu_req_tail {
559 status: VIRTIO_IOMMU_S_OK,
560 ..Default::default()
561 };
562
563 let (reply_len, fault_resolved_event) = match req_head.type_ {
564 VIRTIO_IOMMU_T_ATTACH => self.process_attach_request(reader, &mut tail)?,
565 VIRTIO_IOMMU_T_DETACH => self.process_detach_request(reader, &mut tail)?,
566 VIRTIO_IOMMU_T_MAP => (self.process_dma_map_request(reader, &mut tail)?, None),
567 VIRTIO_IOMMU_T_UNMAP => self.process_dma_unmap_request(reader, &mut tail)?,
568 #[cfg(target_arch = "x86_64")]
569 VIRTIO_IOMMU_T_PROBE => (self.process_probe_request(reader, writer, &mut tail)?, None),
570 _ => return Err(IommuError::UnexpectedDescriptor),
571 };
572
573 writer
574 .write_all(tail.as_bytes())
575 .map_err(IommuError::GuestMemoryWrite)?;
576 Ok((
577 reply_len + size_of::<virtio_iommu_req_tail>(),
578 fault_resolved_event,
579 ))
580 }
581 }
582
request_queue( state: &Rc<RefCell<State>>, mut queue: Queue, mut queue_event: EventAsync, interrupt: Interrupt, ) -> Result<()>583 async fn request_queue(
584 state: &Rc<RefCell<State>>,
585 mut queue: Queue,
586 mut queue_event: EventAsync,
587 interrupt: Interrupt,
588 ) -> Result<()> {
589 loop {
590 let mut avail_desc = queue
591 .next_async(&mut queue_event)
592 .await
593 .map_err(IommuError::ReadAsyncDesc)?;
594
595 let (len, fault_resolved_event) = match state.borrow_mut().execute_request(&mut avail_desc)
596 {
597 Ok(res) => res,
598 Err(e) => {
599 error!("execute_request failed: {}", e);
600
601 // If a request type is not recognized, the device SHOULD NOT write
602 // the buffer and SHOULD set the used length to zero
603 (0, None)
604 }
605 };
606
607 if let Some(fault_resolved_event) = fault_resolved_event {
608 debug!("waiting for iommu fault resolution");
609 fault_resolved_event
610 .next_val()
611 .await
612 .expect("failed waiting for fault");
613 debug!("iommu fault resolved");
614 }
615
616 queue.add_used(avail_desc, len as u32);
617 queue.trigger_interrupt(&interrupt);
618 }
619 }
620
run( state: State, iommu_device_tube: Tube, mut queues: BTreeMap<usize, Queue>, kill_evt: Event, interrupt: Interrupt, translate_response_senders: Option<BTreeMap<u32, Tube>>, translate_request_rx: Option<Tube>, ) -> Result<()>621 fn run(
622 state: State,
623 iommu_device_tube: Tube,
624 mut queues: BTreeMap<usize, Queue>,
625 kill_evt: Event,
626 interrupt: Interrupt,
627 translate_response_senders: Option<BTreeMap<u32, Tube>>,
628 translate_request_rx: Option<Tube>,
629 ) -> Result<()> {
630 let state = Rc::new(RefCell::new(state));
631 let ex = Executor::new().expect("Failed to create an executor");
632
633 let req_queue = queues.remove(&0).unwrap();
634 let req_evt = req_queue
635 .event()
636 .try_clone()
637 .expect("Failed to clone queue event");
638 let req_evt = EventAsync::new(req_evt, &ex).expect("Failed to create async event for queue");
639
640 let f_resample = async_utils::handle_irq_resample(&ex, interrupt.clone());
641 let f_kill = async_utils::await_and_exit(&ex, kill_evt);
642
643 let request_tube = translate_request_rx
644 .map(|t| AsyncTube::new(&ex, t).expect("Failed to create async tube for rx"));
645 let response_tubes = translate_response_senders.map(|m| {
646 m.into_iter()
647 .map(|x| {
648 (
649 x.0,
650 AsyncTube::new(&ex, x.1).expect("Failed to create async tube"),
651 )
652 })
653 .collect()
654 });
655
656 let f_handle_translate_request =
657 sys::handle_translate_request(&ex, &state, request_tube, response_tubes);
658 let f_request = request_queue(&state, req_queue, req_evt, interrupt);
659
660 let command_tube = AsyncTube::new(&ex, iommu_device_tube).unwrap();
661 // Future to handle command messages from host, such as passing vfio containers.
662 let f_cmd = sys::handle_command_tube(&state, command_tube);
663
664 let done = async {
665 select! {
666 res = f_request.fuse() => res.context("error in handling request queue"),
667 res = f_resample.fuse() => res.context("error in handle_irq_resample"),
668 res = f_kill.fuse() => res.context("error in await_and_exit"),
669 res = f_handle_translate_request.fuse() => {
670 res.context("error in handle_translate_request")
671 }
672 res = f_cmd.fuse() => res.context("error in handling host request"),
673 }
674 };
675 match ex.run_until(done) {
676 Ok(Ok(())) => {}
677 Ok(Err(e)) => error!("Error in worker: {:#}", e),
678 Err(e) => return Err(IommuError::AsyncExec(e)),
679 }
680
681 Ok(())
682 }
683
684 /// Virtio device for IOMMU memory management.
685 pub struct Iommu {
686 worker_thread: Option<WorkerThread<()>>,
687 config: virtio_iommu_config,
688 avail_features: u64,
689 // Attached endpoints
690 // key: endpoint PCI address
691 // value: reference counter and MemoryMapperTrait
692 endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
693 // Hot-pluggable PCI endpoints ranges
694 // RangeInclusive: (start endpoint PCI address .. =end endpoint PCI address)
695 hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
696 translate_response_senders: Option<BTreeMap<u32, Tube>>,
697 translate_request_rx: Option<Tube>,
698 iommu_device_tube: Option<Tube>,
699 }
700
701 impl Iommu {
702 /// Create a new virtio IOMMU device.
new( base_features: u64, endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, iova_max_addr: u64, hp_endpoints_ranges: Vec<RangeInclusive<u32>>, translate_response_senders: Option<BTreeMap<u32, Tube>>, translate_request_rx: Option<Tube>, iommu_device_tube: Option<Tube>, ) -> SysResult<Iommu>703 pub fn new(
704 base_features: u64,
705 endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
706 iova_max_addr: u64,
707 hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
708 translate_response_senders: Option<BTreeMap<u32, Tube>>,
709 translate_request_rx: Option<Tube>,
710 iommu_device_tube: Option<Tube>,
711 ) -> SysResult<Iommu> {
712 let mut page_size_mask = !((pagesize() as u64) - 1);
713 for (_, container) in endpoints.iter() {
714 page_size_mask &= container
715 .lock()
716 .get_mask()
717 .map_err(|_e| SysError::new(libc::EIO))?;
718 }
719
720 if page_size_mask == 0 {
721 return Err(SysError::new(libc::EIO));
722 }
723
724 let input_range = virtio_iommu_range_64 {
725 start: Le64::from(0),
726 end: iova_max_addr.into(),
727 };
728
729 let config = virtio_iommu_config {
730 page_size_mask: page_size_mask.into(),
731 input_range,
732 #[cfg(target_arch = "x86_64")]
733 probe_size: (IOMMU_PROBE_SIZE as u32).into(),
734 ..Default::default()
735 };
736
737 let mut avail_features: u64 = base_features;
738 avail_features |= 1 << VIRTIO_IOMMU_F_MAP_UNMAP
739 | 1 << VIRTIO_IOMMU_F_INPUT_RANGE
740 | 1 << VIRTIO_IOMMU_F_MMIO;
741
742 if cfg!(target_arch = "x86_64") {
743 avail_features |= 1 << VIRTIO_IOMMU_F_PROBE;
744 }
745
746 Ok(Iommu {
747 worker_thread: None,
748 config,
749 avail_features,
750 endpoints,
751 hp_endpoints_ranges,
752 translate_response_senders,
753 translate_request_rx,
754 iommu_device_tube,
755 })
756 }
757 }
758
759 impl VirtioDevice for Iommu {
keep_rds(&self) -> Vec<RawDescriptor>760 fn keep_rds(&self) -> Vec<RawDescriptor> {
761 let mut rds = Vec::new();
762
763 for (_, mapper) in self.endpoints.iter() {
764 rds.append(&mut mapper.lock().as_raw_descriptors());
765 }
766 if let Some(senders) = &self.translate_response_senders {
767 for (_, tube) in senders.iter() {
768 rds.push(tube.as_raw_descriptor());
769 }
770 }
771 if let Some(rx) = &self.translate_request_rx {
772 rds.push(rx.as_raw_descriptor());
773 }
774
775 if let Some(iommu_device_tube) = &self.iommu_device_tube {
776 rds.push(iommu_device_tube.as_raw_descriptor());
777 }
778
779 rds
780 }
781
device_type(&self) -> DeviceType782 fn device_type(&self) -> DeviceType {
783 DeviceType::Iommu
784 }
785
queue_max_sizes(&self) -> &[u16]786 fn queue_max_sizes(&self) -> &[u16] {
787 QUEUE_SIZES
788 }
789
features(&self) -> u64790 fn features(&self) -> u64 {
791 self.avail_features
792 }
793
read_config(&self, offset: u64, data: &mut [u8])794 fn read_config(&self, offset: u64, data: &mut [u8]) {
795 let mut config: Vec<u8> = Vec::new();
796 config.extend_from_slice(self.config.as_bytes());
797 copy_config(data, 0, config.as_slice(), offset);
798 }
799
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>800 fn activate(
801 &mut self,
802 mem: GuestMemory,
803 interrupt: Interrupt,
804 queues: BTreeMap<usize, Queue>,
805 ) -> anyhow::Result<()> {
806 if queues.len() != QUEUE_SIZES.len() {
807 return Err(anyhow!(
808 "expected {} queues, got {}",
809 QUEUE_SIZES.len(),
810 queues.len()
811 ));
812 }
813
814 // The least significant bit of page_size_masks defines the page
815 // granularity of IOMMU mappings
816 let page_mask = (1u64 << u64::from(self.config.page_size_mask).trailing_zeros()) - 1;
817 let eps = self.endpoints.clone();
818 let hp_endpoints_ranges = self.hp_endpoints_ranges.to_owned();
819
820 let translate_response_senders = self.translate_response_senders.take();
821 let translate_request_rx = self.translate_request_rx.take();
822
823 let iommu_device_tube = self
824 .iommu_device_tube
825 .take()
826 .context("failed to start virtio-iommu worker: No control tube")?;
827
828 self.worker_thread = Some(WorkerThread::start("v_iommu", move |kill_evt| {
829 let state = State {
830 mem,
831 page_mask,
832 hp_endpoints_ranges,
833 endpoint_map: BTreeMap::new(),
834 domain_map: BTreeMap::new(),
835 endpoints: eps,
836 dmabuf_mem: BTreeMap::new(),
837 };
838 let result = run(
839 state,
840 iommu_device_tube,
841 queues,
842 kill_evt,
843 interrupt,
844 translate_response_senders,
845 translate_request_rx,
846 );
847 if let Err(e) = result {
848 error!("virtio-iommu worker thread exited with error: {}", e);
849 }
850 }));
851 Ok(())
852 }
853
854 #[cfg(target_arch = "x86_64")]
generate_acpi( &mut self, pci_address: &Option<PciAddress>, mut sdts: Vec<SDT>, ) -> Option<Vec<SDT>>855 fn generate_acpi(
856 &mut self,
857 pci_address: &Option<PciAddress>,
858 mut sdts: Vec<SDT>,
859 ) -> Option<Vec<SDT>> {
860 const OEM_REVISION: u32 = 1;
861 const VIOT_REVISION: u8 = 0;
862
863 for sdt in sdts.iter() {
864 // there should only be one VIOT table
865 if sdt.is_signature(b"VIOT") {
866 warn!("vIOMMU: duplicate VIOT table detected");
867 return None;
868 }
869 }
870
871 let mut viot = SDT::new(
872 *b"VIOT",
873 acpi_tables::HEADER_LEN,
874 VIOT_REVISION,
875 *b"CROSVM",
876 *b"CROSVMDT",
877 OEM_REVISION,
878 );
879 viot.append(VirtioIommuViotHeader {
880 // # of PCI range nodes + 1 virtio-pci node
881 node_count: (self.endpoints.len() + self.hp_endpoints_ranges.len() + 1) as u16,
882 node_offset: (viot.len() + std::mem::size_of::<VirtioIommuViotHeader>()) as u16,
883 ..Default::default()
884 });
885
886 let bdf = pci_address
887 .or_else(|| {
888 error!("vIOMMU device has no PCI address");
889 None
890 })?
891 .to_u32() as u16;
892 let iommu_offset = viot.len();
893
894 viot.append(VirtioIommuViotVirtioPciNode {
895 type_: VIRTIO_IOMMU_VIOT_NODE_VIRTIO_IOMMU_PCI,
896 length: size_of::<VirtioIommuViotVirtioPciNode>() as u16,
897 bdf,
898 ..Default::default()
899 });
900
901 for (endpoint, _) in self.endpoints.iter() {
902 viot.append(VirtioIommuViotPciRangeNode {
903 type_: VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE,
904 length: size_of::<VirtioIommuViotPciRangeNode>() as u16,
905 endpoint_start: *endpoint,
906 bdf_start: *endpoint as u16,
907 bdf_end: *endpoint as u16,
908 output_node: iommu_offset as u16,
909 ..Default::default()
910 });
911 }
912
913 for endpoints_range in self.hp_endpoints_ranges.iter() {
914 let (endpoint_start, endpoint_end) = endpoints_range.clone().into_inner();
915 viot.append(VirtioIommuViotPciRangeNode {
916 type_: VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE,
917 length: size_of::<VirtioIommuViotPciRangeNode>() as u16,
918 endpoint_start,
919 bdf_start: endpoint_start as u16,
920 bdf_end: endpoint_end as u16,
921 output_node: iommu_offset as u16,
922 ..Default::default()
923 });
924 }
925
926 sdts.push(viot);
927 Some(sdts)
928 }
929 }
930