• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 pub mod ipc_memory_mapper;
6 pub mod memory_mapper;
7 pub mod protocol;
8 pub(crate) mod sys;
9 
10 use std::cell::RefCell;
11 use std::collections::btree_map::Entry;
12 use std::collections::BTreeMap;
13 use std::io;
14 use std::io::Write;
15 use std::mem::size_of;
16 use std::ops::RangeInclusive;
17 use std::rc::Rc;
18 use std::result;
19 use std::sync::Arc;
20 
21 #[cfg(target_arch = "x86_64")]
22 use acpi_tables::sdt::SDT;
23 use anyhow::anyhow;
24 use anyhow::Context;
25 use base::debug;
26 use base::error;
27 use base::pagesize;
28 #[cfg(target_arch = "x86_64")]
29 use base::warn;
30 use base::AsRawDescriptor;
31 use base::Error as SysError;
32 use base::Event;
33 use base::MappedRegion;
34 use base::MemoryMapping;
35 use base::Protection;
36 use base::RawDescriptor;
37 use base::Result as SysResult;
38 use base::Tube;
39 use base::TubeError;
40 use base::WorkerThread;
41 use cros_async::AsyncError;
42 use cros_async::AsyncTube;
43 use cros_async::EventAsync;
44 use cros_async::Executor;
45 use data_model::Le64;
46 use futures::select;
47 use futures::FutureExt;
48 use hypervisor::MemSlot;
49 use remain::sorted;
50 use sync::Mutex;
51 use thiserror::Error;
52 use vm_memory::GuestAddress;
53 use vm_memory::GuestMemory;
54 use vm_memory::GuestMemoryError;
55 use zerocopy::AsBytes;
56 use zerocopy::FromBytes;
57 use zerocopy::FromZeroes;
58 
59 #[cfg(target_arch = "x86_64")]
60 use crate::pci::PciAddress;
61 use crate::virtio::async_utils;
62 use crate::virtio::copy_config;
63 use crate::virtio::iommu::memory_mapper::*;
64 use crate::virtio::iommu::protocol::*;
65 use crate::virtio::DescriptorChain;
66 use crate::virtio::DeviceType;
67 use crate::virtio::Interrupt;
68 use crate::virtio::Queue;
69 use crate::virtio::Reader;
70 use crate::virtio::VirtioDevice;
71 #[cfg(target_arch = "x86_64")]
72 use crate::virtio::Writer;
73 
74 const QUEUE_SIZE: u16 = 256;
75 const NUM_QUEUES: usize = 2;
76 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES];
77 
78 // Size of struct virtio_iommu_probe_property
79 #[cfg(target_arch = "x86_64")]
80 const IOMMU_PROBE_SIZE: usize = size_of::<virtio_iommu_probe_resv_mem>();
81 
82 #[cfg(target_arch = "x86_64")]
83 const VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE: u8 = 1;
84 #[cfg(target_arch = "x86_64")]
85 const VIRTIO_IOMMU_VIOT_NODE_VIRTIO_IOMMU_PCI: u8 = 3;
86 
87 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
88 #[repr(C, packed)]
89 struct VirtioIommuViotHeader {
90     node_count: u16,
91     node_offset: u16,
92     reserved: [u8; 8],
93 }
94 
95 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
96 #[repr(C, packed)]
97 struct VirtioIommuViotVirtioPciNode {
98     type_: u8,
99     reserved: [u8; 1],
100     length: u16,
101     segment: u16,
102     bdf: u16,
103     reserved2: [u8; 8],
104 }
105 
106 #[derive(Copy, Clone, Debug, Default, FromZeroes, FromBytes, AsBytes)]
107 #[repr(C, packed)]
108 struct VirtioIommuViotPciRangeNode {
109     type_: u8,
110     reserved: [u8; 1],
111     length: u16,
112     endpoint_start: u32,
113     segment_start: u16,
114     segment_end: u16,
115     bdf_start: u16,
116     bdf_end: u16,
117     output_node: u16,
118     reserved2: [u8; 2],
119     reserved3: [u8; 4],
120 }
121 
122 type Result<T> = result::Result<T, IommuError>;
123 
124 #[sorted]
125 #[derive(Error, Debug)]
126 pub enum IommuError {
127     #[error("async executor error: {0}")]
128     AsyncExec(AsyncError),
129     #[error("failed to create wait context: {0}")]
130     CreateWaitContext(SysError),
131     #[error("failed getting host address: {0}")]
132     GetHostAddress(GuestMemoryError),
133     #[error("failed to read from guest address: {0}")]
134     GuestMemoryRead(io::Error),
135     #[error("failed to write to guest address: {0}")]
136     GuestMemoryWrite(io::Error),
137     #[error("memory mapper failed: {0}")]
138     MemoryMapper(anyhow::Error),
139     #[error("Failed to read descriptor asynchronously: {0}")]
140     ReadAsyncDesc(AsyncError),
141     #[error("failed to read from virtio queue Event: {0}")]
142     ReadQueueEvent(SysError),
143     #[error("tube error: {0}")]
144     Tube(TubeError),
145     #[error("unexpected descriptor error")]
146     UnexpectedDescriptor,
147     #[error("failed to receive virtio-iommu control request: {0}")]
148     VirtioIOMMUReqError(TubeError),
149     #[error("failed to send virtio-iommu control response: {0}")]
150     VirtioIOMMUResponseError(TubeError),
151     #[error("failed to wait for events: {0}")]
152     WaitError(SysError),
153     #[error("write buffer length too small")]
154     WriteBufferTooSmall,
155 }
156 
157 // key: domain ID
158 // value: reference counter and MemoryMapperTrait
159 type DomainMap = BTreeMap<u32, (u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>)>;
160 
161 struct DmabufRegionEntry {
162     mmap: MemoryMapping,
163     mem_slot: MemSlot,
164     len: u64,
165 }
166 
167 // Shared state for the virtio-iommu device.
168 struct State {
169     mem: GuestMemory,
170     page_mask: u64,
171     // Hot-pluggable PCI endpoints ranges
172     // RangeInclusive: (start endpoint PCI address .. =end endpoint PCI address)
173     #[cfg_attr(windows, allow(dead_code))]
174     hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
175     // All PCI endpoints that attach to certain IOMMU domain
176     // key: endpoint PCI address
177     // value: attached domain ID
178     endpoint_map: BTreeMap<u32, u32>,
179     // All attached domains
180     domain_map: DomainMap,
181     // Contains all pass-through endpoints that attach to this IOMMU device
182     // key: endpoint PCI address
183     // value: reference counter and MemoryMapperTrait
184     endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
185     // Contains dmabuf regions
186     // key: guest physical address
187     dmabuf_mem: BTreeMap<u64, DmabufRegionEntry>,
188 }
189 
190 impl State {
191     // Detach the given endpoint if possible, and return whether or not the endpoint
192     // was actually detached. If a successfully detached endpoint has exported
193     // memory, returns an event that will be signaled once all exported memory is released.
194     //
195     // The device MUST ensure that after being detached from a domain, the endpoint
196     // cannot access any mapping from that domain.
197     //
198     // Currently, we only support detaching an endpoint if it is the only endpoint attached
199     // to its domain.
detach_endpoint( endpoint_map: &mut BTreeMap<u32, u32>, domain_map: &mut DomainMap, endpoint: u32, ) -> (bool, Option<EventAsync>)200     fn detach_endpoint(
201         endpoint_map: &mut BTreeMap<u32, u32>,
202         domain_map: &mut DomainMap,
203         endpoint: u32,
204     ) -> (bool, Option<EventAsync>) {
205         let mut evt = None;
206         // The endpoint has attached to an IOMMU domain
207         if let Some(attached_domain) = endpoint_map.get(&endpoint) {
208             // Remove the entry or update the domain reference count
209             if let Entry::Occupied(o) = domain_map.entry(*attached_domain) {
210                 let (refs, mapper) = o.get();
211                 if !mapper.lock().supports_detach() {
212                     return (false, None);
213                 }
214 
215                 match refs {
216                     0 => unreachable!(),
217                     1 => {
218                         evt = mapper.lock().reset_domain();
219                         o.remove();
220                     }
221                     _ => return (false, None),
222                 }
223             }
224         }
225 
226         endpoint_map.remove(&endpoint);
227         (true, evt)
228     }
229 
230     // Processes an attach request. This may require detaching the endpoint from
231     // its current endpoint before attaching it to a new endpoint. If that happens
232     // while the endpoint has exported memory, this function returns an event that
233     // will be signaled once all exported memory is released.
234     //
235     // Notes: if a VFIO group contains multiple devices, it could violate the follow
236     // requirement from the virtio IOMMU spec: If the VIRTIO_IOMMU_F_BYPASS feature
237     // is negotiated, all accesses from unattached endpoints are allowed and translated
238     // by the IOMMU using the identity function. If the feature is not negotiated, any
239     // memory access from an unattached endpoint fails.
240     //
241     // This happens after the virtio-iommu device receives a VIRTIO_IOMMU_T_ATTACH
242     // request for the first endpoint in a VFIO group, any not yet attached endpoints
243     // in the VFIO group will be able to access the domain.
244     //
245     // This violation is benign for current virtualization use cases. Since device
246     // topology in the guest matches topology in the host, the guest doesn't expect
247     // the device in the same VFIO group are isolated from each other in the first place.
process_attach_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>248     fn process_attach_request(
249         &mut self,
250         reader: &mut Reader,
251         tail: &mut virtio_iommu_req_tail,
252     ) -> Result<(usize, Option<EventAsync>)> {
253         let req: virtio_iommu_req_attach =
254             reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
255         let mut fault_resolved_event = None;
256 
257         // If the reserved field of an ATTACH request is not zero,
258         // the device MUST reject the request and set status to
259         // VIRTIO_IOMMU_S_INVAL.
260         if req.reserved.iter().any(|&x| x != 0) {
261             tail.status = VIRTIO_IOMMU_S_INVAL;
262             return Ok((0, None));
263         }
264 
265         let domain: u32 = req.domain.into();
266         let endpoint: u32 = req.endpoint.into();
267 
268         if let Some(mapper) = self.endpoints.get(&endpoint) {
269             // The same mapper can't be used for two domains at the same time,
270             // since that would result in conflicts/permission leaks between
271             // the two domains.
272             let mapper_id = {
273                 let m = mapper.lock();
274                 ((**m).type_id(), m.id())
275             };
276             for (other_endpoint, other_mapper) in self.endpoints.iter() {
277                 if *other_endpoint == endpoint {
278                     continue;
279                 }
280                 let other_id = {
281                     let m = other_mapper.lock();
282                     ((**m).type_id(), m.id())
283                 };
284                 if mapper_id == other_id {
285                     if !self
286                         .endpoint_map
287                         .get(other_endpoint)
288                         .map_or(true, |d| d == &domain)
289                     {
290                         tail.status = VIRTIO_IOMMU_S_UNSUPP;
291                         return Ok((0, None));
292                     }
293                 }
294             }
295 
296             // If the endpoint identified by `endpoint` is already attached
297             // to another domain, then the device SHOULD first detach it
298             // from that domain and attach it to the one identified by domain.
299             if self.endpoint_map.contains_key(&endpoint) {
300                 // In that case the device SHOULD behave as if the driver issued
301                 // a DETACH request with this endpoint, followed by the ATTACH
302                 // request. If the device cannot do so, it MUST reject the request
303                 // and set status to VIRTIO_IOMMU_S_UNSUPP.
304                 let (detached, evt) =
305                     Self::detach_endpoint(&mut self.endpoint_map, &mut self.domain_map, endpoint);
306                 if !detached {
307                     tail.status = VIRTIO_IOMMU_S_UNSUPP;
308                     return Ok((0, None));
309                 }
310                 fault_resolved_event = evt;
311             }
312 
313             let new_ref = match self.domain_map.get(&domain) {
314                 None => 1,
315                 Some(val) => val.0 + 1,
316             };
317 
318             self.endpoint_map.insert(endpoint, domain);
319             self.domain_map.insert(domain, (new_ref, mapper.clone()));
320         } else {
321             // If the endpoint identified by endpoint doesn’t exist,
322             // the device MUST reject the request and set status to
323             // VIRTIO_IOMMU_S_NOENT.
324             tail.status = VIRTIO_IOMMU_S_NOENT;
325         }
326 
327         Ok((0, fault_resolved_event))
328     }
329 
process_detach_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>330     fn process_detach_request(
331         &mut self,
332         reader: &mut Reader,
333         tail: &mut virtio_iommu_req_tail,
334     ) -> Result<(usize, Option<EventAsync>)> {
335         let req: virtio_iommu_req_detach =
336             reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
337 
338         // If the endpoint identified by |req.endpoint| doesn’t exist,
339         // the device MUST reject the request and set status to
340         // VIRTIO_IOMMU_S_NOENT.
341         let endpoint: u32 = req.endpoint.into();
342         if !self.endpoints.contains_key(&endpoint) {
343             tail.status = VIRTIO_IOMMU_S_NOENT;
344             return Ok((0, None));
345         }
346 
347         let (detached, evt) =
348             Self::detach_endpoint(&mut self.endpoint_map, &mut self.domain_map, endpoint);
349         if !detached {
350             tail.status = VIRTIO_IOMMU_S_UNSUPP;
351         }
352         Ok((0, evt))
353     }
354 
process_dma_map_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<usize>355     fn process_dma_map_request(
356         &mut self,
357         reader: &mut Reader,
358         tail: &mut virtio_iommu_req_tail,
359     ) -> Result<usize> {
360         let req: virtio_iommu_req_map = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
361 
362         // If virt_start, phys_start or (virt_end + 1) is not aligned
363         // on the page granularity, the device SHOULD reject the
364         // request and set status to VIRTIO_IOMMU_S_RANGE
365         if self.page_mask & u64::from(req.phys_start) != 0
366             || self.page_mask & u64::from(req.virt_start) != 0
367             || self.page_mask & (u64::from(req.virt_end) + 1) != 0
368         {
369             tail.status = VIRTIO_IOMMU_S_RANGE;
370             return Ok(0);
371         }
372 
373         // If the device doesn’t recognize a flags bit, it MUST reject
374         // the request and set status to VIRTIO_IOMMU_S_INVAL.
375         if u32::from(req.flags) & !VIRTIO_IOMMU_MAP_F_MASK != 0 {
376             tail.status = VIRTIO_IOMMU_S_INVAL;
377             return Ok(0);
378         }
379 
380         let domain: u32 = req.domain.into();
381         if !self.domain_map.contains_key(&domain) {
382             // If domain does not exist, the device SHOULD reject
383             // the request and set status to VIRTIO_IOMMU_S_NOENT.
384             tail.status = VIRTIO_IOMMU_S_NOENT;
385             return Ok(0);
386         }
387 
388         // The device MUST NOT allow writes to a range mapped
389         // without the VIRTIO_IOMMU_MAP_F_WRITE flag.
390         let write_en = u32::from(req.flags) & VIRTIO_IOMMU_MAP_F_WRITE != 0;
391 
392         if let Some(mapper) = self.domain_map.get(&domain) {
393             let size = u64::from(req.virt_end) - u64::from(req.virt_start) + 1u64;
394 
395             let dmabuf_map = self
396                 .dmabuf_mem
397                 .range(..=u64::from(req.phys_start))
398                 .next_back()
399                 .and_then(|(addr, region)| {
400                     if u64::from(req.phys_start) + size <= addr + region.len {
401                         Some(region.mmap.as_ptr() as u64 + (u64::from(req.phys_start) - addr))
402                     } else {
403                         None
404                     }
405                 });
406 
407             let prot = match write_en {
408                 true => Protection::read_write(),
409                 false => Protection::read(),
410             };
411 
412             let vfio_map_result = match dmabuf_map {
413                 // SAFETY:
414                 // Safe because [dmabuf_map, dmabuf_map + size) refers to an external mmap'ed
415                 // region.
416                 Some(dmabuf_map) => unsafe {
417                     mapper
418                         .1
419                         .lock()
420                         .vfio_dma_map(req.virt_start.into(), dmabuf_map, size, prot)
421                 },
422                 None => mapper.1.lock().add_map(MappingInfo {
423                     iova: req.virt_start.into(),
424                     gpa: GuestAddress(req.phys_start.into()),
425                     size,
426                     prot,
427                 }),
428             };
429 
430             match vfio_map_result {
431                 Ok(AddMapResult::Ok) => (),
432                 Ok(AddMapResult::OverlapFailure) => {
433                     // If a mapping already exists in the requested range,
434                     // the device SHOULD reject the request and set status
435                     // to VIRTIO_IOMMU_S_INVAL.
436                     tail.status = VIRTIO_IOMMU_S_INVAL;
437                 }
438                 Err(e) => return Err(IommuError::MemoryMapper(e)),
439             }
440         }
441 
442         Ok(0)
443     }
444 
process_dma_unmap_request( &mut self, reader: &mut Reader, tail: &mut virtio_iommu_req_tail, ) -> Result<(usize, Option<EventAsync>)>445     fn process_dma_unmap_request(
446         &mut self,
447         reader: &mut Reader,
448         tail: &mut virtio_iommu_req_tail,
449     ) -> Result<(usize, Option<EventAsync>)> {
450         let req: virtio_iommu_req_unmap = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
451 
452         let domain: u32 = req.domain.into();
453         let fault_resolved_event = if let Some(mapper) = self.domain_map.get(&domain) {
454             let size = u64::from(req.virt_end) - u64::from(req.virt_start) + 1;
455             let res = mapper
456                 .1
457                 .lock()
458                 .remove_map(u64::from(req.virt_start), size)
459                 .map_err(IommuError::MemoryMapper)?;
460             match res {
461                 RemoveMapResult::Success(evt) => evt,
462                 RemoveMapResult::OverlapFailure => {
463                     // If a mapping affected by the range is not covered in its entirety by the
464                     // range (the UNMAP request would split the mapping), then the device SHOULD
465                     // set the request `status` to VIRTIO_IOMMU_S_RANGE, and SHOULD NOT remove
466                     // any mapping.
467                     tail.status = VIRTIO_IOMMU_S_RANGE;
468                     None
469                 }
470             }
471         } else {
472             // If domain does not exist, the device SHOULD set the
473             // request status to VIRTIO_IOMMU_S_NOENT
474             tail.status = VIRTIO_IOMMU_S_NOENT;
475             None
476         };
477 
478         Ok((0, fault_resolved_event))
479     }
480 
481     #[cfg(target_arch = "x86_64")]
process_probe_request( &mut self, reader: &mut Reader, writer: &mut Writer, tail: &mut virtio_iommu_req_tail, ) -> Result<usize>482     fn process_probe_request(
483         &mut self,
484         reader: &mut Reader,
485         writer: &mut Writer,
486         tail: &mut virtio_iommu_req_tail,
487     ) -> Result<usize> {
488         let req: virtio_iommu_req_probe = reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
489         let endpoint: u32 = req.endpoint.into();
490 
491         // If the endpoint identified by endpoint doesn’t exist,
492         // then the device SHOULD reject the request and set status
493         // to VIRTIO_IOMMU_S_NOENT.
494         if !self.endpoints.contains_key(&endpoint) {
495             tail.status = VIRTIO_IOMMU_S_NOENT;
496         }
497 
498         let properties_size = writer.available_bytes() - size_of::<virtio_iommu_req_tail>();
499 
500         // It's OK if properties_size is larger than probe_size
501         // We are good even if properties_size is 0
502         if properties_size < IOMMU_PROBE_SIZE {
503             // If the properties list is smaller than probe_size, the device
504             // SHOULD NOT write any property. It SHOULD reject the request
505             // and set status to VIRTIO_IOMMU_S_INVAL.
506             tail.status = VIRTIO_IOMMU_S_INVAL;
507         } else if tail.status == VIRTIO_IOMMU_S_OK {
508             const VIRTIO_IOMMU_PROBE_T_RESV_MEM: u16 = 1;
509             const VIRTIO_IOMMU_RESV_MEM_T_MSI: u8 = 1;
510             const PROBE_PROPERTY_SIZE: u16 = 4;
511             const X86_MSI_IOVA_START: u64 = 0xfee0_0000;
512             const X86_MSI_IOVA_END: u64 = 0xfeef_ffff;
513 
514             let properties = virtio_iommu_probe_resv_mem {
515                 head: virtio_iommu_probe_property {
516                     type_: VIRTIO_IOMMU_PROBE_T_RESV_MEM.into(),
517                     length: (IOMMU_PROBE_SIZE as u16 - PROBE_PROPERTY_SIZE).into(),
518                 },
519                 subtype: VIRTIO_IOMMU_RESV_MEM_T_MSI,
520                 start: X86_MSI_IOVA_START.into(),
521                 end: X86_MSI_IOVA_END.into(),
522                 ..Default::default()
523             };
524             writer
525                 .write_all(properties.as_bytes())
526                 .map_err(IommuError::GuestMemoryWrite)?;
527         }
528 
529         // If the device doesn’t fill all probe_size bytes with properties,
530         // it SHOULD fill the remaining bytes of properties with zeroes.
531         let remaining_bytes = writer.available_bytes() - size_of::<virtio_iommu_req_tail>();
532 
533         if remaining_bytes > 0 {
534             let buffer: Vec<u8> = vec![0; remaining_bytes];
535             writer
536                 .write_all(buffer.as_slice())
537                 .map_err(IommuError::GuestMemoryWrite)?;
538         }
539 
540         Ok(properties_size)
541     }
542 
execute_request( &mut self, avail_desc: &mut DescriptorChain, ) -> Result<(usize, Option<EventAsync>)>543     fn execute_request(
544         &mut self,
545         avail_desc: &mut DescriptorChain,
546     ) -> Result<(usize, Option<EventAsync>)> {
547         let reader = &mut avail_desc.reader;
548         let writer = &mut avail_desc.writer;
549 
550         // at least we need space to write VirtioIommuReqTail
551         if writer.available_bytes() < size_of::<virtio_iommu_req_tail>() {
552             return Err(IommuError::WriteBufferTooSmall);
553         }
554 
555         let req_head: virtio_iommu_req_head =
556             reader.read_obj().map_err(IommuError::GuestMemoryRead)?;
557 
558         let mut tail = virtio_iommu_req_tail {
559             status: VIRTIO_IOMMU_S_OK,
560             ..Default::default()
561         };
562 
563         let (reply_len, fault_resolved_event) = match req_head.type_ {
564             VIRTIO_IOMMU_T_ATTACH => self.process_attach_request(reader, &mut tail)?,
565             VIRTIO_IOMMU_T_DETACH => self.process_detach_request(reader, &mut tail)?,
566             VIRTIO_IOMMU_T_MAP => (self.process_dma_map_request(reader, &mut tail)?, None),
567             VIRTIO_IOMMU_T_UNMAP => self.process_dma_unmap_request(reader, &mut tail)?,
568             #[cfg(target_arch = "x86_64")]
569             VIRTIO_IOMMU_T_PROBE => (self.process_probe_request(reader, writer, &mut tail)?, None),
570             _ => return Err(IommuError::UnexpectedDescriptor),
571         };
572 
573         writer
574             .write_all(tail.as_bytes())
575             .map_err(IommuError::GuestMemoryWrite)?;
576         Ok((
577             reply_len + size_of::<virtio_iommu_req_tail>(),
578             fault_resolved_event,
579         ))
580     }
581 }
582 
request_queue( state: &Rc<RefCell<State>>, mut queue: Queue, mut queue_event: EventAsync, interrupt: Interrupt, ) -> Result<()>583 async fn request_queue(
584     state: &Rc<RefCell<State>>,
585     mut queue: Queue,
586     mut queue_event: EventAsync,
587     interrupt: Interrupt,
588 ) -> Result<()> {
589     loop {
590         let mut avail_desc = queue
591             .next_async(&mut queue_event)
592             .await
593             .map_err(IommuError::ReadAsyncDesc)?;
594 
595         let (len, fault_resolved_event) = match state.borrow_mut().execute_request(&mut avail_desc)
596         {
597             Ok(res) => res,
598             Err(e) => {
599                 error!("execute_request failed: {}", e);
600 
601                 // If a request type is not recognized, the device SHOULD NOT write
602                 // the buffer and SHOULD set the used length to zero
603                 (0, None)
604             }
605         };
606 
607         if let Some(fault_resolved_event) = fault_resolved_event {
608             debug!("waiting for iommu fault resolution");
609             fault_resolved_event
610                 .next_val()
611                 .await
612                 .expect("failed waiting for fault");
613             debug!("iommu fault resolved");
614         }
615 
616         queue.add_used(avail_desc, len as u32);
617         queue.trigger_interrupt(&interrupt);
618     }
619 }
620 
run( state: State, iommu_device_tube: Tube, mut queues: BTreeMap<usize, Queue>, kill_evt: Event, interrupt: Interrupt, translate_response_senders: Option<BTreeMap<u32, Tube>>, translate_request_rx: Option<Tube>, ) -> Result<()>621 fn run(
622     state: State,
623     iommu_device_tube: Tube,
624     mut queues: BTreeMap<usize, Queue>,
625     kill_evt: Event,
626     interrupt: Interrupt,
627     translate_response_senders: Option<BTreeMap<u32, Tube>>,
628     translate_request_rx: Option<Tube>,
629 ) -> Result<()> {
630     let state = Rc::new(RefCell::new(state));
631     let ex = Executor::new().expect("Failed to create an executor");
632 
633     let req_queue = queues.remove(&0).unwrap();
634     let req_evt = req_queue
635         .event()
636         .try_clone()
637         .expect("Failed to clone queue event");
638     let req_evt = EventAsync::new(req_evt, &ex).expect("Failed to create async event for queue");
639 
640     let f_resample = async_utils::handle_irq_resample(&ex, interrupt.clone());
641     let f_kill = async_utils::await_and_exit(&ex, kill_evt);
642 
643     let request_tube = translate_request_rx
644         .map(|t| AsyncTube::new(&ex, t).expect("Failed to create async tube for rx"));
645     let response_tubes = translate_response_senders.map(|m| {
646         m.into_iter()
647             .map(|x| {
648                 (
649                     x.0,
650                     AsyncTube::new(&ex, x.1).expect("Failed to create async tube"),
651                 )
652             })
653             .collect()
654     });
655 
656     let f_handle_translate_request =
657         sys::handle_translate_request(&ex, &state, request_tube, response_tubes);
658     let f_request = request_queue(&state, req_queue, req_evt, interrupt);
659 
660     let command_tube = AsyncTube::new(&ex, iommu_device_tube).unwrap();
661     // Future to handle command messages from host, such as passing vfio containers.
662     let f_cmd = sys::handle_command_tube(&state, command_tube);
663 
664     let done = async {
665         select! {
666             res = f_request.fuse() => res.context("error in handling request queue"),
667             res = f_resample.fuse() => res.context("error in handle_irq_resample"),
668             res = f_kill.fuse() => res.context("error in await_and_exit"),
669             res = f_handle_translate_request.fuse() => {
670                 res.context("error in handle_translate_request")
671             }
672             res = f_cmd.fuse() => res.context("error in handling host request"),
673         }
674     };
675     match ex.run_until(done) {
676         Ok(Ok(())) => {}
677         Ok(Err(e)) => error!("Error in worker: {:#}", e),
678         Err(e) => return Err(IommuError::AsyncExec(e)),
679     }
680 
681     Ok(())
682 }
683 
684 /// Virtio device for IOMMU memory management.
685 pub struct Iommu {
686     worker_thread: Option<WorkerThread<()>>,
687     config: virtio_iommu_config,
688     avail_features: u64,
689     // Attached endpoints
690     // key: endpoint PCI address
691     // value: reference counter and MemoryMapperTrait
692     endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
693     // Hot-pluggable PCI endpoints ranges
694     // RangeInclusive: (start endpoint PCI address .. =end endpoint PCI address)
695     hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
696     translate_response_senders: Option<BTreeMap<u32, Tube>>,
697     translate_request_rx: Option<Tube>,
698     iommu_device_tube: Option<Tube>,
699 }
700 
701 impl Iommu {
702     /// Create a new virtio IOMMU device.
new( base_features: u64, endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, iova_max_addr: u64, hp_endpoints_ranges: Vec<RangeInclusive<u32>>, translate_response_senders: Option<BTreeMap<u32, Tube>>, translate_request_rx: Option<Tube>, iommu_device_tube: Option<Tube>, ) -> SysResult<Iommu>703     pub fn new(
704         base_features: u64,
705         endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
706         iova_max_addr: u64,
707         hp_endpoints_ranges: Vec<RangeInclusive<u32>>,
708         translate_response_senders: Option<BTreeMap<u32, Tube>>,
709         translate_request_rx: Option<Tube>,
710         iommu_device_tube: Option<Tube>,
711     ) -> SysResult<Iommu> {
712         let mut page_size_mask = !((pagesize() as u64) - 1);
713         for (_, container) in endpoints.iter() {
714             page_size_mask &= container
715                 .lock()
716                 .get_mask()
717                 .map_err(|_e| SysError::new(libc::EIO))?;
718         }
719 
720         if page_size_mask == 0 {
721             return Err(SysError::new(libc::EIO));
722         }
723 
724         let input_range = virtio_iommu_range_64 {
725             start: Le64::from(0),
726             end: iova_max_addr.into(),
727         };
728 
729         let config = virtio_iommu_config {
730             page_size_mask: page_size_mask.into(),
731             input_range,
732             #[cfg(target_arch = "x86_64")]
733             probe_size: (IOMMU_PROBE_SIZE as u32).into(),
734             ..Default::default()
735         };
736 
737         let mut avail_features: u64 = base_features;
738         avail_features |= 1 << VIRTIO_IOMMU_F_MAP_UNMAP
739             | 1 << VIRTIO_IOMMU_F_INPUT_RANGE
740             | 1 << VIRTIO_IOMMU_F_MMIO;
741 
742         if cfg!(target_arch = "x86_64") {
743             avail_features |= 1 << VIRTIO_IOMMU_F_PROBE;
744         }
745 
746         Ok(Iommu {
747             worker_thread: None,
748             config,
749             avail_features,
750             endpoints,
751             hp_endpoints_ranges,
752             translate_response_senders,
753             translate_request_rx,
754             iommu_device_tube,
755         })
756     }
757 }
758 
759 impl VirtioDevice for Iommu {
keep_rds(&self) -> Vec<RawDescriptor>760     fn keep_rds(&self) -> Vec<RawDescriptor> {
761         let mut rds = Vec::new();
762 
763         for (_, mapper) in self.endpoints.iter() {
764             rds.append(&mut mapper.lock().as_raw_descriptors());
765         }
766         if let Some(senders) = &self.translate_response_senders {
767             for (_, tube) in senders.iter() {
768                 rds.push(tube.as_raw_descriptor());
769             }
770         }
771         if let Some(rx) = &self.translate_request_rx {
772             rds.push(rx.as_raw_descriptor());
773         }
774 
775         if let Some(iommu_device_tube) = &self.iommu_device_tube {
776             rds.push(iommu_device_tube.as_raw_descriptor());
777         }
778 
779         rds
780     }
781 
device_type(&self) -> DeviceType782     fn device_type(&self) -> DeviceType {
783         DeviceType::Iommu
784     }
785 
queue_max_sizes(&self) -> &[u16]786     fn queue_max_sizes(&self) -> &[u16] {
787         QUEUE_SIZES
788     }
789 
features(&self) -> u64790     fn features(&self) -> u64 {
791         self.avail_features
792     }
793 
read_config(&self, offset: u64, data: &mut [u8])794     fn read_config(&self, offset: u64, data: &mut [u8]) {
795         let mut config: Vec<u8> = Vec::new();
796         config.extend_from_slice(self.config.as_bytes());
797         copy_config(data, 0, config.as_slice(), offset);
798     }
799 
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>800     fn activate(
801         &mut self,
802         mem: GuestMemory,
803         interrupt: Interrupt,
804         queues: BTreeMap<usize, Queue>,
805     ) -> anyhow::Result<()> {
806         if queues.len() != QUEUE_SIZES.len() {
807             return Err(anyhow!(
808                 "expected {} queues, got {}",
809                 QUEUE_SIZES.len(),
810                 queues.len()
811             ));
812         }
813 
814         // The least significant bit of page_size_masks defines the page
815         // granularity of IOMMU mappings
816         let page_mask = (1u64 << u64::from(self.config.page_size_mask).trailing_zeros()) - 1;
817         let eps = self.endpoints.clone();
818         let hp_endpoints_ranges = self.hp_endpoints_ranges.to_owned();
819 
820         let translate_response_senders = self.translate_response_senders.take();
821         let translate_request_rx = self.translate_request_rx.take();
822 
823         let iommu_device_tube = self
824             .iommu_device_tube
825             .take()
826             .context("failed to start virtio-iommu worker: No control tube")?;
827 
828         self.worker_thread = Some(WorkerThread::start("v_iommu", move |kill_evt| {
829             let state = State {
830                 mem,
831                 page_mask,
832                 hp_endpoints_ranges,
833                 endpoint_map: BTreeMap::new(),
834                 domain_map: BTreeMap::new(),
835                 endpoints: eps,
836                 dmabuf_mem: BTreeMap::new(),
837             };
838             let result = run(
839                 state,
840                 iommu_device_tube,
841                 queues,
842                 kill_evt,
843                 interrupt,
844                 translate_response_senders,
845                 translate_request_rx,
846             );
847             if let Err(e) = result {
848                 error!("virtio-iommu worker thread exited with error: {}", e);
849             }
850         }));
851         Ok(())
852     }
853 
854     #[cfg(target_arch = "x86_64")]
generate_acpi( &mut self, pci_address: &Option<PciAddress>, mut sdts: Vec<SDT>, ) -> Option<Vec<SDT>>855     fn generate_acpi(
856         &mut self,
857         pci_address: &Option<PciAddress>,
858         mut sdts: Vec<SDT>,
859     ) -> Option<Vec<SDT>> {
860         const OEM_REVISION: u32 = 1;
861         const VIOT_REVISION: u8 = 0;
862 
863         for sdt in sdts.iter() {
864             // there should only be one VIOT table
865             if sdt.is_signature(b"VIOT") {
866                 warn!("vIOMMU: duplicate VIOT table detected");
867                 return None;
868             }
869         }
870 
871         let mut viot = SDT::new(
872             *b"VIOT",
873             acpi_tables::HEADER_LEN,
874             VIOT_REVISION,
875             *b"CROSVM",
876             *b"CROSVMDT",
877             OEM_REVISION,
878         );
879         viot.append(VirtioIommuViotHeader {
880             // # of PCI range nodes + 1 virtio-pci node
881             node_count: (self.endpoints.len() + self.hp_endpoints_ranges.len() + 1) as u16,
882             node_offset: (viot.len() + std::mem::size_of::<VirtioIommuViotHeader>()) as u16,
883             ..Default::default()
884         });
885 
886         let bdf = pci_address
887             .or_else(|| {
888                 error!("vIOMMU device has no PCI address");
889                 None
890             })?
891             .to_u32() as u16;
892         let iommu_offset = viot.len();
893 
894         viot.append(VirtioIommuViotVirtioPciNode {
895             type_: VIRTIO_IOMMU_VIOT_NODE_VIRTIO_IOMMU_PCI,
896             length: size_of::<VirtioIommuViotVirtioPciNode>() as u16,
897             bdf,
898             ..Default::default()
899         });
900 
901         for (endpoint, _) in self.endpoints.iter() {
902             viot.append(VirtioIommuViotPciRangeNode {
903                 type_: VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE,
904                 length: size_of::<VirtioIommuViotPciRangeNode>() as u16,
905                 endpoint_start: *endpoint,
906                 bdf_start: *endpoint as u16,
907                 bdf_end: *endpoint as u16,
908                 output_node: iommu_offset as u16,
909                 ..Default::default()
910             });
911         }
912 
913         for endpoints_range in self.hp_endpoints_ranges.iter() {
914             let (endpoint_start, endpoint_end) = endpoints_range.clone().into_inner();
915             viot.append(VirtioIommuViotPciRangeNode {
916                 type_: VIRTIO_IOMMU_VIOT_NODE_PCI_RANGE,
917                 length: size_of::<VirtioIommuViotPciRangeNode>() as u16,
918                 endpoint_start,
919                 bdf_start: endpoint_start as u16,
920                 bdf_end: endpoint_end as u16,
921                 output_node: iommu_offset as u16,
922                 ..Default::default()
923             });
924         }
925 
926         sdts.push(viot);
927         Some(sdts)
928     }
929 }
930