• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::num::Wrapping;
6 use std::sync::atomic::fence;
7 use std::sync::atomic::Ordering;
8 use std::sync::Arc;
9 
10 use anyhow::bail;
11 use anyhow::Context;
12 use anyhow::Result;
13 use base::error;
14 use base::warn;
15 use base::Protection;
16 use cros_async::AsyncError;
17 use cros_async::EventAsync;
18 use data_model::Le16;
19 use data_model::Le32;
20 use data_model::Le64;
21 use smallvec::smallvec;
22 use smallvec::SmallVec;
23 use sync::Mutex;
24 use virtio_sys::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
25 use vm_memory::GuestAddress;
26 use vm_memory::GuestMemory;
27 use zerocopy::AsBytes;
28 use zerocopy::FromBytes;
29 
30 use super::SignalableInterrupt;
31 use super::VIRTIO_MSI_NO_VECTOR;
32 use crate::virtio::ipc_memory_mapper::ExportedRegion;
33 use crate::virtio::ipc_memory_mapper::IpcMemoryMapper;
34 use crate::virtio::memory_mapper::MemRegion;
35 use crate::virtio::memory_util::read_obj_from_addr_wrapper;
36 use crate::virtio::memory_util::write_obj_at_addr_wrapper;
37 
38 const VIRTQ_DESC_F_NEXT: u16 = 0x1;
39 const VIRTQ_DESC_F_WRITE: u16 = 0x2;
40 #[allow(dead_code)]
41 const VIRTQ_DESC_F_INDIRECT: u16 = 0x4;
42 
43 #[allow(dead_code)]
44 const VIRTQ_USED_F_NO_NOTIFY: u16 = 0x1;
45 #[allow(dead_code)]
46 const VIRTQ_AVAIL_F_NO_INTERRUPT: u16 = 0x1;
47 
48 /// An iterator over a single descriptor chain.  Not to be confused with AvailIter,
49 /// which iterates over the descriptor chain heads in a queue.
50 pub struct DescIter {
51     next: Option<DescriptorChain>,
52 }
53 
54 impl DescIter {
55     /// Returns an iterator that only yields the readable descriptors in the chain.
readable(self) -> impl Iterator<Item = DescriptorChain>56     pub fn readable(self) -> impl Iterator<Item = DescriptorChain> {
57         self.take_while(DescriptorChain::is_read_only)
58     }
59 
60     /// Returns an iterator that only yields the writable descriptors in the chain.
writable(self) -> impl Iterator<Item = DescriptorChain>61     pub fn writable(self) -> impl Iterator<Item = DescriptorChain> {
62         self.skip_while(DescriptorChain::is_read_only)
63     }
64 }
65 
66 impl Iterator for DescIter {
67     type Item = DescriptorChain;
68 
next(&mut self) -> Option<Self::Item>69     fn next(&mut self) -> Option<Self::Item> {
70         if let Some(current) = self.next.take() {
71             self.next = current.next_descriptor();
72             Some(current)
73         } else {
74             None
75         }
76     }
77 }
78 
79 /// A virtio descriptor chain.
80 #[derive(Clone)]
81 pub struct DescriptorChain {
82     mem: GuestMemory,
83     desc_table: GuestAddress,
84     queue_size: u16,
85     ttl: u16, // used to prevent infinite chain cycles
86 
87     /// Index into the descriptor table
88     pub index: u16,
89 
90     /// Guest physical address of device specific data, or IO virtual address
91     /// if iommu is used
92     pub addr: GuestAddress,
93 
94     /// Length of device specific data
95     pub len: u32,
96 
97     /// Includes next, write, and indirect bits
98     pub flags: u16,
99 
100     /// Index into the descriptor table of the next descriptor if flags has
101     /// the next bit set
102     pub next: u16,
103 
104     /// The memory regions associated with the current descriptor.
105     regions: SmallVec<[MemRegion; 1]>,
106 
107     /// Translates `addr` to guest physical address
108     iommu: Option<Arc<Mutex<IpcMemoryMapper>>>,
109 
110     /// The exported descriptor table of this chain's queue. Present
111     /// iff iommu is present.
112     exported_desc_table: Option<ExportedRegion>,
113 
114     /// The exported iommu region of the current descriptor. Present iff
115     /// iommu is present.
116     exported_region: Option<ExportedRegion>,
117 }
118 
119 #[derive(Copy, Clone, Debug, FromBytes, AsBytes)]
120 #[repr(C)]
121 pub struct Desc {
122     pub addr: Le64,
123     pub len: Le32,
124     pub flags: Le16,
125     pub next: Le16,
126 }
127 
128 impl DescriptorChain {
checked_new( mem: &GuestMemory, desc_table: GuestAddress, queue_size: u16, index: u16, required_flags: u16, iommu: Option<Arc<Mutex<IpcMemoryMapper>>>, exported_desc_table: Option<ExportedRegion>, ) -> Result<DescriptorChain>129     pub(crate) fn checked_new(
130         mem: &GuestMemory,
131         desc_table: GuestAddress,
132         queue_size: u16,
133         index: u16,
134         required_flags: u16,
135         iommu: Option<Arc<Mutex<IpcMemoryMapper>>>,
136         exported_desc_table: Option<ExportedRegion>,
137     ) -> Result<DescriptorChain> {
138         if index >= queue_size {
139             bail!("index ({}) >= queue_size ({})", index, queue_size);
140         }
141 
142         let desc_head = desc_table
143             .checked_add((index as u64) * 16)
144             .context("integer overflow")?;
145         let desc: Desc = read_obj_from_addr_wrapper(mem, &exported_desc_table, desc_head)
146             .with_context(|| format!("failed to read desc {:x}", desc_head.offset()))?;
147 
148         let addr = GuestAddress(desc.addr.into());
149         let len = desc.len.to_native();
150         let (regions, exported_region) = if let Some(iommu) = &iommu {
151             if exported_desc_table.is_none() {
152                 bail!("missing exported descriptor table");
153             }
154 
155             let exported_region =
156                 ExportedRegion::new(mem, iommu.clone(), addr.offset(), len.into())
157                     .context("failed to get mem regions")?;
158 
159             let regions = exported_region.get_mem_regions();
160             let required_prot = if required_flags & VIRTQ_DESC_F_WRITE == 0 {
161                 Protection::read()
162             } else {
163                 Protection::write()
164             };
165             for r in &regions {
166                 if !r.prot.allows(&required_prot) {
167                     bail!("missing RW permissions for descriptor");
168                 }
169             }
170 
171             (regions, Some(exported_region))
172         } else {
173             (
174                 smallvec![MemRegion {
175                     gpa: addr,
176                     len: len.into(),
177                     prot: Protection::read_write(),
178                 }],
179                 None,
180             )
181         };
182 
183         let chain = DescriptorChain {
184             mem: mem.clone(),
185             desc_table,
186             queue_size,
187             ttl: queue_size,
188             index,
189             addr,
190             len,
191             flags: desc.flags.into(),
192             next: desc.next.into(),
193             iommu,
194             regions,
195             exported_region,
196             exported_desc_table,
197         };
198 
199         if chain.is_valid() && chain.flags & required_flags == required_flags {
200             Ok(chain)
201         } else {
202             bail!("chain is invalid")
203         }
204     }
205 
into_mem_regions(self) -> (SmallVec<[MemRegion; 1]>, Option<ExportedRegion>)206     pub fn into_mem_regions(self) -> (SmallVec<[MemRegion; 1]>, Option<ExportedRegion>) {
207         (self.regions, self.exported_region)
208     }
209 
is_valid(&self) -> bool210     fn is_valid(&self) -> bool {
211         if self.len > 0 {
212             // Each region in `self.regions` must be a contiguous range in `self.mem`.
213             if !self
214                 .regions
215                 .iter()
216                 .all(|r| self.mem.is_valid_range(r.gpa, r.len as u64))
217             {
218                 return false;
219             }
220         }
221 
222         !self.has_next() || self.next < self.queue_size
223     }
224 
225     /// Gets if this descriptor chain has another descriptor chain linked after it.
has_next(&self) -> bool226     pub fn has_next(&self) -> bool {
227         self.flags & VIRTQ_DESC_F_NEXT != 0 && self.ttl > 1
228     }
229 
230     /// If the driver designated this as a write only descriptor.
231     ///
232     /// If this is false, this descriptor is read only.
233     /// Write only means the the emulated device can write and the driver can read.
is_write_only(&self) -> bool234     pub fn is_write_only(&self) -> bool {
235         self.flags & VIRTQ_DESC_F_WRITE != 0
236     }
237 
238     /// If the driver designated this as a read only descriptor.
239     ///
240     /// If this is false, this descriptor is write only.
241     /// Read only means the emulated device can read and the driver can write.
is_read_only(&self) -> bool242     pub fn is_read_only(&self) -> bool {
243         self.flags & VIRTQ_DESC_F_WRITE == 0
244     }
245 
246     /// Gets the next descriptor in this descriptor chain, if there is one.
247     ///
248     /// Note that this is distinct from the next descriptor chain returned by `AvailIter`, which is
249     /// the head of the next _available_ descriptor chain.
next_descriptor(&self) -> Option<DescriptorChain>250     pub fn next_descriptor(&self) -> Option<DescriptorChain> {
251         if self.has_next() {
252             // Once we see a write-only descriptor, all subsequent descriptors must be write-only.
253             let required_flags = self.flags & VIRTQ_DESC_F_WRITE;
254             let iommu = self.iommu.as_ref().map(Arc::clone);
255             match DescriptorChain::checked_new(
256                 &self.mem,
257                 self.desc_table,
258                 self.queue_size,
259                 self.next,
260                 required_flags,
261                 iommu,
262                 self.exported_desc_table.clone(),
263             ) {
264                 Ok(mut c) => {
265                     c.ttl = self.ttl - 1;
266                     Some(c)
267                 }
268                 Err(e) => {
269                     error!("{:#}", e);
270                     None
271                 }
272             }
273         } else {
274             None
275         }
276     }
277 
278     /// Produces an iterator over all the descriptors in this chain.
into_iter(self) -> DescIter279     pub fn into_iter(self) -> DescIter {
280         DescIter { next: Some(self) }
281     }
282 }
283 
284 /// Consuming iterator over all available descriptor chain heads in the queue.
285 pub struct AvailIter<'a, 'b> {
286     mem: &'a GuestMemory,
287     queue: &'b mut Queue,
288 }
289 
290 impl<'a, 'b> Iterator for AvailIter<'a, 'b> {
291     type Item = DescriptorChain;
292 
next(&mut self) -> Option<Self::Item>293     fn next(&mut self) -> Option<Self::Item> {
294         self.queue.pop(self.mem)
295     }
296 }
297 
298 /// A virtio queue's parameters.
299 pub struct Queue {
300     /// Whether this queue has already been activated.
301     activated: bool,
302 
303     /// The maximal size in elements offered by the device
304     max_size: u16,
305 
306     /// The queue size in elements the driver selected. This is always guaranteed to be a power of
307     /// two less than or equal to `max_size`, as required for split virtqueues. These invariants are
308     /// enforced by `set_size()`.
309     size: u16,
310 
311     /// Inidcates if the queue is finished with configuration
312     ready: bool,
313 
314     /// MSI-X vector for the queue. Don't care for INTx
315     vector: u16,
316 
317     /// Guest physical address of the descriptor table
318     desc_table: GuestAddress,
319 
320     /// Guest physical address of the available ring
321     avail_ring: GuestAddress,
322 
323     /// Guest physical address of the used ring
324     used_ring: GuestAddress,
325 
326     pub next_avail: Wrapping<u16>,
327     pub next_used: Wrapping<u16>,
328 
329     // Device feature bits accepted by the driver
330     features: u64,
331     last_used: Wrapping<u16>,
332 
333     iommu: Option<Arc<Mutex<IpcMemoryMapper>>>,
334 
335     // When |iommu| is present, |desc_table| and the rings are IOVAs rather than real
336     // GPAs. These are the exported regions used to access the underlying GPAs. They
337     // are initialized by |export_memory| and released by |release_exported_memory|.
338     exported_desc_table: Option<ExportedRegion>,
339     exported_avail_ring: Option<ExportedRegion>,
340     exported_used_ring: Option<ExportedRegion>,
341 }
342 
343 macro_rules! accessors {
344     ($var:ident, $t:ty, $setter:ident) => {
345         pub fn $var(&self) -> $t {
346             self.$var
347         }
348 
349         pub fn $setter(&mut self, val: $t) {
350             if self.ready {
351                 warn!("ignoring write to {} on ready queue", stringify!($var));
352                 return;
353             }
354             self.$var = val;
355         }
356     };
357 }
358 
359 impl Queue {
360     /// Constructs an empty virtio queue with the given `max_size`.
new(max_size: u16) -> Queue361     pub fn new(max_size: u16) -> Queue {
362         assert!(max_size.is_power_of_two());
363         Queue {
364             activated: false,
365             max_size,
366             size: max_size,
367             ready: false,
368             vector: VIRTIO_MSI_NO_VECTOR,
369             desc_table: GuestAddress(0),
370             avail_ring: GuestAddress(0),
371             used_ring: GuestAddress(0),
372             next_avail: Wrapping(0),
373             next_used: Wrapping(0),
374             features: 0,
375             last_used: Wrapping(0),
376             iommu: None,
377             exported_desc_table: None,
378             exported_avail_ring: None,
379             exported_used_ring: None,
380         }
381     }
382 
383     accessors!(vector, u16, set_vector);
384     accessors!(desc_table, GuestAddress, set_desc_table);
385     accessors!(avail_ring, GuestAddress, set_avail_ring);
386     accessors!(used_ring, GuestAddress, set_used_ring);
387 
388     /// Return the maximum size of this queue.
max_size(&self) -> u16389     pub fn max_size(&self) -> u16 {
390         self.max_size
391     }
392 
393     /// Return the actual size of the queue, as the driver may not set up a
394     /// queue as big as the device allows.
size(&self) -> u16395     pub fn size(&self) -> u16 {
396         self.size
397     }
398 
399     /// Set the queue size requested by the driver, which may be smaller than the maximum size.
set_size(&mut self, val: u16)400     pub fn set_size(&mut self, val: u16) {
401         if self.ready {
402             warn!("ignoring write to queue_size on ready queue");
403             return;
404         }
405 
406         if val > self.max_size || !val.is_power_of_two() {
407             warn!(
408                 "ignoring invalid queue_size {} (max_size {})",
409                 val, self.max_size,
410             );
411             return;
412         }
413 
414         self.size = val;
415     }
416 
417     /// Return whether the driver has enabled this queue.
ready(&self) -> bool418     pub fn ready(&self) -> bool {
419         self.ready
420     }
421 
422     /// Signal that the driver has completed queue configuration.
set_ready(&mut self, enable: bool)423     pub fn set_ready(&mut self, enable: bool) {
424         // If the queue is already in the desired state, return early.
425         if enable == self.ready {
426             return;
427         }
428 
429         if enable {
430             // Validate addresses and queue size to ensure that address calculation won't overflow.
431             let ring_sizes = self.ring_sizes();
432             let rings =
433                 ring_sizes
434                     .iter()
435                     .zip(vec!["descriptor table", "available ring", "used ring"]);
436 
437             for ((addr, size), name) in rings {
438                 if addr.checked_add(*size as u64).is_none() {
439                     error!(
440                         "virtio queue {} goes out of bounds: start:0x{:08x} size:0x{:08x}",
441                         name,
442                         addr.offset(),
443                         size,
444                     );
445                     return;
446                 }
447             }
448         }
449 
450         self.ready = enable;
451     }
452 
453     /// Convert the queue configuration into an active queue.
activate(&mut self) -> Result<Queue>454     pub fn activate(&mut self) -> Result<Queue> {
455         if !self.ready {
456             bail!("attempted to activate a non-ready queue");
457         }
458 
459         if self.activated {
460             bail!("queue is already activated");
461         }
462 
463         self.activated = true;
464 
465         let queue = Queue {
466             activated: self.activated,
467             max_size: self.max_size,
468             size: self.size,
469             ready: self.ready,
470             vector: self.vector,
471             desc_table: self.desc_table,
472             avail_ring: self.avail_ring,
473             used_ring: self.used_ring,
474             next_avail: self.next_avail,
475             next_used: self.next_used,
476             features: self.features,
477             last_used: self.last_used,
478             iommu: self.iommu.as_ref().map(Arc::clone),
479             exported_desc_table: self.exported_desc_table.clone(),
480             exported_avail_ring: self.exported_avail_ring.clone(),
481             exported_used_ring: self.exported_used_ring.clone(),
482         };
483         Ok(queue)
484     }
485 
486     // Return `index` modulo the currently configured queue size.
wrap_queue_index(&self, index: Wrapping<u16>) -> u16487     fn wrap_queue_index(&self, index: Wrapping<u16>) -> u16 {
488         // We know that `self.size` is a power of two (enforced by `set_size()`), so the modulus can
489         // be calculated with a bitmask rather than actual division.
490         debug_assert!(self.size.is_power_of_two());
491         index.0 & (self.size - 1)
492     }
493 
494     /// Reset queue to a clean state
reset(&mut self)495     pub fn reset(&mut self) {
496         self.activated = false;
497         self.ready = false;
498         self.size = self.max_size;
499         self.vector = VIRTIO_MSI_NO_VECTOR;
500         self.desc_table = GuestAddress(0);
501         self.avail_ring = GuestAddress(0);
502         self.used_ring = GuestAddress(0);
503         self.next_avail = Wrapping(0);
504         self.next_used = Wrapping(0);
505         self.features = 0;
506         self.last_used = Wrapping(0);
507         self.exported_desc_table = None;
508         self.exported_avail_ring = None;
509         self.exported_used_ring = None;
510     }
511 
512     /// Reset queue's counters.
513     /// This method doesn't change the queue's metadata so it's reusable without initializing it
514     /// again.
reset_counters(&mut self)515     pub fn reset_counters(&mut self) {
516         self.next_avail = Wrapping(0);
517         self.next_used = Wrapping(0);
518         self.last_used = Wrapping(0);
519     }
520 
ring_sizes(&self) -> Vec<(GuestAddress, usize)>521     fn ring_sizes(&self) -> Vec<(GuestAddress, usize)> {
522         let queue_size = self.size as usize;
523         vec![
524             (self.desc_table, 16 * queue_size),
525             (self.avail_ring, 6 + 2 * queue_size),
526             (self.used_ring, 6 + 8 * queue_size),
527         ]
528     }
529 
530     /// If this queue is for a device that sits behind a virtio-iommu device, exports
531     /// this queue's memory. After the queue becomes ready, this must be called before
532     /// using the queue, to convert the IOVA-based configuration to GuestAddresses.
export_memory(&mut self, mem: &GuestMemory) -> Result<()>533     pub fn export_memory(&mut self, mem: &GuestMemory) -> Result<()> {
534         if !self.ready {
535             bail!("not ready");
536         }
537         if self.exported_desc_table.is_some() {
538             bail!("already exported");
539         }
540 
541         let iommu = self.iommu.as_ref().context("no iommu to export with")?;
542 
543         let ring_sizes = self.ring_sizes();
544         let rings = ring_sizes.iter().zip(vec![
545             &mut self.exported_desc_table,
546             &mut self.exported_avail_ring,
547             &mut self.exported_used_ring,
548         ]);
549 
550         for ((addr, size), region) in rings {
551             *region = Some(
552                 ExportedRegion::new(mem, iommu.clone(), addr.offset(), *size as u64)
553                     .context("failed to export region")?,
554             );
555         }
556         Ok(())
557     }
558 
559     /// Releases memory exported by a previous call to [`Queue::export_memory()`].
release_exported_memory(&mut self)560     pub fn release_exported_memory(&mut self) {
561         self.exported_desc_table = None;
562         self.exported_avail_ring = None;
563         self.exported_used_ring = None;
564     }
565 
566     // Get the index of the first available descriptor chain in the available ring
567     // (the next one that the driver will fill).
568     //
569     // All available ring entries between `self.next_avail` and `get_avail_index()` are available
570     // to be processed by the device.
get_avail_index(&self, mem: &GuestMemory) -> Wrapping<u16>571     fn get_avail_index(&self, mem: &GuestMemory) -> Wrapping<u16> {
572         fence(Ordering::SeqCst);
573 
574         let avail_index_addr = self.avail_ring.unchecked_add(2);
575         let avail_index: u16 =
576             read_obj_from_addr_wrapper(mem, &self.exported_avail_ring, avail_index_addr).unwrap();
577 
578         Wrapping(avail_index)
579     }
580 
581     // Set the `avail_event` field in the used ring.
582     //
583     // This allows the device to inform the driver that driver-to-device notification
584     // (kicking the ring) is not necessary until the driver reaches the `avail_index` descriptor.
585     //
586     // This value is only used if the `VIRTIO_F_EVENT_IDX` feature has been negotiated.
set_avail_event(&mut self, mem: &GuestMemory, avail_index: Wrapping<u16>)587     fn set_avail_event(&mut self, mem: &GuestMemory, avail_index: Wrapping<u16>) {
588         fence(Ordering::SeqCst);
589 
590         let avail_event_addr = self.used_ring.unchecked_add(4 + 8 * u64::from(self.size));
591         write_obj_at_addr_wrapper(
592             mem,
593             &self.exported_used_ring,
594             avail_index.0,
595             avail_event_addr,
596         )
597         .unwrap();
598     }
599 
600     // Query the value of a single-bit flag in the available ring.
601     //
602     // Returns `true` if `flag` is currently set (by the driver) in the available ring flags.
get_avail_flag(&self, mem: &GuestMemory, flag: u16) -> bool603     fn get_avail_flag(&self, mem: &GuestMemory, flag: u16) -> bool {
604         fence(Ordering::SeqCst);
605 
606         let avail_flags: u16 =
607             read_obj_from_addr_wrapper(mem, &self.exported_avail_ring, self.avail_ring).unwrap();
608 
609         avail_flags & flag == flag
610     }
611 
612     // Get the `used_event` field in the available ring.
613     //
614     // The returned value is the index of the next descriptor chain entry for which the driver
615     // needs to be notified upon use.  Entries before this index may be used without notifying
616     // the driver.
617     //
618     // This value is only valid if the `VIRTIO_F_EVENT_IDX` feature has been negotiated.
get_used_event(&self, mem: &GuestMemory) -> Wrapping<u16>619     fn get_used_event(&self, mem: &GuestMemory) -> Wrapping<u16> {
620         fence(Ordering::SeqCst);
621 
622         let used_event_addr = self.avail_ring.unchecked_add(4 + 2 * u64::from(self.size));
623         let used_event: u16 =
624             read_obj_from_addr_wrapper(mem, &self.exported_avail_ring, used_event_addr).unwrap();
625 
626         Wrapping(used_event)
627     }
628 
629     // Set the `idx` field in the used ring.
630     //
631     // This indicates to the driver that all entries up to (but not including) `used_index` have
632     // been used by the device and may be processed by the driver.
set_used_index(&mut self, mem: &GuestMemory, used_index: Wrapping<u16>)633     fn set_used_index(&mut self, mem: &GuestMemory, used_index: Wrapping<u16>) {
634         fence(Ordering::SeqCst);
635 
636         let used_index_addr = self.used_ring.unchecked_add(2);
637         write_obj_at_addr_wrapper(mem, &self.exported_used_ring, used_index.0, used_index_addr)
638             .unwrap();
639     }
640 
641     /// Get the first available descriptor chain without removing it from the queue.
642     /// Call `pop_peeked` to remove the returned descriptor chain from the queue.
peek(&mut self, mem: &GuestMemory) -> Option<DescriptorChain>643     pub fn peek(&mut self, mem: &GuestMemory) -> Option<DescriptorChain> {
644         if !self.ready {
645             error!("attempt to use virtio queue that is not marked ready");
646             return None;
647         }
648 
649         let avail_index = self.get_avail_index(mem);
650         if self.next_avail == avail_index {
651             return None;
652         }
653 
654         // This fence ensures that subsequent reads from the descriptor do not
655         // get reordered and happen only after fetching the available_index and
656         // checking that there is a slot available.
657         fence(Ordering::SeqCst);
658 
659         let desc_idx_addr_offset = 4 + (u64::from(self.wrap_queue_index(self.next_avail)) * 2);
660         let desc_idx_addr = self.avail_ring.checked_add(desc_idx_addr_offset)?;
661 
662         // This index is checked below in checked_new.
663         let descriptor_index: u16 =
664             read_obj_from_addr_wrapper(mem, &self.exported_avail_ring, desc_idx_addr).unwrap();
665 
666         let iommu = self.iommu.as_ref().map(Arc::clone);
667         DescriptorChain::checked_new(
668             mem,
669             self.desc_table,
670             self.size,
671             descriptor_index,
672             0,
673             iommu,
674             self.exported_desc_table.clone(),
675         )
676         .map_err(|e| {
677             error!("{:#}", e);
678             e
679         })
680         .ok()
681     }
682 
683     /// Remove the first available descriptor chain from the queue.
684     /// This function should only be called immediately following `peek`.
pop_peeked(&mut self, mem: &GuestMemory)685     pub fn pop_peeked(&mut self, mem: &GuestMemory) {
686         self.next_avail += Wrapping(1);
687         if self.features & ((1u64) << VIRTIO_RING_F_EVENT_IDX) != 0 {
688             self.set_avail_event(mem, self.next_avail);
689         }
690     }
691 
692     /// If a new DescriptorHead is available, returns one and removes it from the queue.
pop(&mut self, mem: &GuestMemory) -> Option<DescriptorChain>693     pub fn pop(&mut self, mem: &GuestMemory) -> Option<DescriptorChain> {
694         let descriptor_chain = self.peek(mem);
695         if descriptor_chain.is_some() {
696             self.pop_peeked(mem);
697         }
698         descriptor_chain
699     }
700 
701     /// A consuming iterator over all available descriptor chain heads offered by the driver.
iter<'a, 'b>(&'b mut self, mem: &'a GuestMemory) -> AvailIter<'a, 'b>702     pub fn iter<'a, 'b>(&'b mut self, mem: &'a GuestMemory) -> AvailIter<'a, 'b> {
703         AvailIter { mem, queue: self }
704     }
705 
706     /// Asynchronously read the next descriptor chain from the queue.
707     /// Returns a `DescriptorChain` when it is `await`ed.
next_async( &mut self, mem: &GuestMemory, eventfd: &mut EventAsync, ) -> std::result::Result<DescriptorChain, AsyncError>708     pub async fn next_async(
709         &mut self,
710         mem: &GuestMemory,
711         eventfd: &mut EventAsync,
712     ) -> std::result::Result<DescriptorChain, AsyncError> {
713         loop {
714             // Check if there are more descriptors available.
715             if let Some(chain) = self.pop(mem) {
716                 return Ok(chain);
717             }
718             eventfd.next_val().await?;
719         }
720     }
721 
722     /// Puts an available descriptor head into the used ring for use by the guest.
add_used(&mut self, mem: &GuestMemory, desc_index: u16, len: u32)723     pub fn add_used(&mut self, mem: &GuestMemory, desc_index: u16, len: u32) {
724         if desc_index >= self.size {
725             error!(
726                 "attempted to add out of bounds descriptor to used ring: {}",
727                 desc_index
728             );
729             return;
730         }
731 
732         let used_ring = self.used_ring;
733         let next_used = self.wrap_queue_index(self.next_used) as usize;
734         let used_elem = used_ring.unchecked_add((4 + next_used * 8) as u64);
735 
736         // These writes can't fail as we are guaranteed to be within the descriptor ring.
737         write_obj_at_addr_wrapper(mem, &self.exported_used_ring, desc_index as u32, used_elem)
738             .unwrap();
739         write_obj_at_addr_wrapper(
740             mem,
741             &self.exported_used_ring,
742             len as u32,
743             used_elem.unchecked_add(4),
744         )
745         .unwrap();
746 
747         self.next_used += Wrapping(1);
748         self.set_used_index(mem, self.next_used);
749     }
750 
751     /// Returns if the queue should have an interrupt sent based on its state.
752     ///
753     /// This function implements `VIRTIO_RING_F_EVENT_IDX`, otherwise known as
754     /// interrupt suppression. The virtio spec provides the driver with a field,
755     /// `used_event`, which says that once we write that descriptor (or several
756     /// in the case of a flurry of `add_used` calls), we should send a
757     /// notification. Because the values involved wrap around `u16::MAX`, and to
758     /// avoid checking the condition on every `add_used` call, the math is a
759     /// little complicated.
760     ///
761     /// The critical inequality is:
762     /// ```text
763     ///      (next_used - 1) - used_event < next_used - last_used
764     /// ```
765     ///
766     /// For illustration purposes, we label it as `A < B`, where
767     /// `A = (next_used -1) - used_event`, and `B = next_used - last_used`.
768     ///
769     /// `A` and `B` represent two distances, measured in a wrapping ring of size
770     /// `u16::MAX`. In the "send intr" case, the inequality is true. In the
771     /// "don't send intr" case, the inequality is false. We must be very careful
772     /// in assigning a direction to the ring, so that when we
773     /// graph the subtraction operations, we are measuring the right distance
774     /// (similar to how DC circuits are analyzed).
775     ///
776     /// The two distances are as follows:
777     ///  * `A` is the distance between the driver's requested notification
778     ///    point, and the current position in the ring.
779     ///
780     ///  * `B` is the distance between the last time we notified the guest,
781     ///    and the current position in the ring.
782     ///
783     /// If we graph these distances for the situation where we want to notify
784     /// the guest, and when we don't want to notify the guest, we see that
785     /// `A < B` becomes true the moment `next_used - 1` passes `used_event`. See
786     /// the graphs at the bottom of this comment block for a more visual
787     /// explanation.
788     ///
789     /// Once an interrupt is sent, we have a final useful property: last_used
790     /// moves up next_used, which causes the inequality to be false. Thus, we
791     /// won't send notifications again until `used_event` is moved forward by
792     /// the driver.
793     ///
794     /// Finally, let's talk about a couple of ways to write this inequality
795     /// that don't work, and critically, explain *why*.
796     ///
797     /// First, a naive reading of the virtio spec might lead us to ask: why not
798     /// just use the following inequality:
799     /// ```text
800     ///      next_used - 1 >= used_event
801     /// ```
802     ///
803     /// because that's much simpler, right? The trouble is that the ring wraps,
804     /// so it could be that a smaller index is actually ahead of a larger one.
805     /// That's why we have to use distances in the ring instead.
806     ///
807     /// Second, one might look at the correct inequality:
808     /// ```text
809     ///      (next_used - 1) - used_event < next_used - last_used
810     /// ```
811     ///
812     /// And try to simplify it to:
813     /// ```text
814     ///      last_used - 1 < used_event
815     /// ```
816     ///
817     /// Functionally, this won't work because next_used isn't present at all
818     /// anymore. (Notifications will never be sent.) But why is that? The algebra
819     /// here *appears* to work out, but all semantic meaning is lost. There are
820     /// two explanations for why this happens:
821     /// * The intuitive one: the terms in the inequality are not actually
822     ///   separable; in other words, (next_used - last_used) is an inseparable
823     ///   term, so subtracting next_used from both sides of the original
824     ///   inequality and zeroing them out is semantically invalid. But why aren't
825     ///   they separable? See below.
826     /// * The theoretical one: canceling like terms relies a vector space law:
827     ///   a + x = b + x => a = b (cancellation law). For congruences / equality
828     ///   under modulo, this law is satisfied, but for inequalities under mod, it
829     ///   is not; therefore, we cannot cancel like terms.
830     ///
831     /// ```text
832     /// ┌──────────────────────────────────┐
833     /// │                                  │
834     /// │                                  │
835     /// │                                  │
836     /// │           ┌────────────  next_used - 1
837     /// │           │A                   x
838     /// │           │       ┌────────────x────────────┐
839     /// │           │       │            x            │
840     /// │           │       │                         │
841     /// │           │       │               │         │
842     /// │           │       │               │         │
843     /// │     used_event  xxxx        + ◄───┘       xxxxx last_used
844     /// │                   │                         │      │
845     /// │                   │        Send intr        │      │
846     /// │                   │                         │      │
847     /// │                   └─────────────────────────┘      │
848     /// │                                                    │
849     /// │ B                                                  │
850     /// └────────────────────────────────────────────────────┘
851     ///
852     ///             ┌───────────────────────────────────────────────────┐
853     ///             │                                                 A │
854     ///             │       ┌────────────────────────┐                  │
855     ///             │       │                        │                  │
856     ///             │       │                        │                  │
857     ///             │       │              │         │                  │
858     ///             │       │              │         │                  │
859     ///       used_event  xxxx             │       xxxxx last_used      │
860     ///                     │        + ◄───┘         │       │          │
861     ///                     │                        │       │          │
862     ///                     │     Don't send intr    │       │          │
863     ///                     │                        │       │          │
864     ///                     └───────────x────────────┘       │          │
865     ///                                 x                    │          │
866     ///                              next_used - 1           │          │
867     ///                              │  │                  B │          │
868     ///                              │  └────────────────────┘          │
869     ///                              │                                  │
870     ///                              └──────────────────────────────────┘
871     /// ```
queue_wants_interrupt(&self, mem: &GuestMemory) -> bool872     fn queue_wants_interrupt(&self, mem: &GuestMemory) -> bool {
873         if self.features & ((1u64) << VIRTIO_RING_F_EVENT_IDX) != 0 {
874             let used_event = self.get_used_event(mem);
875             self.next_used - used_event - Wrapping(1) < self.next_used - self.last_used
876         } else {
877             !self.get_avail_flag(mem, VIRTQ_AVAIL_F_NO_INTERRUPT)
878         }
879     }
880 
881     /// inject interrupt into guest on this queue
882     /// return true: interrupt is injected into guest for this queue
883     ///        false: interrupt isn't injected
trigger_interrupt<I: SignalableInterrupt>( &mut self, mem: &GuestMemory, interrupt: &I, ) -> bool884     pub fn trigger_interrupt<I: SignalableInterrupt>(
885         &mut self,
886         mem: &GuestMemory,
887         interrupt: &I,
888     ) -> bool {
889         if self.queue_wants_interrupt(mem) {
890             self.last_used = self.next_used;
891             interrupt.signal_used_queue(self.vector);
892             true
893         } else {
894             false
895         }
896     }
897 
898     /// Acknowledges that this set of features should be enabled on this queue.
ack_features(&mut self, features: u64)899     pub fn ack_features(&mut self, features: u64) {
900         self.features |= features;
901     }
902 
set_iommu(&mut self, iommu: Arc<Mutex<IpcMemoryMapper>>)903     pub fn set_iommu(&mut self, iommu: Arc<Mutex<IpcMemoryMapper>>) {
904         self.iommu = Some(iommu);
905     }
906 }
907 
908 #[cfg(test)]
909 mod tests {
910     use std::convert::TryInto;
911 
912     use memoffset::offset_of;
913 
914     use super::super::Interrupt;
915     use super::*;
916     use crate::IrqLevelEvent;
917 
918     const GUEST_MEMORY_SIZE: u64 = 0x10000;
919     const DESC_OFFSET: u64 = 0;
920     const AVAIL_OFFSET: u64 = 0x200;
921     const USED_OFFSET: u64 = 0x400;
922     const QUEUE_SIZE: usize = 0x10;
923     const BUFFER_OFFSET: u64 = 0x8000;
924     const BUFFER_LEN: u32 = 0x400;
925 
926     #[derive(Copy, Clone, Debug, FromBytes, AsBytes)]
927     #[repr(C)]
928     struct Avail {
929         flags: Le16,
930         idx: Le16,
931         ring: [Le16; QUEUE_SIZE],
932         used_event: Le16,
933     }
934 
935     impl Default for Avail {
default() -> Self936         fn default() -> Self {
937             Avail {
938                 flags: Le16::from(0u16),
939                 idx: Le16::from(0u16),
940                 ring: [Le16::from(0u16); QUEUE_SIZE],
941                 used_event: Le16::from(0u16),
942             }
943         }
944     }
945 
946     #[derive(Copy, Clone, Debug, FromBytes, AsBytes)]
947     #[repr(C)]
948     struct UsedElem {
949         id: Le32,
950         len: Le32,
951     }
952 
953     impl Default for UsedElem {
default() -> Self954         fn default() -> Self {
955             UsedElem {
956                 id: Le32::from(0u32),
957                 len: Le32::from(0u32),
958             }
959         }
960     }
961 
962     #[derive(Copy, Clone, Debug, FromBytes, AsBytes)]
963     #[repr(C, packed)]
964     struct Used {
965         flags: Le16,
966         idx: Le16,
967         used_elem_ring: [UsedElem; QUEUE_SIZE],
968         avail_event: Le16,
969     }
970 
971     impl Default for Used {
default() -> Self972         fn default() -> Self {
973             Used {
974                 flags: Le16::from(0u16),
975                 idx: Le16::from(0u16),
976                 used_elem_ring: [UsedElem::default(); QUEUE_SIZE],
977                 avail_event: Le16::from(0u16),
978             }
979         }
980     }
981 
setup_vq(queue: &mut Queue, mem: &GuestMemory)982     fn setup_vq(queue: &mut Queue, mem: &GuestMemory) {
983         let desc = Desc {
984             addr: Le64::from(BUFFER_OFFSET),
985             len: Le32::from(BUFFER_LEN),
986             flags: Le16::from(0u16),
987             next: Le16::from(1u16),
988         };
989         let _ = mem.write_obj_at_addr(desc, GuestAddress(DESC_OFFSET));
990 
991         let avail = Avail::default();
992         let _ = mem.write_obj_at_addr(avail, GuestAddress(AVAIL_OFFSET));
993 
994         let used = Used::default();
995         let _ = mem.write_obj_at_addr(used, GuestAddress(USED_OFFSET));
996 
997         queue.desc_table = GuestAddress(DESC_OFFSET);
998         queue.avail_ring = GuestAddress(AVAIL_OFFSET);
999         queue.used_ring = GuestAddress(USED_OFFSET);
1000         queue.ack_features((1u64) << VIRTIO_RING_F_EVENT_IDX);
1001     }
1002 
1003     #[test]
queue_event_id_guest_fast()1004     fn queue_event_id_guest_fast() {
1005         let mut queue = Queue::new(QUEUE_SIZE.try_into().unwrap());
1006         let memory_start_addr = GuestAddress(0x0);
1007         let mem = GuestMemory::new(&[(memory_start_addr, GUEST_MEMORY_SIZE)]).unwrap();
1008         setup_vq(&mut queue, &mem);
1009 
1010         let interrupt = Interrupt::new(IrqLevelEvent::new().unwrap(), None, 10);
1011 
1012         // Offset of used_event within Avail structure
1013         let used_event_offset = offset_of!(Avail, used_event) as u64;
1014         let used_event_address = GuestAddress(AVAIL_OFFSET + used_event_offset);
1015 
1016         // Assume driver submit 0x100 req to device,
1017         // device has handled them, so increase self.next_used to 0x100
1018         let mut device_generate: Wrapping<u16> = Wrapping(0x100);
1019         for _ in 0..device_generate.0 {
1020             queue.add_used(&mem, 0x0, BUFFER_LEN);
1021         }
1022 
1023         // At this moment driver hasn't handled any interrupts yet, so it
1024         // should inject interrupt.
1025         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1026 
1027         // Driver handle all the interrupts and update avail.used_event to 0x100
1028         let mut driver_handled = device_generate;
1029         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1030 
1031         // At this moment driver have handled all the interrupts, and
1032         // device doesn't generate more data, so interrupt isn't needed.
1033         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1034 
1035         // Assume driver submit another u16::MAX - 0x100 req to device,
1036         // Device has handled all of them, so increase self.next_used to u16::MAX
1037         for _ in device_generate.0..u16::max_value() {
1038             queue.add_used(&mem, 0x0, BUFFER_LEN);
1039         }
1040         device_generate = Wrapping(u16::max_value());
1041 
1042         // At this moment driver just handled 0x100 interrupts, so it
1043         // should inject interrupt.
1044         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1045 
1046         // driver handle all the interrupts and update avail.used_event to u16::MAX
1047         driver_handled = device_generate;
1048         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1049 
1050         // At this moment driver have handled all the interrupts, and
1051         // device doesn't generate more data, so interrupt isn't needed.
1052         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1053 
1054         // Assume driver submit another 1 request,
1055         // device has handled it, so wrap self.next_used to 0
1056         queue.add_used(&mem, 0x0, BUFFER_LEN);
1057         device_generate += Wrapping(1);
1058 
1059         // At this moment driver has handled all the previous interrupts, so it
1060         // should inject interrupt again.
1061         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1062 
1063         // driver handle that interrupts and update avail.used_event to 0
1064         driver_handled = device_generate;
1065         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1066 
1067         // At this moment driver have handled all the interrupts, and
1068         // device doesn't generate more data, so interrupt isn't needed.
1069         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1070     }
1071 
1072     #[test]
queue_event_id_guest_slow()1073     fn queue_event_id_guest_slow() {
1074         let mut queue = Queue::new(QUEUE_SIZE.try_into().unwrap());
1075         let memory_start_addr = GuestAddress(0x0);
1076         let mem = GuestMemory::new(&[(memory_start_addr, GUEST_MEMORY_SIZE)]).unwrap();
1077         setup_vq(&mut queue, &mem);
1078 
1079         let interrupt = Interrupt::new(IrqLevelEvent::new().unwrap(), None, 10);
1080 
1081         // Offset of used_event within Avail structure
1082         let used_event_offset = offset_of!(Avail, used_event) as u64;
1083         let used_event_address = GuestAddress(AVAIL_OFFSET + used_event_offset);
1084 
1085         // Assume driver submit 0x100 req to device,
1086         // device have handled 0x100 req, so increase self.next_used to 0x100
1087         let mut device_generate: Wrapping<u16> = Wrapping(0x100);
1088         for _ in 0..device_generate.0 {
1089             queue.add_used(&mem, 0x0, BUFFER_LEN);
1090         }
1091 
1092         // At this moment driver hasn't handled any interrupts yet, so it
1093         // should inject interrupt.
1094         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1095 
1096         // Driver handle part of the interrupts and update avail.used_event to 0x80
1097         let mut driver_handled = Wrapping(0x80);
1098         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1099 
1100         // At this moment driver hasn't finished last interrupt yet,
1101         // so interrupt isn't needed.
1102         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1103 
1104         // Assume driver submit another 1 request,
1105         // device has handled it, so increment self.next_used.
1106         queue.add_used(&mem, 0x0, BUFFER_LEN);
1107         device_generate += Wrapping(1);
1108 
1109         // At this moment driver hasn't finished last interrupt yet,
1110         // so interrupt isn't needed.
1111         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1112 
1113         // Assume driver submit another u16::MAX - 0x101 req to device,
1114         // Device has handled all of them, so increase self.next_used to u16::MAX
1115         for _ in device_generate.0..u16::max_value() {
1116             queue.add_used(&mem, 0x0, BUFFER_LEN);
1117         }
1118         device_generate = Wrapping(u16::max_value());
1119 
1120         // At this moment driver hasn't finished last interrupt yet,
1121         // so interrupt isn't needed.
1122         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1123 
1124         // driver handle most of the interrupts and update avail.used_event to u16::MAX - 1,
1125         driver_handled = device_generate - Wrapping(1);
1126         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1127 
1128         // Assume driver submit another 1 request,
1129         // device has handled it, so wrap self.next_used to 0
1130         queue.add_used(&mem, 0x0, BUFFER_LEN);
1131         device_generate += Wrapping(1);
1132 
1133         // At this moment driver has already finished the last interrupt(0x100),
1134         // and device service other request, so new interrupt is needed.
1135         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1136 
1137         // Assume driver submit another 1 request,
1138         // device has handled it, so increment self.next_used to 1
1139         queue.add_used(&mem, 0x0, BUFFER_LEN);
1140         device_generate += Wrapping(1);
1141 
1142         // At this moment driver hasn't finished last interrupt((Wrapping(0)) yet,
1143         // so interrupt isn't needed.
1144         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1145 
1146         // driver handle all the remain interrupts and wrap avail.used_event to 0x1.
1147         driver_handled = device_generate;
1148         let _ = mem.write_obj_at_addr(Le16::from(driver_handled.0), used_event_address);
1149 
1150         // At this moment driver has handled all the interrupts, and
1151         // device doesn't generate more data, so interrupt isn't needed.
1152         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), false);
1153 
1154         // Assume driver submit another 1 request,
1155         // device has handled it, so increase self.next_used.
1156         queue.add_used(&mem, 0x0, BUFFER_LEN);
1157         device_generate += Wrapping(1);
1158 
1159         // At this moment driver has finished all the previous interrupts, so it
1160         // should inject interrupt again.
1161         assert_eq!(queue.trigger_interrupt(&mem, &interrupt), true);
1162     }
1163 }
1164