• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 pub mod sys;
6 
7 use std::cell::RefCell;
8 use std::rc::Rc;
9 use std::sync::Arc;
10 
11 use anyhow::anyhow;
12 use anyhow::bail;
13 use anyhow::Context;
14 use base::error;
15 use base::warn;
16 use base::Tube;
17 use cros_async::EventAsync;
18 use cros_async::Executor;
19 use cros_async::TaskHandle;
20 use sync::Mutex;
21 pub use sys::run_gpu_device;
22 pub use sys::Options;
23 use vm_memory::GuestMemory;
24 use vmm_vhost::message::VhostUserProtocolFeatures;
25 use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
26 
27 use crate::virtio::device_constants::gpu::NUM_QUEUES;
28 use crate::virtio::gpu;
29 use crate::virtio::gpu::QueueReader;
30 use crate::virtio::vhost::user::device::handler::Error as DeviceError;
31 use crate::virtio::vhost::user::device::handler::VhostBackendReqConnection;
32 use crate::virtio::vhost::user::device::handler::VhostUserDevice;
33 use crate::virtio::vhost::user::device::handler::WorkerState;
34 use crate::virtio::DescriptorChain;
35 use crate::virtio::Gpu;
36 use crate::virtio::Interrupt;
37 use crate::virtio::Queue;
38 use crate::virtio::SharedMemoryMapper;
39 use crate::virtio::SharedMemoryRegion;
40 use crate::virtio::VirtioDevice;
41 
42 const MAX_QUEUE_NUM: usize = NUM_QUEUES;
43 
44 #[derive(Clone)]
45 struct SharedReader {
46     queue: Arc<Mutex<Queue>>,
47     doorbell: Interrupt,
48 }
49 
50 impl gpu::QueueReader for SharedReader {
pop(&self) -> Option<DescriptorChain>51     fn pop(&self) -> Option<DescriptorChain> {
52         self.queue.lock().pop()
53     }
54 
add_used(&self, desc_chain: DescriptorChain, len: u32)55     fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
56         self.queue.lock().add_used(desc_chain, len)
57     }
58 
signal_used(&self)59     fn signal_used(&self) {
60         self.queue.lock().trigger_interrupt(&self.doorbell);
61     }
62 }
63 
run_ctrl_queue( reader: SharedReader, mem: GuestMemory, kick_evt: EventAsync, state: Rc<RefCell<gpu::Frontend>>, )64 async fn run_ctrl_queue(
65     reader: SharedReader,
66     mem: GuestMemory,
67     kick_evt: EventAsync,
68     state: Rc<RefCell<gpu::Frontend>>,
69 ) {
70     loop {
71         if let Err(e) = kick_evt.next_val().await {
72             error!("Failed to read kick event for ctrl queue: {}", e);
73             break;
74         }
75 
76         let mut state = state.borrow_mut();
77         let needs_interrupt = state.process_queue(&mem, &reader);
78 
79         if needs_interrupt {
80             reader.signal_used();
81         }
82     }
83 }
84 
85 struct GpuBackend {
86     ex: Executor,
87     gpu: Rc<RefCell<Gpu>>,
88     resource_bridges: Arc<Mutex<Vec<Tube>>>,
89     acked_protocol_features: u64,
90     state: Option<Rc<RefCell<gpu::Frontend>>>,
91     fence_state: Arc<Mutex<gpu::FenceState>>,
92     queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
93     platform_workers: Rc<RefCell<Vec<TaskHandle<()>>>>,
94     shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
95 }
96 
97 impl VhostUserDevice for GpuBackend {
max_queue_num(&self) -> usize98     fn max_queue_num(&self) -> usize {
99         MAX_QUEUE_NUM
100     }
101 
features(&self) -> u64102     fn features(&self) -> u64 {
103         self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
104     }
105 
ack_features(&mut self, value: u64) -> anyhow::Result<()>106     fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
107         self.gpu.borrow_mut().ack_features(value);
108         Ok(())
109     }
110 
acked_features(&self) -> u64111     fn acked_features(&self) -> u64 {
112         self.features()
113     }
114 
protocol_features(&self) -> VhostUserProtocolFeatures115     fn protocol_features(&self) -> VhostUserProtocolFeatures {
116         VhostUserProtocolFeatures::CONFIG
117             | VhostUserProtocolFeatures::BACKEND_REQ
118             | VhostUserProtocolFeatures::MQ
119             | VhostUserProtocolFeatures::SHARED_MEMORY_REGIONS
120     }
121 
ack_protocol_features(&mut self, features: u64) -> anyhow::Result<()>122     fn ack_protocol_features(&mut self, features: u64) -> anyhow::Result<()> {
123         let unrequested_features = features & !self.protocol_features().bits();
124         if unrequested_features != 0 {
125             bail!("Unexpected protocol features: {:#x}", unrequested_features);
126         }
127 
128         self.acked_protocol_features |= features;
129         Ok(())
130     }
131 
acked_protocol_features(&self) -> u64132     fn acked_protocol_features(&self) -> u64 {
133         self.acked_protocol_features
134     }
135 
read_config(&self, offset: u64, dst: &mut [u8])136     fn read_config(&self, offset: u64, dst: &mut [u8]) {
137         self.gpu.borrow().read_config(offset, dst)
138     }
139 
write_config(&self, offset: u64, data: &[u8])140     fn write_config(&self, offset: u64, data: &[u8]) {
141         self.gpu.borrow_mut().write_config(offset, data)
142     }
143 
start_queue( &mut self, idx: usize, queue: Queue, mem: GuestMemory, doorbell: Interrupt, ) -> anyhow::Result<()>144     fn start_queue(
145         &mut self,
146         idx: usize,
147         queue: Queue,
148         mem: GuestMemory,
149         doorbell: Interrupt,
150     ) -> anyhow::Result<()> {
151         if self.queue_workers[idx].is_some() {
152             warn!("Starting new queue handler without stopping old handler");
153             self.stop_queue(idx)?;
154         }
155 
156         // Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
157         // handle fences in the RutabagaFenceHandler, and also handle queue messages in
158         // `run_ctrl_queue`.
159         // For the cursor queue, we still create the refcounted queue to support retrieving the
160         // queue for snapshotting (but don't handle any messages).
161         let queue = Arc::new(Mutex::new(queue));
162 
163         // Spawn a worker for the queue.
164         let queue_task = match idx {
165             0 => {
166                 // Set up worker for the control queue.
167                 let kick_evt = queue
168                     .lock()
169                     .event()
170                     .try_clone()
171                     .context("failed to clone queue event")?;
172                 let kick_evt = EventAsync::new(kick_evt, &self.ex)
173                     .context("failed to create EventAsync for kick_evt")?;
174                 let reader = SharedReader {
175                     queue: queue.clone(),
176                     doorbell: doorbell.clone(),
177                 };
178 
179                 let state = if let Some(s) = self.state.as_ref() {
180                     s.clone()
181                 } else {
182                     let fence_handler_resources =
183                         Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
184                             mem: mem.clone(),
185                             ctrl_queue: reader.clone(),
186                         })));
187                     let fence_handler = gpu::create_fence_handler(
188                         fence_handler_resources,
189                         self.fence_state.clone(),
190                     );
191 
192                     let state = Rc::new(RefCell::new(
193                         self.gpu
194                             .borrow_mut()
195                             .initialize_frontend(
196                                 self.fence_state.clone(),
197                                 fence_handler,
198                                 Arc::clone(&self.shmem_mapper),
199                             )
200                             .ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
201                     ));
202                     self.state = Some(state.clone());
203                     state
204                 };
205 
206                 // Start handling platform-specific workers.
207                 self.start_platform_workers(doorbell)?;
208 
209                 // Start handling the control queue.
210                 self.ex
211                     .spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
212             }
213             1 => {
214                 // For the cursor queue, spawn an empty worker, as we don't process it at all.
215                 // We don't handle the cursor queue because no current users of vhost-user GPU pass
216                 // any messages on it.
217                 self.ex.spawn_local(async {})
218             }
219             _ => bail!("attempted to start unknown queue: {}", idx),
220         };
221 
222         self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
223         Ok(())
224     }
225 
stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue>226     fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
227         if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
228             // Wait for queue_task to be aborted.
229             let _ = self.ex.run_until(worker.queue_task.cancel());
230 
231             if idx == 0 {
232                 // Stop the non-queue workers if this is the control queue (where we start them).
233                 self.stop_non_queue_workers()?;
234 
235                 // After we stop all workers, we have only one reference left to self.state.
236                 // Clearing it allows the GPU state to be destroyed, which gets rid of the
237                 // remaining control queue reference from RutabagaFenceHandler.
238                 // This allows our worker.queue to be recovered as it has no further references.
239                 self.state = None;
240             }
241 
242             let queue = match Arc::try_unwrap(worker.queue) {
243                 Ok(queue_mutex) => queue_mutex.into_inner(),
244                 Err(_) => panic!("failed to recover queue from worker"),
245             };
246 
247             Ok(queue)
248         } else {
249             Err(anyhow::Error::new(DeviceError::WorkerNotFound))
250         }
251     }
252 
stop_non_queue_workers(&mut self) -> anyhow::Result<()>253     fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
254         for handle in self.platform_workers.borrow_mut().drain(..) {
255             let _ = self.ex.run_until(handle.cancel());
256         }
257         Ok(())
258     }
259 
reset(&mut self)260     fn reset(&mut self) {
261         self.stop_non_queue_workers()
262             .expect("Failed to stop platform workers.");
263 
264         for queue_num in 0..self.max_queue_num() {
265             // The cursor queue is never used, so we should check if the queue is set before
266             // stopping.
267             if self.queue_workers[queue_num].is_some() {
268                 if let Err(e) = self.stop_queue(queue_num) {
269                     error!("Failed to stop_queue during reset: {}", e);
270                 }
271             }
272         }
273     }
274 
get_shared_memory_region(&self) -> Option<SharedMemoryRegion>275     fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
276         self.gpu.borrow().get_shared_memory_region()
277     }
278 
set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>)279     fn set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>) {
280         if self
281             .shmem_mapper
282             .lock()
283             .replace(conn.take_shmem_mapper().unwrap())
284             .is_some()
285         {
286             warn!("Connection already established. Overwriting shmem_mapper");
287         }
288     }
289 
snapshot(&self) -> anyhow::Result<Vec<u8>>290     fn snapshot(&self) -> anyhow::Result<Vec<u8>> {
291         // TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
292         // snapshot of the GPU to create a POC.
293         serde_json::to_vec(&serde_json::Value::Null)
294             .context("Failed to serialize Null in the GPU device")
295     }
296 
restore(&mut self, data: Vec<u8>) -> anyhow::Result<()>297     fn restore(&mut self, data: Vec<u8>) -> anyhow::Result<()> {
298         let data: serde_json::Value = serde_json::from_slice(data.as_slice())
299             .context("Failed to deserialize NULL in the GPU device")?;
300         anyhow::ensure!(
301             data.is_null(),
302             "unexpected snapshot data: should be null, got {}",
303             data
304         );
305         Ok(())
306     }
307 }
308 
309 impl Drop for GpuBackend {
drop(&mut self)310     fn drop(&mut self) {
311         // Workers are detached and will leak unless they are aborted. Aborting marks the
312         // Abortable task, then wakes it up. This means the executor should be asked to continue
313         // running for one more step after the backend is destroyed.
314         self.reset();
315     }
316 }
317