• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 pub mod sys;
6 
7 use std::cell::RefCell;
8 use std::rc::Rc;
9 use std::sync::Arc;
10 
11 use anyhow::anyhow;
12 use anyhow::bail;
13 use anyhow::Context;
14 use base::error;
15 use base::warn;
16 use base::Tube;
17 use cros_async::EventAsync;
18 use cros_async::Executor;
19 use cros_async::TaskHandle;
20 use futures::FutureExt;
21 use futures::StreamExt;
22 use snapshot::AnySnapshot;
23 use sync::Mutex;
24 pub use sys::run_gpu_device;
25 pub use sys::Options;
26 use vm_memory::GuestMemory;
27 use vmm_vhost::message::VhostUserProtocolFeatures;
28 use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
29 
30 use crate::virtio::device_constants::gpu::NUM_QUEUES;
31 use crate::virtio::gpu;
32 use crate::virtio::gpu::QueueReader;
33 use crate::virtio::vhost::user::device::handler::Error as DeviceError;
34 use crate::virtio::vhost::user::device::handler::VhostBackendReqConnection;
35 use crate::virtio::vhost::user::device::handler::VhostUserDevice;
36 use crate::virtio::vhost::user::device::handler::WorkerState;
37 use crate::virtio::DescriptorChain;
38 use crate::virtio::Gpu;
39 use crate::virtio::Queue;
40 use crate::virtio::SharedMemoryMapper;
41 use crate::virtio::SharedMemoryRegion;
42 use crate::virtio::VirtioDevice;
43 
44 const MAX_QUEUE_NUM: usize = NUM_QUEUES;
45 
46 #[derive(Clone)]
47 struct SharedReader {
48     queue: Arc<Mutex<Queue>>,
49 }
50 
51 impl gpu::QueueReader for SharedReader {
pop(&self) -> Option<DescriptorChain>52     fn pop(&self) -> Option<DescriptorChain> {
53         self.queue.lock().pop()
54     }
55 
add_used(&self, desc_chain: DescriptorChain, len: u32)56     fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
57         self.queue.lock().add_used(desc_chain, len)
58     }
59 
signal_used(&self)60     fn signal_used(&self) {
61         self.queue.lock().trigger_interrupt();
62     }
63 }
64 
run_ctrl_queue( reader: SharedReader, mem: GuestMemory, kick_evt: EventAsync, state: Rc<RefCell<gpu::Frontend>>, )65 async fn run_ctrl_queue(
66     reader: SharedReader,
67     mem: GuestMemory,
68     kick_evt: EventAsync,
69     state: Rc<RefCell<gpu::Frontend>>,
70 ) {
71     loop {
72         if let Err(e) = kick_evt.next_val().await {
73             error!("Failed to read kick event for ctrl queue: {}", e);
74             break;
75         }
76 
77         let mut state = state.borrow_mut();
78         let needs_interrupt = state.process_queue(&mem, &reader);
79 
80         if needs_interrupt {
81             reader.signal_used();
82         }
83     }
84 }
85 
86 struct GpuBackend {
87     ex: Executor,
88     gpu: Rc<RefCell<Gpu>>,
89     resource_bridges: Arc<Mutex<Vec<Tube>>>,
90     state: Option<Rc<RefCell<gpu::Frontend>>>,
91     fence_state: Arc<Mutex<gpu::FenceState>>,
92     queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
93     // In the downstream, we may add platform workers after start_platform_workers returns.
94     platform_worker_tx: futures::channel::mpsc::UnboundedSender<TaskHandle<()>>,
95     platform_worker_rx: futures::channel::mpsc::UnboundedReceiver<TaskHandle<()>>,
96     shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
97 }
98 
99 impl GpuBackend {
stop_non_queue_workers(&mut self) -> anyhow::Result<()>100     fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
101         self.ex
102             .run_until(async {
103                 while let Some(Some(handle)) = self.platform_worker_rx.next().now_or_never() {
104                     handle.cancel().await;
105                 }
106             })
107             .context("stopping the non-queue workers for GPU")?;
108         Ok(())
109     }
110 }
111 
112 impl VhostUserDevice for GpuBackend {
max_queue_num(&self) -> usize113     fn max_queue_num(&self) -> usize {
114         MAX_QUEUE_NUM
115     }
116 
features(&self) -> u64117     fn features(&self) -> u64 {
118         self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
119     }
120 
ack_features(&mut self, value: u64) -> anyhow::Result<()>121     fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
122         self.gpu.borrow_mut().ack_features(value);
123         Ok(())
124     }
125 
protocol_features(&self) -> VhostUserProtocolFeatures126     fn protocol_features(&self) -> VhostUserProtocolFeatures {
127         VhostUserProtocolFeatures::CONFIG
128             | VhostUserProtocolFeatures::BACKEND_REQ
129             | VhostUserProtocolFeatures::MQ
130             | VhostUserProtocolFeatures::SHARED_MEMORY_REGIONS
131             | VhostUserProtocolFeatures::DEVICE_STATE
132     }
133 
read_config(&self, offset: u64, dst: &mut [u8])134     fn read_config(&self, offset: u64, dst: &mut [u8]) {
135         self.gpu.borrow().read_config(offset, dst)
136     }
137 
write_config(&self, offset: u64, data: &[u8])138     fn write_config(&self, offset: u64, data: &[u8]) {
139         self.gpu.borrow_mut().write_config(offset, data)
140     }
141 
start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()>142     fn start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()> {
143         if self.queue_workers[idx].is_some() {
144             warn!("Starting new queue handler without stopping old handler");
145             self.stop_queue(idx)?;
146         }
147 
148         let doorbell = queue.interrupt().clone();
149 
150         // Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
151         // handle fences in the RutabagaFenceHandler, and also handle queue messages in
152         // `run_ctrl_queue`.
153         // For the cursor queue, we still create the refcounted queue to support retrieving the
154         // queue for snapshotting (but don't handle any messages).
155         let queue = Arc::new(Mutex::new(queue));
156 
157         // Spawn a worker for the queue.
158         let queue_task = match idx {
159             0 => {
160                 // Set up worker for the control queue.
161                 let kick_evt = queue
162                     .lock()
163                     .event()
164                     .try_clone()
165                     .context("failed to clone queue event")?;
166                 let kick_evt = EventAsync::new(kick_evt, &self.ex)
167                     .context("failed to create EventAsync for kick_evt")?;
168                 let reader = SharedReader {
169                     queue: queue.clone(),
170                 };
171 
172                 let state = if let Some(s) = self.state.as_ref() {
173                     s.clone()
174                 } else {
175                     let fence_handler_resources =
176                         Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
177                             mem: mem.clone(),
178                             ctrl_queue: reader.clone(),
179                         })));
180                     let fence_handler = gpu::create_fence_handler(
181                         fence_handler_resources,
182                         self.fence_state.clone(),
183                     );
184 
185                     let state = Rc::new(RefCell::new(
186                         self.gpu
187                             .borrow_mut()
188                             .initialize_frontend(
189                                 self.fence_state.clone(),
190                                 fence_handler,
191                                 Arc::clone(&self.shmem_mapper),
192                             )
193                             .ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
194                     ));
195                     self.state = Some(state.clone());
196                     state
197                 };
198 
199                 // Start handling platform-specific workers.
200                 self.start_platform_workers(doorbell)?;
201 
202                 // Start handling the control queue.
203                 self.ex
204                     .spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
205             }
206             1 => {
207                 // For the cursor queue, spawn an empty worker, as we don't process it at all.
208                 // We don't handle the cursor queue because no current users of vhost-user GPU pass
209                 // any messages on it.
210                 self.ex.spawn_local(async {})
211             }
212             _ => bail!("attempted to start unknown queue: {}", idx),
213         };
214 
215         self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
216         Ok(())
217     }
218 
stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue>219     fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
220         if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
221             // Wait for queue_task to be aborted.
222             let _ = self.ex.run_until(worker.queue_task.cancel());
223 
224             if idx == 0 {
225                 // Stop the non-queue workers if this is the control queue (where we start them).
226                 self.stop_non_queue_workers()?;
227 
228                 // After we stop all workers, we have only one reference left to self.state.
229                 // Clearing it allows the GPU state to be destroyed, which gets rid of the
230                 // remaining control queue reference from RutabagaFenceHandler.
231                 // This allows our worker.queue to be recovered as it has no further references.
232                 self.state = None;
233             }
234 
235             let queue = match Arc::try_unwrap(worker.queue) {
236                 Ok(queue_mutex) => queue_mutex.into_inner(),
237                 Err(_) => panic!("failed to recover queue from worker"),
238             };
239 
240             Ok(queue)
241         } else {
242             Err(anyhow::Error::new(DeviceError::WorkerNotFound))
243         }
244     }
245 
enter_suspended_state(&mut self) -> anyhow::Result<()>246     fn enter_suspended_state(&mut self) -> anyhow::Result<()> {
247         self.stop_non_queue_workers()?;
248         Ok(())
249     }
250 
reset(&mut self)251     fn reset(&mut self) {
252         self.stop_non_queue_workers()
253             .expect("Failed to stop platform workers.");
254 
255         for queue_num in 0..self.max_queue_num() {
256             // The cursor queue is never used, so we should check if the queue is set before
257             // stopping.
258             if self.queue_workers[queue_num].is_some() {
259                 if let Err(e) = self.stop_queue(queue_num) {
260                     error!("Failed to stop_queue during reset: {}", e);
261                 }
262             }
263         }
264     }
265 
get_shared_memory_region(&self) -> Option<SharedMemoryRegion>266     fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
267         self.gpu.borrow().get_shared_memory_region()
268     }
269 
set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>)270     fn set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>) {
271         if self
272             .shmem_mapper
273             .lock()
274             .replace(conn.take_shmem_mapper().unwrap())
275             .is_some()
276         {
277             warn!("Connection already established. Overwriting shmem_mapper");
278         }
279     }
280 
snapshot(&mut self) -> anyhow::Result<AnySnapshot>281     fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
282         // TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
283         // snapshot of the GPU to create a POC.
284         AnySnapshot::to_any(())
285     }
286 
restore(&mut self, data: AnySnapshot) -> anyhow::Result<()>287     fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
288         let () = AnySnapshot::from_any(data)?;
289         Ok(())
290     }
291 }
292 
293 impl Drop for GpuBackend {
drop(&mut self)294     fn drop(&mut self) {
295         // Workers are detached and will leak unless they are aborted. Aborting marks the
296         // Abortable task, then wakes it up. This means the executor should be asked to continue
297         // running for one more step after the backend is destroyed.
298         self.reset();
299     }
300 }
301