1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 pub mod sys;
6
7 use std::cell::RefCell;
8 use std::rc::Rc;
9 use std::sync::Arc;
10
11 use anyhow::anyhow;
12 use anyhow::bail;
13 use anyhow::Context;
14 use base::error;
15 use base::warn;
16 use base::Tube;
17 use cros_async::EventAsync;
18 use cros_async::Executor;
19 use cros_async::TaskHandle;
20 use futures::FutureExt;
21 use futures::StreamExt;
22 use snapshot::AnySnapshot;
23 use sync::Mutex;
24 pub use sys::run_gpu_device;
25 pub use sys::Options;
26 use vm_memory::GuestMemory;
27 use vmm_vhost::message::VhostUserProtocolFeatures;
28 use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
29
30 use crate::virtio::device_constants::gpu::NUM_QUEUES;
31 use crate::virtio::gpu;
32 use crate::virtio::gpu::QueueReader;
33 use crate::virtio::vhost::user::device::handler::Error as DeviceError;
34 use crate::virtio::vhost::user::device::handler::VhostBackendReqConnection;
35 use crate::virtio::vhost::user::device::handler::VhostUserDevice;
36 use crate::virtio::vhost::user::device::handler::WorkerState;
37 use crate::virtio::DescriptorChain;
38 use crate::virtio::Gpu;
39 use crate::virtio::Queue;
40 use crate::virtio::SharedMemoryMapper;
41 use crate::virtio::SharedMemoryRegion;
42 use crate::virtio::VirtioDevice;
43
44 const MAX_QUEUE_NUM: usize = NUM_QUEUES;
45
46 #[derive(Clone)]
47 struct SharedReader {
48 queue: Arc<Mutex<Queue>>,
49 }
50
51 impl gpu::QueueReader for SharedReader {
pop(&self) -> Option<DescriptorChain>52 fn pop(&self) -> Option<DescriptorChain> {
53 self.queue.lock().pop()
54 }
55
add_used(&self, desc_chain: DescriptorChain, len: u32)56 fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
57 self.queue.lock().add_used(desc_chain, len)
58 }
59
signal_used(&self)60 fn signal_used(&self) {
61 self.queue.lock().trigger_interrupt();
62 }
63 }
64
run_ctrl_queue( reader: SharedReader, mem: GuestMemory, kick_evt: EventAsync, state: Rc<RefCell<gpu::Frontend>>, )65 async fn run_ctrl_queue(
66 reader: SharedReader,
67 mem: GuestMemory,
68 kick_evt: EventAsync,
69 state: Rc<RefCell<gpu::Frontend>>,
70 ) {
71 loop {
72 if let Err(e) = kick_evt.next_val().await {
73 error!("Failed to read kick event for ctrl queue: {}", e);
74 break;
75 }
76
77 let mut state = state.borrow_mut();
78 let needs_interrupt = state.process_queue(&mem, &reader);
79
80 if needs_interrupt {
81 reader.signal_used();
82 }
83 }
84 }
85
86 struct GpuBackend {
87 ex: Executor,
88 gpu: Rc<RefCell<Gpu>>,
89 resource_bridges: Arc<Mutex<Vec<Tube>>>,
90 state: Option<Rc<RefCell<gpu::Frontend>>>,
91 fence_state: Arc<Mutex<gpu::FenceState>>,
92 queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
93 // In the downstream, we may add platform workers after start_platform_workers returns.
94 platform_worker_tx: futures::channel::mpsc::UnboundedSender<TaskHandle<()>>,
95 platform_worker_rx: futures::channel::mpsc::UnboundedReceiver<TaskHandle<()>>,
96 shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
97 }
98
99 impl GpuBackend {
stop_non_queue_workers(&mut self) -> anyhow::Result<()>100 fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
101 self.ex
102 .run_until(async {
103 while let Some(Some(handle)) = self.platform_worker_rx.next().now_or_never() {
104 handle.cancel().await;
105 }
106 })
107 .context("stopping the non-queue workers for GPU")?;
108 Ok(())
109 }
110 }
111
112 impl VhostUserDevice for GpuBackend {
max_queue_num(&self) -> usize113 fn max_queue_num(&self) -> usize {
114 MAX_QUEUE_NUM
115 }
116
features(&self) -> u64117 fn features(&self) -> u64 {
118 self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
119 }
120
ack_features(&mut self, value: u64) -> anyhow::Result<()>121 fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
122 self.gpu.borrow_mut().ack_features(value);
123 Ok(())
124 }
125
protocol_features(&self) -> VhostUserProtocolFeatures126 fn protocol_features(&self) -> VhostUserProtocolFeatures {
127 VhostUserProtocolFeatures::CONFIG
128 | VhostUserProtocolFeatures::BACKEND_REQ
129 | VhostUserProtocolFeatures::MQ
130 | VhostUserProtocolFeatures::SHARED_MEMORY_REGIONS
131 | VhostUserProtocolFeatures::DEVICE_STATE
132 }
133
read_config(&self, offset: u64, dst: &mut [u8])134 fn read_config(&self, offset: u64, dst: &mut [u8]) {
135 self.gpu.borrow().read_config(offset, dst)
136 }
137
write_config(&self, offset: u64, data: &[u8])138 fn write_config(&self, offset: u64, data: &[u8]) {
139 self.gpu.borrow_mut().write_config(offset, data)
140 }
141
start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()>142 fn start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()> {
143 if self.queue_workers[idx].is_some() {
144 warn!("Starting new queue handler without stopping old handler");
145 self.stop_queue(idx)?;
146 }
147
148 let doorbell = queue.interrupt().clone();
149
150 // Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
151 // handle fences in the RutabagaFenceHandler, and also handle queue messages in
152 // `run_ctrl_queue`.
153 // For the cursor queue, we still create the refcounted queue to support retrieving the
154 // queue for snapshotting (but don't handle any messages).
155 let queue = Arc::new(Mutex::new(queue));
156
157 // Spawn a worker for the queue.
158 let queue_task = match idx {
159 0 => {
160 // Set up worker for the control queue.
161 let kick_evt = queue
162 .lock()
163 .event()
164 .try_clone()
165 .context("failed to clone queue event")?;
166 let kick_evt = EventAsync::new(kick_evt, &self.ex)
167 .context("failed to create EventAsync for kick_evt")?;
168 let reader = SharedReader {
169 queue: queue.clone(),
170 };
171
172 let state = if let Some(s) = self.state.as_ref() {
173 s.clone()
174 } else {
175 let fence_handler_resources =
176 Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
177 mem: mem.clone(),
178 ctrl_queue: reader.clone(),
179 })));
180 let fence_handler = gpu::create_fence_handler(
181 fence_handler_resources,
182 self.fence_state.clone(),
183 );
184
185 let state = Rc::new(RefCell::new(
186 self.gpu
187 .borrow_mut()
188 .initialize_frontend(
189 self.fence_state.clone(),
190 fence_handler,
191 Arc::clone(&self.shmem_mapper),
192 )
193 .ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
194 ));
195 self.state = Some(state.clone());
196 state
197 };
198
199 // Start handling platform-specific workers.
200 self.start_platform_workers(doorbell)?;
201
202 // Start handling the control queue.
203 self.ex
204 .spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
205 }
206 1 => {
207 // For the cursor queue, spawn an empty worker, as we don't process it at all.
208 // We don't handle the cursor queue because no current users of vhost-user GPU pass
209 // any messages on it.
210 self.ex.spawn_local(async {})
211 }
212 _ => bail!("attempted to start unknown queue: {}", idx),
213 };
214
215 self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
216 Ok(())
217 }
218
stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue>219 fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
220 if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
221 // Wait for queue_task to be aborted.
222 let _ = self.ex.run_until(worker.queue_task.cancel());
223
224 if idx == 0 {
225 // Stop the non-queue workers if this is the control queue (where we start them).
226 self.stop_non_queue_workers()?;
227
228 // After we stop all workers, we have only one reference left to self.state.
229 // Clearing it allows the GPU state to be destroyed, which gets rid of the
230 // remaining control queue reference from RutabagaFenceHandler.
231 // This allows our worker.queue to be recovered as it has no further references.
232 self.state = None;
233 }
234
235 let queue = match Arc::try_unwrap(worker.queue) {
236 Ok(queue_mutex) => queue_mutex.into_inner(),
237 Err(_) => panic!("failed to recover queue from worker"),
238 };
239
240 Ok(queue)
241 } else {
242 Err(anyhow::Error::new(DeviceError::WorkerNotFound))
243 }
244 }
245
enter_suspended_state(&mut self) -> anyhow::Result<()>246 fn enter_suspended_state(&mut self) -> anyhow::Result<()> {
247 self.stop_non_queue_workers()?;
248 Ok(())
249 }
250
reset(&mut self)251 fn reset(&mut self) {
252 self.stop_non_queue_workers()
253 .expect("Failed to stop platform workers.");
254
255 for queue_num in 0..self.max_queue_num() {
256 // The cursor queue is never used, so we should check if the queue is set before
257 // stopping.
258 if self.queue_workers[queue_num].is_some() {
259 if let Err(e) = self.stop_queue(queue_num) {
260 error!("Failed to stop_queue during reset: {}", e);
261 }
262 }
263 }
264 }
265
get_shared_memory_region(&self) -> Option<SharedMemoryRegion>266 fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
267 self.gpu.borrow().get_shared_memory_region()
268 }
269
set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>)270 fn set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>) {
271 if self
272 .shmem_mapper
273 .lock()
274 .replace(conn.take_shmem_mapper().unwrap())
275 .is_some()
276 {
277 warn!("Connection already established. Overwriting shmem_mapper");
278 }
279 }
280
snapshot(&mut self) -> anyhow::Result<AnySnapshot>281 fn snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
282 // TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
283 // snapshot of the GPU to create a POC.
284 AnySnapshot::to_any(())
285 }
286
restore(&mut self, data: AnySnapshot) -> anyhow::Result<()>287 fn restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
288 let () = AnySnapshot::from_any(data)?;
289 Ok(())
290 }
291 }
292
293 impl Drop for GpuBackend {
drop(&mut self)294 fn drop(&mut self) {
295 // Workers are detached and will leak unless they are aborted. Aborting marks the
296 // Abortable task, then wakes it up. This means the executor should be asked to continue
297 // running for one more step after the backend is destroyed.
298 self.reset();
299 }
300 }
301