1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 pub mod sys;
6
7 use std::cell::RefCell;
8 use std::rc::Rc;
9 use std::sync::Arc;
10
11 use anyhow::anyhow;
12 use anyhow::bail;
13 use anyhow::Context;
14 use base::error;
15 use base::warn;
16 use base::Tube;
17 use cros_async::EventAsync;
18 use cros_async::Executor;
19 use cros_async::TaskHandle;
20 use sync::Mutex;
21 pub use sys::run_gpu_device;
22 pub use sys::Options;
23 use vm_memory::GuestMemory;
24 use vmm_vhost::message::VhostUserProtocolFeatures;
25 use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
26
27 use crate::virtio::device_constants::gpu::NUM_QUEUES;
28 use crate::virtio::gpu;
29 use crate::virtio::gpu::QueueReader;
30 use crate::virtio::vhost::user::device::handler::Error as DeviceError;
31 use crate::virtio::vhost::user::device::handler::VhostBackendReqConnection;
32 use crate::virtio::vhost::user::device::handler::VhostUserDevice;
33 use crate::virtio::vhost::user::device::handler::WorkerState;
34 use crate::virtio::DescriptorChain;
35 use crate::virtio::Gpu;
36 use crate::virtio::Interrupt;
37 use crate::virtio::Queue;
38 use crate::virtio::SharedMemoryMapper;
39 use crate::virtio::SharedMemoryRegion;
40 use crate::virtio::VirtioDevice;
41
42 const MAX_QUEUE_NUM: usize = NUM_QUEUES;
43
44 #[derive(Clone)]
45 struct SharedReader {
46 queue: Arc<Mutex<Queue>>,
47 doorbell: Interrupt,
48 }
49
50 impl gpu::QueueReader for SharedReader {
pop(&self) -> Option<DescriptorChain>51 fn pop(&self) -> Option<DescriptorChain> {
52 self.queue.lock().pop()
53 }
54
add_used(&self, desc_chain: DescriptorChain, len: u32)55 fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
56 self.queue.lock().add_used(desc_chain, len)
57 }
58
signal_used(&self)59 fn signal_used(&self) {
60 self.queue.lock().trigger_interrupt(&self.doorbell);
61 }
62 }
63
run_ctrl_queue( reader: SharedReader, mem: GuestMemory, kick_evt: EventAsync, state: Rc<RefCell<gpu::Frontend>>, )64 async fn run_ctrl_queue(
65 reader: SharedReader,
66 mem: GuestMemory,
67 kick_evt: EventAsync,
68 state: Rc<RefCell<gpu::Frontend>>,
69 ) {
70 loop {
71 if let Err(e) = kick_evt.next_val().await {
72 error!("Failed to read kick event for ctrl queue: {}", e);
73 break;
74 }
75
76 let mut state = state.borrow_mut();
77 let needs_interrupt = state.process_queue(&mem, &reader);
78
79 if needs_interrupt {
80 reader.signal_used();
81 }
82 }
83 }
84
85 struct GpuBackend {
86 ex: Executor,
87 gpu: Rc<RefCell<Gpu>>,
88 resource_bridges: Arc<Mutex<Vec<Tube>>>,
89 acked_protocol_features: u64,
90 state: Option<Rc<RefCell<gpu::Frontend>>>,
91 fence_state: Arc<Mutex<gpu::FenceState>>,
92 queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
93 platform_workers: Rc<RefCell<Vec<TaskHandle<()>>>>,
94 shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
95 }
96
97 impl VhostUserDevice for GpuBackend {
max_queue_num(&self) -> usize98 fn max_queue_num(&self) -> usize {
99 MAX_QUEUE_NUM
100 }
101
features(&self) -> u64102 fn features(&self) -> u64 {
103 self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
104 }
105
ack_features(&mut self, value: u64) -> anyhow::Result<()>106 fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
107 self.gpu.borrow_mut().ack_features(value);
108 Ok(())
109 }
110
acked_features(&self) -> u64111 fn acked_features(&self) -> u64 {
112 self.features()
113 }
114
protocol_features(&self) -> VhostUserProtocolFeatures115 fn protocol_features(&self) -> VhostUserProtocolFeatures {
116 VhostUserProtocolFeatures::CONFIG
117 | VhostUserProtocolFeatures::BACKEND_REQ
118 | VhostUserProtocolFeatures::MQ
119 | VhostUserProtocolFeatures::SHARED_MEMORY_REGIONS
120 }
121
ack_protocol_features(&mut self, features: u64) -> anyhow::Result<()>122 fn ack_protocol_features(&mut self, features: u64) -> anyhow::Result<()> {
123 let unrequested_features = features & !self.protocol_features().bits();
124 if unrequested_features != 0 {
125 bail!("Unexpected protocol features: {:#x}", unrequested_features);
126 }
127
128 self.acked_protocol_features |= features;
129 Ok(())
130 }
131
acked_protocol_features(&self) -> u64132 fn acked_protocol_features(&self) -> u64 {
133 self.acked_protocol_features
134 }
135
read_config(&self, offset: u64, dst: &mut [u8])136 fn read_config(&self, offset: u64, dst: &mut [u8]) {
137 self.gpu.borrow().read_config(offset, dst)
138 }
139
write_config(&self, offset: u64, data: &[u8])140 fn write_config(&self, offset: u64, data: &[u8]) {
141 self.gpu.borrow_mut().write_config(offset, data)
142 }
143
start_queue( &mut self, idx: usize, queue: Queue, mem: GuestMemory, doorbell: Interrupt, ) -> anyhow::Result<()>144 fn start_queue(
145 &mut self,
146 idx: usize,
147 queue: Queue,
148 mem: GuestMemory,
149 doorbell: Interrupt,
150 ) -> anyhow::Result<()> {
151 if self.queue_workers[idx].is_some() {
152 warn!("Starting new queue handler without stopping old handler");
153 self.stop_queue(idx)?;
154 }
155
156 // Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
157 // handle fences in the RutabagaFenceHandler, and also handle queue messages in
158 // `run_ctrl_queue`.
159 // For the cursor queue, we still create the refcounted queue to support retrieving the
160 // queue for snapshotting (but don't handle any messages).
161 let queue = Arc::new(Mutex::new(queue));
162
163 // Spawn a worker for the queue.
164 let queue_task = match idx {
165 0 => {
166 // Set up worker for the control queue.
167 let kick_evt = queue
168 .lock()
169 .event()
170 .try_clone()
171 .context("failed to clone queue event")?;
172 let kick_evt = EventAsync::new(kick_evt, &self.ex)
173 .context("failed to create EventAsync for kick_evt")?;
174 let reader = SharedReader {
175 queue: queue.clone(),
176 doorbell: doorbell.clone(),
177 };
178
179 let state = if let Some(s) = self.state.as_ref() {
180 s.clone()
181 } else {
182 let fence_handler_resources =
183 Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
184 mem: mem.clone(),
185 ctrl_queue: reader.clone(),
186 })));
187 let fence_handler = gpu::create_fence_handler(
188 fence_handler_resources,
189 self.fence_state.clone(),
190 );
191
192 let state = Rc::new(RefCell::new(
193 self.gpu
194 .borrow_mut()
195 .initialize_frontend(
196 self.fence_state.clone(),
197 fence_handler,
198 Arc::clone(&self.shmem_mapper),
199 )
200 .ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
201 ));
202 self.state = Some(state.clone());
203 state
204 };
205
206 // Start handling platform-specific workers.
207 self.start_platform_workers(doorbell)?;
208
209 // Start handling the control queue.
210 self.ex
211 .spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
212 }
213 1 => {
214 // For the cursor queue, spawn an empty worker, as we don't process it at all.
215 // We don't handle the cursor queue because no current users of vhost-user GPU pass
216 // any messages on it.
217 self.ex.spawn_local(async {})
218 }
219 _ => bail!("attempted to start unknown queue: {}", idx),
220 };
221
222 self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
223 Ok(())
224 }
225
stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue>226 fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
227 if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
228 // Wait for queue_task to be aborted.
229 let _ = self.ex.run_until(worker.queue_task.cancel());
230
231 if idx == 0 {
232 // Stop the non-queue workers if this is the control queue (where we start them).
233 self.stop_non_queue_workers()?;
234
235 // After we stop all workers, we have only one reference left to self.state.
236 // Clearing it allows the GPU state to be destroyed, which gets rid of the
237 // remaining control queue reference from RutabagaFenceHandler.
238 // This allows our worker.queue to be recovered as it has no further references.
239 self.state = None;
240 }
241
242 let queue = match Arc::try_unwrap(worker.queue) {
243 Ok(queue_mutex) => queue_mutex.into_inner(),
244 Err(_) => panic!("failed to recover queue from worker"),
245 };
246
247 Ok(queue)
248 } else {
249 Err(anyhow::Error::new(DeviceError::WorkerNotFound))
250 }
251 }
252
stop_non_queue_workers(&mut self) -> anyhow::Result<()>253 fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
254 for handle in self.platform_workers.borrow_mut().drain(..) {
255 let _ = self.ex.run_until(handle.cancel());
256 }
257 Ok(())
258 }
259
reset(&mut self)260 fn reset(&mut self) {
261 self.stop_non_queue_workers()
262 .expect("Failed to stop platform workers.");
263
264 for queue_num in 0..self.max_queue_num() {
265 // The cursor queue is never used, so we should check if the queue is set before
266 // stopping.
267 if self.queue_workers[queue_num].is_some() {
268 if let Err(e) = self.stop_queue(queue_num) {
269 error!("Failed to stop_queue during reset: {}", e);
270 }
271 }
272 }
273 }
274
get_shared_memory_region(&self) -> Option<SharedMemoryRegion>275 fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
276 self.gpu.borrow().get_shared_memory_region()
277 }
278
set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>)279 fn set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>) {
280 if self
281 .shmem_mapper
282 .lock()
283 .replace(conn.take_shmem_mapper().unwrap())
284 .is_some()
285 {
286 warn!("Connection already established. Overwriting shmem_mapper");
287 }
288 }
289
snapshot(&self) -> anyhow::Result<Vec<u8>>290 fn snapshot(&self) -> anyhow::Result<Vec<u8>> {
291 // TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
292 // snapshot of the GPU to create a POC.
293 serde_json::to_vec(&serde_json::Value::Null)
294 .context("Failed to serialize Null in the GPU device")
295 }
296
restore(&mut self, data: Vec<u8>) -> anyhow::Result<()>297 fn restore(&mut self, data: Vec<u8>) -> anyhow::Result<()> {
298 let data: serde_json::Value = serde_json::from_slice(data.as_slice())
299 .context("Failed to deserialize NULL in the GPU device")?;
300 anyhow::ensure!(
301 data.is_null(),
302 "unexpected snapshot data: should be null, got {}",
303 data
304 );
305 Ok(())
306 }
307 }
308
309 impl Drop for GpuBackend {
drop(&mut self)310 fn drop(&mut self) {
311 // Workers are detached and will leak unless they are aborted. Aborting marks the
312 // Abortable task, then wakes it up. This means the executor should be asked to continue
313 // running for one more step after the backend is destroyed.
314 self.reset();
315 }
316 }
317