1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::collections::BTreeMap;
6 use std::fs::File;
7 use std::io;
8 use std::mem::size_of;
9 use std::time::Duration;
10
11 use anyhow::anyhow;
12 use anyhow::Context;
13 use base::error;
14 use base::AsRawDescriptor;
15 use base::Error as SysError;
16 use base::Event;
17 use base::RawDescriptor;
18 use base::Result as SysResult;
19 use base::Timer;
20 use base::Tube;
21 use base::TubeError;
22 use base::WorkerThread;
23 use cros_async::select2;
24 use cros_async::select3;
25 use cros_async::AsyncError;
26 use cros_async::EventAsync;
27 use cros_async::Executor;
28 use cros_async::TimerAsync;
29 use data_model::Le32;
30 use data_model::Le64;
31 use futures::pin_mut;
32 use remain::sorted;
33 use snapshot::AnySnapshot;
34 use thiserror::Error;
35 use vm_control::MemSlot;
36 use vm_control::VmMemoryMappingRequest;
37 use vm_control::VmMemoryMappingResponse;
38 use vm_memory::GuestAddress;
39 use vm_memory::GuestMemory;
40 use zerocopy::FromBytes;
41 use zerocopy::Immutable;
42 use zerocopy::IntoBytes;
43 use zerocopy::KnownLayout;
44
45 use super::async_utils;
46 use super::copy_config;
47 use super::DescriptorChain;
48 use super::DeviceType;
49 use super::Interrupt;
50 use super::Queue;
51 use super::VirtioDevice;
52
53 const QUEUE_SIZE: u16 = 256;
54 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
55
56 /* Feature bits */
57 const VIRTIO_PMEM_F_DISCARD: u32 = 63;
58
59 const VIRTIO_PMEM_REQ_TYPE_FLUSH: u32 = 0;
60 const VIRTIO_PMEM_REQ_TYPE_DISCARD: u32 = u32::MAX;
61 const VIRTIO_PMEM_RESP_TYPE_OK: u32 = 0;
62 const VIRTIO_PMEM_RESP_TYPE_EIO: u32 = 1;
63
64 #[derive(Copy, Clone, Debug, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
65 #[repr(C)]
66 struct virtio_pmem_config {
67 start_address: Le64,
68 size: Le64,
69 }
70
71 #[derive(Copy, Clone, Debug, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
72 #[repr(C)]
73 struct virtio_pmem_resp {
74 status_code: Le32,
75 }
76
77 #[derive(Copy, Clone, Debug, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
78 #[repr(C)]
79 struct virtio_pmem_req {
80 type_: Le32,
81 }
82
83 #[derive(Copy, Clone, Debug, Default, FromBytes, Immutable, IntoBytes, KnownLayout)]
84 #[repr(C)]
85 struct virtio_pmem_range_req {
86 type_: Le32,
87 padding_: Le32,
88 start_address: Le64,
89 size: Le64,
90 }
91
92 #[sorted]
93 #[derive(Error, Debug)]
94 enum Error {
95 /// Failed to get value from pageout timer.
96 #[error("failed to get value from pageout timer: {0}")]
97 PageoutTimer(AsyncError),
98 /// Failed to read from virtqueue.
99 #[error("failed to read from virtqueue: {0}")]
100 ReadQueue(io::Error),
101 /// Failed to receive tube response.
102 #[error("failed to receive tube response: {0}")]
103 ReceiveResponse(TubeError),
104 /// Failed to send tube request.
105 #[error("failed to send tube request: {0}")]
106 SendingRequest(TubeError),
107 /// Failed to write to virtqueue.
108 #[error("failed to write to virtqueue: {0}")]
109 WriteQueue(io::Error),
110 }
111
112 type Result<T> = ::std::result::Result<T, Error>;
113
pageout( ex: &Executor, swap_interval: Duration, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> Result<()>114 async fn pageout(
115 ex: &Executor,
116 swap_interval: Duration,
117 pmem_device_tube: &Tube,
118 mapping_arena_slot: u32,
119 mapping_size: usize,
120 ) -> Result<()> {
121 let timer = Timer::new().expect("Failed to create a timer");
122 let mut pageout_timer =
123 TimerAsync::new(timer, ex).expect("Failed to create an async pageout timer");
124 pageout_timer
125 .reset_repeating(swap_interval)
126 .expect("Failed to reset pageout timer");
127
128 loop {
129 pageout_timer.wait().await.map_err(Error::PageoutTimer)?;
130 let request = VmMemoryMappingRequest::MadvisePageout {
131 slot: mapping_arena_slot,
132 offset: 0,
133 size: mapping_size,
134 };
135
136 pmem_device_tube
137 .send(&request)
138 .map_err(Error::SendingRequest)?;
139 match pmem_device_tube
140 .recv::<VmMemoryMappingResponse>()
141 .map_err(Error::ReceiveResponse)?
142 {
143 VmMemoryMappingResponse::Ok => {}
144 VmMemoryMappingResponse::Err(e) => {
145 error!("failed to page out the memory mapping: {}", e);
146 }
147 };
148 }
149 }
150
execute_request( request_type: u32, start_address: u64, size: u64, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> u32151 fn execute_request(
152 request_type: u32,
153 start_address: u64,
154 size: u64,
155 pmem_device_tube: &Tube,
156 mapping_arena_slot: u32,
157 mapping_size: usize,
158 ) -> u32 {
159 match request_type {
160 VIRTIO_PMEM_REQ_TYPE_FLUSH => {
161 let request = VmMemoryMappingRequest::MsyncArena {
162 slot: mapping_arena_slot,
163 offset: 0, // The pmem backing file is always at offset 0 in the arena.
164 size: mapping_size,
165 };
166
167 if let Err(e) = pmem_device_tube.send(&request) {
168 error!("failed to send request: {}", e);
169 return VIRTIO_PMEM_RESP_TYPE_EIO;
170 }
171
172 match pmem_device_tube.recv() {
173 Ok(response) => match response {
174 VmMemoryMappingResponse::Ok => VIRTIO_PMEM_RESP_TYPE_OK,
175 VmMemoryMappingResponse::Err(e) => {
176 error!("failed flushing disk image: {}", e);
177 VIRTIO_PMEM_RESP_TYPE_EIO
178 }
179 },
180 Err(e) => {
181 error!("failed to receive data: {}", e);
182 VIRTIO_PMEM_RESP_TYPE_EIO
183 }
184 }
185 }
186
187 VIRTIO_PMEM_REQ_TYPE_DISCARD => {
188 let request = VmMemoryMappingRequest::MadviseRemove {
189 slot: mapping_arena_slot,
190 offset: usize::try_from(start_address).unwrap(),
191 size: usize::try_from(size).unwrap(),
192 };
193
194 if let Err(e) = pmem_device_tube.send(&request) {
195 error!("failed to send request: {}", e);
196 return VIRTIO_PMEM_RESP_TYPE_EIO;
197 }
198
199 match pmem_device_tube.recv() {
200 Ok(response) => match response {
201 VmMemoryMappingResponse::Ok => VIRTIO_PMEM_RESP_TYPE_OK,
202 VmMemoryMappingResponse::Err(e) => {
203 error!("failed to discard memory range: {}", e);
204 VIRTIO_PMEM_RESP_TYPE_EIO
205 }
206 },
207 Err(e) => {
208 error!("failed to receive data: {}", e);
209 VIRTIO_PMEM_RESP_TYPE_EIO
210 }
211 }
212 }
213
214 _ => {
215 error!("unknown request type: {}", request_type);
216 VIRTIO_PMEM_RESP_TYPE_EIO
217 }
218 }
219 }
220
handle_request( avail_desc: &mut DescriptorChain, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, ) -> Result<usize>221 fn handle_request(
222 avail_desc: &mut DescriptorChain,
223 pmem_device_tube: &Tube,
224 mapping_arena_slot: u32,
225 mapping_size: usize,
226 ) -> Result<usize> {
227 let (request_type, start_address, size) =
228 if avail_desc.reader.available_bytes() == size_of::<virtio_pmem_req>() {
229 let request = avail_desc
230 .reader
231 .read_obj::<virtio_pmem_req>()
232 .map_err(Error::ReadQueue)?;
233 (request.type_.to_native(), 0, 0)
234 } else {
235 let request = avail_desc
236 .reader
237 .read_obj::<virtio_pmem_range_req>()
238 .map_err(Error::ReadQueue)?;
239 (
240 request.type_.to_native(),
241 request.start_address.to_native(),
242 request.size.to_native(),
243 )
244 };
245 let status_code = execute_request(
246 request_type,
247 start_address,
248 size,
249 pmem_device_tube,
250 mapping_arena_slot,
251 mapping_size,
252 );
253
254 let response = virtio_pmem_resp {
255 status_code: status_code.into(),
256 };
257
258 avail_desc
259 .writer
260 .write_obj(response)
261 .map_err(Error::WriteQueue)?;
262
263 Ok(avail_desc.writer.bytes_written())
264 }
265
handle_queue( queue: &mut Queue, mut queue_event: EventAsync, pmem_device_tube: &Tube, mapping_arena_slot: u32, mapping_size: usize, )266 async fn handle_queue(
267 queue: &mut Queue,
268 mut queue_event: EventAsync,
269 pmem_device_tube: &Tube,
270 mapping_arena_slot: u32,
271 mapping_size: usize,
272 ) {
273 loop {
274 let mut avail_desc = match queue.next_async(&mut queue_event).await {
275 Err(e) => {
276 error!("Failed to read descriptor {}", e);
277 return;
278 }
279 Ok(d) => d,
280 };
281
282 let written = match handle_request(
283 &mut avail_desc,
284 pmem_device_tube,
285 mapping_arena_slot,
286 mapping_size,
287 ) {
288 Ok(n) => n,
289 Err(e) => {
290 error!("pmem: failed to handle request: {}", e);
291 0
292 }
293 };
294 queue.add_used(avail_desc, written as u32);
295 queue.trigger_interrupt();
296 }
297 }
298
run_worker( queue: &mut Queue, pmem_device_tube: &Tube, kill_evt: Event, mapping_arena_slot: u32, mapping_size: usize, swap_interval: Option<Duration>, )299 fn run_worker(
300 queue: &mut Queue,
301 pmem_device_tube: &Tube,
302 kill_evt: Event,
303 mapping_arena_slot: u32,
304 mapping_size: usize,
305 swap_interval: Option<Duration>,
306 ) {
307 let ex = Executor::new().unwrap();
308
309 let queue_evt = queue
310 .event()
311 .try_clone()
312 .expect("failed to clone queue event");
313 let queue_evt = EventAsync::new(queue_evt, &ex).expect("failed to set up the queue event");
314
315 // Process requests from the virtio queue.
316 let queue_fut = handle_queue(
317 queue,
318 queue_evt,
319 pmem_device_tube,
320 mapping_arena_slot,
321 mapping_size,
322 );
323 pin_mut!(queue_fut);
324
325 // Exit if the kill event is triggered.
326 let kill = async_utils::await_and_exit(&ex, kill_evt);
327 pin_mut!(kill);
328
329 let interval = swap_interval.unwrap_or(Duration::ZERO);
330 if interval.is_zero() {
331 if let Err(e) = ex.run_until(select2(queue_fut, kill)) {
332 error!("error happened in executor: {}", e);
333 }
334 } else {
335 let pageout_fut = pageout(
336 &ex,
337 interval,
338 pmem_device_tube,
339 mapping_arena_slot,
340 mapping_size,
341 );
342 pin_mut!(pageout_fut);
343 if let Err(e) = ex.run_until(select3(queue_fut, kill, pageout_fut)) {
344 error!("error happened in executor: {}", e);
345 }
346 }
347 }
348
349 /// Specifies how memory slot is initialized.
350 pub enum MemSlotConfig {
351 /// The memory region has already been mapped to the guest.
352 MemSlot {
353 /// index of the guest-mapped memory regions.
354 idx: MemSlot,
355 },
356 /// The memory region that is not initialized yet and whose slot index will be provided via
357 /// `Tube` later. e.g. pmem-ext2 device, where fs construction will be done in the main
358 /// process.
359 LazyInit { tube: Tube },
360 }
361
362 pub struct Pmem {
363 worker_thread: Option<WorkerThread<(Queue, Tube)>>,
364 features: u64,
365 disk_image: Option<File>,
366 mapping_address: GuestAddress,
367 mem_slot: MemSlotConfig,
368 mapping_size: u64,
369 pmem_device_tube: Option<Tube>,
370 swap_interval: Option<Duration>,
371 }
372
373 #[derive(serde::Serialize, serde::Deserialize)]
374 struct PmemSnapshot {
375 mapping_address: GuestAddress,
376 mapping_size: u64,
377 }
378
379 /// Configuration of a virtio-pmem device.
380 pub struct PmemConfig {
381 /// Disk image exposed to the guest.
382 /// If the memory region is not backed by a file, this should be `None`.
383 pub disk_image: Option<File>,
384 /// Guest physical address where the memory will be mapped.
385 pub mapping_address: GuestAddress,
386 pub mem_slot: MemSlotConfig,
387 /// The size of the mapped region.
388 pub mapping_size: u64,
389 /// A communication channel to the main process to send memory requests.
390 pub pmem_device_tube: Tube,
391 /// Interval for periodic swap out of memory mapping
392 pub swap_interval: Option<Duration>,
393 /// Whether the region is writeble or not.
394 pub mapping_writable: bool,
395 }
396
397 impl Pmem {
new(base_features: u64, cfg: PmemConfig) -> SysResult<Pmem>398 pub fn new(base_features: u64, cfg: PmemConfig) -> SysResult<Pmem> {
399 if cfg.mapping_size > usize::MAX as u64 {
400 return Err(SysError::new(libc::EOVERFLOW));
401 }
402
403 let mut avail_features = base_features;
404 if cfg.mapping_writable {
405 if let MemSlotConfig::LazyInit { .. } = cfg.mem_slot {
406 error!("pmem-ext2 must be a read-only device");
407 return Err(SysError::new(libc::EINVAL));
408 }
409
410 avail_features |= 1 << VIRTIO_PMEM_F_DISCARD;
411 }
412
413 Ok(Pmem {
414 worker_thread: None,
415 features: avail_features,
416 disk_image: cfg.disk_image,
417 mapping_address: cfg.mapping_address,
418 mem_slot: cfg.mem_slot,
419 mapping_size: cfg.mapping_size,
420 pmem_device_tube: Some(cfg.pmem_device_tube),
421 swap_interval: cfg.swap_interval,
422 })
423 }
424 }
425
426 impl VirtioDevice for Pmem {
keep_rds(&self) -> Vec<RawDescriptor>427 fn keep_rds(&self) -> Vec<RawDescriptor> {
428 let mut keep_rds = Vec::new();
429 if let Some(disk_image) = &self.disk_image {
430 keep_rds.push(disk_image.as_raw_descriptor());
431 }
432
433 if let Some(ref pmem_device_tube) = self.pmem_device_tube {
434 keep_rds.push(pmem_device_tube.as_raw_descriptor());
435 }
436
437 if let MemSlotConfig::LazyInit { tube } = &self.mem_slot {
438 keep_rds.push(tube.as_raw_descriptor());
439 }
440
441 keep_rds
442 }
443
device_type(&self) -> DeviceType444 fn device_type(&self) -> DeviceType {
445 DeviceType::Pmem
446 }
447
queue_max_sizes(&self) -> &[u16]448 fn queue_max_sizes(&self) -> &[u16] {
449 QUEUE_SIZES
450 }
451
features(&self) -> u64452 fn features(&self) -> u64 {
453 self.features
454 }
455
read_config(&self, offset: u64, data: &mut [u8])456 fn read_config(&self, offset: u64, data: &mut [u8]) {
457 let config = virtio_pmem_config {
458 start_address: Le64::from(self.mapping_address.offset()),
459 size: Le64::from(self.mapping_size),
460 };
461 copy_config(data, 0, config.as_bytes(), offset);
462 }
463
activate( &mut self, _memory: GuestMemory, _interrupt: Interrupt, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>464 fn activate(
465 &mut self,
466 _memory: GuestMemory,
467 _interrupt: Interrupt,
468 mut queues: BTreeMap<usize, Queue>,
469 ) -> anyhow::Result<()> {
470 if queues.len() != 1 {
471 return Err(anyhow!("expected 1 queue, got {}", queues.len()));
472 }
473
474 let mut queue = queues.remove(&0).unwrap();
475
476 // We checked that this fits in a usize in `Pmem::new`.
477 let mapping_size = self.mapping_size as usize;
478
479 let pmem_device_tube = self
480 .pmem_device_tube
481 .take()
482 .context("missing pmem device tube")?;
483
484 let swap_interval = self.swap_interval;
485
486 let mapping_arena_slot = match &self.mem_slot {
487 MemSlotConfig::MemSlot { idx } => *idx,
488 MemSlotConfig::LazyInit { tube } => tube
489 .recv::<u32>()
490 .context("failed to receive memory slot for ext2 pmem device")?,
491 };
492
493 self.worker_thread = Some(WorkerThread::start("v_pmem", move |kill_event| {
494 run_worker(
495 &mut queue,
496 &pmem_device_tube,
497 kill_event,
498 mapping_arena_slot,
499 mapping_size,
500 swap_interval,
501 );
502 (queue, pmem_device_tube)
503 }));
504
505 Ok(())
506 }
507
reset(&mut self) -> anyhow::Result<()>508 fn reset(&mut self) -> anyhow::Result<()> {
509 if let Some(worker_thread) = self.worker_thread.take() {
510 let (_queue, pmem_device_tube) = worker_thread.stop();
511 self.pmem_device_tube = Some(pmem_device_tube);
512 }
513 Ok(())
514 }
515
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>516 fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
517 if let Some(worker_thread) = self.worker_thread.take() {
518 let (queue, pmem_device_tube) = worker_thread.stop();
519 self.pmem_device_tube = Some(pmem_device_tube);
520 return Ok(Some(BTreeMap::from([(0, queue)])));
521 }
522 Ok(None)
523 }
524
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>525 fn virtio_wake(
526 &mut self,
527 queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
528 ) -> anyhow::Result<()> {
529 if let Some((mem, interrupt, queues)) = queues_state {
530 self.activate(mem, interrupt, queues)?;
531 }
532 Ok(())
533 }
534
virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot>535 fn virtio_snapshot(&mut self) -> anyhow::Result<AnySnapshot> {
536 AnySnapshot::to_any(PmemSnapshot {
537 mapping_address: self.mapping_address,
538 mapping_size: self.mapping_size,
539 })
540 .context("failed to serialize pmem snapshot")
541 }
542
virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()>543 fn virtio_restore(&mut self, data: AnySnapshot) -> anyhow::Result<()> {
544 let snapshot: PmemSnapshot =
545 AnySnapshot::from_any(data).context("failed to deserialize pmem snapshot")?;
546 anyhow::ensure!(
547 snapshot.mapping_address == self.mapping_address
548 && snapshot.mapping_size == self.mapping_size,
549 "pmem snapshot doesn't match config: expected {:?}, got {:?}",
550 (self.mapping_address, self.mapping_size),
551 (snapshot.mapping_address, snapshot.mapping_size),
552 );
553 Ok(())
554 }
555 }
556