1 // Copyright 2021 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cell::RefCell;
6 use std::io::{self, Write};
7 use std::mem::size_of;
8 use std::rc::Rc;
9 use std::result;
10 use std::sync::{atomic::AtomicU64, atomic::Ordering, Arc};
11 use std::thread;
12 use std::time::Duration;
13 use std::u32;
14
15 use futures::pin_mut;
16 use futures::stream::{FuturesUnordered, StreamExt};
17 use remain::sorted;
18 use thiserror::Error as ThisError;
19
20 use base::Error as SysError;
21 use base::Result as SysResult;
22 use base::{error, info, warn, AsRawDescriptor, Event, RawDescriptor, Timer, Tube, TubeError};
23 use cros_async::{
24 select5, sync::Mutex as AsyncMutex, AsyncError, AsyncTube, EventAsync, Executor, SelectResult,
25 TimerAsync,
26 };
27 use data_model::DataInit;
28 use disk::{AsyncDisk, ToAsyncDisk};
29 use vm_control::{DiskControlCommand, DiskControlResult};
30 use vm_memory::GuestMemory;
31
32 use super::common::*;
33 use crate::virtio::{
34 async_utils, block::sys::*, copy_config, DescriptorChain, DescriptorError, Interrupt, Queue,
35 Reader, SignalableInterrupt, VirtioDevice, Writer, TYPE_BLOCK,
36 };
37
38 const QUEUE_SIZE: u16 = 256;
39 const NUM_QUEUES: u16 = 16;
40 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES as usize];
41
42 #[sorted]
43 #[derive(ThisError, Debug)]
44 enum ExecuteError {
45 #[error("failed to copy ID string: {0}")]
46 CopyId(io::Error),
47 #[error("virtio descriptor error: {0}")]
48 Descriptor(DescriptorError),
49 #[error("failed to perform discard or write zeroes; sector={sector} num_sectors={num_sectors} flags={flags}; {ioerr:?}")]
50 DiscardWriteZeroes {
51 ioerr: Option<disk::Error>,
52 sector: u64,
53 num_sectors: u32,
54 flags: u32,
55 },
56 #[error("failed to flush: {0}")]
57 Flush(disk::Error),
58 #[error("not enough space in descriptor chain to write status")]
59 MissingStatus,
60 #[error("out of range")]
61 OutOfRange,
62 #[error("failed to read message: {0}")]
63 Read(io::Error),
64 #[error("io error reading {length} bytes from sector {sector}: {desc_error}")]
65 ReadIo {
66 length: usize,
67 sector: u64,
68 desc_error: disk::Error,
69 },
70 #[error("read only; request_type={request_type}")]
71 ReadOnly { request_type: u32 },
72 #[error("failed to recieve command message: {0}")]
73 ReceivingCommand(TubeError),
74 #[error("failed to send command response: {0}")]
75 SendingResponse(TubeError),
76 #[error("couldn't reset the timer: {0}")]
77 TimerReset(base::Error),
78 #[error("unsupported ({0})")]
79 Unsupported(u32),
80 #[error("io error writing {length} bytes from sector {sector}: {desc_error}")]
81 WriteIo {
82 length: usize,
83 sector: u64,
84 desc_error: disk::Error,
85 },
86 #[error("failed to write request status: {0}")]
87 WriteStatus(io::Error),
88 }
89
90 impl ExecuteError {
status(&self) -> u891 fn status(&self) -> u8 {
92 match self {
93 ExecuteError::CopyId(_) => VIRTIO_BLK_S_IOERR,
94 ExecuteError::Descriptor(_) => VIRTIO_BLK_S_IOERR,
95 ExecuteError::DiscardWriteZeroes { .. } => VIRTIO_BLK_S_IOERR,
96 ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
97 ExecuteError::MissingStatus => VIRTIO_BLK_S_IOERR,
98 ExecuteError::OutOfRange { .. } => VIRTIO_BLK_S_IOERR,
99 ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
100 ExecuteError::ReadIo { .. } => VIRTIO_BLK_S_IOERR,
101 ExecuteError::ReadOnly { .. } => VIRTIO_BLK_S_IOERR,
102 ExecuteError::ReceivingCommand(_) => VIRTIO_BLK_S_IOERR,
103 ExecuteError::SendingResponse(_) => VIRTIO_BLK_S_IOERR,
104 ExecuteError::TimerReset(_) => VIRTIO_BLK_S_IOERR,
105 ExecuteError::WriteIo { .. } => VIRTIO_BLK_S_IOERR,
106 ExecuteError::WriteStatus(_) => VIRTIO_BLK_S_IOERR,
107 ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
108 }
109 }
110 }
111
112 /// Errors that happen in block outside of executing a request.
113 /// This includes errors during resize and flush operations.
114 #[sorted]
115 #[derive(ThisError, Debug)]
116 pub enum ControlError {
117 #[error("couldn't create an async resample event: {0}")]
118 AsyncResampleCreate(AsyncError),
119 #[error("couldn't clone the resample event: {0}")]
120 CloneResampleEvent(base::Error),
121 #[error("couldn't get a value from a timer for flushing: {0}")]
122 FlushTimer(AsyncError),
123 #[error("failed to fsync the disk: {0}")]
124 FsyncDisk(disk::Error),
125 #[error("couldn't read the resample event: {0}")]
126 ReadResampleEvent(AsyncError),
127 }
128
129 /// Tracks the state of an anynchronous disk.
130 pub struct DiskState {
131 pub disk_image: Box<dyn AsyncDisk>,
132 pub disk_size: Arc<AtomicU64>,
133 pub read_only: bool,
134 pub sparse: bool,
135 pub id: Option<BlockId>,
136 }
137
138 impl DiskState {
139 /// Creates a `DiskState` with the given params.
new( disk_image: Box<dyn AsyncDisk>, disk_size: Arc<AtomicU64>, read_only: bool, sparse: bool, id: Option<BlockId>, ) -> DiskState140 pub fn new(
141 disk_image: Box<dyn AsyncDisk>,
142 disk_size: Arc<AtomicU64>,
143 read_only: bool,
144 sparse: bool,
145 id: Option<BlockId>,
146 ) -> DiskState {
147 DiskState {
148 disk_image,
149 disk_size,
150 read_only,
151 sparse,
152 id,
153 }
154 }
155 }
156
process_one_request( avail_desc: DescriptorChain, disk_state: Rc<AsyncMutex<DiskState>>, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, mem: &GuestMemory, ) -> result::Result<usize, ExecuteError>157 async fn process_one_request(
158 avail_desc: DescriptorChain,
159 disk_state: Rc<AsyncMutex<DiskState>>,
160 flush_timer: Rc<RefCell<TimerAsync>>,
161 flush_timer_armed: Rc<RefCell<bool>>,
162 mem: &GuestMemory,
163 ) -> result::Result<usize, ExecuteError> {
164 let mut reader =
165 Reader::new(mem.clone(), avail_desc.clone()).map_err(ExecuteError::Descriptor)?;
166 let mut writer = Writer::new(mem.clone(), avail_desc).map_err(ExecuteError::Descriptor)?;
167
168 // The last byte of the buffer is virtio_blk_req::status.
169 // Split it into a separate Writer so that status_writer is the final byte and
170 // the original writer is left with just the actual block I/O data.
171 let available_bytes = writer.available_bytes();
172 let status_offset = available_bytes
173 .checked_sub(1)
174 .ok_or(ExecuteError::MissingStatus)?;
175 let mut status_writer = writer.split_at(status_offset);
176
177 let status = match BlockAsync::execute_request(
178 &mut reader,
179 &mut writer,
180 disk_state,
181 flush_timer,
182 flush_timer_armed,
183 )
184 .await
185 {
186 Ok(()) => VIRTIO_BLK_S_OK,
187 Err(e) => {
188 if !matches!(e, ExecuteError::Unsupported(VIRTIO_BLK_T_GET_ID)) {
189 error!("failed executing disk request: {}", e);
190 }
191 e.status()
192 }
193 };
194
195 status_writer
196 .write_all(&[status])
197 .map_err(ExecuteError::WriteStatus)?;
198 Ok(available_bytes)
199 }
200
201 /// Process one descriptor chain asynchronously.
process_one_chain<I: SignalableInterrupt>( queue: Rc<RefCell<Queue>>, avail_desc: DescriptorChain, disk_state: Rc<AsyncMutex<DiskState>>, mem: GuestMemory, interrupt: &I, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, )202 pub async fn process_one_chain<I: SignalableInterrupt>(
203 queue: Rc<RefCell<Queue>>,
204 avail_desc: DescriptorChain,
205 disk_state: Rc<AsyncMutex<DiskState>>,
206 mem: GuestMemory,
207 interrupt: &I,
208 flush_timer: Rc<RefCell<TimerAsync>>,
209 flush_timer_armed: Rc<RefCell<bool>>,
210 ) {
211 let descriptor_index = avail_desc.index;
212 let len =
213 match process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem)
214 .await
215 {
216 Ok(len) => len,
217 Err(e) => {
218 error!("block: failed to handle request: {}", e);
219 0
220 }
221 };
222
223 let mut queue = queue.borrow_mut();
224 queue.add_used(&mem, descriptor_index, len as u32);
225 queue.trigger_interrupt(&mem, interrupt);
226 }
227
228 // There is one async task running `handle_queue` per virtio queue in use.
229 // Receives messages from the guest and queues a task to complete the operations with the async
230 // executor.
handle_queue<I: SignalableInterrupt + Clone + 'static>( ex: Executor, mem: GuestMemory, disk_state: Rc<AsyncMutex<DiskState>>, queue: Rc<RefCell<Queue>>, evt: EventAsync, interrupt: I, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, )231 pub async fn handle_queue<I: SignalableInterrupt + Clone + 'static>(
232 ex: Executor,
233 mem: GuestMemory,
234 disk_state: Rc<AsyncMutex<DiskState>>,
235 queue: Rc<RefCell<Queue>>,
236 evt: EventAsync,
237 interrupt: I,
238 flush_timer: Rc<RefCell<TimerAsync>>,
239 flush_timer_armed: Rc<RefCell<bool>>,
240 ) {
241 loop {
242 if let Err(e) = evt.next_val().await {
243 error!("Failed to read the next queue event: {}", e);
244 continue;
245 }
246 while let Some(descriptor_chain) = queue.borrow_mut().pop(&mem) {
247 let queue = Rc::clone(&queue);
248 let disk_state = Rc::clone(&disk_state);
249 let mem = mem.clone();
250 let interrupt = interrupt.clone();
251 let flush_timer = Rc::clone(&flush_timer);
252 let flush_timer_armed = Rc::clone(&flush_timer_armed);
253
254 ex.spawn_local(async move {
255 process_one_chain(
256 queue,
257 descriptor_chain,
258 disk_state,
259 mem,
260 &interrupt,
261 flush_timer,
262 flush_timer_armed,
263 )
264 .await
265 })
266 .detach();
267 }
268 }
269 }
270
handle_command_tube( command_tube: &Option<AsyncTube>, interrupt: Rc<RefCell<Interrupt>>, disk_state: Rc<AsyncMutex<DiskState>>, ) -> Result<(), ExecuteError>271 async fn handle_command_tube(
272 command_tube: &Option<AsyncTube>,
273 interrupt: Rc<RefCell<Interrupt>>,
274 disk_state: Rc<AsyncMutex<DiskState>>,
275 ) -> Result<(), ExecuteError> {
276 let command_tube = match command_tube {
277 Some(c) => c,
278 None => {
279 let () = futures::future::pending().await;
280 return Ok(());
281 }
282 };
283 loop {
284 match command_tube.next().await {
285 Ok(command) => {
286 let resp = match command {
287 DiskControlCommand::Resize { new_size } => {
288 resize(Rc::clone(&disk_state), new_size).await
289 }
290 };
291
292 let resp_clone = resp.clone();
293 command_tube
294 .send(resp_clone)
295 .await
296 .map_err(ExecuteError::SendingResponse)?;
297 if let DiskControlResult::Ok = resp {
298 interrupt.borrow().signal_config_changed();
299 }
300 }
301 Err(e) => return Err(ExecuteError::ReceivingCommand(e)),
302 }
303 }
304 }
305
resize(disk_state: Rc<AsyncMutex<DiskState>>, new_size: u64) -> DiskControlResult306 async fn resize(disk_state: Rc<AsyncMutex<DiskState>>, new_size: u64) -> DiskControlResult {
307 // Acquire exclusive, mutable access to the state so the virtqueue task won't be able to read
308 // the state while resizing.
309 let mut disk_state = disk_state.lock().await;
310
311 if disk_state.read_only {
312 error!("Attempted to resize read-only block device");
313 return DiskControlResult::Err(SysError::new(libc::EROFS));
314 }
315
316 info!("Resizing block device to {} bytes", new_size);
317
318 if let Err(e) = disk_state.disk_image.set_len(new_size) {
319 error!("Resizing disk failed! {}", e);
320 return DiskControlResult::Err(SysError::new(libc::EIO));
321 }
322
323 // Allocate new space if the disk image is not sparse.
324 if let Err(e) = disk_state.disk_image.allocate(0, new_size) {
325 error!("Allocating disk space after resize failed! {}", e);
326 return DiskControlResult::Err(SysError::new(libc::EIO));
327 }
328
329 disk_state.sparse = false;
330
331 if let Ok(new_disk_size) = disk_state.disk_image.get_len() {
332 disk_state.disk_size.store(new_disk_size, Ordering::Release);
333 }
334 DiskControlResult::Ok
335 }
336
337 /// Periodically flushes the disk when the given timer fires.
flush_disk( disk_state: Rc<AsyncMutex<DiskState>>, timer: TimerAsync, armed: Rc<RefCell<bool>>, ) -> Result<(), ControlError>338 pub async fn flush_disk(
339 disk_state: Rc<AsyncMutex<DiskState>>,
340 timer: TimerAsync,
341 armed: Rc<RefCell<bool>>,
342 ) -> Result<(), ControlError> {
343 loop {
344 timer.next_val().await.map_err(ControlError::FlushTimer)?;
345 if !*armed.borrow() {
346 continue;
347 }
348
349 // Reset armed before calling fsync to guarantee that IO requests that started after we call
350 // fsync will be committed eventually.
351 *armed.borrow_mut() = false;
352
353 disk_state
354 .read_lock()
355 .await
356 .disk_image
357 .fsync()
358 .await
359 .map_err(ControlError::FsyncDisk)?;
360 }
361 }
362
363 // The main worker thread. Initialized the asynchronous worker tasks and passes them to the executor
364 // to be processed.
365 //
366 // `disk_state` is wrapped by `AsyncMutex`, which provides both shared and exclusive locks. It's
367 // because the state can be read from the virtqueue task while the control task is processing
368 // a resizing command.
run_worker( ex: Executor, interrupt: Interrupt, queues: Vec<Queue>, mem: GuestMemory, disk_state: &Rc<AsyncMutex<DiskState>>, control_tube: &Option<AsyncTube>, queue_evts: Vec<Event>, kill_evt: Event, ) -> Result<(), String>369 fn run_worker(
370 ex: Executor,
371 interrupt: Interrupt,
372 queues: Vec<Queue>,
373 mem: GuestMemory,
374 disk_state: &Rc<AsyncMutex<DiskState>>,
375 control_tube: &Option<AsyncTube>,
376 queue_evts: Vec<Event>,
377 kill_evt: Event,
378 ) -> Result<(), String> {
379 if queues.len() != queue_evts.len() {
380 return Err("Number of queues and events must match.".to_string());
381 }
382
383 let interrupt = Rc::new(RefCell::new(interrupt));
384
385 // One flush timer per disk.
386 let timer = Timer::new().expect("Failed to create a timer");
387 let flush_timer_armed = Rc::new(RefCell::new(false));
388
389 // Process any requests to resample the irq value.
390 let resample = async_utils::handle_irq_resample(&ex, Rc::clone(&interrupt));
391 pin_mut!(resample);
392
393 // Handles control requests.
394 let control = handle_command_tube(control_tube, Rc::clone(&interrupt), disk_state.clone());
395 pin_mut!(control);
396
397 // Handle all the queues in one sub-select call.
398 let flush_timer = Rc::new(RefCell::new(
399 TimerAsync::new(
400 // Call try_clone() to share the same underlying FD with the `flush_disk` task.
401 timer.0.try_clone().expect("Failed to clone flush_timer"),
402 &ex,
403 )
404 .expect("Failed to create an async timer"),
405 ));
406
407 let queue_handlers =
408 queues
409 .into_iter()
410 .map(|q| Rc::new(RefCell::new(q)))
411 .zip(queue_evts.into_iter().map(|e| {
412 EventAsync::new(e.0, &ex).expect("Failed to create async event for queue")
413 }))
414 .map(|(queue, event)| {
415 handle_queue(
416 ex.clone(),
417 mem.clone(),
418 Rc::clone(disk_state),
419 Rc::clone(&queue),
420 event,
421 Rc::clone(&interrupt),
422 Rc::clone(&flush_timer),
423 Rc::clone(&flush_timer_armed),
424 )
425 })
426 .collect::<FuturesUnordered<_>>()
427 .into_future();
428
429 // Flushes the disk periodically.
430 let flush_timer = TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer");
431 let disk_flush = flush_disk(disk_state.clone(), flush_timer, flush_timer_armed);
432 pin_mut!(disk_flush);
433
434 // Exit if the kill event is triggered.
435 let kill = async_utils::await_and_exit(&ex, kill_evt);
436 pin_mut!(kill);
437
438 match ex.run_until(select5(queue_handlers, disk_flush, control, resample, kill)) {
439 Ok((_, flush_res, control_res, resample_res, _)) => {
440 if let SelectResult::Finished(Err(e)) = flush_res {
441 return Err(format!("failed to flush a disk: {}", e));
442 }
443 if let SelectResult::Finished(Err(e)) = control_res {
444 return Err(format!("failed to handle a control request: {}", e));
445 }
446 if let SelectResult::Finished(Err(e)) = resample_res {
447 return Err(format!("failed to resample a irq value: {:?}", e));
448 }
449 Ok(())
450 }
451 Err(e) => Err(e.to_string()),
452 }
453 }
454
455 /// Virtio device for exposing block level read/write operations on a host file.
456 pub struct BlockAsync {
457 kill_evt: Option<Event>,
458 worker_thread: Option<thread::JoinHandle<(Box<dyn ToAsyncDisk>, Option<Tube>)>>,
459 disk_image: Option<Box<dyn ToAsyncDisk>>,
460 disk_size: Arc<AtomicU64>,
461 avail_features: u64,
462 read_only: bool,
463 sparse: bool,
464 seg_max: u32,
465 block_size: u32,
466 id: Option<BlockId>,
467 control_tube: Option<Tube>,
468 }
469
470 impl BlockAsync {
471 /// Create a new virtio block device that operates on the given AsyncDisk.
new( base_features: u64, disk_image: Box<dyn ToAsyncDisk>, read_only: bool, sparse: bool, block_size: u32, id: Option<BlockId>, control_tube: Option<Tube>, ) -> SysResult<BlockAsync>472 pub fn new(
473 base_features: u64,
474 disk_image: Box<dyn ToAsyncDisk>,
475 read_only: bool,
476 sparse: bool,
477 block_size: u32,
478 id: Option<BlockId>,
479 control_tube: Option<Tube>,
480 ) -> SysResult<BlockAsync> {
481 if block_size % SECTOR_SIZE as u32 != 0 {
482 error!(
483 "Block size {} is not a multiple of {}.",
484 block_size, SECTOR_SIZE,
485 );
486 return Err(SysError::new(libc::EINVAL));
487 }
488 let disk_size = disk_image.get_len()?;
489 if disk_size % block_size as u64 != 0 {
490 warn!(
491 "Disk size {} is not a multiple of block size {}; \
492 the remainder will not be visible to the guest.",
493 disk_size, block_size,
494 );
495 }
496
497 let avail_features = build_avail_features(base_features, read_only, sparse, true);
498
499 let seg_max = get_seg_max(QUEUE_SIZE);
500
501 Ok(BlockAsync {
502 kill_evt: None,
503 worker_thread: None,
504 disk_image: Some(disk_image),
505 disk_size: Arc::new(AtomicU64::new(disk_size)),
506 avail_features,
507 read_only,
508 sparse,
509 seg_max,
510 block_size,
511 id,
512 control_tube,
513 })
514 }
515
516 // Execute a single block device request.
517 // `writer` includes the data region only; the status byte is not included.
518 // It is up to the caller to convert the result of this function into a status byte
519 // and write it to the expected location in guest memory.
execute_request( reader: &mut Reader, writer: &mut Writer, disk_state: Rc<AsyncMutex<DiskState>>, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, ) -> result::Result<(), ExecuteError>520 async fn execute_request(
521 reader: &mut Reader,
522 writer: &mut Writer,
523 disk_state: Rc<AsyncMutex<DiskState>>,
524 flush_timer: Rc<RefCell<TimerAsync>>,
525 flush_timer_armed: Rc<RefCell<bool>>,
526 ) -> result::Result<(), ExecuteError> {
527 // Acquire immutable access to disk_state to prevent the disk from being resized.
528 let disk_state = disk_state.read_lock().await;
529
530 let req_header: virtio_blk_req_header = reader.read_obj().map_err(ExecuteError::Read)?;
531
532 let req_type = req_header.req_type.to_native();
533 let sector = req_header.sector.to_native();
534
535 if disk_state.read_only && req_type != VIRTIO_BLK_T_IN && req_type != VIRTIO_BLK_T_GET_ID {
536 return Err(ExecuteError::ReadOnly {
537 request_type: req_type,
538 });
539 }
540
541 /// Check that a request accesses only data within the disk's current size.
542 /// All parameters are in units of bytes.
543 fn check_range(
544 io_start: u64,
545 io_length: u64,
546 disk_size: u64,
547 ) -> result::Result<(), ExecuteError> {
548 let io_end = io_start
549 .checked_add(io_length)
550 .ok_or(ExecuteError::OutOfRange)?;
551 if io_end > disk_size {
552 Err(ExecuteError::OutOfRange)
553 } else {
554 Ok(())
555 }
556 }
557
558 let disk_size = disk_state.disk_size.load(Ordering::Relaxed);
559 match req_type {
560 VIRTIO_BLK_T_IN => {
561 let data_len = writer.available_bytes();
562 if data_len == 0 {
563 return Ok(());
564 }
565 let offset = sector
566 .checked_shl(u32::from(SECTOR_SHIFT))
567 .ok_or(ExecuteError::OutOfRange)?;
568 check_range(offset, data_len as u64, disk_size)?;
569 let disk_image = &disk_state.disk_image;
570 writer
571 .write_all_from_at_fut(&**disk_image, data_len, offset)
572 .await
573 .map_err(|desc_error| ExecuteError::ReadIo {
574 length: data_len,
575 sector,
576 desc_error,
577 })?;
578 }
579 VIRTIO_BLK_T_OUT => {
580 let data_len = reader.available_bytes();
581 if data_len == 0 {
582 return Ok(());
583 }
584 let offset = sector
585 .checked_shl(u32::from(SECTOR_SHIFT))
586 .ok_or(ExecuteError::OutOfRange)?;
587 check_range(offset, data_len as u64, disk_size)?;
588 let disk_image = &disk_state.disk_image;
589 reader
590 .read_exact_to_at_fut(&**disk_image, data_len, offset)
591 .await
592 .map_err(|desc_error| ExecuteError::WriteIo {
593 length: data_len,
594 sector,
595 desc_error,
596 })?;
597
598 if !*flush_timer_armed.borrow() {
599 *flush_timer_armed.borrow_mut() = true;
600
601 let flush_delay = Duration::from_secs(60);
602 flush_timer
603 .borrow_mut()
604 .reset(flush_delay, None)
605 .map_err(ExecuteError::TimerReset)?;
606 }
607 }
608 VIRTIO_BLK_T_DISCARD | VIRTIO_BLK_T_WRITE_ZEROES => {
609 if req_type == VIRTIO_BLK_T_DISCARD && !disk_state.sparse {
610 // Discard is a hint; if this is a non-sparse disk, just ignore it.
611 return Ok(());
612 }
613
614 while reader.available_bytes() >= size_of::<virtio_blk_discard_write_zeroes>() {
615 let seg: virtio_blk_discard_write_zeroes =
616 reader.read_obj().map_err(ExecuteError::Read)?;
617
618 let sector = seg.sector.to_native();
619 let num_sectors = seg.num_sectors.to_native();
620 let flags = seg.flags.to_native();
621
622 let valid_flags = if req_type == VIRTIO_BLK_T_WRITE_ZEROES {
623 VIRTIO_BLK_DISCARD_WRITE_ZEROES_FLAG_UNMAP
624 } else {
625 0
626 };
627
628 if (flags & !valid_flags) != 0 {
629 return Err(ExecuteError::DiscardWriteZeroes {
630 ioerr: None,
631 sector,
632 num_sectors,
633 flags,
634 });
635 }
636
637 let offset = sector
638 .checked_shl(u32::from(SECTOR_SHIFT))
639 .ok_or(ExecuteError::OutOfRange)?;
640 let length = u64::from(num_sectors)
641 .checked_shl(u32::from(SECTOR_SHIFT))
642 .ok_or(ExecuteError::OutOfRange)?;
643 check_range(offset, length, disk_size)?;
644
645 if req_type == VIRTIO_BLK_T_DISCARD {
646 // Since Discard is just a hint and some filesystems may not implement
647 // FALLOC_FL_PUNCH_HOLE, ignore punch_hole errors.
648 let _ = disk_state.disk_image.punch_hole(offset, length).await;
649 } else {
650 disk_state
651 .disk_image
652 .write_zeroes_at(offset, length)
653 .await
654 .map_err(|e| ExecuteError::DiscardWriteZeroes {
655 ioerr: Some(e),
656 sector,
657 num_sectors,
658 flags,
659 })?;
660 }
661 }
662 }
663 VIRTIO_BLK_T_FLUSH => {
664 disk_state
665 .disk_image
666 .fsync()
667 .await
668 .map_err(ExecuteError::Flush)?;
669 }
670 VIRTIO_BLK_T_GET_ID => {
671 if let Some(id) = disk_state.id {
672 writer.write_all(&id).map_err(ExecuteError::CopyId)?;
673 } else {
674 return Err(ExecuteError::Unsupported(req_type));
675 }
676 }
677 t => return Err(ExecuteError::Unsupported(t)),
678 };
679 Ok(())
680 }
681 }
682
683 impl Drop for BlockAsync {
drop(&mut self)684 fn drop(&mut self) {
685 if let Some(kill_evt) = self.kill_evt.take() {
686 // Ignore the result because there is nothing we can do about it.
687 let _ = kill_evt.write(1);
688 }
689
690 if let Some(worker_thread) = self.worker_thread.take() {
691 let _ = worker_thread.join();
692 }
693 }
694 }
695
696 impl VirtioDevice for BlockAsync {
keep_rds(&self) -> Vec<RawDescriptor>697 fn keep_rds(&self) -> Vec<RawDescriptor> {
698 let mut keep_rds = Vec::new();
699
700 if let Some(disk_image) = &self.disk_image {
701 keep_rds.extend(disk_image.as_raw_descriptors());
702 }
703
704 if let Some(control_tube) = &self.control_tube {
705 keep_rds.push(control_tube.as_raw_descriptor());
706 }
707
708 keep_rds
709 }
710
features(&self) -> u64711 fn features(&self) -> u64 {
712 self.avail_features
713 }
714
device_type(&self) -> u32715 fn device_type(&self) -> u32 {
716 TYPE_BLOCK
717 }
718
queue_max_sizes(&self) -> &[u16]719 fn queue_max_sizes(&self) -> &[u16] {
720 QUEUE_SIZES
721 }
722
read_config(&self, offset: u64, data: &mut [u8])723 fn read_config(&self, offset: u64, data: &mut [u8]) {
724 let config_space = {
725 let disk_size = self.disk_size.load(Ordering::Acquire);
726 build_config_space(disk_size, self.seg_max, self.block_size, NUM_QUEUES)
727 };
728 copy_config(data, 0, config_space.as_slice(), offset);
729 }
730
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: Vec<Queue>, queue_evts: Vec<Event>, )731 fn activate(
732 &mut self,
733 mem: GuestMemory,
734 interrupt: Interrupt,
735 queues: Vec<Queue>,
736 queue_evts: Vec<Event>,
737 ) {
738 let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
739 Ok(v) => v,
740 Err(e) => {
741 error!("failed creating kill Event pair: {}", e);
742 return;
743 }
744 };
745 self.kill_evt = Some(self_kill_evt);
746
747 let read_only = self.read_only;
748 let sparse = self.sparse;
749 let disk_size = self.disk_size.clone();
750 let id = self.id.take();
751 if let Some(disk_image) = self.disk_image.take() {
752 let control_tube = self.control_tube.take();
753 let worker_result =
754 thread::Builder::new()
755 .name("virtio_blk".to_string())
756 .spawn(move || {
757 let ex = Executor::new().expect("Failed to create an executor");
758
759 let async_control = control_tube
760 .map(|c| AsyncTube::new(&ex, c).expect("failed to create async tube"));
761 let async_image = match disk_image.to_async_disk(&ex) {
762 Ok(d) => d,
763 Err(e) => panic!("Failed to create async disk {}", e),
764 };
765 let disk_state = Rc::new(AsyncMutex::new(DiskState {
766 disk_image: async_image,
767 disk_size,
768 read_only,
769 sparse,
770 id,
771 }));
772 if let Err(err_string) = run_worker(
773 ex,
774 interrupt,
775 queues,
776 mem,
777 &disk_state,
778 &async_control,
779 queue_evts,
780 kill_evt,
781 ) {
782 error!("{}", err_string);
783 }
784
785 let disk_state = match Rc::try_unwrap(disk_state) {
786 Ok(d) => d.into_inner(),
787 Err(_) => panic!("too many refs to the disk"),
788 };
789 (
790 disk_state.disk_image.into_inner(),
791 async_control.map(|c| c.into()),
792 )
793 });
794
795 match worker_result {
796 Err(e) => {
797 error!("failed to spawn virtio_blk worker: {}", e);
798 return;
799 }
800 Ok(join_handle) => {
801 self.worker_thread = Some(join_handle);
802 }
803 }
804 }
805 }
806
reset(&mut self) -> bool807 fn reset(&mut self) -> bool {
808 if let Some(kill_evt) = self.kill_evt.take() {
809 if kill_evt.write(1).is_err() {
810 error!("{}: failed to notify the kill event", self.debug_label());
811 return false;
812 }
813 }
814
815 if let Some(worker_thread) = self.worker_thread.take() {
816 match worker_thread.join() {
817 Err(_) => {
818 error!("{}: failed to get back resources", self.debug_label());
819 return false;
820 }
821 Ok((disk_image, control_tube)) => {
822 self.disk_image = Some(disk_image);
823 self.control_tube = control_tube;
824 return true;
825 }
826 }
827 }
828 false
829 }
830 }
831
832 #[cfg(test)]
833 mod tests {
834 use std::fs::{File, OpenOptions};
835 use std::mem::size_of_val;
836 use std::sync::atomic::AtomicU64;
837
838 use data_model::{Le32, Le64};
839 use disk::SingleFileDisk;
840 use hypervisor::ProtectionType;
841 use tempfile::TempDir;
842 use vm_memory::GuestAddress;
843
844 use crate::virtio::base_features;
845 use crate::virtio::block::common::*;
846 use crate::virtio::descriptor_utils::{create_descriptor_chain, DescriptorType};
847
848 use super::*;
849
850 #[test]
read_size()851 fn read_size() {
852 let tempdir = TempDir::new().unwrap();
853 let mut path = tempdir.path().to_owned();
854 path.push("disk_image");
855 let f = File::create(&path).unwrap();
856 f.set_len(0x1000).unwrap();
857
858 let features = base_features(ProtectionType::Unprotected);
859 let b = BlockAsync::new(features, Box::new(f), true, false, 512, None, None).unwrap();
860 let mut num_sectors = [0u8; 4];
861 b.read_config(0, &mut num_sectors);
862 // size is 0x1000, so num_sectors is 8 (4096/512).
863 assert_eq!([0x08, 0x00, 0x00, 0x00], num_sectors);
864 let mut msw_sectors = [0u8; 4];
865 b.read_config(4, &mut msw_sectors);
866 // size is 0x1000, so msw_sectors is 0.
867 assert_eq!([0x00, 0x00, 0x00, 0x00], msw_sectors);
868 }
869
870 #[test]
read_block_size()871 fn read_block_size() {
872 let tempdir = TempDir::new().unwrap();
873 let mut path = tempdir.path().to_owned();
874 path.push("disk_image");
875 let f = File::create(&path).unwrap();
876 f.set_len(0x1000).unwrap();
877
878 let features = base_features(ProtectionType::Unprotected);
879 let b = BlockAsync::new(features, Box::new(f), true, false, 4096, None, None).unwrap();
880 let mut blk_size = [0u8; 4];
881 b.read_config(20, &mut blk_size);
882 // blk_size should be 4096 (0x1000).
883 assert_eq!([0x00, 0x10, 0x00, 0x00], blk_size);
884 }
885
886 #[test]
read_features()887 fn read_features() {
888 let tempdir = TempDir::new().unwrap();
889 let mut path = tempdir.path().to_owned();
890 path.push("disk_image");
891
892 // read-write block device
893 {
894 let f = File::create(&path).unwrap();
895 let features = base_features(ProtectionType::Unprotected);
896 let b = BlockAsync::new(features, Box::new(f), false, true, 512, None, None).unwrap();
897 // writable device should set VIRTIO_BLK_F_FLUSH + VIRTIO_BLK_F_DISCARD
898 // + VIRTIO_BLK_F_WRITE_ZEROES + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE
899 // + VIRTIO_BLK_F_SEG_MAX + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
900 assert_eq!(0x120007244, b.features());
901 }
902
903 // read-write block device, non-sparse
904 {
905 let f = File::create(&path).unwrap();
906 let features = base_features(ProtectionType::Unprotected);
907 let b = BlockAsync::new(features, Box::new(f), false, false, 512, None, None).unwrap();
908 // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO
909 // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE + VIRTIO_BLK_F_SEG_MAX
910 // + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
911 assert_eq!(0x120005244, b.features());
912 }
913
914 // read-only block device
915 {
916 let f = File::create(&path).unwrap();
917 let features = base_features(ProtectionType::Unprotected);
918 let b = BlockAsync::new(features, Box::new(f), true, true, 512, None, None).unwrap();
919 // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO
920 // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE + VIRTIO_BLK_F_SEG_MAX
921 // + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
922 assert_eq!(0x120001264, b.features());
923 }
924 }
925
926 #[test]
read_last_sector()927 fn read_last_sector() {
928 let ex = Executor::new().expect("creating an executor failed");
929
930 let tempdir = TempDir::new().unwrap();
931 let mut path = tempdir.path().to_owned();
932 path.push("disk_image");
933 let f = OpenOptions::new()
934 .read(true)
935 .write(true)
936 .create(true)
937 .open(&path)
938 .unwrap();
939 let disk_size = 0x1000;
940 f.set_len(disk_size).unwrap();
941 let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
942
943 let mem = Rc::new(
944 GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
945 .expect("Creating guest memory failed."),
946 );
947
948 let req_hdr = virtio_blk_req_header {
949 req_type: Le32::from(VIRTIO_BLK_T_IN),
950 reserved: Le32::from(0),
951 sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
952 };
953 mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
954 .expect("writing req failed");
955
956 let avail_desc = create_descriptor_chain(
957 &mem,
958 GuestAddress(0x100), // Place descriptor chain at 0x100.
959 GuestAddress(0x1000), // Describe buffer at 0x1000.
960 vec![
961 // Request header
962 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
963 // I/O buffer (1 sector of data)
964 (DescriptorType::Writable, 512),
965 // Request status
966 (DescriptorType::Writable, 1),
967 ],
968 0,
969 )
970 .expect("create_descriptor_chain failed");
971
972 let timer = Timer::new().expect("Failed to create a timer");
973 let flush_timer = Rc::new(RefCell::new(
974 TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
975 ));
976 let flush_timer_armed = Rc::new(RefCell::new(false));
977
978 let disk_state = Rc::new(AsyncMutex::new(DiskState {
979 disk_image: Box::new(af),
980 disk_size: Arc::new(AtomicU64::new(disk_size)),
981 read_only: false,
982 sparse: true,
983 id: None,
984 }));
985
986 let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
987
988 ex.run_until(fut)
989 .expect("running executor failed")
990 .expect("execute failed");
991
992 let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512) as u64);
993 let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
994 assert_eq!(status, VIRTIO_BLK_S_OK);
995 }
996
997 #[test]
read_beyond_last_sector()998 fn read_beyond_last_sector() {
999 let tempdir = TempDir::new().unwrap();
1000 let mut path = tempdir.path().to_owned();
1001 path.push("disk_image");
1002 let f = OpenOptions::new()
1003 .read(true)
1004 .write(true)
1005 .create(true)
1006 .open(&path)
1007 .unwrap();
1008 let disk_size = 0x1000;
1009 f.set_len(disk_size).unwrap();
1010 let mem = Rc::new(
1011 GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
1012 .expect("Creating guest memory failed."),
1013 );
1014
1015 let req_hdr = virtio_blk_req_header {
1016 req_type: Le32::from(VIRTIO_BLK_T_IN),
1017 reserved: Le32::from(0),
1018 sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
1019 };
1020 mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
1021 .expect("writing req failed");
1022
1023 let avail_desc = create_descriptor_chain(
1024 &mem,
1025 GuestAddress(0x100), // Place descriptor chain at 0x100.
1026 GuestAddress(0x1000), // Describe buffer at 0x1000.
1027 vec![
1028 // Request header
1029 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
1030 // I/O buffer (2 sectors of data - overlap the end of the disk).
1031 (DescriptorType::Writable, 512 * 2),
1032 // Request status
1033 (DescriptorType::Writable, 1),
1034 ],
1035 0,
1036 )
1037 .expect("create_descriptor_chain failed");
1038
1039 let ex = Executor::new().expect("creating an executor failed");
1040
1041 let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
1042 let timer = Timer::new().expect("Failed to create a timer");
1043 let flush_timer = Rc::new(RefCell::new(
1044 TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
1045 ));
1046 let flush_timer_armed = Rc::new(RefCell::new(false));
1047 let disk_state = Rc::new(AsyncMutex::new(DiskState {
1048 disk_image: Box::new(af),
1049 disk_size: Arc::new(AtomicU64::new(disk_size)),
1050 read_only: false,
1051 sparse: true,
1052 id: None,
1053 }));
1054
1055 let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
1056
1057 ex.run_until(fut)
1058 .expect("running executor failed")
1059 .expect("execute failed");
1060
1061 let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512 * 2) as u64);
1062 let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
1063 assert_eq!(status, VIRTIO_BLK_S_IOERR);
1064 }
1065
1066 #[test]
get_id()1067 fn get_id() {
1068 let ex = Executor::new().expect("creating an executor failed");
1069
1070 let tempdir = TempDir::new().unwrap();
1071 let mut path = tempdir.path().to_owned();
1072 path.push("disk_image");
1073 let f = OpenOptions::new()
1074 .read(true)
1075 .write(true)
1076 .create(true)
1077 .open(&path)
1078 .unwrap();
1079 let disk_size = 0x1000;
1080 f.set_len(disk_size).unwrap();
1081
1082 let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
1083 .expect("Creating guest memory failed.");
1084
1085 let req_hdr = virtio_blk_req_header {
1086 req_type: Le32::from(VIRTIO_BLK_T_GET_ID),
1087 reserved: Le32::from(0),
1088 sector: Le64::from(0),
1089 };
1090 mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
1091 .expect("writing req failed");
1092
1093 let avail_desc = create_descriptor_chain(
1094 &mem,
1095 GuestAddress(0x100), // Place descriptor chain at 0x100.
1096 GuestAddress(0x1000), // Describe buffer at 0x1000.
1097 vec![
1098 // Request header
1099 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
1100 // I/O buffer (20 bytes for serial)
1101 (DescriptorType::Writable, 20),
1102 // Request status
1103 (DescriptorType::Writable, 1),
1104 ],
1105 0,
1106 )
1107 .expect("create_descriptor_chain failed");
1108
1109 let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
1110 let timer = Timer::new().expect("Failed to create a timer");
1111 let flush_timer = Rc::new(RefCell::new(
1112 TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
1113 ));
1114 let flush_timer_armed = Rc::new(RefCell::new(false));
1115
1116 let id = b"a20-byteserialnumber";
1117
1118 let disk_state = Rc::new(AsyncMutex::new(DiskState {
1119 disk_image: Box::new(af),
1120 disk_size: Arc::new(AtomicU64::new(disk_size)),
1121 read_only: false,
1122 sparse: true,
1123 id: Some(*id),
1124 }));
1125
1126 let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
1127
1128 ex.run_until(fut)
1129 .expect("running executor failed")
1130 .expect("execute failed");
1131
1132 let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512) as u64);
1133 let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
1134 assert_eq!(status, VIRTIO_BLK_S_OK);
1135
1136 let id_offset = GuestAddress(0x1000 + size_of_val(&req_hdr) as u64);
1137 let returned_id = mem.read_obj_from_addr::<[u8; 20]>(id_offset).unwrap();
1138 assert_eq!(returned_id, *id);
1139 }
1140 }
1141