• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::cell::RefCell;
6 use std::io::{self, Write};
7 use std::mem::size_of;
8 use std::rc::Rc;
9 use std::result;
10 use std::sync::{atomic::AtomicU64, atomic::Ordering, Arc};
11 use std::thread;
12 use std::time::Duration;
13 use std::u32;
14 
15 use futures::pin_mut;
16 use futures::stream::{FuturesUnordered, StreamExt};
17 use remain::sorted;
18 use thiserror::Error as ThisError;
19 
20 use base::Error as SysError;
21 use base::Result as SysResult;
22 use base::{error, info, warn, AsRawDescriptor, Event, RawDescriptor, Timer, Tube, TubeError};
23 use cros_async::{
24     select5, sync::Mutex as AsyncMutex, AsyncError, AsyncTube, EventAsync, Executor, SelectResult,
25     TimerAsync,
26 };
27 use data_model::DataInit;
28 use disk::{AsyncDisk, ToAsyncDisk};
29 use vm_control::{DiskControlCommand, DiskControlResult};
30 use vm_memory::GuestMemory;
31 
32 use super::common::*;
33 use crate::virtio::{
34     async_utils, block::sys::*, copy_config, DescriptorChain, DescriptorError, Interrupt, Queue,
35     Reader, SignalableInterrupt, VirtioDevice, Writer, TYPE_BLOCK,
36 };
37 
38 const QUEUE_SIZE: u16 = 256;
39 const NUM_QUEUES: u16 = 16;
40 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES as usize];
41 
42 #[sorted]
43 #[derive(ThisError, Debug)]
44 enum ExecuteError {
45     #[error("failed to copy ID string: {0}")]
46     CopyId(io::Error),
47     #[error("virtio descriptor error: {0}")]
48     Descriptor(DescriptorError),
49     #[error("failed to perform discard or write zeroes; sector={sector} num_sectors={num_sectors} flags={flags}; {ioerr:?}")]
50     DiscardWriteZeroes {
51         ioerr: Option<disk::Error>,
52         sector: u64,
53         num_sectors: u32,
54         flags: u32,
55     },
56     #[error("failed to flush: {0}")]
57     Flush(disk::Error),
58     #[error("not enough space in descriptor chain to write status")]
59     MissingStatus,
60     #[error("out of range")]
61     OutOfRange,
62     #[error("failed to read message: {0}")]
63     Read(io::Error),
64     #[error("io error reading {length} bytes from sector {sector}: {desc_error}")]
65     ReadIo {
66         length: usize,
67         sector: u64,
68         desc_error: disk::Error,
69     },
70     #[error("read only; request_type={request_type}")]
71     ReadOnly { request_type: u32 },
72     #[error("failed to recieve command message: {0}")]
73     ReceivingCommand(TubeError),
74     #[error("failed to send command response: {0}")]
75     SendingResponse(TubeError),
76     #[error("couldn't reset the timer: {0}")]
77     TimerReset(base::Error),
78     #[error("unsupported ({0})")]
79     Unsupported(u32),
80     #[error("io error writing {length} bytes from sector {sector}: {desc_error}")]
81     WriteIo {
82         length: usize,
83         sector: u64,
84         desc_error: disk::Error,
85     },
86     #[error("failed to write request status: {0}")]
87     WriteStatus(io::Error),
88 }
89 
90 impl ExecuteError {
status(&self) -> u891     fn status(&self) -> u8 {
92         match self {
93             ExecuteError::CopyId(_) => VIRTIO_BLK_S_IOERR,
94             ExecuteError::Descriptor(_) => VIRTIO_BLK_S_IOERR,
95             ExecuteError::DiscardWriteZeroes { .. } => VIRTIO_BLK_S_IOERR,
96             ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
97             ExecuteError::MissingStatus => VIRTIO_BLK_S_IOERR,
98             ExecuteError::OutOfRange { .. } => VIRTIO_BLK_S_IOERR,
99             ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
100             ExecuteError::ReadIo { .. } => VIRTIO_BLK_S_IOERR,
101             ExecuteError::ReadOnly { .. } => VIRTIO_BLK_S_IOERR,
102             ExecuteError::ReceivingCommand(_) => VIRTIO_BLK_S_IOERR,
103             ExecuteError::SendingResponse(_) => VIRTIO_BLK_S_IOERR,
104             ExecuteError::TimerReset(_) => VIRTIO_BLK_S_IOERR,
105             ExecuteError::WriteIo { .. } => VIRTIO_BLK_S_IOERR,
106             ExecuteError::WriteStatus(_) => VIRTIO_BLK_S_IOERR,
107             ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
108         }
109     }
110 }
111 
112 /// Errors that happen in block outside of executing a request.
113 /// This includes errors during resize and flush operations.
114 #[sorted]
115 #[derive(ThisError, Debug)]
116 pub enum ControlError {
117     #[error("couldn't create an async resample event: {0}")]
118     AsyncResampleCreate(AsyncError),
119     #[error("couldn't clone the resample event: {0}")]
120     CloneResampleEvent(base::Error),
121     #[error("couldn't get a value from a timer for flushing: {0}")]
122     FlushTimer(AsyncError),
123     #[error("failed to fsync the disk: {0}")]
124     FsyncDisk(disk::Error),
125     #[error("couldn't read the resample event: {0}")]
126     ReadResampleEvent(AsyncError),
127 }
128 
129 /// Tracks the state of an anynchronous disk.
130 pub struct DiskState {
131     pub disk_image: Box<dyn AsyncDisk>,
132     pub disk_size: Arc<AtomicU64>,
133     pub read_only: bool,
134     pub sparse: bool,
135     pub id: Option<BlockId>,
136 }
137 
138 impl DiskState {
139     /// Creates a `DiskState` with the given params.
new( disk_image: Box<dyn AsyncDisk>, disk_size: Arc<AtomicU64>, read_only: bool, sparse: bool, id: Option<BlockId>, ) -> DiskState140     pub fn new(
141         disk_image: Box<dyn AsyncDisk>,
142         disk_size: Arc<AtomicU64>,
143         read_only: bool,
144         sparse: bool,
145         id: Option<BlockId>,
146     ) -> DiskState {
147         DiskState {
148             disk_image,
149             disk_size,
150             read_only,
151             sparse,
152             id,
153         }
154     }
155 }
156 
process_one_request( avail_desc: DescriptorChain, disk_state: Rc<AsyncMutex<DiskState>>, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, mem: &GuestMemory, ) -> result::Result<usize, ExecuteError>157 async fn process_one_request(
158     avail_desc: DescriptorChain,
159     disk_state: Rc<AsyncMutex<DiskState>>,
160     flush_timer: Rc<RefCell<TimerAsync>>,
161     flush_timer_armed: Rc<RefCell<bool>>,
162     mem: &GuestMemory,
163 ) -> result::Result<usize, ExecuteError> {
164     let mut reader =
165         Reader::new(mem.clone(), avail_desc.clone()).map_err(ExecuteError::Descriptor)?;
166     let mut writer = Writer::new(mem.clone(), avail_desc).map_err(ExecuteError::Descriptor)?;
167 
168     // The last byte of the buffer is virtio_blk_req::status.
169     // Split it into a separate Writer so that status_writer is the final byte and
170     // the original writer is left with just the actual block I/O data.
171     let available_bytes = writer.available_bytes();
172     let status_offset = available_bytes
173         .checked_sub(1)
174         .ok_or(ExecuteError::MissingStatus)?;
175     let mut status_writer = writer.split_at(status_offset);
176 
177     let status = match BlockAsync::execute_request(
178         &mut reader,
179         &mut writer,
180         disk_state,
181         flush_timer,
182         flush_timer_armed,
183     )
184     .await
185     {
186         Ok(()) => VIRTIO_BLK_S_OK,
187         Err(e) => {
188             if !matches!(e, ExecuteError::Unsupported(VIRTIO_BLK_T_GET_ID)) {
189                 error!("failed executing disk request: {}", e);
190             }
191             e.status()
192         }
193     };
194 
195     status_writer
196         .write_all(&[status])
197         .map_err(ExecuteError::WriteStatus)?;
198     Ok(available_bytes)
199 }
200 
201 /// Process one descriptor chain asynchronously.
process_one_chain<I: SignalableInterrupt>( queue: Rc<RefCell<Queue>>, avail_desc: DescriptorChain, disk_state: Rc<AsyncMutex<DiskState>>, mem: GuestMemory, interrupt: &I, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, )202 pub async fn process_one_chain<I: SignalableInterrupt>(
203     queue: Rc<RefCell<Queue>>,
204     avail_desc: DescriptorChain,
205     disk_state: Rc<AsyncMutex<DiskState>>,
206     mem: GuestMemory,
207     interrupt: &I,
208     flush_timer: Rc<RefCell<TimerAsync>>,
209     flush_timer_armed: Rc<RefCell<bool>>,
210 ) {
211     let descriptor_index = avail_desc.index;
212     let len =
213         match process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem)
214             .await
215         {
216             Ok(len) => len,
217             Err(e) => {
218                 error!("block: failed to handle request: {}", e);
219                 0
220             }
221         };
222 
223     let mut queue = queue.borrow_mut();
224     queue.add_used(&mem, descriptor_index, len as u32);
225     queue.trigger_interrupt(&mem, interrupt);
226 }
227 
228 // There is one async task running `handle_queue` per virtio queue in use.
229 // Receives messages from the guest and queues a task to complete the operations with the async
230 // executor.
handle_queue<I: SignalableInterrupt + Clone + 'static>( ex: Executor, mem: GuestMemory, disk_state: Rc<AsyncMutex<DiskState>>, queue: Rc<RefCell<Queue>>, evt: EventAsync, interrupt: I, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, )231 pub async fn handle_queue<I: SignalableInterrupt + Clone + 'static>(
232     ex: Executor,
233     mem: GuestMemory,
234     disk_state: Rc<AsyncMutex<DiskState>>,
235     queue: Rc<RefCell<Queue>>,
236     evt: EventAsync,
237     interrupt: I,
238     flush_timer: Rc<RefCell<TimerAsync>>,
239     flush_timer_armed: Rc<RefCell<bool>>,
240 ) {
241     loop {
242         if let Err(e) = evt.next_val().await {
243             error!("Failed to read the next queue event: {}", e);
244             continue;
245         }
246         while let Some(descriptor_chain) = queue.borrow_mut().pop(&mem) {
247             let queue = Rc::clone(&queue);
248             let disk_state = Rc::clone(&disk_state);
249             let mem = mem.clone();
250             let interrupt = interrupt.clone();
251             let flush_timer = Rc::clone(&flush_timer);
252             let flush_timer_armed = Rc::clone(&flush_timer_armed);
253 
254             ex.spawn_local(async move {
255                 process_one_chain(
256                     queue,
257                     descriptor_chain,
258                     disk_state,
259                     mem,
260                     &interrupt,
261                     flush_timer,
262                     flush_timer_armed,
263                 )
264                 .await
265             })
266             .detach();
267         }
268     }
269 }
270 
handle_command_tube( command_tube: &Option<AsyncTube>, interrupt: Rc<RefCell<Interrupt>>, disk_state: Rc<AsyncMutex<DiskState>>, ) -> Result<(), ExecuteError>271 async fn handle_command_tube(
272     command_tube: &Option<AsyncTube>,
273     interrupt: Rc<RefCell<Interrupt>>,
274     disk_state: Rc<AsyncMutex<DiskState>>,
275 ) -> Result<(), ExecuteError> {
276     let command_tube = match command_tube {
277         Some(c) => c,
278         None => {
279             let () = futures::future::pending().await;
280             return Ok(());
281         }
282     };
283     loop {
284         match command_tube.next().await {
285             Ok(command) => {
286                 let resp = match command {
287                     DiskControlCommand::Resize { new_size } => {
288                         resize(Rc::clone(&disk_state), new_size).await
289                     }
290                 };
291 
292                 let resp_clone = resp.clone();
293                 command_tube
294                     .send(resp_clone)
295                     .await
296                     .map_err(ExecuteError::SendingResponse)?;
297                 if let DiskControlResult::Ok = resp {
298                     interrupt.borrow().signal_config_changed();
299                 }
300             }
301             Err(e) => return Err(ExecuteError::ReceivingCommand(e)),
302         }
303     }
304 }
305 
resize(disk_state: Rc<AsyncMutex<DiskState>>, new_size: u64) -> DiskControlResult306 async fn resize(disk_state: Rc<AsyncMutex<DiskState>>, new_size: u64) -> DiskControlResult {
307     // Acquire exclusive, mutable access to the state so the virtqueue task won't be able to read
308     // the state while resizing.
309     let mut disk_state = disk_state.lock().await;
310 
311     if disk_state.read_only {
312         error!("Attempted to resize read-only block device");
313         return DiskControlResult::Err(SysError::new(libc::EROFS));
314     }
315 
316     info!("Resizing block device to {} bytes", new_size);
317 
318     if let Err(e) = disk_state.disk_image.set_len(new_size) {
319         error!("Resizing disk failed! {}", e);
320         return DiskControlResult::Err(SysError::new(libc::EIO));
321     }
322 
323     // Allocate new space if the disk image is not sparse.
324     if let Err(e) = disk_state.disk_image.allocate(0, new_size) {
325         error!("Allocating disk space after resize failed! {}", e);
326         return DiskControlResult::Err(SysError::new(libc::EIO));
327     }
328 
329     disk_state.sparse = false;
330 
331     if let Ok(new_disk_size) = disk_state.disk_image.get_len() {
332         disk_state.disk_size.store(new_disk_size, Ordering::Release);
333     }
334     DiskControlResult::Ok
335 }
336 
337 /// Periodically flushes the disk when the given timer fires.
flush_disk( disk_state: Rc<AsyncMutex<DiskState>>, timer: TimerAsync, armed: Rc<RefCell<bool>>, ) -> Result<(), ControlError>338 pub async fn flush_disk(
339     disk_state: Rc<AsyncMutex<DiskState>>,
340     timer: TimerAsync,
341     armed: Rc<RefCell<bool>>,
342 ) -> Result<(), ControlError> {
343     loop {
344         timer.next_val().await.map_err(ControlError::FlushTimer)?;
345         if !*armed.borrow() {
346             continue;
347         }
348 
349         // Reset armed before calling fsync to guarantee that IO requests that started after we call
350         // fsync will be committed eventually.
351         *armed.borrow_mut() = false;
352 
353         disk_state
354             .read_lock()
355             .await
356             .disk_image
357             .fsync()
358             .await
359             .map_err(ControlError::FsyncDisk)?;
360     }
361 }
362 
363 // The main worker thread. Initialized the asynchronous worker tasks and passes them to the executor
364 // to be processed.
365 //
366 // `disk_state` is wrapped by `AsyncMutex`, which provides both shared and exclusive locks. It's
367 // because the state can be read from the virtqueue task while the control task is processing
368 // a resizing command.
run_worker( ex: Executor, interrupt: Interrupt, queues: Vec<Queue>, mem: GuestMemory, disk_state: &Rc<AsyncMutex<DiskState>>, control_tube: &Option<AsyncTube>, queue_evts: Vec<Event>, kill_evt: Event, ) -> Result<(), String>369 fn run_worker(
370     ex: Executor,
371     interrupt: Interrupt,
372     queues: Vec<Queue>,
373     mem: GuestMemory,
374     disk_state: &Rc<AsyncMutex<DiskState>>,
375     control_tube: &Option<AsyncTube>,
376     queue_evts: Vec<Event>,
377     kill_evt: Event,
378 ) -> Result<(), String> {
379     if queues.len() != queue_evts.len() {
380         return Err("Number of queues and events must match.".to_string());
381     }
382 
383     let interrupt = Rc::new(RefCell::new(interrupt));
384 
385     // One flush timer per disk.
386     let timer = Timer::new().expect("Failed to create a timer");
387     let flush_timer_armed = Rc::new(RefCell::new(false));
388 
389     // Process any requests to resample the irq value.
390     let resample = async_utils::handle_irq_resample(&ex, Rc::clone(&interrupt));
391     pin_mut!(resample);
392 
393     // Handles control requests.
394     let control = handle_command_tube(control_tube, Rc::clone(&interrupt), disk_state.clone());
395     pin_mut!(control);
396 
397     // Handle all the queues in one sub-select call.
398     let flush_timer = Rc::new(RefCell::new(
399         TimerAsync::new(
400             // Call try_clone() to share the same underlying FD with the `flush_disk` task.
401             timer.0.try_clone().expect("Failed to clone flush_timer"),
402             &ex,
403         )
404         .expect("Failed to create an async timer"),
405     ));
406 
407     let queue_handlers =
408         queues
409             .into_iter()
410             .map(|q| Rc::new(RefCell::new(q)))
411             .zip(queue_evts.into_iter().map(|e| {
412                 EventAsync::new(e.0, &ex).expect("Failed to create async event for queue")
413             }))
414             .map(|(queue, event)| {
415                 handle_queue(
416                     ex.clone(),
417                     mem.clone(),
418                     Rc::clone(disk_state),
419                     Rc::clone(&queue),
420                     event,
421                     Rc::clone(&interrupt),
422                     Rc::clone(&flush_timer),
423                     Rc::clone(&flush_timer_armed),
424                 )
425             })
426             .collect::<FuturesUnordered<_>>()
427             .into_future();
428 
429     // Flushes the disk periodically.
430     let flush_timer = TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer");
431     let disk_flush = flush_disk(disk_state.clone(), flush_timer, flush_timer_armed);
432     pin_mut!(disk_flush);
433 
434     // Exit if the kill event is triggered.
435     let kill = async_utils::await_and_exit(&ex, kill_evt);
436     pin_mut!(kill);
437 
438     match ex.run_until(select5(queue_handlers, disk_flush, control, resample, kill)) {
439         Ok((_, flush_res, control_res, resample_res, _)) => {
440             if let SelectResult::Finished(Err(e)) = flush_res {
441                 return Err(format!("failed to flush a disk: {}", e));
442             }
443             if let SelectResult::Finished(Err(e)) = control_res {
444                 return Err(format!("failed to handle a control request: {}", e));
445             }
446             if let SelectResult::Finished(Err(e)) = resample_res {
447                 return Err(format!("failed to resample a irq value: {:?}", e));
448             }
449             Ok(())
450         }
451         Err(e) => Err(e.to_string()),
452     }
453 }
454 
455 /// Virtio device for exposing block level read/write operations on a host file.
456 pub struct BlockAsync {
457     kill_evt: Option<Event>,
458     worker_thread: Option<thread::JoinHandle<(Box<dyn ToAsyncDisk>, Option<Tube>)>>,
459     disk_image: Option<Box<dyn ToAsyncDisk>>,
460     disk_size: Arc<AtomicU64>,
461     avail_features: u64,
462     read_only: bool,
463     sparse: bool,
464     seg_max: u32,
465     block_size: u32,
466     id: Option<BlockId>,
467     control_tube: Option<Tube>,
468 }
469 
470 impl BlockAsync {
471     /// Create a new virtio block device that operates on the given AsyncDisk.
new( base_features: u64, disk_image: Box<dyn ToAsyncDisk>, read_only: bool, sparse: bool, block_size: u32, id: Option<BlockId>, control_tube: Option<Tube>, ) -> SysResult<BlockAsync>472     pub fn new(
473         base_features: u64,
474         disk_image: Box<dyn ToAsyncDisk>,
475         read_only: bool,
476         sparse: bool,
477         block_size: u32,
478         id: Option<BlockId>,
479         control_tube: Option<Tube>,
480     ) -> SysResult<BlockAsync> {
481         if block_size % SECTOR_SIZE as u32 != 0 {
482             error!(
483                 "Block size {} is not a multiple of {}.",
484                 block_size, SECTOR_SIZE,
485             );
486             return Err(SysError::new(libc::EINVAL));
487         }
488         let disk_size = disk_image.get_len()?;
489         if disk_size % block_size as u64 != 0 {
490             warn!(
491                 "Disk size {} is not a multiple of block size {}; \
492                  the remainder will not be visible to the guest.",
493                 disk_size, block_size,
494             );
495         }
496 
497         let avail_features = build_avail_features(base_features, read_only, sparse, true);
498 
499         let seg_max = get_seg_max(QUEUE_SIZE);
500 
501         Ok(BlockAsync {
502             kill_evt: None,
503             worker_thread: None,
504             disk_image: Some(disk_image),
505             disk_size: Arc::new(AtomicU64::new(disk_size)),
506             avail_features,
507             read_only,
508             sparse,
509             seg_max,
510             block_size,
511             id,
512             control_tube,
513         })
514     }
515 
516     // Execute a single block device request.
517     // `writer` includes the data region only; the status byte is not included.
518     // It is up to the caller to convert the result of this function into a status byte
519     // and write it to the expected location in guest memory.
execute_request( reader: &mut Reader, writer: &mut Writer, disk_state: Rc<AsyncMutex<DiskState>>, flush_timer: Rc<RefCell<TimerAsync>>, flush_timer_armed: Rc<RefCell<bool>>, ) -> result::Result<(), ExecuteError>520     async fn execute_request(
521         reader: &mut Reader,
522         writer: &mut Writer,
523         disk_state: Rc<AsyncMutex<DiskState>>,
524         flush_timer: Rc<RefCell<TimerAsync>>,
525         flush_timer_armed: Rc<RefCell<bool>>,
526     ) -> result::Result<(), ExecuteError> {
527         // Acquire immutable access to disk_state to prevent the disk from being resized.
528         let disk_state = disk_state.read_lock().await;
529 
530         let req_header: virtio_blk_req_header = reader.read_obj().map_err(ExecuteError::Read)?;
531 
532         let req_type = req_header.req_type.to_native();
533         let sector = req_header.sector.to_native();
534 
535         if disk_state.read_only && req_type != VIRTIO_BLK_T_IN && req_type != VIRTIO_BLK_T_GET_ID {
536             return Err(ExecuteError::ReadOnly {
537                 request_type: req_type,
538             });
539         }
540 
541         /// Check that a request accesses only data within the disk's current size.
542         /// All parameters are in units of bytes.
543         fn check_range(
544             io_start: u64,
545             io_length: u64,
546             disk_size: u64,
547         ) -> result::Result<(), ExecuteError> {
548             let io_end = io_start
549                 .checked_add(io_length)
550                 .ok_or(ExecuteError::OutOfRange)?;
551             if io_end > disk_size {
552                 Err(ExecuteError::OutOfRange)
553             } else {
554                 Ok(())
555             }
556         }
557 
558         let disk_size = disk_state.disk_size.load(Ordering::Relaxed);
559         match req_type {
560             VIRTIO_BLK_T_IN => {
561                 let data_len = writer.available_bytes();
562                 if data_len == 0 {
563                     return Ok(());
564                 }
565                 let offset = sector
566                     .checked_shl(u32::from(SECTOR_SHIFT))
567                     .ok_or(ExecuteError::OutOfRange)?;
568                 check_range(offset, data_len as u64, disk_size)?;
569                 let disk_image = &disk_state.disk_image;
570                 writer
571                     .write_all_from_at_fut(&**disk_image, data_len, offset)
572                     .await
573                     .map_err(|desc_error| ExecuteError::ReadIo {
574                         length: data_len,
575                         sector,
576                         desc_error,
577                     })?;
578             }
579             VIRTIO_BLK_T_OUT => {
580                 let data_len = reader.available_bytes();
581                 if data_len == 0 {
582                     return Ok(());
583                 }
584                 let offset = sector
585                     .checked_shl(u32::from(SECTOR_SHIFT))
586                     .ok_or(ExecuteError::OutOfRange)?;
587                 check_range(offset, data_len as u64, disk_size)?;
588                 let disk_image = &disk_state.disk_image;
589                 reader
590                     .read_exact_to_at_fut(&**disk_image, data_len, offset)
591                     .await
592                     .map_err(|desc_error| ExecuteError::WriteIo {
593                         length: data_len,
594                         sector,
595                         desc_error,
596                     })?;
597 
598                 if !*flush_timer_armed.borrow() {
599                     *flush_timer_armed.borrow_mut() = true;
600 
601                     let flush_delay = Duration::from_secs(60);
602                     flush_timer
603                         .borrow_mut()
604                         .reset(flush_delay, None)
605                         .map_err(ExecuteError::TimerReset)?;
606                 }
607             }
608             VIRTIO_BLK_T_DISCARD | VIRTIO_BLK_T_WRITE_ZEROES => {
609                 if req_type == VIRTIO_BLK_T_DISCARD && !disk_state.sparse {
610                     // Discard is a hint; if this is a non-sparse disk, just ignore it.
611                     return Ok(());
612                 }
613 
614                 while reader.available_bytes() >= size_of::<virtio_blk_discard_write_zeroes>() {
615                     let seg: virtio_blk_discard_write_zeroes =
616                         reader.read_obj().map_err(ExecuteError::Read)?;
617 
618                     let sector = seg.sector.to_native();
619                     let num_sectors = seg.num_sectors.to_native();
620                     let flags = seg.flags.to_native();
621 
622                     let valid_flags = if req_type == VIRTIO_BLK_T_WRITE_ZEROES {
623                         VIRTIO_BLK_DISCARD_WRITE_ZEROES_FLAG_UNMAP
624                     } else {
625                         0
626                     };
627 
628                     if (flags & !valid_flags) != 0 {
629                         return Err(ExecuteError::DiscardWriteZeroes {
630                             ioerr: None,
631                             sector,
632                             num_sectors,
633                             flags,
634                         });
635                     }
636 
637                     let offset = sector
638                         .checked_shl(u32::from(SECTOR_SHIFT))
639                         .ok_or(ExecuteError::OutOfRange)?;
640                     let length = u64::from(num_sectors)
641                         .checked_shl(u32::from(SECTOR_SHIFT))
642                         .ok_or(ExecuteError::OutOfRange)?;
643                     check_range(offset, length, disk_size)?;
644 
645                     if req_type == VIRTIO_BLK_T_DISCARD {
646                         // Since Discard is just a hint and some filesystems may not implement
647                         // FALLOC_FL_PUNCH_HOLE, ignore punch_hole errors.
648                         let _ = disk_state.disk_image.punch_hole(offset, length).await;
649                     } else {
650                         disk_state
651                             .disk_image
652                             .write_zeroes_at(offset, length)
653                             .await
654                             .map_err(|e| ExecuteError::DiscardWriteZeroes {
655                                 ioerr: Some(e),
656                                 sector,
657                                 num_sectors,
658                                 flags,
659                             })?;
660                     }
661                 }
662             }
663             VIRTIO_BLK_T_FLUSH => {
664                 disk_state
665                     .disk_image
666                     .fsync()
667                     .await
668                     .map_err(ExecuteError::Flush)?;
669             }
670             VIRTIO_BLK_T_GET_ID => {
671                 if let Some(id) = disk_state.id {
672                     writer.write_all(&id).map_err(ExecuteError::CopyId)?;
673                 } else {
674                     return Err(ExecuteError::Unsupported(req_type));
675                 }
676             }
677             t => return Err(ExecuteError::Unsupported(t)),
678         };
679         Ok(())
680     }
681 }
682 
683 impl Drop for BlockAsync {
drop(&mut self)684     fn drop(&mut self) {
685         if let Some(kill_evt) = self.kill_evt.take() {
686             // Ignore the result because there is nothing we can do about it.
687             let _ = kill_evt.write(1);
688         }
689 
690         if let Some(worker_thread) = self.worker_thread.take() {
691             let _ = worker_thread.join();
692         }
693     }
694 }
695 
696 impl VirtioDevice for BlockAsync {
keep_rds(&self) -> Vec<RawDescriptor>697     fn keep_rds(&self) -> Vec<RawDescriptor> {
698         let mut keep_rds = Vec::new();
699 
700         if let Some(disk_image) = &self.disk_image {
701             keep_rds.extend(disk_image.as_raw_descriptors());
702         }
703 
704         if let Some(control_tube) = &self.control_tube {
705             keep_rds.push(control_tube.as_raw_descriptor());
706         }
707 
708         keep_rds
709     }
710 
features(&self) -> u64711     fn features(&self) -> u64 {
712         self.avail_features
713     }
714 
device_type(&self) -> u32715     fn device_type(&self) -> u32 {
716         TYPE_BLOCK
717     }
718 
queue_max_sizes(&self) -> &[u16]719     fn queue_max_sizes(&self) -> &[u16] {
720         QUEUE_SIZES
721     }
722 
read_config(&self, offset: u64, data: &mut [u8])723     fn read_config(&self, offset: u64, data: &mut [u8]) {
724         let config_space = {
725             let disk_size = self.disk_size.load(Ordering::Acquire);
726             build_config_space(disk_size, self.seg_max, self.block_size, NUM_QUEUES)
727         };
728         copy_config(data, 0, config_space.as_slice(), offset);
729     }
730 
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: Vec<Queue>, queue_evts: Vec<Event>, )731     fn activate(
732         &mut self,
733         mem: GuestMemory,
734         interrupt: Interrupt,
735         queues: Vec<Queue>,
736         queue_evts: Vec<Event>,
737     ) {
738         let (self_kill_evt, kill_evt) = match Event::new().and_then(|e| Ok((e.try_clone()?, e))) {
739             Ok(v) => v,
740             Err(e) => {
741                 error!("failed creating kill Event pair: {}", e);
742                 return;
743             }
744         };
745         self.kill_evt = Some(self_kill_evt);
746 
747         let read_only = self.read_only;
748         let sparse = self.sparse;
749         let disk_size = self.disk_size.clone();
750         let id = self.id.take();
751         if let Some(disk_image) = self.disk_image.take() {
752             let control_tube = self.control_tube.take();
753             let worker_result =
754                 thread::Builder::new()
755                     .name("virtio_blk".to_string())
756                     .spawn(move || {
757                         let ex = Executor::new().expect("Failed to create an executor");
758 
759                         let async_control = control_tube
760                             .map(|c| AsyncTube::new(&ex, c).expect("failed to create async tube"));
761                         let async_image = match disk_image.to_async_disk(&ex) {
762                             Ok(d) => d,
763                             Err(e) => panic!("Failed to create async disk {}", e),
764                         };
765                         let disk_state = Rc::new(AsyncMutex::new(DiskState {
766                             disk_image: async_image,
767                             disk_size,
768                             read_only,
769                             sparse,
770                             id,
771                         }));
772                         if let Err(err_string) = run_worker(
773                             ex,
774                             interrupt,
775                             queues,
776                             mem,
777                             &disk_state,
778                             &async_control,
779                             queue_evts,
780                             kill_evt,
781                         ) {
782                             error!("{}", err_string);
783                         }
784 
785                         let disk_state = match Rc::try_unwrap(disk_state) {
786                             Ok(d) => d.into_inner(),
787                             Err(_) => panic!("too many refs to the disk"),
788                         };
789                         (
790                             disk_state.disk_image.into_inner(),
791                             async_control.map(|c| c.into()),
792                         )
793                     });
794 
795             match worker_result {
796                 Err(e) => {
797                     error!("failed to spawn virtio_blk worker: {}", e);
798                     return;
799                 }
800                 Ok(join_handle) => {
801                     self.worker_thread = Some(join_handle);
802                 }
803             }
804         }
805     }
806 
reset(&mut self) -> bool807     fn reset(&mut self) -> bool {
808         if let Some(kill_evt) = self.kill_evt.take() {
809             if kill_evt.write(1).is_err() {
810                 error!("{}: failed to notify the kill event", self.debug_label());
811                 return false;
812             }
813         }
814 
815         if let Some(worker_thread) = self.worker_thread.take() {
816             match worker_thread.join() {
817                 Err(_) => {
818                     error!("{}: failed to get back resources", self.debug_label());
819                     return false;
820                 }
821                 Ok((disk_image, control_tube)) => {
822                     self.disk_image = Some(disk_image);
823                     self.control_tube = control_tube;
824                     return true;
825                 }
826             }
827         }
828         false
829     }
830 }
831 
832 #[cfg(test)]
833 mod tests {
834     use std::fs::{File, OpenOptions};
835     use std::mem::size_of_val;
836     use std::sync::atomic::AtomicU64;
837 
838     use data_model::{Le32, Le64};
839     use disk::SingleFileDisk;
840     use hypervisor::ProtectionType;
841     use tempfile::TempDir;
842     use vm_memory::GuestAddress;
843 
844     use crate::virtio::base_features;
845     use crate::virtio::block::common::*;
846     use crate::virtio::descriptor_utils::{create_descriptor_chain, DescriptorType};
847 
848     use super::*;
849 
850     #[test]
read_size()851     fn read_size() {
852         let tempdir = TempDir::new().unwrap();
853         let mut path = tempdir.path().to_owned();
854         path.push("disk_image");
855         let f = File::create(&path).unwrap();
856         f.set_len(0x1000).unwrap();
857 
858         let features = base_features(ProtectionType::Unprotected);
859         let b = BlockAsync::new(features, Box::new(f), true, false, 512, None, None).unwrap();
860         let mut num_sectors = [0u8; 4];
861         b.read_config(0, &mut num_sectors);
862         // size is 0x1000, so num_sectors is 8 (4096/512).
863         assert_eq!([0x08, 0x00, 0x00, 0x00], num_sectors);
864         let mut msw_sectors = [0u8; 4];
865         b.read_config(4, &mut msw_sectors);
866         // size is 0x1000, so msw_sectors is 0.
867         assert_eq!([0x00, 0x00, 0x00, 0x00], msw_sectors);
868     }
869 
870     #[test]
read_block_size()871     fn read_block_size() {
872         let tempdir = TempDir::new().unwrap();
873         let mut path = tempdir.path().to_owned();
874         path.push("disk_image");
875         let f = File::create(&path).unwrap();
876         f.set_len(0x1000).unwrap();
877 
878         let features = base_features(ProtectionType::Unprotected);
879         let b = BlockAsync::new(features, Box::new(f), true, false, 4096, None, None).unwrap();
880         let mut blk_size = [0u8; 4];
881         b.read_config(20, &mut blk_size);
882         // blk_size should be 4096 (0x1000).
883         assert_eq!([0x00, 0x10, 0x00, 0x00], blk_size);
884     }
885 
886     #[test]
read_features()887     fn read_features() {
888         let tempdir = TempDir::new().unwrap();
889         let mut path = tempdir.path().to_owned();
890         path.push("disk_image");
891 
892         // read-write block device
893         {
894             let f = File::create(&path).unwrap();
895             let features = base_features(ProtectionType::Unprotected);
896             let b = BlockAsync::new(features, Box::new(f), false, true, 512, None, None).unwrap();
897             // writable device should set VIRTIO_BLK_F_FLUSH + VIRTIO_BLK_F_DISCARD
898             // + VIRTIO_BLK_F_WRITE_ZEROES + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE
899             // + VIRTIO_BLK_F_SEG_MAX + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
900             assert_eq!(0x120007244, b.features());
901         }
902 
903         // read-write block device, non-sparse
904         {
905             let f = File::create(&path).unwrap();
906             let features = base_features(ProtectionType::Unprotected);
907             let b = BlockAsync::new(features, Box::new(f), false, false, 512, None, None).unwrap();
908             // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO
909             // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE + VIRTIO_BLK_F_SEG_MAX
910             // + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
911             assert_eq!(0x120005244, b.features());
912         }
913 
914         // read-only block device
915         {
916             let f = File::create(&path).unwrap();
917             let features = base_features(ProtectionType::Unprotected);
918             let b = BlockAsync::new(features, Box::new(f), true, true, 512, None, None).unwrap();
919             // read-only device should set VIRTIO_BLK_F_FLUSH and VIRTIO_BLK_F_RO
920             // + VIRTIO_F_VERSION_1 + VIRTIO_BLK_F_BLK_SIZE + VIRTIO_BLK_F_SEG_MAX
921             // + VIRTIO_BLK_F_MQ + VIRTIO_RING_F_EVENT_IDX
922             assert_eq!(0x120001264, b.features());
923         }
924     }
925 
926     #[test]
read_last_sector()927     fn read_last_sector() {
928         let ex = Executor::new().expect("creating an executor failed");
929 
930         let tempdir = TempDir::new().unwrap();
931         let mut path = tempdir.path().to_owned();
932         path.push("disk_image");
933         let f = OpenOptions::new()
934             .read(true)
935             .write(true)
936             .create(true)
937             .open(&path)
938             .unwrap();
939         let disk_size = 0x1000;
940         f.set_len(disk_size).unwrap();
941         let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
942 
943         let mem = Rc::new(
944             GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
945                 .expect("Creating guest memory failed."),
946         );
947 
948         let req_hdr = virtio_blk_req_header {
949             req_type: Le32::from(VIRTIO_BLK_T_IN),
950             reserved: Le32::from(0),
951             sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
952         };
953         mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
954             .expect("writing req failed");
955 
956         let avail_desc = create_descriptor_chain(
957             &mem,
958             GuestAddress(0x100),  // Place descriptor chain at 0x100.
959             GuestAddress(0x1000), // Describe buffer at 0x1000.
960             vec![
961                 // Request header
962                 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
963                 // I/O buffer (1 sector of data)
964                 (DescriptorType::Writable, 512),
965                 // Request status
966                 (DescriptorType::Writable, 1),
967             ],
968             0,
969         )
970         .expect("create_descriptor_chain failed");
971 
972         let timer = Timer::new().expect("Failed to create a timer");
973         let flush_timer = Rc::new(RefCell::new(
974             TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
975         ));
976         let flush_timer_armed = Rc::new(RefCell::new(false));
977 
978         let disk_state = Rc::new(AsyncMutex::new(DiskState {
979             disk_image: Box::new(af),
980             disk_size: Arc::new(AtomicU64::new(disk_size)),
981             read_only: false,
982             sparse: true,
983             id: None,
984         }));
985 
986         let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
987 
988         ex.run_until(fut)
989             .expect("running executor failed")
990             .expect("execute failed");
991 
992         let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512) as u64);
993         let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
994         assert_eq!(status, VIRTIO_BLK_S_OK);
995     }
996 
997     #[test]
read_beyond_last_sector()998     fn read_beyond_last_sector() {
999         let tempdir = TempDir::new().unwrap();
1000         let mut path = tempdir.path().to_owned();
1001         path.push("disk_image");
1002         let f = OpenOptions::new()
1003             .read(true)
1004             .write(true)
1005             .create(true)
1006             .open(&path)
1007             .unwrap();
1008         let disk_size = 0x1000;
1009         f.set_len(disk_size).unwrap();
1010         let mem = Rc::new(
1011             GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
1012                 .expect("Creating guest memory failed."),
1013         );
1014 
1015         let req_hdr = virtio_blk_req_header {
1016             req_type: Le32::from(VIRTIO_BLK_T_IN),
1017             reserved: Le32::from(0),
1018             sector: Le64::from(7), // Disk is 8 sectors long, so this is the last valid sector.
1019         };
1020         mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
1021             .expect("writing req failed");
1022 
1023         let avail_desc = create_descriptor_chain(
1024             &mem,
1025             GuestAddress(0x100),  // Place descriptor chain at 0x100.
1026             GuestAddress(0x1000), // Describe buffer at 0x1000.
1027             vec![
1028                 // Request header
1029                 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
1030                 // I/O buffer (2 sectors of data - overlap the end of the disk).
1031                 (DescriptorType::Writable, 512 * 2),
1032                 // Request status
1033                 (DescriptorType::Writable, 1),
1034             ],
1035             0,
1036         )
1037         .expect("create_descriptor_chain failed");
1038 
1039         let ex = Executor::new().expect("creating an executor failed");
1040 
1041         let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
1042         let timer = Timer::new().expect("Failed to create a timer");
1043         let flush_timer = Rc::new(RefCell::new(
1044             TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
1045         ));
1046         let flush_timer_armed = Rc::new(RefCell::new(false));
1047         let disk_state = Rc::new(AsyncMutex::new(DiskState {
1048             disk_image: Box::new(af),
1049             disk_size: Arc::new(AtomicU64::new(disk_size)),
1050             read_only: false,
1051             sparse: true,
1052             id: None,
1053         }));
1054 
1055         let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
1056 
1057         ex.run_until(fut)
1058             .expect("running executor failed")
1059             .expect("execute failed");
1060 
1061         let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512 * 2) as u64);
1062         let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
1063         assert_eq!(status, VIRTIO_BLK_S_IOERR);
1064     }
1065 
1066     #[test]
get_id()1067     fn get_id() {
1068         let ex = Executor::new().expect("creating an executor failed");
1069 
1070         let tempdir = TempDir::new().unwrap();
1071         let mut path = tempdir.path().to_owned();
1072         path.push("disk_image");
1073         let f = OpenOptions::new()
1074             .read(true)
1075             .write(true)
1076             .create(true)
1077             .open(&path)
1078             .unwrap();
1079         let disk_size = 0x1000;
1080         f.set_len(disk_size).unwrap();
1081 
1082         let mem = GuestMemory::new(&[(GuestAddress(0u64), 4 * 1024 * 1024)])
1083             .expect("Creating guest memory failed.");
1084 
1085         let req_hdr = virtio_blk_req_header {
1086             req_type: Le32::from(VIRTIO_BLK_T_GET_ID),
1087             reserved: Le32::from(0),
1088             sector: Le64::from(0),
1089         };
1090         mem.write_obj_at_addr(req_hdr, GuestAddress(0x1000))
1091             .expect("writing req failed");
1092 
1093         let avail_desc = create_descriptor_chain(
1094             &mem,
1095             GuestAddress(0x100),  // Place descriptor chain at 0x100.
1096             GuestAddress(0x1000), // Describe buffer at 0x1000.
1097             vec![
1098                 // Request header
1099                 (DescriptorType::Readable, size_of_val(&req_hdr) as u32),
1100                 // I/O buffer (20 bytes for serial)
1101                 (DescriptorType::Writable, 20),
1102                 // Request status
1103                 (DescriptorType::Writable, 1),
1104             ],
1105             0,
1106         )
1107         .expect("create_descriptor_chain failed");
1108 
1109         let af = SingleFileDisk::new(f, &ex).expect("Failed to create SFD");
1110         let timer = Timer::new().expect("Failed to create a timer");
1111         let flush_timer = Rc::new(RefCell::new(
1112             TimerAsync::new(timer.0, &ex).expect("Failed to create an async timer"),
1113         ));
1114         let flush_timer_armed = Rc::new(RefCell::new(false));
1115 
1116         let id = b"a20-byteserialnumber";
1117 
1118         let disk_state = Rc::new(AsyncMutex::new(DiskState {
1119             disk_image: Box::new(af),
1120             disk_size: Arc::new(AtomicU64::new(disk_size)),
1121             read_only: false,
1122             sparse: true,
1123             id: Some(*id),
1124         }));
1125 
1126         let fut = process_one_request(avail_desc, disk_state, flush_timer, flush_timer_armed, &mem);
1127 
1128         ex.run_until(fut)
1129             .expect("running executor failed")
1130             .expect("execute failed");
1131 
1132         let status_offset = GuestAddress((0x1000 + size_of_val(&req_hdr) + 512) as u64);
1133         let status = mem.read_obj_from_addr::<u8>(status_offset).unwrap();
1134         assert_eq!(status, VIRTIO_BLK_S_OK);
1135 
1136         let id_offset = GuestAddress(0x1000 + size_of_val(&req_hdr) as u64);
1137         let returned_id = mem.read_obj_from_addr::<[u8; 20]>(id_offset).unwrap();
1138         assert_eq!(returned_id, *id);
1139     }
1140 }
1141