1 // Copyright (C) 2019 Alibaba Cloud. All rights reserved.
2 // SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause
3
4 //! Virtio Vhost Backend Drivers
5 //!
6 //! Virtio devices use virtqueues to transport data efficiently. The first generation of virtqueue
7 //! is a set of three different single-producer, single-consumer ring structures designed to store
8 //! generic scatter-gather I/O. The virtio specification 1.1 introduces an alternative compact
9 //! virtqueue layout named "Packed Virtqueue", which is more friendly to memory cache system and
10 //! hardware implemented virtio devices. The packed virtqueue uses read-write memory, that means
11 //! the memory will be both read and written by both host and guest. The new Packed Virtqueue is
12 //! preferred for performance.
13 //!
14 //! Vhost is a mechanism to improve performance of Virtio devices by delegate data plane operations
15 //! to dedicated IO service processes. Only the configuration, I/O submission notification, and I/O
16 //! completion interruption are piped through the hypervisor.
17 //! It uses the same virtqueue layout as Virtio to allow Vhost devices to be mapped directly to
18 //! Virtio devices. This allows a Vhost device to be accessed directly by a guest OS inside a
19 //! hypervisor process with an existing Virtio (PCI) driver.
20 //!
21 //! The initial vhost implementation is a part of the Linux kernel and uses ioctl interface to
22 //! communicate with userspace applications. Dedicated kernel worker threads are created to handle
23 //! IO requests from the guest.
24 //!
25 //! Later Vhost-user protocol is introduced to complement the ioctl interface used to control the
26 //! vhost implementation in the Linux kernel. It implements the control plane needed to establish
27 //! virtqueues sharing with a user space process on the same host. It uses communication over a
28 //! Unix domain socket to share file descriptors in the ancillary data of the message.
29 //! The protocol defines 2 sides of the communication, master and slave. Master is the application
30 //! that shares its virtqueues. Slave is the consumer of the virtqueues. Master and slave can be
31 //! either a client (i.e. connecting) or server (listening) in the socket communication.
32
33 #![deny(missing_docs)]
34
35 use std::fs::File;
36 use std::io::Error as IOError;
37
38 use remain::sorted;
39 use thiserror::Error as ThisError;
40
41 mod backend;
42 pub use backend::*;
43
44 pub mod message;
45
46 pub mod connection;
47
48 mod sys;
49 pub use sys::SystemStream;
50 pub use sys::*;
51
52 cfg_if::cfg_if! {
53 if #[cfg(feature = "vmm")] {
54 pub(crate) mod master;
55 pub use self::master::{Master, VhostUserMaster};
56 mod master_req_handler;
57 pub use self::master_req_handler::{VhostUserMasterReqHandler,
58 VhostUserMasterReqHandlerMut};
59 }
60 }
61 cfg_if::cfg_if! {
62 if #[cfg(feature = "device")] {
63 mod slave_req_handler;
64 mod slave_proxy;
65 pub use self::slave_req_handler::{
66 Protocol, SlaveReqHandler, SlaveReqHelper, VhostUserSlaveReqHandler,
67 VhostUserSlaveReqHandlerMut,
68 };
69 pub use self::slave_proxy::Slave;
70 }
71 }
72 cfg_if::cfg_if! {
73 if #[cfg(all(feature = "device", unix))] {
74 mod slave;
75 pub use self::slave::SlaveListener;
76 }
77 }
78 cfg_if::cfg_if! {
79 if #[cfg(feature = "vmm")] {
80 pub use self::master_req_handler::MasterReqHandler;
81 }
82 }
83
84 /// Errors for vhost-user operations
85 #[sorted]
86 #[derive(Debug, ThisError)]
87 pub enum Error {
88 /// client exited properly.
89 #[error("client exited properly")]
90 ClientExit,
91 /// client disconnected.
92 /// If connection is closed properly, use `ClientExit` instead.
93 #[error("client closed the connection")]
94 Disconnect,
95 /// Virtio/protocol features mismatch.
96 #[error("virtio features mismatch")]
97 FeatureMismatch,
98 /// Fd array in question is too big or too small
99 #[error("wrong number of attached fds")]
100 IncorrectFds,
101 /// Invalid message format, flag or content.
102 #[error("invalid message")]
103 InvalidMessage,
104 /// Unsupported operations due to that the protocol feature hasn't been negotiated.
105 #[error("invalid operation")]
106 InvalidOperation,
107 /// Invalid parameters.
108 #[error("invalid parameters")]
109 InvalidParam,
110 /// Failure from the master side.
111 #[error("master Internal error")]
112 MasterInternalError,
113 /// Message is too large
114 #[error("oversized message")]
115 OversizedMsg,
116 /// Only part of a message have been sent or received successfully
117 #[error("partial message")]
118 PartialMessage,
119 /// Provided recv buffer was too small, and data was dropped.
120 #[error("buffer for recv was too small, data was dropped: got size {got}, needed {want}")]
121 RecvBufferTooSmall {
122 /// The size of the buffer received.
123 got: usize,
124 /// The expected size of the buffer.
125 want: usize,
126 },
127 /// Error from request handler
128 #[error("handler failed to handle request: {0}")]
129 ReqHandlerError(IOError),
130 /// Failure from the slave side.
131 #[error("slave internal error")]
132 SlaveInternalError,
133 /// The socket is broken or has been closed.
134 #[error("socket is broken: {0}")]
135 SocketBroken(std::io::Error),
136 /// Can't connect to peer.
137 #[error("can't connect to peer: {0}")]
138 SocketConnect(std::io::Error),
139 /// Generic socket errors.
140 #[error("socket error: {0}")]
141 SocketError(std::io::Error),
142 /// Should retry the socket operation again.
143 #[error("temporary socket error: {0}")]
144 SocketRetry(std::io::Error),
145 /// Error from tx/rx on a Tube.
146 #[error("failed to read/write on Tube: {0}")]
147 TubeError(base::TubeError),
148 /// Error from VFIO device.
149 #[error("error occurred in VFIO device: {0}")]
150 VfioDeviceError(anyhow::Error),
151 }
152
153 impl From<base::TubeError> for Error {
from(err: base::TubeError) -> Self154 fn from(err: base::TubeError) -> Self {
155 Error::TubeError(err)
156 }
157 }
158
159 impl From<std::io::Error> for Error {
from(err: std::io::Error) -> Self160 fn from(err: std::io::Error) -> Self {
161 Error::SocketError(err)
162 }
163 }
164
165 impl From<base::Error> for Error {
166 /// Convert raw socket errors into meaningful vhost-user errors.
167 ///
168 /// The base::Error is a simple wrapper over the raw errno, which doesn't means
169 /// much to the vhost-user connection manager. So convert it into meaningful errors to simplify
170 /// the connection manager logic.
171 ///
172 /// # Return:
173 /// * - Error::SocketRetry: temporary error caused by signals or short of resources.
174 /// * - Error::SocketBroken: the underline socket is broken.
175 /// * - Error::SocketError: other socket related errors.
176 #[allow(unreachable_patterns)] // EWOULDBLOCK equals to EGAIN on linux
from(err: base::Error) -> Self177 fn from(err: base::Error) -> Self {
178 match err.errno() {
179 // Retry:
180 // * EAGAIN, EWOULDBLOCK: The socket is marked nonblocking and the requested operation
181 // would block.
182 // * EINTR: A signal occurred before any data was transmitted
183 // * ENOBUFS: The output queue for a network interface was full. This generally
184 // indicates that the interface has stopped sending, but may be caused by transient
185 // congestion.
186 // * ENOMEM: No memory available.
187 libc::EAGAIN | libc::EWOULDBLOCK | libc::EINTR | libc::ENOBUFS | libc::ENOMEM => {
188 Error::SocketRetry(err.into())
189 }
190 // Broken:
191 // * ECONNRESET: Connection reset by peer.
192 // * EPIPE: The local end has been shut down on a connection oriented socket. In this
193 // case the process will also receive a SIGPIPE unless MSG_NOSIGNAL is set.
194 libc::ECONNRESET | libc::EPIPE => Error::SocketBroken(err.into()),
195 // Write permission is denied on the destination socket file, or search permission is
196 // denied for one of the directories the path prefix.
197 libc::EACCES => Error::SocketConnect(IOError::from_raw_os_error(libc::EACCES)),
198 // Catch all other errors
199 e => Error::SocketError(IOError::from_raw_os_error(e)),
200 }
201 }
202 }
203
204 /// Result of vhost-user operations
205 pub type Result<T> = std::result::Result<T, Error>;
206
207 /// Result of request handler.
208 pub type HandlerResult<T> = std::result::Result<T, IOError>;
209
210 /// Utility function to take the first element from option of a vector of files.
211 /// Returns `None` if the vector contains no file or more than one file.
take_single_file(files: Option<Vec<File>>) -> Option<File>212 pub(crate) fn take_single_file(files: Option<Vec<File>>) -> Option<File> {
213 let mut files = files?;
214 if files.len() != 1 {
215 return None;
216 }
217 Some(files.swap_remove(0))
218 }
219
220 #[cfg(all(test, feature = "device"))]
221 mod dummy_slave;
222
223 #[cfg(all(test, feature = "vmm", feature = "device"))]
224 mod tests {
225 use std::sync::Arc;
226 use std::sync::Barrier;
227 use std::sync::Mutex;
228 use std::thread;
229
230 use base::AsRawDescriptor;
231 use tempfile::tempfile;
232
233 use super::*;
234 use crate::backend::VhostBackend;
235 use crate::connection::tests::*;
236 use crate::dummy_slave::DummySlaveReqHandler;
237 use crate::dummy_slave::VIRTIO_FEATURES;
238 use crate::message::*;
239 use crate::VhostUserMemoryRegionInfo;
240 use crate::VringConfigData;
241
242 /// Utility function to process a header and a message together.
handle_request( h: &mut SlaveReqHandler<Mutex<DummySlaveReqHandler>, MasterReqEndpoint>, ) -> Result<()>243 fn handle_request(
244 h: &mut SlaveReqHandler<Mutex<DummySlaveReqHandler>, MasterReqEndpoint>,
245 ) -> Result<()> {
246 // We assume that a header comes together with message body in tests so we don't wait before
247 // calling `process_message()`.
248 let (hdr, files) = h.recv_header()?;
249 h.process_message(hdr, files)
250 }
251
252 #[test]
create_dummy_slave()253 fn create_dummy_slave() {
254 let slave = Mutex::new(DummySlaveReqHandler::new());
255
256 slave.set_owner().unwrap();
257 assert!(slave.set_owner().is_err());
258 }
259
260 #[test]
test_set_owner()261 fn test_set_owner() {
262 let slave_be = Mutex::new(DummySlaveReqHandler::new());
263 let (master, mut slave) = create_master_slave_pair(slave_be);
264
265 assert!(!slave.as_ref().lock().unwrap().owned);
266 master.set_owner().unwrap();
267 handle_request(&mut slave).unwrap();
268 assert!(slave.as_ref().lock().unwrap().owned);
269 master.set_owner().unwrap();
270 assert!(handle_request(&mut slave).is_err());
271 assert!(slave.as_ref().lock().unwrap().owned);
272 }
273
274 #[test]
test_set_features()275 fn test_set_features() {
276 let mbar = Arc::new(Barrier::new(2));
277 let sbar = mbar.clone();
278 let slave_be = Mutex::new(DummySlaveReqHandler::new());
279 let (mut master, mut slave) = create_master_slave_pair(slave_be);
280
281 thread::spawn(move || {
282 handle_request(&mut slave).unwrap();
283 assert!(slave.as_ref().lock().unwrap().owned);
284
285 handle_request(&mut slave).unwrap();
286 handle_request(&mut slave).unwrap();
287 assert_eq!(
288 slave.as_ref().lock().unwrap().acked_features,
289 VIRTIO_FEATURES & !0x1
290 );
291
292 handle_request(&mut slave).unwrap();
293 handle_request(&mut slave).unwrap();
294 assert_eq!(
295 slave.as_ref().lock().unwrap().acked_protocol_features,
296 VhostUserProtocolFeatures::all().bits()
297 );
298
299 sbar.wait();
300 });
301
302 master.set_owner().unwrap();
303
304 // set virtio features
305 let features = master.get_features().unwrap();
306 assert_eq!(features, VIRTIO_FEATURES);
307 master.set_features(VIRTIO_FEATURES & !0x1).unwrap();
308
309 // set vhost protocol features
310 let features = master.get_protocol_features().unwrap();
311 assert_eq!(features.bits(), VhostUserProtocolFeatures::all().bits());
312 master.set_protocol_features(features).unwrap();
313
314 mbar.wait();
315 }
316
317 #[test]
test_master_slave_process()318 fn test_master_slave_process() {
319 let mbar = Arc::new(Barrier::new(2));
320 let sbar = mbar.clone();
321 let slave_be = Mutex::new(DummySlaveReqHandler::new());
322 let (mut master, mut slave) = create_master_slave_pair(slave_be);
323
324 thread::spawn(move || {
325 // set_own()
326 handle_request(&mut slave).unwrap();
327 assert!(slave.as_ref().lock().unwrap().owned);
328
329 // get/set_features()
330 handle_request(&mut slave).unwrap();
331 handle_request(&mut slave).unwrap();
332 assert_eq!(
333 slave.as_ref().lock().unwrap().acked_features,
334 VIRTIO_FEATURES & !0x1
335 );
336
337 handle_request(&mut slave).unwrap();
338 handle_request(&mut slave).unwrap();
339 assert_eq!(
340 slave.as_ref().lock().unwrap().acked_protocol_features,
341 VhostUserProtocolFeatures::all().bits()
342 );
343
344 // get_inflight_fd()
345 handle_request(&mut slave).unwrap();
346 // set_inflight_fd()
347 handle_request(&mut slave).unwrap();
348
349 // get_queue_num()
350 handle_request(&mut slave).unwrap();
351
352 // set_mem_table()
353 handle_request(&mut slave).unwrap();
354
355 // get/set_config()
356 handle_request(&mut slave).unwrap();
357 handle_request(&mut slave).unwrap();
358
359 // set_slave_request_fd
360 handle_request(&mut slave).unwrap();
361
362 // set_vring_enable
363 handle_request(&mut slave).unwrap();
364
365 // set_log_base,set_log_fd()
366 handle_request(&mut slave).unwrap_err();
367 handle_request(&mut slave).unwrap_err();
368
369 // set_vring_xxx
370 handle_request(&mut slave).unwrap();
371 handle_request(&mut slave).unwrap();
372 handle_request(&mut slave).unwrap();
373 handle_request(&mut slave).unwrap();
374 handle_request(&mut slave).unwrap();
375 handle_request(&mut slave).unwrap();
376
377 // get_max_mem_slots()
378 handle_request(&mut slave).unwrap();
379
380 // add_mem_region()
381 handle_request(&mut slave).unwrap();
382
383 // remove_mem_region()
384 handle_request(&mut slave).unwrap();
385
386 sbar.wait();
387 });
388
389 master.set_owner().unwrap();
390
391 // set virtio features
392 let features = master.get_features().unwrap();
393 assert_eq!(features, VIRTIO_FEATURES);
394 master.set_features(VIRTIO_FEATURES & !0x1).unwrap();
395
396 // set vhost protocol features
397 let features = master.get_protocol_features().unwrap();
398 assert_eq!(features.bits(), VhostUserProtocolFeatures::all().bits());
399 master.set_protocol_features(features).unwrap();
400
401 // Retrieve inflight I/O tracking information
402 let (inflight_info, inflight_file) = master
403 .get_inflight_fd(&VhostUserInflight {
404 num_queues: 2,
405 queue_size: 256,
406 ..Default::default()
407 })
408 .unwrap();
409 // Set the buffer back to the backend
410 master
411 .set_inflight_fd(&inflight_info, inflight_file.as_raw_descriptor())
412 .unwrap();
413
414 let num = master.get_queue_num().unwrap();
415 assert_eq!(num, 2);
416
417 let event = base::Event::new().unwrap();
418 let mem = [VhostUserMemoryRegionInfo {
419 guest_phys_addr: 0,
420 memory_size: 0x10_0000,
421 userspace_addr: 0,
422 mmap_offset: 0,
423 mmap_handle: event.as_raw_descriptor(),
424 }];
425 master.set_mem_table(&mem).unwrap();
426
427 master
428 .set_config(0x100, VhostUserConfigFlags::WRITABLE, &[0xa5u8])
429 .unwrap();
430 let buf = [0x0u8; 4];
431 let (reply_body, reply_payload) = master
432 .get_config(0x100, 4, VhostUserConfigFlags::empty(), &buf)
433 .unwrap();
434 let offset = reply_body.offset;
435 assert_eq!(offset, 0x100);
436 assert_eq!(reply_payload[0], 0xa5);
437
438 #[cfg(windows)]
439 let tubes = base::Tube::pair().unwrap();
440 #[cfg(windows)]
441 // Safe because we will be importing the Tube in the other thread.
442 let descriptor =
443 unsafe { tube_transporter::packed_tube::pack(tubes.0, std::process::id()).unwrap() };
444
445 #[cfg(unix)]
446 let descriptor = base::Event::new().unwrap();
447
448 master.set_slave_request_fd(&descriptor).unwrap();
449 master.set_vring_enable(0, true).unwrap();
450
451 // unimplemented yet
452 master
453 .set_log_base(0, Some(event.as_raw_descriptor()))
454 .unwrap();
455 master.set_log_fd(event.as_raw_descriptor()).unwrap();
456
457 master.set_vring_num(0, 256).unwrap();
458 master.set_vring_base(0, 0).unwrap();
459 let config = VringConfigData {
460 queue_max_size: 256,
461 queue_size: 128,
462 flags: VhostUserVringAddrFlags::VHOST_VRING_F_LOG.bits(),
463 desc_table_addr: 0x1000,
464 used_ring_addr: 0x2000,
465 avail_ring_addr: 0x3000,
466 log_addr: Some(0x4000),
467 };
468 master.set_vring_addr(0, &config).unwrap();
469 master.set_vring_call(0, &event).unwrap();
470 master.set_vring_kick(0, &event).unwrap();
471 master.set_vring_err(0, &event).unwrap();
472
473 let max_mem_slots = master.get_max_mem_slots().unwrap();
474 assert_eq!(max_mem_slots, 32);
475
476 let region_file = tempfile().unwrap();
477 let region = VhostUserMemoryRegionInfo {
478 guest_phys_addr: 0x10_0000,
479 memory_size: 0x10_0000,
480 userspace_addr: 0,
481 mmap_offset: 0,
482 mmap_handle: region_file.as_raw_descriptor(),
483 };
484 master.add_mem_region(®ion).unwrap();
485
486 master.remove_mem_region(®ion).unwrap();
487
488 mbar.wait();
489 }
490
491 #[test]
test_error_display()492 fn test_error_display() {
493 assert_eq!(format!("{}", Error::InvalidParam), "invalid parameters");
494 assert_eq!(format!("{}", Error::InvalidOperation), "invalid operation");
495 }
496
497 #[test]
test_error_from_base_error()498 fn test_error_from_base_error() {
499 let e: Error = base::Error::new(libc::EAGAIN).into();
500 if let Error::SocketRetry(e1) = e {
501 assert_eq!(e1.raw_os_error().unwrap(), libc::EAGAIN);
502 } else {
503 panic!("invalid error code conversion!");
504 }
505 }
506 }
507