1 // Copyright (C) 2019 Alibaba Cloud. All rights reserved.
2 // SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause
3
4 //! Virtio Vhost Backend Drivers
5 //!
6 //! Virtio devices use virtqueues to transport data efficiently. The first generation of virtqueue
7 //! is a set of three different single-producer, single-consumer ring structures designed to store
8 //! generic scatter-gather I/O. The virtio specification 1.1 introduces an alternative compact
9 //! virtqueue layout named "Packed Virtqueue", which is more friendly to memory cache system and
10 //! hardware implemented virtio devices. The packed virtqueue uses read-write memory, that means
11 //! the memory will be both read and written by both host and guest. The new Packed Virtqueue is
12 //! preferred for performance.
13 //!
14 //! Vhost is a mechanism to improve performance of Virtio devices by delegate data plane operations
15 //! to dedicated IO service processes. Only the configuration, I/O submission notification, and I/O
16 //! completion interruption are piped through the hypervisor.
17 //! It uses the same virtqueue layout as Virtio to allow Vhost devices to be mapped directly to
18 //! Virtio devices. This allows a Vhost device to be accessed directly by a guest OS inside a
19 //! hypervisor process with an existing Virtio (PCI) driver.
20 //!
21 //! The initial vhost implementation is a part of the Linux kernel and uses ioctl interface to
22 //! communicate with userspace applications. Dedicated kernel worker threads are created to handle
23 //! IO requests from the guest.
24 //!
25 //! Later Vhost-user protocol is introduced to complement the ioctl interface used to control the
26 //! vhost implementation in the Linux kernel. It implements the control plane needed to establish
27 //! virtqueues sharing with a user space process on the same host. It uses communication over a
28 //! Unix domain socket to share file descriptors in the ancillary data of the message.
29 //! The protocol defines 2 sides of the communication, master and slave. Master is the application
30 //! that shares its virtqueues. Slave is the consumer of the virtqueues. Master and slave can be
31 //! either a client (i.e. connecting) or server (listening) in the socket communication.
32
33 #![deny(missing_docs)]
34
35 #[cfg(any(feature = "vmm", feature = "device"))]
36 use std::fs::File;
37 use std::io::Error as IOError;
38
39 use remain::sorted;
40 use thiserror::Error as ThisError;
41
42 mod backend;
43 pub use backend::*;
44
45 pub mod message;
46
47 pub mod connection;
48
49 mod sys;
50 pub use sys::{SystemStream, *};
51
52 cfg_if::cfg_if! {
53 if #[cfg(feature = "vmm")] {
54 pub(crate) mod master;
55 pub use self::master::{Master, VhostUserMaster};
56 mod master_req_handler;
57 pub use self::master_req_handler::{VhostUserMasterReqHandler,
58 VhostUserMasterReqHandlerMut};
59 }
60 }
61 cfg_if::cfg_if! {
62 if #[cfg(feature = "device")] {
63 mod slave_req_handler;
64 mod slave_fs_cache;
65 pub use self::slave_req_handler::{
66 Protocol, SlaveReqHandler, SlaveReqHelper, VhostUserSlaveReqHandler,
67 VhostUserSlaveReqHandlerMut,
68 };
69 pub use self::slave_fs_cache::SlaveFsCacheReq;
70 }
71 }
72 cfg_if::cfg_if! {
73 if #[cfg(all(feature = "device", unix))] {
74 mod slave;
75 pub use self::slave::SlaveListener;
76 }
77 }
78 cfg_if::cfg_if! {
79 if #[cfg(all(feature = "vmm", unix))] {
80 pub use self::master_req_handler::MasterReqHandler;
81 }
82 }
83
84 /// Errors for vhost-user operations
85 #[sorted]
86 #[derive(Debug, ThisError)]
87 pub enum Error {
88 /// client exited properly.
89 #[error("client exited properly")]
90 ClientExit,
91 /// client disconnected.
92 /// If connection is closed properly, use `ClientExit` instead.
93 #[error("client closed the connection")]
94 Disconnect,
95 /// Virtio/protocol features mismatch.
96 #[error("virtio features mismatch")]
97 FeatureMismatch,
98 /// Fd array in question is too big or too small
99 #[error("wrong number of attached fds")]
100 IncorrectFds,
101 /// Invalid message format, flag or content.
102 #[error("invalid message")]
103 InvalidMessage,
104 /// Unsupported operations due to that the protocol feature hasn't been negotiated.
105 #[error("invalid operation")]
106 InvalidOperation,
107 /// Invalid parameters.
108 #[error("invalid parameters")]
109 InvalidParam,
110 /// Failure from the master side.
111 #[error("master Internal error")]
112 MasterInternalError,
113 /// Message is too large
114 #[error("oversized message")]
115 OversizedMsg,
116 /// Only part of a message have been sent or received successfully
117 #[error("partial message")]
118 PartialMessage,
119 /// Provided recv buffer was too small, and data was dropped.
120 #[error("buffer for recv was too small, data was dropped: got size {got}, needed {want}")]
121 RecvBufferTooSmall {
122 /// The size of the buffer received.
123 got: usize,
124 /// The expected size of the buffer.
125 want: usize,
126 },
127 /// Error from request handler
128 #[error("handler failed to handle request: {0}")]
129 ReqHandlerError(IOError),
130 /// Failure from the slave side.
131 #[error("slave internal error")]
132 SlaveInternalError,
133 /// The socket is broken or has been closed.
134 #[error("socket is broken: {0}")]
135 SocketBroken(std::io::Error),
136 /// Can't connect to peer.
137 #[error("can't connect to peer: {0}")]
138 SocketConnect(std::io::Error),
139 /// Generic socket errors.
140 #[error("socket error: {0}")]
141 SocketError(std::io::Error),
142 /// Should retry the socket operation again.
143 #[error("temporary socket error: {0}")]
144 SocketRetry(std::io::Error),
145 /// Error from tx/rx on a Tube.
146 #[error("failed to read/write on Tube: {0}")]
147 TubeError(base::TubeError),
148 /// Error from VFIO device.
149 #[error("error occurred in VFIO device: {0}")]
150 VfioDeviceError(anyhow::Error),
151 }
152
153 impl std::convert::From<base::Error> for Error {
154 /// Convert raw socket errors into meaningful vhost-user errors.
155 ///
156 /// The base::Error is a simple wrapper over the raw errno, which doesn't means
157 /// much to the vhost-user connection manager. So convert it into meaningful errors to simplify
158 /// the connection manager logic.
159 ///
160 /// # Return:
161 /// * - Error::SocketRetry: temporary error caused by signals or short of resources.
162 /// * - Error::SocketBroken: the underline socket is broken.
163 /// * - Error::SocketError: other socket related errors.
164 #[allow(unreachable_patterns)] // EWOULDBLOCK equals to EGAIN on linux
from(err: base::Error) -> Self165 fn from(err: base::Error) -> Self {
166 match err.errno() {
167 // Retry:
168 // * EAGAIN, EWOULDBLOCK: The socket is marked nonblocking and the requested operation
169 // would block.
170 // * EINTR: A signal occurred before any data was transmitted
171 // * ENOBUFS: The output queue for a network interface was full. This generally
172 // indicates that the interface has stopped sending, but may be caused by transient
173 // congestion.
174 // * ENOMEM: No memory available.
175 libc::EAGAIN | libc::EWOULDBLOCK | libc::EINTR | libc::ENOBUFS | libc::ENOMEM => {
176 Error::SocketRetry(err.into())
177 }
178 // Broken:
179 // * ECONNRESET: Connection reset by peer.
180 // * EPIPE: The local end has been shut down on a connection oriented socket. In this
181 // case the process will also receive a SIGPIPE unless MSG_NOSIGNAL is set.
182 libc::ECONNRESET | libc::EPIPE => Error::SocketBroken(err.into()),
183 // Write permission is denied on the destination socket file, or search permission is
184 // denied for one of the directories the path prefix.
185 libc::EACCES => Error::SocketConnect(IOError::from_raw_os_error(libc::EACCES)),
186 // Catch all other errors
187 e => Error::SocketError(IOError::from_raw_os_error(e)),
188 }
189 }
190 }
191
192 /// Result of vhost-user operations
193 pub type Result<T> = std::result::Result<T, Error>;
194
195 /// Result of request handler.
196 pub type HandlerResult<T> = std::result::Result<T, IOError>;
197
198 /// Utility function to take the first element from option of a vector of files.
199 /// Returns `None` if the vector contains no file or more than one file.
200 #[cfg(any(feature = "vmm", feature = "device"))]
take_single_file(files: Option<Vec<File>>) -> Option<File>201 pub(crate) fn take_single_file(files: Option<Vec<File>>) -> Option<File> {
202 let mut files = files?;
203 if files.len() != 1 {
204 return None;
205 }
206 Some(files.swap_remove(0))
207 }
208
209 #[cfg(all(test, feature = "device"))]
210 mod dummy_slave;
211
212 #[cfg(all(test, feature = "vmm", feature = "device"))]
213 mod tests {
214 use base::AsRawDescriptor;
215 use std::sync::{Arc, Barrier, Mutex};
216 use std::thread;
217
218 use super::connection::tests::*;
219 use super::dummy_slave::{DummySlaveReqHandler, VIRTIO_FEATURES};
220 use super::message::*;
221 use super::*;
222 use crate::backend::VhostBackend;
223 use crate::{VhostUserMemoryRegionInfo, VringConfigData};
224 use tempfile::tempfile;
225
226 #[test]
create_dummy_slave()227 fn create_dummy_slave() {
228 let slave = Arc::new(Mutex::new(DummySlaveReqHandler::new()));
229
230 slave.set_owner().unwrap();
231 assert!(slave.set_owner().is_err());
232 }
233
234 #[test]
test_set_owner()235 fn test_set_owner() {
236 let slave_be = Arc::new(Mutex::new(DummySlaveReqHandler::new()));
237 let (master, mut slave) = create_master_slave_pair(slave_be.clone());
238
239 assert!(!slave_be.lock().unwrap().owned);
240 master.set_owner().unwrap();
241 slave.handle_request().unwrap();
242 assert!(slave_be.lock().unwrap().owned);
243 master.set_owner().unwrap();
244 assert!(slave.handle_request().is_err());
245 assert!(slave_be.lock().unwrap().owned);
246 }
247
248 #[test]
test_set_features()249 fn test_set_features() {
250 let mbar = Arc::new(Barrier::new(2));
251 let sbar = mbar.clone();
252 let slave_be = Arc::new(Mutex::new(DummySlaveReqHandler::new()));
253 let (mut master, mut slave) = create_master_slave_pair(slave_be.clone());
254
255 thread::spawn(move || {
256 slave.handle_request().unwrap();
257 assert!(slave_be.lock().unwrap().owned);
258
259 slave.handle_request().unwrap();
260 slave.handle_request().unwrap();
261 assert_eq!(
262 slave_be.lock().unwrap().acked_features,
263 VIRTIO_FEATURES & !0x1
264 );
265
266 slave.handle_request().unwrap();
267 slave.handle_request().unwrap();
268 assert_eq!(
269 slave_be.lock().unwrap().acked_protocol_features,
270 VhostUserProtocolFeatures::all().bits()
271 );
272
273 sbar.wait();
274 });
275
276 master.set_owner().unwrap();
277
278 // set virtio features
279 let features = master.get_features().unwrap();
280 assert_eq!(features, VIRTIO_FEATURES);
281 master.set_features(VIRTIO_FEATURES & !0x1).unwrap();
282
283 // set vhost protocol features
284 let features = master.get_protocol_features().unwrap();
285 assert_eq!(features.bits(), VhostUserProtocolFeatures::all().bits());
286 master.set_protocol_features(features).unwrap();
287
288 mbar.wait();
289 }
290
291 #[test]
test_master_slave_process()292 fn test_master_slave_process() {
293 let mbar = Arc::new(Barrier::new(2));
294 let sbar = mbar.clone();
295 let slave_be = Arc::new(Mutex::new(DummySlaveReqHandler::new()));
296 let (mut master, mut slave) = create_master_slave_pair(slave_be.clone());
297
298 thread::spawn(move || {
299 // set_own()
300 slave.handle_request().unwrap();
301 assert!(slave_be.lock().unwrap().owned);
302
303 // get/set_features()
304 slave.handle_request().unwrap();
305 slave.handle_request().unwrap();
306 assert_eq!(
307 slave_be.lock().unwrap().acked_features,
308 VIRTIO_FEATURES & !0x1
309 );
310
311 slave.handle_request().unwrap();
312 slave.handle_request().unwrap();
313 assert_eq!(
314 slave_be.lock().unwrap().acked_protocol_features,
315 VhostUserProtocolFeatures::all().bits()
316 );
317
318 // get_inflight_fd()
319 slave.handle_request().unwrap();
320 // set_inflight_fd()
321 slave.handle_request().unwrap();
322
323 // get_queue_num()
324 slave.handle_request().unwrap();
325
326 // set_mem_table()
327 slave.handle_request().unwrap();
328
329 // get/set_config()
330 slave.handle_request().unwrap();
331 slave.handle_request().unwrap();
332
333 // set_slave_request_rd isn't implemented on Windows.
334 #[cfg(unix)]
335 {
336 // set_slave_request_fd
337 slave.handle_request().unwrap();
338 }
339
340 // set_vring_enable
341 slave.handle_request().unwrap();
342
343 // set_log_base,set_log_fd()
344 slave.handle_request().unwrap_err();
345 slave.handle_request().unwrap_err();
346
347 // set_vring_xxx
348 slave.handle_request().unwrap();
349 slave.handle_request().unwrap();
350 slave.handle_request().unwrap();
351 slave.handle_request().unwrap();
352 slave.handle_request().unwrap();
353 slave.handle_request().unwrap();
354
355 // get_max_mem_slots()
356 slave.handle_request().unwrap();
357
358 // add_mem_region()
359 slave.handle_request().unwrap();
360
361 // remove_mem_region()
362 slave.handle_request().unwrap();
363
364 sbar.wait();
365 });
366
367 master.set_owner().unwrap();
368
369 // set virtio features
370 let features = master.get_features().unwrap();
371 assert_eq!(features, VIRTIO_FEATURES);
372 master.set_features(VIRTIO_FEATURES & !0x1).unwrap();
373
374 // set vhost protocol features
375 let features = master.get_protocol_features().unwrap();
376 assert_eq!(features.bits(), VhostUserProtocolFeatures::all().bits());
377 master.set_protocol_features(features).unwrap();
378
379 // Retrieve inflight I/O tracking information
380 let (inflight_info, inflight_file) = master
381 .get_inflight_fd(&VhostUserInflight {
382 num_queues: 2,
383 queue_size: 256,
384 ..Default::default()
385 })
386 .unwrap();
387 // Set the buffer back to the backend
388 master
389 .set_inflight_fd(&inflight_info, inflight_file.as_raw_descriptor())
390 .unwrap();
391
392 let num = master.get_queue_num().unwrap();
393 assert_eq!(num, 2);
394
395 let event = base::Event::new().unwrap();
396 let mem = [VhostUserMemoryRegionInfo {
397 guest_phys_addr: 0,
398 memory_size: 0x10_0000,
399 userspace_addr: 0,
400 mmap_offset: 0,
401 mmap_handle: event.as_raw_descriptor(),
402 }];
403 master.set_mem_table(&mem).unwrap();
404
405 master
406 .set_config(0x100, VhostUserConfigFlags::WRITABLE, &[0xa5u8])
407 .unwrap();
408 let buf = [0x0u8; 4];
409 let (reply_body, reply_payload) = master
410 .get_config(0x100, 4, VhostUserConfigFlags::empty(), &buf)
411 .unwrap();
412 let offset = reply_body.offset;
413 assert_eq!(offset, 0x100);
414 assert_eq!(reply_payload[0], 0xa5);
415
416 // slave request rds are not implemented on Windows.
417 #[cfg(unix)]
418 {
419 master
420 .set_slave_request_fd(&event as &dyn AsRawDescriptor)
421 .unwrap();
422 }
423 master.set_vring_enable(0, true).unwrap();
424
425 // unimplemented yet
426 master
427 .set_log_base(0, Some(event.as_raw_descriptor()))
428 .unwrap();
429 master.set_log_fd(event.as_raw_descriptor()).unwrap();
430
431 master.set_vring_num(0, 256).unwrap();
432 master.set_vring_base(0, 0).unwrap();
433 let config = VringConfigData {
434 queue_max_size: 256,
435 queue_size: 128,
436 flags: VhostUserVringAddrFlags::VHOST_VRING_F_LOG.bits(),
437 desc_table_addr: 0x1000,
438 used_ring_addr: 0x2000,
439 avail_ring_addr: 0x3000,
440 log_addr: Some(0x4000),
441 };
442 master.set_vring_addr(0, &config).unwrap();
443 master.set_vring_call(0, &event).unwrap();
444 master.set_vring_kick(0, &event).unwrap();
445 master.set_vring_err(0, &event).unwrap();
446
447 let max_mem_slots = master.get_max_mem_slots().unwrap();
448 assert_eq!(max_mem_slots, 32);
449
450 let region_file = tempfile().unwrap();
451 let region = VhostUserMemoryRegionInfo {
452 guest_phys_addr: 0x10_0000,
453 memory_size: 0x10_0000,
454 userspace_addr: 0,
455 mmap_offset: 0,
456 mmap_handle: region_file.as_raw_descriptor(),
457 };
458 master.add_mem_region(®ion).unwrap();
459
460 master.remove_mem_region(®ion).unwrap();
461
462 mbar.wait();
463 }
464
465 #[test]
test_error_display()466 fn test_error_display() {
467 assert_eq!(format!("{}", Error::InvalidParam), "invalid parameters");
468 assert_eq!(format!("{}", Error::InvalidOperation), "invalid operation");
469 }
470
471 #[test]
test_error_from_base_error()472 fn test_error_from_base_error() {
473 let e: Error = base::Error::new(libc::EAGAIN).into();
474 if let Error::SocketRetry(e1) = e {
475 assert_eq!(e1.raw_os_error().unwrap(), libc::EAGAIN);
476 } else {
477 panic!("invalid error code conversion!");
478 }
479 }
480 }
481