1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::cmp;
6 use std::fmt::{self, Display};
7 use std::mem;
8 use std::net::Ipv4Addr;
9 use std::os::unix::io::{AsRawFd, RawFd};
10 use std::sync::atomic::{AtomicUsize, Ordering};
11 use std::sync::Arc;
12 use std::thread;
13
14 use libc::EAGAIN;
15 use net_sys;
16 use net_util::{Error as TapError, MacAddress, TapT};
17 use sys_util::Error as SysError;
18 use sys_util::{error, warn, EventFd, GuestMemory, PollContext, PollToken};
19 use virtio_sys::virtio_net::virtio_net_hdr_v1;
20 use virtio_sys::{vhost, virtio_net};
21
22 use super::{Queue, VirtioDevice, INTERRUPT_STATUS_USED_RING, TYPE_NET};
23
24 /// The maximum buffer size when segmentation offload is enabled. This
25 /// includes the 12-byte virtio net header.
26 /// http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html#x1-1740003
27 const MAX_BUFFER_SIZE: usize = 65562;
28 const QUEUE_SIZE: u16 = 256;
29 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE, QUEUE_SIZE];
30
31 #[derive(Debug)]
32 pub enum NetError {
33 /// Creating kill eventfd failed.
34 CreateKillEventFd(SysError),
35 /// Creating PollContext failed.
36 CreatePollContext(SysError),
37 /// Cloning kill eventfd failed.
38 CloneKillEventFd(SysError),
39 /// Open tap device failed.
40 TapOpen(TapError),
41 /// Setting tap IP failed.
42 TapSetIp(TapError),
43 /// Setting tap netmask failed.
44 TapSetNetmask(TapError),
45 /// Setting tap mac address failed.
46 TapSetMacAddress(TapError),
47 /// Setting tap interface offload flags failed.
48 TapSetOffload(TapError),
49 /// Setting vnet header size failed.
50 TapSetVnetHdrSize(TapError),
51 /// Enabling tap interface failed.
52 TapEnable(TapError),
53 /// Validating tap interface failed.
54 TapValidate(String),
55 /// Error while polling for events.
56 PollError(SysError),
57 }
58
59 impl Display for NetError {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61 use self::NetError::*;
62
63 match self {
64 CreateKillEventFd(e) => write!(f, "failed to create kill eventfd: {}", e),
65 CreatePollContext(e) => write!(f, "failed to create poll context: {}", e),
66 CloneKillEventFd(e) => write!(f, "failed to clone kill eventfd: {}", e),
67 TapOpen(e) => write!(f, "failed to open tap device: {}", e),
68 TapSetIp(e) => write!(f, "failed to set tap IP: {}", e),
69 TapSetNetmask(e) => write!(f, "failed to set tap netmask: {}", e),
70 TapSetMacAddress(e) => write!(f, "failed to set tap mac address: {}", e),
71 TapSetOffload(e) => write!(f, "failed to set tap interface offload flags: {}", e),
72 TapSetVnetHdrSize(e) => write!(f, "failed to set vnet header size: {}", e),
73 TapEnable(e) => write!(f, "failed to enable tap interface: {}", e),
74 TapValidate(s) => write!(f, "failed to validate tap interface: {}", s),
75 PollError(e) => write!(f, "error while polling for events: {}", e),
76 }
77 }
78 }
79
80 struct Worker<T: TapT> {
81 mem: GuestMemory,
82 rx_queue: Queue,
83 tx_queue: Queue,
84 tap: T,
85 interrupt_status: Arc<AtomicUsize>,
86 interrupt_evt: EventFd,
87 interrupt_resample_evt: EventFd,
88 rx_buf: [u8; MAX_BUFFER_SIZE],
89 rx_count: usize,
90 deferred_rx: bool,
91 // TODO(smbarber): http://crbug.com/753630
92 // Remove once MRG_RXBUF is supported and this variable is actually used.
93 #[allow(dead_code)]
94 acked_features: u64,
95 }
96
97 impl<T> Worker<T>
98 where
99 T: TapT,
100 {
signal_used_queue(&self)101 fn signal_used_queue(&self) {
102 self.interrupt_status
103 .fetch_or(INTERRUPT_STATUS_USED_RING as usize, Ordering::SeqCst);
104 self.interrupt_evt.write(1).unwrap();
105 }
106
107 // Copies a single frame from `self.rx_buf` into the guest. Returns true
108 // if a buffer was used, and false if the frame must be deferred until a buffer
109 // is made available by the driver.
rx_single_frame(&mut self) -> bool110 fn rx_single_frame(&mut self) -> bool {
111 let mut next_desc = self.rx_queue.pop(&self.mem);
112
113 if next_desc.is_none() {
114 return false;
115 }
116
117 // We just checked that the head descriptor exists.
118 let head_index = next_desc.as_ref().unwrap().index;
119 let mut write_count = 0;
120
121 // Copy from frame into buffer, which may span multiple descriptors.
122 loop {
123 match next_desc {
124 Some(desc) => {
125 if !desc.is_write_only() {
126 break;
127 }
128 let limit = cmp::min(write_count + desc.len as usize, self.rx_count);
129 let source_slice = &self.rx_buf[write_count..limit];
130 let write_result = self.mem.write_at_addr(source_slice, desc.addr);
131
132 match write_result {
133 Ok(sz) => {
134 write_count += sz;
135 }
136 Err(e) => {
137 warn!("net: rx: failed to write slice: {}", e);
138 break;
139 }
140 };
141
142 if write_count >= self.rx_count {
143 break;
144 }
145 next_desc = desc.next_descriptor();
146 }
147 None => {
148 warn!(
149 "net: rx: buffer is too small to hold frame of size {}",
150 self.rx_count
151 );
152 break;
153 }
154 }
155 }
156
157 self.rx_queue
158 .add_used(&self.mem, head_index, write_count as u32);
159
160 // Interrupt the guest immediately for received frames to
161 // reduce latency.
162 self.signal_used_queue();
163
164 true
165 }
166
process_rx(&mut self)167 fn process_rx(&mut self) {
168 // Read as many frames as possible.
169 loop {
170 let res = self.tap.read(&mut self.rx_buf);
171 match res {
172 Ok(count) => {
173 self.rx_count = count;
174 if !self.rx_single_frame() {
175 self.deferred_rx = true;
176 break;
177 }
178 }
179 Err(e) => {
180 // The tap device is nonblocking, so any error aside from EAGAIN is
181 // unexpected.
182 if e.raw_os_error().unwrap() != EAGAIN {
183 warn!("net: rx: failed to read tap: {}", e);
184 }
185 break;
186 }
187 }
188 }
189 }
190
process_tx(&mut self)191 fn process_tx(&mut self) {
192 let mut frame = [0u8; MAX_BUFFER_SIZE];
193
194 while let Some(avail_desc) = self.tx_queue.pop(&self.mem) {
195 let head_index = avail_desc.index;
196 let mut next_desc = Some(avail_desc);
197 let mut read_count = 0;
198
199 // Copy buffer from across multiple descriptors.
200 while let Some(desc) = next_desc {
201 if desc.is_write_only() {
202 break;
203 }
204 let limit = cmp::min(read_count + desc.len as usize, frame.len());
205 let read_result = self
206 .mem
207 .read_at_addr(&mut frame[read_count..limit as usize], desc.addr);
208 match read_result {
209 Ok(sz) => {
210 read_count += sz;
211 }
212 Err(e) => {
213 warn!("net: tx: failed to read slice: {}", e);
214 break;
215 }
216 }
217 next_desc = desc.next_descriptor();
218 }
219
220 let write_result = self.tap.write(&frame[..read_count as usize]);
221 match write_result {
222 Ok(_) => {}
223 Err(e) => {
224 warn!("net: tx: error failed to write to tap: {}", e);
225 }
226 };
227
228 self.tx_queue.add_used(&self.mem, head_index, 0);
229 }
230
231 self.signal_used_queue();
232 }
233
run( &mut self, rx_queue_evt: EventFd, tx_queue_evt: EventFd, kill_evt: EventFd, ) -> Result<(), NetError>234 fn run(
235 &mut self,
236 rx_queue_evt: EventFd,
237 tx_queue_evt: EventFd,
238 kill_evt: EventFd,
239 ) -> Result<(), NetError> {
240 #[derive(PollToken)]
241 enum Token {
242 // A frame is available for reading from the tap device to receive in the guest.
243 RxTap,
244 // The guest has made a buffer available to receive a frame into.
245 RxQueue,
246 // The transmit queue has a frame that is ready to send from the guest.
247 TxQueue,
248 // Check if any interrupts need to be re-asserted.
249 InterruptResample,
250 // crosvm has requested the device to shut down.
251 Kill,
252 }
253
254 let poll_ctx: PollContext<Token> = PollContext::new()
255 .and_then(|pc| pc.add(&self.tap, Token::RxTap).and(Ok(pc)))
256 .and_then(|pc| pc.add(&rx_queue_evt, Token::RxQueue).and(Ok(pc)))
257 .and_then(|pc| pc.add(&tx_queue_evt, Token::TxQueue).and(Ok(pc)))
258 .and_then(|pc| {
259 pc.add(&self.interrupt_resample_evt, Token::InterruptResample)
260 .and(Ok(pc))
261 })
262 .and_then(|pc| pc.add(&kill_evt, Token::Kill).and(Ok(pc)))
263 .map_err(NetError::CreatePollContext)?;
264
265 'poll: loop {
266 let events = poll_ctx.wait().map_err(NetError::PollError)?;
267 for event in events.iter_readable() {
268 match event.token() {
269 Token::RxTap => {
270 // Process a deferred frame first if available. Don't read from tap again
271 // until we manage to receive this deferred frame.
272 if self.deferred_rx {
273 if self.rx_single_frame() {
274 self.deferred_rx = false;
275 } else {
276 continue;
277 }
278 }
279 self.process_rx();
280 }
281 Token::RxQueue => {
282 if let Err(e) = rx_queue_evt.read() {
283 error!("net: error reading rx queue EventFd: {}", e);
284 break 'poll;
285 }
286 // There should be a buffer available now to receive the frame into.
287 if self.deferred_rx && self.rx_single_frame() {
288 self.deferred_rx = false;
289 }
290 }
291 Token::TxQueue => {
292 if let Err(e) = tx_queue_evt.read() {
293 error!("net: error reading tx queue EventFd: {}", e);
294 break 'poll;
295 }
296 self.process_tx();
297 }
298 Token::InterruptResample => {
299 let _ = self.interrupt_resample_evt.read();
300 if self.interrupt_status.load(Ordering::SeqCst) != 0 {
301 self.interrupt_evt.write(1).unwrap();
302 }
303 }
304 Token::Kill => break 'poll,
305 }
306 }
307 }
308 Ok(())
309 }
310 }
311
312 pub struct Net<T: TapT> {
313 workers_kill_evt: Option<EventFd>,
314 kill_evt: EventFd,
315 tap: Option<T>,
316 avail_features: u64,
317 acked_features: u64,
318 }
319
320 impl<T> Net<T>
321 where
322 T: TapT,
323 {
324 /// Create a new virtio network device with the given IP address and
325 /// netmask.
new( ip_addr: Ipv4Addr, netmask: Ipv4Addr, mac_addr: MacAddress, ) -> Result<Net<T>, NetError>326 pub fn new(
327 ip_addr: Ipv4Addr,
328 netmask: Ipv4Addr,
329 mac_addr: MacAddress,
330 ) -> Result<Net<T>, NetError> {
331 let tap: T = T::new(true).map_err(NetError::TapOpen)?;
332 tap.set_ip_addr(ip_addr).map_err(NetError::TapSetIp)?;
333 tap.set_netmask(netmask).map_err(NetError::TapSetNetmask)?;
334 tap.set_mac_address(mac_addr)
335 .map_err(NetError::TapSetMacAddress)?;
336
337 tap.enable().map_err(NetError::TapEnable)?;
338
339 Net::from(tap)
340 }
341
342 /// Creates a new virtio network device from a tap device that has already been
343 /// configured.
from(tap: T) -> Result<Net<T>, NetError>344 pub fn from(tap: T) -> Result<Net<T>, NetError> {
345 // This would also validate a tap created by Self::new(), but that's a good thing as it
346 // would ensure that any changes in the creation procedure are matched in the validation.
347 // Plus we still need to set the offload and vnet_hdr_size values.
348 validate_and_configure_tap(&tap)?;
349
350 let avail_features = 1 << virtio_net::VIRTIO_NET_F_GUEST_CSUM
351 | 1 << virtio_net::VIRTIO_NET_F_CSUM
352 | 1 << virtio_net::VIRTIO_NET_F_GUEST_TSO4
353 | 1 << virtio_net::VIRTIO_NET_F_GUEST_UFO
354 | 1 << virtio_net::VIRTIO_NET_F_HOST_TSO4
355 | 1 << virtio_net::VIRTIO_NET_F_HOST_UFO
356 | 1 << vhost::VIRTIO_F_VERSION_1;
357
358 let kill_evt = EventFd::new().map_err(NetError::CreateKillEventFd)?;
359 Ok(Net {
360 workers_kill_evt: Some(kill_evt.try_clone().map_err(NetError::CloneKillEventFd)?),
361 kill_evt,
362 tap: Some(tap),
363 avail_features,
364 acked_features: 0u64,
365 })
366 }
367 }
368
369 // Ensure that the tap interface has the correct flags and sets the offload and VNET header size
370 // to the appropriate values.
validate_and_configure_tap<T: TapT>(tap: &T) -> Result<(), NetError>371 fn validate_and_configure_tap<T: TapT>(tap: &T) -> Result<(), NetError> {
372 let flags = tap.if_flags();
373 let required_flags = [
374 (net_sys::IFF_TAP, "IFF_TAP"),
375 (net_sys::IFF_NO_PI, "IFF_NO_PI"),
376 (net_sys::IFF_VNET_HDR, "IFF_VNET_HDR"),
377 ];
378 let missing_flags = required_flags
379 .iter()
380 .filter_map(
381 |(value, name)| {
382 if value & flags == 0 {
383 Some(name)
384 } else {
385 None
386 }
387 },
388 )
389 .collect::<Vec<_>>();
390
391 if !missing_flags.is_empty() {
392 return Err(NetError::TapValidate(format!(
393 "Missing flags: {:?}",
394 missing_flags
395 )));
396 }
397
398 // Set offload flags to match the virtio features below.
399 tap.set_offload(
400 net_sys::TUN_F_CSUM | net_sys::TUN_F_UFO | net_sys::TUN_F_TSO4 | net_sys::TUN_F_TSO6,
401 )
402 .map_err(NetError::TapSetOffload)?;
403
404 let vnet_hdr_size = mem::size_of::<virtio_net_hdr_v1>() as i32;
405 tap.set_vnet_hdr_size(vnet_hdr_size)
406 .map_err(NetError::TapSetVnetHdrSize)?;
407
408 Ok(())
409 }
410
411 impl<T> Drop for Net<T>
412 where
413 T: TapT,
414 {
drop(&mut self)415 fn drop(&mut self) {
416 // Only kill the child if it claimed its eventfd.
417 if self.workers_kill_evt.is_none() {
418 // Ignore the result because there is nothing we can do about it.
419 let _ = self.kill_evt.write(1);
420 }
421 }
422 }
423
424 impl<T> VirtioDevice for Net<T>
425 where
426 T: 'static + TapT,
427 {
keep_fds(&self) -> Vec<RawFd>428 fn keep_fds(&self) -> Vec<RawFd> {
429 let mut keep_fds = Vec::new();
430
431 if let Some(tap) = &self.tap {
432 keep_fds.push(tap.as_raw_fd());
433 }
434
435 if let Some(workers_kill_evt) = &self.workers_kill_evt {
436 keep_fds.push(workers_kill_evt.as_raw_fd());
437 }
438
439 keep_fds
440 }
441
device_type(&self) -> u32442 fn device_type(&self) -> u32 {
443 TYPE_NET
444 }
445
queue_max_sizes(&self) -> &[u16]446 fn queue_max_sizes(&self) -> &[u16] {
447 QUEUE_SIZES
448 }
449
features(&self) -> u64450 fn features(&self) -> u64 {
451 self.avail_features
452 }
453
ack_features(&mut self, value: u64)454 fn ack_features(&mut self, value: u64) {
455 let mut v = value;
456
457 // Check if the guest is ACK'ing a feature that we didn't claim to have.
458 let unrequested_features = v & !self.avail_features;
459 if unrequested_features != 0 {
460 warn!("net: virtio net got unknown feature ack: {:x}", v);
461
462 // Don't count these features as acked.
463 v &= !unrequested_features;
464 }
465 self.acked_features |= v;
466 }
467
activate( &mut self, mem: GuestMemory, interrupt_evt: EventFd, interrupt_resample_evt: EventFd, status: Arc<AtomicUsize>, mut queues: Vec<Queue>, mut queue_evts: Vec<EventFd>, )468 fn activate(
469 &mut self,
470 mem: GuestMemory,
471 interrupt_evt: EventFd,
472 interrupt_resample_evt: EventFd,
473 status: Arc<AtomicUsize>,
474 mut queues: Vec<Queue>,
475 mut queue_evts: Vec<EventFd>,
476 ) {
477 if queues.len() != 2 || queue_evts.len() != 2 {
478 error!("net: expected 2 queues, got {}", queues.len());
479 return;
480 }
481
482 if let Some(tap) = self.tap.take() {
483 if let Some(kill_evt) = self.workers_kill_evt.take() {
484 let acked_features = self.acked_features;
485 let worker_result =
486 thread::Builder::new()
487 .name("virtio_net".to_string())
488 .spawn(move || {
489 // First queue is rx, second is tx.
490 let rx_queue = queues.remove(0);
491 let tx_queue = queues.remove(0);
492 let mut worker = Worker {
493 mem,
494 rx_queue,
495 tx_queue,
496 tap,
497 interrupt_status: status,
498 interrupt_evt,
499 interrupt_resample_evt,
500 rx_buf: [0u8; MAX_BUFFER_SIZE],
501 rx_count: 0,
502 deferred_rx: false,
503 acked_features,
504 };
505 let rx_queue_evt = queue_evts.remove(0);
506 let tx_queue_evt = queue_evts.remove(0);
507 let result = worker.run(rx_queue_evt, tx_queue_evt, kill_evt);
508 if let Err(e) = result {
509 error!("net worker thread exited with error: {}", e);
510 }
511 });
512
513 if let Err(e) = worker_result {
514 error!("failed to spawn virtio_net worker: {}", e);
515 return;
516 }
517 }
518 }
519 }
520 }
521