1 // Copyright 2019 The Chromium OS Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 use std::sync::Arc; 6 use std::u32; 7 8 use base::{ 9 error, pagesize, AsRawDescriptor, Event, MappedRegion, MemoryMapping, MemoryMappingBuilder, 10 RawDescriptor, Tube, 11 }; 12 use hypervisor::Datamatch; 13 14 use resources::{Alloc, MmioType, SystemAllocator}; 15 16 use vfio_sys::*; 17 use vm_control::{VmIrqRequest, VmIrqResponse, VmMemoryRequest, VmMemoryResponse}; 18 19 use crate::pci::msix::{ 20 MsixConfig, BITS_PER_PBA_ENTRY, MSIX_PBA_ENTRIES_MODULO, MSIX_TABLE_ENTRIES_MODULO, 21 }; 22 23 use crate::pci::pci_device::{Error as PciDeviceError, PciDevice}; 24 use crate::pci::{PciAddress, PciClassCode, PciInterruptPin}; 25 26 use crate::vfio::{VfioDevice, VfioIrqType}; 27 28 const PCI_VENDOR_ID: u32 = 0x0; 29 const INTEL_VENDOR_ID: u16 = 0x8086; 30 const PCI_COMMAND: u32 = 0x4; 31 const PCI_COMMAND_MEMORY: u8 = 0x2; 32 const PCI_BASE_CLASS_CODE: u32 = 0x0B; 33 34 const PCI_INTERRUPT_PIN: u32 = 0x3D; 35 36 struct VfioPciConfig { 37 device: Arc<VfioDevice>, 38 } 39 40 impl VfioPciConfig { new(device: Arc<VfioDevice>) -> Self41 fn new(device: Arc<VfioDevice>) -> Self { 42 VfioPciConfig { device } 43 } 44 45 #[allow(dead_code)] read_config_byte(&self, offset: u32) -> u846 fn read_config_byte(&self, offset: u32) -> u8 { 47 let mut data: [u8; 1] = [0]; 48 self.device 49 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 50 51 data[0] 52 } 53 54 #[allow(dead_code)] read_config_word(&self, offset: u32) -> u1655 fn read_config_word(&self, offset: u32) -> u16 { 56 let mut data: [u8; 2] = [0, 0]; 57 self.device 58 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 59 60 u16::from_le_bytes(data) 61 } 62 63 #[allow(dead_code)] read_config_dword(&self, offset: u32) -> u3264 fn read_config_dword(&self, offset: u32) -> u32 { 65 let mut data: [u8; 4] = [0, 0, 0, 0]; 66 self.device 67 .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); 68 69 u32::from_le_bytes(data) 70 } 71 72 #[allow(dead_code)] write_config_byte(&self, buf: u8, offset: u32)73 fn write_config_byte(&self, buf: u8, offset: u32) { 74 self.device.region_write( 75 VFIO_PCI_CONFIG_REGION_INDEX, 76 ::std::slice::from_ref(&buf), 77 offset.into(), 78 ) 79 } 80 81 #[allow(dead_code)] write_config_word(&self, buf: u16, offset: u32)82 fn write_config_word(&self, buf: u16, offset: u32) { 83 let data: [u8; 2] = buf.to_le_bytes(); 84 self.device 85 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) 86 } 87 88 #[allow(dead_code)] write_config_dword(&self, buf: u32, offset: u32)89 fn write_config_dword(&self, buf: u32, offset: u32) { 90 let data: [u8; 4] = buf.to_le_bytes(); 91 self.device 92 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) 93 } 94 } 95 96 const PCI_CAPABILITY_LIST: u32 = 0x34; 97 const PCI_CAP_ID_MSI: u8 = 0x05; 98 const PCI_CAP_ID_MSIX: u8 = 0x11; 99 100 // MSI registers 101 const PCI_MSI_NEXT_POINTER: u32 = 0x1; // Next cap pointer 102 const PCI_MSI_FLAGS: u32 = 0x2; // Message Control 103 const PCI_MSI_FLAGS_ENABLE: u16 = 0x0001; // MSI feature enabled 104 const PCI_MSI_FLAGS_64BIT: u16 = 0x0080; // 64-bit addresses allowed 105 const PCI_MSI_FLAGS_MASKBIT: u16 = 0x0100; // Per-vector masking capable 106 const PCI_MSI_ADDRESS_LO: u32 = 0x4; // MSI address lower 32 bits 107 const PCI_MSI_ADDRESS_HI: u32 = 0x8; // MSI address upper 32 bits (if 64 bit allowed) 108 const PCI_MSI_DATA_32: u32 = 0x8; // 16 bits of data for 32-bit message address 109 const PCI_MSI_DATA_64: u32 = 0xC; // 16 bits of date for 64-bit message address 110 111 // MSI length 112 const MSI_LENGTH_32BIT_WITHOUT_MASK: u32 = 0xA; 113 const MSI_LENGTH_32BIT_WITH_MASK: u32 = 0x14; 114 const MSI_LENGTH_64BIT_WITHOUT_MASK: u32 = 0xE; 115 const MSI_LENGTH_64BIT_WITH_MASK: u32 = 0x18; 116 117 enum VfioMsiChange { 118 Disable, 119 Enable, 120 } 121 122 struct VfioMsiCap { 123 offset: u32, 124 is_64bit: bool, 125 mask_cap: bool, 126 ctl: u16, 127 address: u64, 128 data: u16, 129 vm_socket_irq: Tube, 130 irqfd: Option<Event>, 131 gsi: Option<u32>, 132 } 133 134 impl VfioMsiCap { new(config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube) -> Self135 fn new(config: &VfioPciConfig, msi_cap_start: u32, vm_socket_irq: Tube) -> Self { 136 let msi_ctl = config.read_config_word(msi_cap_start + PCI_MSI_FLAGS); 137 138 VfioMsiCap { 139 offset: msi_cap_start, 140 is_64bit: (msi_ctl & PCI_MSI_FLAGS_64BIT) != 0, 141 mask_cap: (msi_ctl & PCI_MSI_FLAGS_MASKBIT) != 0, 142 ctl: 0, 143 address: 0, 144 data: 0, 145 vm_socket_irq, 146 irqfd: None, 147 gsi: None, 148 } 149 } 150 is_msi_reg(&self, index: u64, len: usize) -> bool151 fn is_msi_reg(&self, index: u64, len: usize) -> bool { 152 let msi_len: u32 = if self.is_64bit { 153 if self.mask_cap { 154 MSI_LENGTH_64BIT_WITH_MASK 155 } else { 156 MSI_LENGTH_64BIT_WITHOUT_MASK 157 } 158 } else { 159 if self.mask_cap { 160 MSI_LENGTH_32BIT_WITH_MASK 161 } else { 162 MSI_LENGTH_32BIT_WITHOUT_MASK 163 } 164 }; 165 166 index >= self.offset as u64 167 && index + len as u64 <= (self.offset + msi_len) as u64 168 && len as u32 <= msi_len 169 } 170 write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange>171 fn write_msi_reg(&mut self, index: u64, data: &[u8]) -> Option<VfioMsiChange> { 172 let len = data.len(); 173 let offset = index as u32 - self.offset; 174 let mut ret: Option<VfioMsiChange> = None; 175 let old_address = self.address; 176 let old_data = self.data; 177 178 // write msi ctl 179 if len == 2 && offset == PCI_MSI_FLAGS { 180 let was_enabled = self.is_msi_enabled(); 181 let value: [u8; 2] = [data[0], data[1]]; 182 self.ctl = u16::from_le_bytes(value); 183 let is_enabled = self.is_msi_enabled(); 184 if !was_enabled && is_enabled { 185 self.enable(); 186 ret = Some(VfioMsiChange::Enable); 187 } else if was_enabled && !is_enabled { 188 ret = Some(VfioMsiChange::Disable) 189 } 190 } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && !self.is_64bit { 191 //write 32 bit message address 192 let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0]; 193 self.address = u64::from_le_bytes(value); 194 } else if len == 4 && offset == PCI_MSI_ADDRESS_LO && self.is_64bit { 195 // write 64 bit message address low part 196 let value: [u8; 8] = [data[0], data[1], data[2], data[3], 0, 0, 0, 0]; 197 self.address &= !0xffffffff; 198 self.address |= u64::from_le_bytes(value); 199 } else if len == 4 && offset == PCI_MSI_ADDRESS_HI && self.is_64bit { 200 //write 64 bit message address high part 201 let value: [u8; 8] = [0, 0, 0, 0, data[0], data[1], data[2], data[3]]; 202 self.address &= 0xffffffff; 203 self.address |= u64::from_le_bytes(value); 204 } else if len == 8 && offset == PCI_MSI_ADDRESS_LO && self.is_64bit { 205 // write 64 bit message address 206 let value: [u8; 8] = [ 207 data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], 208 ]; 209 self.address = u64::from_le_bytes(value); 210 } else if len == 2 211 && ((offset == PCI_MSI_DATA_32 && !self.is_64bit) 212 || (offset == PCI_MSI_DATA_64 && self.is_64bit)) 213 { 214 // write message data 215 let value: [u8; 2] = [data[0], data[1]]; 216 self.data = u16::from_le_bytes(value); 217 } 218 219 if self.is_msi_enabled() && (old_address != self.address || old_data != self.data) { 220 self.add_msi_route(); 221 } 222 223 ret 224 } 225 is_msi_enabled(&self) -> bool226 fn is_msi_enabled(&self) -> bool { 227 self.ctl & PCI_MSI_FLAGS_ENABLE == PCI_MSI_FLAGS_ENABLE 228 } 229 add_msi_route(&self)230 fn add_msi_route(&self) { 231 let gsi = match self.gsi { 232 Some(g) => g, 233 None => { 234 error!("Add msi route but gsi is none"); 235 return; 236 } 237 }; 238 if let Err(e) = self.vm_socket_irq.send(&VmIrqRequest::AddMsiRoute { 239 gsi, 240 msi_address: self.address, 241 msi_data: self.data.into(), 242 }) { 243 error!("failed to send AddMsiRoute request at {:?}", e); 244 return; 245 } 246 match self.vm_socket_irq.recv() { 247 Ok(VmIrqResponse::Err(e)) => error!("failed to call AddMsiRoute request {:?}", e), 248 Ok(_) => {} 249 Err(e) => error!("failed to receive AddMsiRoute response {:?}", e), 250 } 251 } 252 allocate_one_msi(&mut self)253 fn allocate_one_msi(&mut self) { 254 let irqfd = match self.irqfd.take() { 255 Some(e) => e, 256 None => match Event::new() { 257 Ok(e) => e, 258 Err(e) => { 259 error!("failed to create event: {:?}", e); 260 return; 261 } 262 }, 263 }; 264 265 let request = VmIrqRequest::AllocateOneMsi { irqfd }; 266 let request_result = self.vm_socket_irq.send(&request); 267 268 // Stash the irqfd in self immediately because we used take above. 269 self.irqfd = match request { 270 VmIrqRequest::AllocateOneMsi { irqfd } => Some(irqfd), 271 _ => unreachable!(), 272 }; 273 274 if let Err(e) = request_result { 275 error!("failed to send AllocateOneMsi request: {:?}", e); 276 return; 277 } 278 279 match self.vm_socket_irq.recv() { 280 Ok(VmIrqResponse::AllocateOneMsi { gsi }) => self.gsi = Some(gsi), 281 _ => error!("failed to receive AllocateOneMsi Response"), 282 } 283 } 284 enable(&mut self)285 fn enable(&mut self) { 286 if self.gsi.is_none() || self.irqfd.is_none() { 287 self.allocate_one_msi(); 288 } 289 290 self.add_msi_route(); 291 } 292 get_msi_irqfd(&self) -> Option<&Event>293 fn get_msi_irqfd(&self) -> Option<&Event> { 294 self.irqfd.as_ref() 295 } 296 } 297 298 // MSI-X registers in MSI-X capability 299 const PCI_MSIX_FLAGS: u32 = 0x02; // Message Control 300 const PCI_MSIX_FLAGS_QSIZE: u16 = 0x07FF; // Table size 301 const PCI_MSIX_TABLE: u32 = 0x04; // Table offset 302 const PCI_MSIX_TABLE_BIR: u32 = 0x07; // BAR index 303 const PCI_MSIX_TABLE_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR 304 const PCI_MSIX_PBA: u32 = 0x08; // Pending bit Array offset 305 const PCI_MSIX_PBA_BIR: u32 = 0x07; // BAR index 306 const PCI_MSIX_PBA_OFFSET: u32 = 0xFFFFFFF8; // Offset into specified BAR 307 308 struct VfioMsixCap { 309 config: MsixConfig, 310 offset: u32, 311 table_size: u16, 312 table_pci_bar: u32, 313 table_offset: u64, 314 pba_pci_bar: u32, 315 pba_offset: u64, 316 } 317 318 impl VfioMsixCap { new(config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube) -> Self319 fn new(config: &VfioPciConfig, msix_cap_start: u32, vm_socket_irq: Tube) -> Self { 320 let msix_ctl = config.read_config_word(msix_cap_start + PCI_MSIX_FLAGS); 321 let table_size = (msix_ctl & PCI_MSIX_FLAGS_QSIZE) + 1; 322 let table = config.read_config_dword(msix_cap_start + PCI_MSIX_TABLE); 323 let table_pci_bar = table & PCI_MSIX_TABLE_BIR; 324 let table_offset = (table & PCI_MSIX_TABLE_OFFSET) as u64; 325 let pba = config.read_config_dword(msix_cap_start + PCI_MSIX_PBA); 326 let pba_pci_bar = pba & PCI_MSIX_PBA_BIR; 327 let pba_offset = (pba & PCI_MSIX_PBA_OFFSET) as u64; 328 329 VfioMsixCap { 330 config: MsixConfig::new(table_size, vm_socket_irq), 331 offset: msix_cap_start, 332 table_size, 333 table_pci_bar, 334 table_offset, 335 pba_pci_bar, 336 pba_offset, 337 } 338 } 339 340 // only msix control register is writable and need special handle in pci r/w is_msix_control_reg(&self, offset: u32, size: u32) -> bool341 fn is_msix_control_reg(&self, offset: u32, size: u32) -> bool { 342 let control_start = self.offset + PCI_MSIX_FLAGS; 343 let control_end = control_start + 2; 344 345 offset < control_end && offset + size > control_start 346 } 347 read_msix_control(&self, data: &mut u32)348 fn read_msix_control(&self, data: &mut u32) { 349 *data = self.config.read_msix_capability(*data); 350 } 351 write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange>352 fn write_msix_control(&mut self, data: &[u8]) -> Option<VfioMsiChange> { 353 let old_enabled = self.config.enabled(); 354 355 self.config 356 .write_msix_capability(PCI_MSIX_FLAGS.into(), data); 357 358 let new_enabled = self.config.enabled(); 359 if !old_enabled && new_enabled { 360 Some(VfioMsiChange::Enable) 361 } else if old_enabled && !new_enabled { 362 Some(VfioMsiChange::Disable) 363 } else { 364 None 365 } 366 } 367 is_msix_table(&self, bar_index: u32, offset: u64) -> bool368 fn is_msix_table(&self, bar_index: u32, offset: u64) -> bool { 369 let table_size: u64 = (self.table_size * (MSIX_TABLE_ENTRIES_MODULO as u16)).into(); 370 bar_index == self.table_pci_bar 371 && offset >= self.table_offset 372 && offset < self.table_offset + table_size 373 } 374 read_table(&self, offset: u64, data: &mut [u8])375 fn read_table(&self, offset: u64, data: &mut [u8]) { 376 let offset = offset - self.table_offset; 377 self.config.read_msix_table(offset, data); 378 } 379 write_table(&mut self, offset: u64, data: &[u8])380 fn write_table(&mut self, offset: u64, data: &[u8]) { 381 let offset = offset - self.table_offset; 382 self.config.write_msix_table(offset, data); 383 } 384 is_msix_pba(&self, bar_index: u32, offset: u64) -> bool385 fn is_msix_pba(&self, bar_index: u32, offset: u64) -> bool { 386 let pba_size: u64 = (((self.table_size + BITS_PER_PBA_ENTRY as u16 - 1) 387 / BITS_PER_PBA_ENTRY as u16) 388 * MSIX_PBA_ENTRIES_MODULO as u16) as u64; 389 bar_index == self.pba_pci_bar 390 && offset >= self.pba_offset 391 && offset < self.pba_offset + pba_size 392 } 393 read_pba(&self, offset: u64, data: &mut [u8])394 fn read_pba(&self, offset: u64, data: &mut [u8]) { 395 let offset = offset - self.pba_offset; 396 self.config.read_pba_entries(offset, data); 397 } 398 write_pba(&mut self, offset: u64, data: &[u8])399 fn write_pba(&mut self, offset: u64, data: &[u8]) { 400 let offset = offset - self.pba_offset; 401 self.config.write_pba_entries(offset, data); 402 } 403 is_msix_bar(&self, bar_index: u32) -> bool404 fn is_msix_bar(&self, bar_index: u32) -> bool { 405 bar_index == self.table_pci_bar || bar_index == self.pba_pci_bar 406 } 407 get_msix_irqfds(&self) -> Option<Vec<&Event>>408 fn get_msix_irqfds(&self) -> Option<Vec<&Event>> { 409 let mut irqfds = Vec::new(); 410 411 for i in 0..self.table_size { 412 let irqfd = self.config.get_irqfd(i as usize); 413 if let Some(fd) = irqfd { 414 irqfds.push(fd); 415 } else { 416 return None; 417 } 418 } 419 420 Some(irqfds) 421 } 422 } 423 424 struct MmioInfo { 425 bar_index: u32, 426 start: u64, 427 length: u64, 428 } 429 430 struct IoInfo { 431 bar_index: u32, 432 } 433 434 enum DeviceData { 435 IntelGfxData { opregion_index: u32 }, 436 } 437 438 /// Implements the Vfio Pci device, then a pci device is added into vm 439 pub struct VfioPciDevice { 440 device: Arc<VfioDevice>, 441 config: VfioPciConfig, 442 pci_address: Option<PciAddress>, 443 interrupt_evt: Option<Event>, 444 interrupt_resample_evt: Option<Event>, 445 mmio_regions: Vec<MmioInfo>, 446 io_regions: Vec<IoInfo>, 447 msi_cap: Option<VfioMsiCap>, 448 msix_cap: Option<VfioMsixCap>, 449 irq_type: Option<VfioIrqType>, 450 vm_socket_mem: Tube, 451 device_data: Option<DeviceData>, 452 453 // scratch MemoryMapping to avoid unmap beform vm exit 454 mem: Vec<MemoryMapping>, 455 } 456 457 impl VfioPciDevice { 458 /// Constructs a new Vfio Pci device for the give Vfio device new( device: VfioDevice, vfio_device_socket_msi: Tube, vfio_device_socket_msix: Tube, vfio_device_socket_mem: Tube, ) -> Self459 pub fn new( 460 device: VfioDevice, 461 vfio_device_socket_msi: Tube, 462 vfio_device_socket_msix: Tube, 463 vfio_device_socket_mem: Tube, 464 ) -> Self { 465 let dev = Arc::new(device); 466 let config = VfioPciConfig::new(Arc::clone(&dev)); 467 let mut msi_socket = Some(vfio_device_socket_msi); 468 let mut msix_socket = Some(vfio_device_socket_msix); 469 let mut msi_cap: Option<VfioMsiCap> = None; 470 let mut msix_cap: Option<VfioMsixCap> = None; 471 472 let mut cap_next: u32 = config.read_config_byte(PCI_CAPABILITY_LIST).into(); 473 while cap_next != 0 { 474 let cap_id = config.read_config_byte(cap_next); 475 if cap_id == PCI_CAP_ID_MSI { 476 if let Some(msi_socket) = msi_socket.take() { 477 msi_cap = Some(VfioMsiCap::new(&config, cap_next, msi_socket)); 478 } 479 } else if cap_id == PCI_CAP_ID_MSIX { 480 if let Some(msix_socket) = msix_socket.take() { 481 msix_cap = Some(VfioMsixCap::new(&config, cap_next, msix_socket)); 482 } 483 } 484 let offset = cap_next + PCI_MSI_NEXT_POINTER; 485 cap_next = config.read_config_byte(offset).into(); 486 } 487 488 let vendor_id = config.read_config_word(PCI_VENDOR_ID); 489 let class_code = config.read_config_byte(PCI_BASE_CLASS_CODE); 490 491 let is_intel_gfx = vendor_id == INTEL_VENDOR_ID 492 && class_code == PciClassCode::DisplayController.get_register_value(); 493 let device_data = if is_intel_gfx { 494 Some(DeviceData::IntelGfxData { 495 opregion_index: u32::max_value(), 496 }) 497 } else { 498 None 499 }; 500 501 VfioPciDevice { 502 device: dev, 503 config, 504 pci_address: None, 505 interrupt_evt: None, 506 interrupt_resample_evt: None, 507 mmio_regions: Vec::new(), 508 io_regions: Vec::new(), 509 msi_cap, 510 msix_cap, 511 irq_type: None, 512 vm_socket_mem: vfio_device_socket_mem, 513 device_data, 514 mem: Vec::new(), 515 } 516 } 517 is_intel_gfx(&self) -> bool518 fn is_intel_gfx(&self) -> bool { 519 let mut ret = false; 520 521 if let Some(device_data) = &self.device_data { 522 match *device_data { 523 DeviceData::IntelGfxData { .. } => ret = true, 524 } 525 } 526 527 ret 528 } 529 find_region(&self, addr: u64) -> Option<MmioInfo>530 fn find_region(&self, addr: u64) -> Option<MmioInfo> { 531 for mmio_info in self.mmio_regions.iter() { 532 if addr >= mmio_info.start && addr < mmio_info.start + mmio_info.length { 533 return Some(MmioInfo { 534 bar_index: mmio_info.bar_index, 535 start: mmio_info.start, 536 length: mmio_info.length, 537 }); 538 } 539 } 540 541 None 542 } 543 enable_intx(&mut self)544 fn enable_intx(&mut self) { 545 if self.interrupt_evt.is_none() || self.interrupt_resample_evt.is_none() { 546 return; 547 } 548 549 if let Some(ref interrupt_evt) = self.interrupt_evt { 550 let mut fds = Vec::new(); 551 fds.push(interrupt_evt); 552 if let Err(e) = self.device.irq_enable(fds, VFIO_PCI_INTX_IRQ_INDEX) { 553 error!("Intx enable failed: {}", e); 554 return; 555 } 556 if let Some(ref irq_resample_evt) = self.interrupt_resample_evt { 557 if let Err(e) = self.device.irq_mask(VFIO_PCI_INTX_IRQ_INDEX) { 558 error!("Intx mask failed: {}", e); 559 self.disable_intx(); 560 return; 561 } 562 if let Err(e) = self 563 .device 564 .resample_virq_enable(irq_resample_evt, VFIO_PCI_INTX_IRQ_INDEX) 565 { 566 error!("resample enable failed: {}", e); 567 self.disable_intx(); 568 return; 569 } 570 if let Err(e) = self.device.irq_unmask(VFIO_PCI_INTX_IRQ_INDEX) { 571 error!("Intx unmask failed: {}", e); 572 self.disable_intx(); 573 return; 574 } 575 } 576 } 577 578 self.irq_type = Some(VfioIrqType::Intx); 579 } 580 disable_intx(&mut self)581 fn disable_intx(&mut self) { 582 if let Err(e) = self.device.irq_disable(VFIO_PCI_INTX_IRQ_INDEX) { 583 error!("Intx disable failed: {}", e); 584 } 585 self.irq_type = None; 586 } 587 disable_irqs(&mut self)588 fn disable_irqs(&mut self) { 589 match self.irq_type { 590 Some(VfioIrqType::Msi) => self.disable_msi(), 591 Some(VfioIrqType::Msix) => self.disable_msix(), 592 _ => (), 593 } 594 595 // Above disable_msi() or disable_msix() will enable intx again. 596 // so disable_intx here again. 597 if let Some(VfioIrqType::Intx) = self.irq_type { 598 self.disable_intx(); 599 } 600 } 601 enable_msi(&mut self)602 fn enable_msi(&mut self) { 603 self.disable_irqs(); 604 605 let irqfd = match &self.msi_cap { 606 Some(cap) => { 607 if let Some(fd) = cap.get_msi_irqfd() { 608 fd 609 } else { 610 self.enable_intx(); 611 return; 612 } 613 } 614 None => { 615 self.enable_intx(); 616 return; 617 } 618 }; 619 620 let mut fds = Vec::new(); 621 fds.push(irqfd); 622 if let Err(e) = self.device.irq_enable(fds, VFIO_PCI_MSI_IRQ_INDEX) { 623 error!("failed to enable msi: {}", e); 624 self.enable_intx(); 625 return; 626 } 627 628 self.irq_type = Some(VfioIrqType::Msi); 629 } 630 disable_msi(&mut self)631 fn disable_msi(&mut self) { 632 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSI_IRQ_INDEX) { 633 error!("failed to disable msi: {}", e); 634 return; 635 } 636 637 self.enable_intx(); 638 } 639 enable_msix(&mut self)640 fn enable_msix(&mut self) { 641 self.disable_irqs(); 642 643 let irqfds = match &self.msix_cap { 644 Some(cap) => cap.get_msix_irqfds(), 645 None => return, 646 }; 647 648 if let Some(descriptors) = irqfds { 649 if let Err(e) = self.device.irq_enable(descriptors, VFIO_PCI_MSIX_IRQ_INDEX) { 650 error!("failed to enable msix: {}", e); 651 self.enable_intx(); 652 return; 653 } 654 } else { 655 self.enable_intx(); 656 return; 657 } 658 659 self.irq_type = Some(VfioIrqType::Msix); 660 } 661 disable_msix(&mut self)662 fn disable_msix(&mut self) { 663 if let Err(e) = self.device.irq_disable(VFIO_PCI_MSIX_IRQ_INDEX) { 664 error!("failed to disable msix: {}", e); 665 return; 666 } 667 668 self.enable_intx(); 669 } 670 add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping>671 fn add_bar_mmap(&self, index: u32, bar_addr: u64) -> Vec<MemoryMapping> { 672 let mut mem_map: Vec<MemoryMapping> = Vec::new(); 673 if self.device.get_region_flags(index) & VFIO_REGION_INFO_FLAG_MMAP != 0 { 674 // the bar storing msix table and pba couldn't mmap. 675 // these bars should be trapped, so that msix could be emulated. 676 if let Some(msix_cap) = &self.msix_cap { 677 if msix_cap.is_msix_bar(index) { 678 return mem_map; 679 } 680 } 681 682 let mmaps = self.device.get_region_mmap(index); 683 if mmaps.is_empty() { 684 return mem_map; 685 } 686 687 for mmap in mmaps.iter() { 688 let mmap_offset = mmap.offset; 689 let mmap_size = mmap.size; 690 let guest_map_start = bar_addr + mmap_offset; 691 let region_offset = self.device.get_region_offset(index); 692 let offset = region_offset + mmap_offset; 693 let descriptor = match self.device.device_file().try_clone() { 694 Ok(device_file) => device_file.into(), 695 Err(_) => break, 696 }; 697 if self 698 .vm_socket_mem 699 .send(&VmMemoryRequest::RegisterMmapMemory { 700 descriptor, 701 size: mmap_size as usize, 702 offset, 703 gpa: guest_map_start, 704 }) 705 .is_err() 706 { 707 break; 708 } 709 710 let response: VmMemoryResponse = match self.vm_socket_mem.recv() { 711 Ok(res) => res, 712 Err(_) => break, 713 }; 714 match response { 715 VmMemoryResponse::Ok => { 716 // Even if vm has mapped this region, but it is in vm main process, 717 // device process doesn't has this mapping, but vfio_dma_map() need it 718 // in device process, so here map it again. 719 let mmap = match MemoryMappingBuilder::new(mmap_size as usize) 720 .from_file(self.device.device_file()) 721 .offset(offset) 722 .build() 723 { 724 Ok(v) => v, 725 Err(_e) => break, 726 }; 727 let host = (&mmap).as_ptr() as u64; 728 let pgsz = pagesize() as u64; 729 let size = (mmap_size + pgsz - 1) / pgsz * pgsz; 730 // Safe because the given guest_map_start is valid guest bar address. and 731 // the host pointer is correct and valid guaranteed by MemoryMapping interface. 732 // The size will be extened to page size aligned if it is not which is also 733 // safe because VFIO actually maps the BAR with page size aligned size. 734 match unsafe { self.device.vfio_dma_map(guest_map_start, size, host) } { 735 Ok(_) => mem_map.push(mmap), 736 Err(e) => { 737 error!( 738 "{}, index: {}, bar_addr:0x{:x}, host:0x{:x}", 739 e, index, bar_addr, host 740 ); 741 break; 742 } 743 } 744 } 745 _ => break, 746 } 747 } 748 } 749 750 mem_map 751 } 752 enable_bars_mmap(&mut self)753 fn enable_bars_mmap(&mut self) { 754 for mmio_info in self.mmio_regions.iter() { 755 let mut mem_map = self.add_bar_mmap(mmio_info.bar_index, mmio_info.start); 756 self.mem.append(&mut mem_map); 757 } 758 } 759 } 760 761 impl PciDevice for VfioPciDevice { debug_label(&self) -> String762 fn debug_label(&self) -> String { 763 format!("vfio {} device", self.device.device_name()) 764 } 765 allocate_address( &mut self, resources: &mut SystemAllocator, ) -> Result<PciAddress, PciDeviceError>766 fn allocate_address( 767 &mut self, 768 resources: &mut SystemAllocator, 769 ) -> Result<PciAddress, PciDeviceError> { 770 if self.pci_address.is_none() { 771 let address = PciAddress::from_string(self.device.device_name()); 772 if resources.reserve_pci( 773 Alloc::PciBar { 774 bus: address.bus, 775 dev: address.dev, 776 func: address.func, 777 bar: 0, 778 }, 779 self.debug_label(), 780 ) { 781 self.pci_address = Some(address); 782 } 783 } 784 self.pci_address.ok_or(PciDeviceError::PciAllocationFailed) 785 } 786 keep_rds(&self) -> Vec<RawDescriptor>787 fn keep_rds(&self) -> Vec<RawDescriptor> { 788 let mut rds = self.device.keep_rds(); 789 if let Some(ref interrupt_evt) = self.interrupt_evt { 790 rds.push(interrupt_evt.as_raw_descriptor()); 791 } 792 if let Some(ref interrupt_resample_evt) = self.interrupt_resample_evt { 793 rds.push(interrupt_resample_evt.as_raw_descriptor()); 794 } 795 rds.push(self.vm_socket_mem.as_raw_descriptor()); 796 if let Some(msi_cap) = &self.msi_cap { 797 rds.push(msi_cap.vm_socket_irq.as_raw_descriptor()); 798 } 799 if let Some(msix_cap) = &self.msix_cap { 800 rds.push(msix_cap.config.as_raw_descriptor()); 801 } 802 rds 803 } 804 assign_irq( &mut self, irq_evt: Event, irq_resample_evt: Event, irq_num: u32, _irq_pin: PciInterruptPin, )805 fn assign_irq( 806 &mut self, 807 irq_evt: Event, 808 irq_resample_evt: Event, 809 irq_num: u32, 810 _irq_pin: PciInterruptPin, 811 ) { 812 self.config.write_config_byte(irq_num as u8, 0x3C); 813 self.interrupt_evt = Some(irq_evt); 814 self.interrupt_resample_evt = Some(irq_resample_evt); 815 816 // enable INTX 817 if self.config.read_config_byte(PCI_INTERRUPT_PIN) > 0 { 818 self.enable_intx(); 819 } 820 } 821 allocate_io_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<(u64, u64)>, PciDeviceError>822 fn allocate_io_bars( 823 &mut self, 824 resources: &mut SystemAllocator, 825 ) -> Result<Vec<(u64, u64)>, PciDeviceError> { 826 let mut ranges = Vec::new(); 827 let mut i = VFIO_PCI_BAR0_REGION_INDEX; 828 let address = self 829 .pci_address 830 .expect("allocate_address must be called prior to allocate_io_bars"); 831 832 while i <= VFIO_PCI_ROM_REGION_INDEX { 833 let mut low: u32 = 0xffffffff; 834 let offset: u32; 835 if i == VFIO_PCI_ROM_REGION_INDEX { 836 offset = 0x30; 837 } else { 838 offset = 0x10 + i * 4; 839 } 840 self.config.write_config_dword(low, offset); 841 low = self.config.read_config_dword(offset); 842 843 let low_flag = low & 0xf; 844 let is_64bit = low_flag & 0x4 == 0x4; 845 if (low_flag & 0x1 == 0 || i == VFIO_PCI_ROM_REGION_INDEX) && low != 0 { 846 let mut upper: u32 = 0xffffffff; 847 if is_64bit { 848 self.config.write_config_dword(upper, offset + 4); 849 upper = self.config.read_config_dword(offset + 4); 850 } 851 852 low &= 0xffff_fff0; 853 let mut size: u64 = u64::from(upper); 854 size <<= 32; 855 size |= u64::from(low); 856 size = !size + 1; 857 let mmio_type = match is_64bit { 858 false => MmioType::Low, 859 true => MmioType::High, 860 }; 861 let bar_addr = resources 862 .mmio_allocator(mmio_type) 863 .allocate_with_align( 864 size, 865 Alloc::PciBar { 866 bus: address.bus, 867 dev: address.dev, 868 func: address.func, 869 bar: i as u8, 870 }, 871 "vfio_bar".to_string(), 872 size, 873 ) 874 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?; 875 ranges.push((bar_addr, size)); 876 self.mmio_regions.push(MmioInfo { 877 bar_index: i, 878 start: bar_addr, 879 length: size, 880 }); 881 882 low = bar_addr as u32; 883 low |= low_flag; 884 self.config.write_config_dword(low, offset); 885 if is_64bit { 886 upper = (bar_addr >> 32) as u32; 887 self.config.write_config_dword(upper, offset + 4); 888 } 889 } else if low_flag & 0x1 == 0x1 { 890 self.io_regions.push(IoInfo { bar_index: i }); 891 } 892 893 if is_64bit { 894 i += 2; 895 } else { 896 i += 1; 897 } 898 } 899 900 // Quirk, enable igd memory for guest vga arbitrate, otherwise kernel vga arbitrate 901 // driver doesn't claim this vga device, then xorg couldn't boot up. 902 if self.is_intel_gfx() { 903 let mut cmd = self.config.read_config_byte(PCI_COMMAND); 904 cmd |= PCI_COMMAND_MEMORY; 905 self.config.write_config_byte(cmd, PCI_COMMAND); 906 } 907 908 Ok(ranges) 909 } 910 allocate_device_bars( &mut self, resources: &mut SystemAllocator, ) -> Result<Vec<(u64, u64)>, PciDeviceError>911 fn allocate_device_bars( 912 &mut self, 913 resources: &mut SystemAllocator, 914 ) -> Result<Vec<(u64, u64)>, PciDeviceError> { 915 let mut ranges = Vec::new(); 916 917 if !self.is_intel_gfx() { 918 return Ok(ranges); 919 } 920 921 // Make intel gfx's opregion as mmio bar, and allocate a gpa for it 922 // then write this gpa into pci cfg register 923 if let Some((index, size)) = self.device.get_cap_type_info( 924 VFIO_REGION_TYPE_PCI_VENDOR_TYPE | (INTEL_VENDOR_ID as u32), 925 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, 926 ) { 927 let address = self 928 .pci_address 929 .expect("allocate_address must be called prior to allocate_device_bars"); 930 let bar_addr = resources 931 .mmio_allocator(MmioType::Low) 932 .allocate( 933 size, 934 Alloc::PciBar { 935 bus: address.bus, 936 dev: address.dev, 937 func: address.func, 938 bar: (index * 4) as u8, 939 }, 940 "vfio_bar".to_string(), 941 ) 942 .map_err(|e| PciDeviceError::IoAllocationFailed(size, e))?; 943 ranges.push((bar_addr, size)); 944 self.device_data = Some(DeviceData::IntelGfxData { 945 opregion_index: index, 946 }); 947 948 self.mmio_regions.push(MmioInfo { 949 bar_index: index, 950 start: bar_addr, 951 length: size, 952 }); 953 self.config.write_config_dword(bar_addr as u32, 0xFC); 954 } 955 956 Ok(ranges) 957 } 958 register_device_capabilities(&mut self) -> Result<(), PciDeviceError>959 fn register_device_capabilities(&mut self) -> Result<(), PciDeviceError> { 960 Ok(()) 961 } 962 ioevents(&self) -> Vec<(&Event, u64, Datamatch)>963 fn ioevents(&self) -> Vec<(&Event, u64, Datamatch)> { 964 Vec::new() 965 } 966 read_config_register(&self, reg_idx: usize) -> u32967 fn read_config_register(&self, reg_idx: usize) -> u32 { 968 let reg: u32 = (reg_idx * 4) as u32; 969 970 let mut config = self.config.read_config_dword(reg); 971 972 // Ignore IO bar 973 if (0x10..=0x24).contains(®) { 974 for io_info in self.io_regions.iter() { 975 if io_info.bar_index * 4 + 0x10 == reg { 976 config = 0; 977 } 978 } 979 } else if let Some(msix_cap) = &self.msix_cap { 980 if msix_cap.is_msix_control_reg(reg, 4) { 981 msix_cap.read_msix_control(&mut config); 982 } 983 } 984 985 // Quirk for intel graphic, set stolen memory size to 0 in pci_cfg[0x51] 986 if self.is_intel_gfx() && reg == 0x50 { 987 config &= 0xffff00ff; 988 } 989 990 config 991 } 992 write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8])993 fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { 994 let start = (reg_idx * 4) as u64 + offset; 995 996 let mut msi_change: Option<VfioMsiChange> = None; 997 if let Some(msi_cap) = self.msi_cap.as_mut() { 998 if msi_cap.is_msi_reg(start, data.len()) { 999 msi_change = msi_cap.write_msi_reg(start, data); 1000 } 1001 } 1002 1003 match msi_change { 1004 Some(VfioMsiChange::Enable) => self.enable_msi(), 1005 Some(VfioMsiChange::Disable) => self.disable_msi(), 1006 None => (), 1007 } 1008 1009 msi_change = None; 1010 if let Some(msix_cap) = self.msix_cap.as_mut() { 1011 if msix_cap.is_msix_control_reg(start as u32, data.len() as u32) { 1012 msi_change = msix_cap.write_msix_control(data); 1013 } 1014 } 1015 match msi_change { 1016 Some(VfioMsiChange::Enable) => self.enable_msix(), 1017 Some(VfioMsiChange::Disable) => self.disable_msix(), 1018 None => (), 1019 } 1020 1021 // if guest enable memory access, then enable bar mappable once 1022 if start == PCI_COMMAND as u64 1023 && data.len() == 2 1024 && data[0] & PCI_COMMAND_MEMORY == PCI_COMMAND_MEMORY 1025 && self.mem.is_empty() 1026 { 1027 self.enable_bars_mmap(); 1028 } 1029 1030 self.device 1031 .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, start); 1032 } 1033 read_bar(&mut self, addr: u64, data: &mut [u8])1034 fn read_bar(&mut self, addr: u64, data: &mut [u8]) { 1035 if let Some(mmio_info) = self.find_region(addr) { 1036 let offset = addr - mmio_info.start; 1037 let bar_index = mmio_info.bar_index; 1038 if let Some(msix_cap) = &self.msix_cap { 1039 if msix_cap.is_msix_table(bar_index, offset) { 1040 msix_cap.read_table(offset, data); 1041 return; 1042 } else if msix_cap.is_msix_pba(bar_index, offset) { 1043 msix_cap.read_pba(offset, data); 1044 return; 1045 } 1046 } 1047 self.device.region_read(bar_index, data, offset); 1048 } 1049 } 1050 write_bar(&mut self, addr: u64, data: &[u8])1051 fn write_bar(&mut self, addr: u64, data: &[u8]) { 1052 if let Some(mmio_info) = self.find_region(addr) { 1053 // Ignore igd opregion's write 1054 if let Some(device_data) = &self.device_data { 1055 match *device_data { 1056 DeviceData::IntelGfxData { opregion_index } => { 1057 if opregion_index == mmio_info.bar_index { 1058 return; 1059 } 1060 } 1061 } 1062 } 1063 1064 let offset = addr - mmio_info.start; 1065 let bar_index = mmio_info.bar_index; 1066 1067 if let Some(msix_cap) = self.msix_cap.as_mut() { 1068 if msix_cap.is_msix_table(bar_index, offset) { 1069 msix_cap.write_table(offset, data); 1070 return; 1071 } else if msix_cap.is_msix_pba(bar_index, offset) { 1072 msix_cap.write_pba(offset, data); 1073 return; 1074 } 1075 } 1076 1077 self.device.region_write(bar_index, data, offset); 1078 } 1079 } 1080 } 1081