1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fs::read;
6 #[cfg(feature = "direct")]
7 use std::fs::read_to_string;
8 use std::fs::write;
9 use std::fs::File;
10 use std::fs::OpenOptions;
11 use std::os::unix::fs::FileExt;
12 use std::path::Path;
13 use std::path::PathBuf;
14 use std::sync::Arc;
15 use std::thread;
16
17 use anyhow::anyhow;
18 use anyhow::bail;
19 use anyhow::Context;
20 use anyhow::Result;
21 use base::error;
22 #[cfg(feature = "direct")]
23 use base::warn;
24 use base::Tube;
25 use data_model::DataInit;
26 use sync::Mutex;
27 use vm_control::HotPlugDeviceInfo;
28 use vm_control::HotPlugDeviceType;
29 use vm_control::VmRequest;
30 use vm_control::VmResponse;
31
32 use crate::pci::pci_configuration::PciBridgeSubclass;
33 use crate::pci::pci_configuration::CAPABILITY_LIST_HEAD_OFFSET;
34 #[cfg(feature = "direct")]
35 use crate::pci::pci_configuration::CLASS_REG;
36 #[cfg(feature = "direct")]
37 use crate::pci::pci_configuration::CLASS_REG_REVISION_ID_OFFSET;
38 use crate::pci::pci_configuration::HEADER_TYPE_REG;
39 use crate::pci::pci_configuration::PCI_CAP_NEXT_POINTER;
40 use crate::pci::pcie::pci_bridge::PciBridgeBusRange;
41 use crate::pci::pcie::pci_bridge::BR_BUS_NUMBER_REG;
42 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_MASK;
43 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_SHIFT;
44 use crate::pci::pcie::pci_bridge::BR_MEM_LIMIT_MASK;
45 use crate::pci::pcie::pci_bridge::BR_MEM_MINIMUM;
46 use crate::pci::pcie::pci_bridge::BR_MEM_REG;
47 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_64BIT;
48 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_BASE_HIGH_REG;
49 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LIMIT_HIGH_REG;
50 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LOW_REG;
51 use crate::pci::pcie::pci_bridge::BR_WINDOW_ALIGNMENT;
52 use crate::pci::pcie::PcieDevicePortType;
53 use crate::pci::PciCapabilityID;
54 use crate::pci::PciClassCode;
55
56 // Host Pci device's sysfs config file
57 struct PciHostConfig {
58 config_file: File,
59 }
60
61 impl PciHostConfig {
62 // Create a new host pci device's sysfs config file
new(host_sysfs_path: &Path) -> Result<Self>63 fn new(host_sysfs_path: &Path) -> Result<Self> {
64 let mut config_path = PathBuf::new();
65 config_path.push(host_sysfs_path);
66 config_path.push("config");
67 let f = OpenOptions::new()
68 .write(true)
69 .read(true)
70 .open(config_path.as_path())
71 .with_context(|| format!("failed to open: {}", config_path.display()))?;
72 Ok(PciHostConfig { config_file: f })
73 }
74
75 // Read host pci device's config register
read_config<T: DataInit>(&self, offset: u64) -> T76 fn read_config<T: DataInit>(&self, offset: u64) -> T {
77 let length = std::mem::size_of::<T>();
78 let mut buf = vec![0u8; length];
79 if offset % length as u64 != 0 {
80 error!(
81 "read_config, offset {} isn't aligned to length {}",
82 offset, length
83 );
84 } else if let Err(e) = self.config_file.read_exact_at(&mut buf, offset) {
85 error!("failed to read host sysfs config: {}", e);
86 }
87
88 T::from_slice(&buf)
89 .copied()
90 .expect("failed to convert host sysfs config data from slice")
91 }
92
93 // write host pci device's config register
94 #[allow(dead_code)]
write_config(&self, offset: u64, data: &[u8])95 fn write_config(&self, offset: u64, data: &[u8]) {
96 if offset % data.len() as u64 != 0 {
97 error!(
98 "write_config, offset {} isn't aligned to length {}",
99 offset,
100 data.len()
101 );
102 return;
103 }
104 if let Err(e) = self.config_file.write_all_at(data, offset) {
105 error!("failed to write host sysfs config: {}", e);
106 }
107 }
108 }
109
110 // Find all the added pcie devices
visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()>111 fn visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()> {
112 // Each pci device has a sysfs directory
113 if !dir.is_dir() {
114 bail!("{} isn't directory", dir.display());
115 }
116 // Loop device sysfs subdirectory
117 let entries = dir
118 .read_dir()
119 .with_context(|| format!("failed to read dir {}", dir.display()))?;
120 let mut devices = Vec::new();
121 for entry in entries {
122 let sub_dir = match entry {
123 Ok(sub) => sub,
124 _ => continue,
125 };
126
127 if !sub_dir.path().is_dir() {
128 continue;
129 }
130
131 let name = sub_dir
132 .file_name()
133 .into_string()
134 .map_err(|_| anyhow!("failed to get dir name"))?;
135 // Child pci device has name format 0000:xx:xx.x, length is 12
136 if name.len() != 12 || !name.starts_with("0000:") {
137 continue;
138 }
139 let child_path = dir.join(name);
140 devices.push(child_path);
141 }
142 devices.reverse();
143 let mut iter = devices.iter().peekable();
144 while let Some(device) = iter.next() {
145 let class_path = device.join("class");
146 let class_id = read(class_path.as_path())
147 .with_context(|| format!("failed to read {}", class_path.display()))?;
148 let hp_interrupt = iter.peek().is_none();
149 if !class_id.starts_with("0x0604".as_bytes()) {
150 // If the device isn't pci bridge, this is a pcie endpoint device
151 children.push(HotPlugDeviceInfo {
152 device_type: HotPlugDeviceType::EndPoint,
153 path: device.to_path_buf(),
154 hp_interrupt,
155 });
156 // No need to look further
157 return Ok(());
158 } else {
159 // Find the pci express cap to get the port type of the pcie bridge
160 let host_config = PciHostConfig::new(device)?;
161 let mut cap_pointer: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
162 while cap_pointer != 0x0 {
163 let cap_id: u8 = host_config.read_config(cap_pointer as u64);
164 if cap_id == PciCapabilityID::PciExpress as u8 {
165 break;
166 }
167 cap_pointer = host_config.read_config(cap_pointer as u64 + 0x1);
168 }
169 if cap_pointer == 0x0 {
170 bail!(
171 "Failed to get pcie express capability for {}",
172 device.display()
173 );
174 }
175 let express_cap_reg: u16 = host_config.read_config(cap_pointer as u64 + 0x2);
176 match (express_cap_reg & 0xf0) >> 4 {
177 x if x == PcieDevicePortType::UpstreamPort as u16 => {
178 children.push(HotPlugDeviceInfo {
179 device_type: HotPlugDeviceType::UpstreamPort,
180 path: device.to_path_buf(),
181 hp_interrupt,
182 })
183 }
184 x if x == PcieDevicePortType::DownstreamPort as u16 => {
185 children.push(HotPlugDeviceInfo {
186 device_type: HotPlugDeviceType::DownstreamPort,
187 path: device.to_path_buf(),
188 hp_interrupt,
189 })
190 }
191 _ => (),
192 }
193 }
194 }
195 for device in devices.iter() {
196 visit_children(device.as_path(), children)?;
197 }
198 Ok(())
199 }
200
201 struct HotplugWorker {
202 host_name: String,
203 }
204
205 impl HotplugWorker {
run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()>206 fn run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()> {
207 let mut host_sysfs = PathBuf::new();
208 host_sysfs.push("/sys/bus/pci/devices/");
209 host_sysfs.push(self.host_name.clone());
210 let rescan_path = host_sysfs.join("rescan");
211 // Let pcie root port rescan to find the added or removed children devices
212 write(rescan_path.as_path(), "1")
213 .with_context(|| format!("failed to write {}", rescan_path.display()))?;
214
215 // If child device existed, but code run here again, this means host has a
216 // hotplug out event, after the above rescan, host should find the removed
217 // child device, and host vfio-pci kernel driver should notify crosvm vfio-pci
218 // devie such hotplug out event, so nothing is needed to do here, just return
219 // it now.
220 let mut child_exist = child_exist.lock();
221 if *child_exist {
222 return Ok(());
223 }
224
225 // Probe the new added pcie endpoint devices
226 let mut children: Vec<HotPlugDeviceInfo> = Vec::new();
227 visit_children(host_sysfs.as_path(), &mut children)?;
228
229 // Without reverse children, physical larger BDF device is at the top, it will be
230 // added into guest first with smaller virtual function number, so physical smaller
231 // BDF device has larger virtual function number, phyiscal larger BDF device has
232 // smaller virtual function number. During hotplug out process, host pcie root port
233 // driver remove physical smaller BDF pcie endpoint device first, so host vfio-pci
234 // driver send plug out event first for smaller BDF device and wait for this device
235 // removed from crosvm, when crosvm receives this plug out event, crosvm will remove
236 // all the children devices, crosvm remove smaller virtual function number device
237 // first, this isn't the target device which host vfio-pci driver is waiting for.
238 // Host vfio-pci driver holds a lock when it is waiting, when crosvm remove another
239 // device throgh vfio-pci which try to get the same lock, so deadlock happens in
240 // host kernel.
241 //
242 // In order to fix the deadlock, children is reversed, so physical smaller BDF
243 // device has smaller virtual function number, and it will have the same order
244 // between host kernel and crosvm during hotplug out process.
245 children.reverse();
246 while let Some(child) = children.pop() {
247 if let HotPlugDeviceType::EndPoint = child.device_type {
248 // In order to bind device to vfio-pci driver, get device VID and DID
249 let vendor_path = child.path.join("vendor");
250 let vendor_id = read(vendor_path.as_path())
251 .with_context(|| format!("failed to read {}", vendor_path.display()))?;
252 // Remove the first two elements 0x
253 let prefix: &str = "0x";
254 let vendor = match vendor_id.strip_prefix(prefix.as_bytes()) {
255 Some(v) => v.to_vec(),
256 None => vendor_id,
257 };
258 let device_path = child.path.join("device");
259 let device_id = read(device_path.as_path())
260 .with_context(|| format!("failed to read {}", device_path.display()))?;
261 // Remove the first two elements 0x
262 let device = match device_id.strip_prefix(prefix.as_bytes()) {
263 Some(d) => d.to_vec(),
264 None => device_id,
265 };
266 let new_id = vec![
267 String::from_utf8_lossy(&vendor),
268 String::from_utf8_lossy(&device),
269 ]
270 .join(" ");
271 if Path::new("/sys/bus/pci/drivers/vfio-pci-pm/new_id").exists() {
272 let _ = write("/sys/bus/pci/drivers/vfio-pci-pm/new_id", &new_id);
273 }
274 // This is normal - either the kernel doesn't support vfio-pci-pm driver,
275 // or the device failed to attach to vfio-pci-pm driver (most likely due to
276 // lack of power management capability).
277 if !child.path.join("driver/unbind").exists() {
278 write("/sys/bus/pci/drivers/vfio-pci/new_id", &new_id).with_context(|| {
279 format!("failed to write {} into vfio-pci/new_id", new_id)
280 })?;
281 }
282 }
283 // Request to hotplug the new added pcie device into guest
284 let request = VmRequest::HotPlugCommand {
285 device: child.clone(),
286 add: true,
287 };
288 let vm_socket = vm_socket.lock();
289 vm_socket
290 .send(&request)
291 .with_context(|| format!("failed to send hotplug request for {:?}", child))?;
292 let response = vm_socket
293 .recv::<VmResponse>()
294 .with_context(|| format!("failed to receive hotplug response for {:?}", child))?;
295 match response {
296 VmResponse::Ok => {}
297 _ => bail!("unexpected hotplug response: {response}"),
298 };
299 if !*child_exist {
300 *child_exist = true;
301 }
302 }
303
304 Ok(())
305 }
306 }
307
308 const PCI_CONFIG_DEVICE_ID: u64 = 0x02;
309 const PCI_BASE_CLASS_CODE: u64 = 0x0B;
310 const PCI_SUB_CLASS_CODE: u64 = 0x0A;
311
312 /// Pcie root port device has a corresponding host pcie root port.
313 pub struct PcieHostPort {
314 host_config: PciHostConfig,
315 host_name: String,
316 hotplug_in_process: Arc<Mutex<bool>>,
317 hotplug_child_exist: Arc<Mutex<bool>>,
318 vm_socket: Arc<Mutex<Tube>>,
319 #[cfg(feature = "direct")]
320 sysfs_path: Option<PathBuf>,
321 #[cfg(feature = "direct")]
322 header_type_reg: Option<u32>,
323 }
324
325 impl PcieHostPort {
326 /// Create PcieHostPort, host_syfsfs_patch specify host pcie port
327 /// sysfs path.
new(host_sysfs_path: &Path, socket: Tube) -> Result<Self>328 pub fn new(host_sysfs_path: &Path, socket: Tube) -> Result<Self> {
329 let host_config = PciHostConfig::new(host_sysfs_path)?;
330 let host_name = host_sysfs_path
331 .file_name()
332 .unwrap()
333 .to_str()
334 .unwrap()
335 .to_owned();
336 let base_class: u8 = host_config.read_config(PCI_BASE_CLASS_CODE);
337 if base_class != PciClassCode::BridgeDevice.get_register_value() {
338 return Err(anyhow!("host {} isn't bridge", host_name));
339 }
340 let sub_class: u8 = host_config.read_config(PCI_SUB_CLASS_CODE);
341 if sub_class != PciBridgeSubclass::PciToPciBridge as u8 {
342 return Err(anyhow!("host {} isn't pci to pci bridge", host_name));
343 }
344
345 let mut pcie_cap_reg: u8 = 0;
346
347 let mut cap_next: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
348 let mut counter: u16 = 0;
349 while cap_next != 0 && counter < 256 {
350 let cap_id: u8 = host_config.read_config(cap_next.into());
351 if cap_id == PciCapabilityID::PciExpress as u8 {
352 pcie_cap_reg = cap_next;
353 break;
354 }
355 let offset = cap_next as u64 + PCI_CAP_NEXT_POINTER as u64;
356 cap_next = host_config.read_config(offset);
357 counter += 1;
358 }
359
360 if pcie_cap_reg == 0 {
361 return Err(anyhow!("host {} isn't pcie device", host_name));
362 }
363
364 #[cfg(feature = "direct")]
365 let (sysfs_path, header_type_reg) =
366 match PcieHostPort::coordinated_pm(host_sysfs_path, true) {
367 Ok(_) => {
368 // Cache the dword at offset 0x0c (cacheline size, latency timer,
369 // header type, BIST).
370 // When using the "direct" feature, this dword can be accessed for
371 // device power state. Directly accessing a device's physical PCI
372 // config space in D3cold state causes a hang. We treat the cacheline
373 // size, latency timer and header type field as immutable in the
374 // guest.
375 let reg: u32 = host_config.read_config((HEADER_TYPE_REG as u64) * 4);
376 (Some(host_sysfs_path.to_path_buf()), Some(reg))
377 }
378 Err(e) => {
379 warn!("coordinated_pm not supported: {}", e);
380 (None, None)
381 }
382 };
383
384 Ok(PcieHostPort {
385 host_config,
386 host_name,
387 hotplug_in_process: Arc::new(Mutex::new(false)),
388 hotplug_child_exist: Arc::new(Mutex::new(false)),
389 vm_socket: Arc::new(Mutex::new(socket)),
390 #[cfg(feature = "direct")]
391 sysfs_path,
392 #[cfg(feature = "direct")]
393 header_type_reg,
394 })
395 }
396
get_bus_range(&self) -> PciBridgeBusRange397 pub fn get_bus_range(&self) -> PciBridgeBusRange {
398 let bus_num: u32 = self.host_config.read_config((BR_BUS_NUMBER_REG * 4) as u64);
399 let primary = (bus_num & 0xFF) as u8;
400 let secondary = ((bus_num >> 8) & 0xFF) as u8;
401 let subordinate = ((bus_num >> 16) & 0xFF) as u8;
402
403 PciBridgeBusRange {
404 primary,
405 secondary,
406 subordinate,
407 }
408 }
409
read_device_id(&self) -> u16410 pub fn read_device_id(&self) -> u16 {
411 self.host_config.read_config::<u16>(PCI_CONFIG_DEVICE_ID)
412 }
413
host_name(&self) -> String414 pub fn host_name(&self) -> String {
415 self.host_name.clone()
416 }
417
read_config(&self, reg_idx: usize, data: &mut u32)418 pub fn read_config(&self, reg_idx: usize, data: &mut u32) {
419 if reg_idx == HEADER_TYPE_REG {
420 #[cfg(feature = "direct")]
421 if let Some(header_type_reg) = self.header_type_reg {
422 let mut v = header_type_reg.to_le_bytes();
423 // HACK
424 // Reads from the "BIST" register are interpreted as device
425 // PCI power state
426 v[3] = self.power_state().unwrap_or_else(|e| {
427 error!("Failed to get device power state: {}", e);
428 5 // unknown state
429 });
430 *data = u32::from_le_bytes(v);
431 return;
432 }
433 *data = self.host_config.read_config((HEADER_TYPE_REG as u64) * 4)
434 }
435 }
436
437 #[allow(unused_variables)]
write_config(&mut self, reg_idx: usize, offset: u64, data: &[u8])438 pub fn write_config(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
439 #[cfg(feature = "direct")]
440 if self.sysfs_path.is_some()
441 && reg_idx == CLASS_REG
442 && offset == CLASS_REG_REVISION_ID_OFFSET as u64
443 && data.len() == 1
444 {
445 // HACK
446 // Byte writes to the "Revision ID" register are interpreted as PM
447 // op calls
448 if let Err(e) = self.op_call(data[0]) {
449 error!("Failed to perform op call: {}", e);
450 }
451 }
452 }
453
get_bridge_window_size(&self) -> (u64, u64)454 pub fn get_bridge_window_size(&self) -> (u64, u64) {
455 let br_memory: u32 = self.host_config.read_config(BR_MEM_REG as u64 * 4);
456 let mem_base = (br_memory & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
457 let mem_limit = br_memory & BR_MEM_LIMIT_MASK;
458 let mem_size = if mem_limit > mem_base {
459 (mem_limit - mem_base) as u64 + BR_WINDOW_ALIGNMENT
460 } else {
461 BR_MEM_MINIMUM
462 };
463 let br_pref_mem_low: u32 = self.host_config.read_config(BR_PREF_MEM_LOW_REG as u64 * 4);
464 let pref_mem_base_low = (br_pref_mem_low & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
465 let pref_mem_limit_low = br_pref_mem_low & BR_MEM_LIMIT_MASK;
466 let mut pref_mem_base: u64 = pref_mem_base_low as u64;
467 let mut pref_mem_limit: u64 = pref_mem_limit_low as u64;
468 if br_pref_mem_low & BR_PREF_MEM_64BIT == BR_PREF_MEM_64BIT {
469 // 64bit prefetch memory
470 let pref_mem_base_high: u32 = self
471 .host_config
472 .read_config(BR_PREF_MEM_BASE_HIGH_REG as u64 * 4);
473 let pref_mem_limit_high: u32 = self
474 .host_config
475 .read_config(BR_PREF_MEM_LIMIT_HIGH_REG as u64 * 4);
476 pref_mem_base = ((pref_mem_base_high as u64) << 32) | (pref_mem_base_low as u64);
477 pref_mem_limit = ((pref_mem_limit_high as u64) << 32) | (pref_mem_limit_low as u64);
478 }
479 let pref_mem_size = if pref_mem_limit > pref_mem_base {
480 pref_mem_limit - pref_mem_base + BR_WINDOW_ALIGNMENT
481 } else {
482 BR_MEM_MINIMUM
483 };
484
485 (mem_size, pref_mem_size)
486 }
487
hotplug_probe(&mut self)488 pub fn hotplug_probe(&mut self) {
489 if *self.hotplug_in_process.lock() {
490 return;
491 }
492
493 let hotplug_process = self.hotplug_in_process.clone();
494 let child_exist = self.hotplug_child_exist.clone();
495 let socket = self.vm_socket.clone();
496 let name = self.host_name.clone();
497 let _ = thread::Builder::new()
498 .name("pcie_hotplug".to_string())
499 .spawn(move || {
500 let mut hotplug = hotplug_process.lock();
501 *hotplug = true;
502 let hotplug_worker = HotplugWorker { host_name: name };
503 let _ = hotplug_worker.run(socket, child_exist);
504 *hotplug = false;
505 });
506 }
507
hot_unplug(&mut self)508 pub fn hot_unplug(&mut self) {
509 *self.hotplug_child_exist.lock() = false;
510 }
511
512 #[cfg(feature = "direct")]
coordinated_pm(host_sysfs_path: &Path, enter: bool) -> Result<()>513 fn coordinated_pm(host_sysfs_path: &Path, enter: bool) -> Result<()> {
514 let path = Path::new(host_sysfs_path).join("power/coordinated");
515 write(&path, if enter { "enter\n" } else { "exit\n" })
516 .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
517 }
518
519 #[cfg(feature = "direct")]
power_state(&self) -> Result<u8>520 fn power_state(&self) -> Result<u8> {
521 let path = Path::new(&self.sysfs_path.as_ref().unwrap()).join("power_state");
522 let state = read_to_string(&path)
523 .with_context(|| format!("Failed to read from {}", path.to_string_lossy()))?;
524 match state.as_str() {
525 "D0\n" => Ok(0),
526 "D1\n" => Ok(1),
527 "D2\n" => Ok(2),
528 "D3hot\n" => Ok(3),
529 "D3cold\n" => Ok(4),
530 "unknown\n" => Ok(5),
531 _ => Err(std::io::Error::new(
532 std::io::ErrorKind::InvalidData,
533 "invalid state",
534 ))?,
535 }
536 }
537
538 #[cfg(feature = "direct")]
op_call(&self, id: u8) -> Result<()>539 fn op_call(&self, id: u8) -> Result<()> {
540 let path = Path::new(self.sysfs_path.as_ref().unwrap()).join("power/op_call");
541 write(&path, &[id])
542 .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
543 }
544 }
545
546 #[cfg(feature = "direct")]
547 impl Drop for PcieHostPort {
drop(&mut self)548 fn drop(&mut self) {
549 if self.sysfs_path.is_some() {
550 let _ = PcieHostPort::coordinated_pm(self.sysfs_path.as_ref().unwrap(), false);
551 }
552 }
553 }
554