1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::fs::read;
6 #[cfg(feature = "direct")]
7 use std::fs::read_to_string;
8 use std::fs::write;
9 use std::fs::File;
10 use std::fs::OpenOptions;
11 use std::os::unix::fs::FileExt;
12 use std::path::Path;
13 use std::path::PathBuf;
14 use std::sync::Arc;
15 use std::thread;
16 
17 use anyhow::anyhow;
18 use anyhow::bail;
19 use anyhow::Context;
20 use anyhow::Result;
21 use base::error;
22 #[cfg(feature = "direct")]
23 use base::warn;
24 use base::Tube;
25 use data_model::DataInit;
26 use sync::Mutex;
27 use vm_control::HotPlugDeviceInfo;
28 use vm_control::HotPlugDeviceType;
29 use vm_control::VmRequest;
30 use vm_control::VmResponse;
31 
32 use crate::pci::pci_configuration::PciBridgeSubclass;
33 use crate::pci::pci_configuration::CAPABILITY_LIST_HEAD_OFFSET;
34 #[cfg(feature = "direct")]
35 use crate::pci::pci_configuration::CLASS_REG;
36 #[cfg(feature = "direct")]
37 use crate::pci::pci_configuration::CLASS_REG_REVISION_ID_OFFSET;
38 use crate::pci::pci_configuration::HEADER_TYPE_REG;
39 use crate::pci::pci_configuration::PCI_CAP_NEXT_POINTER;
40 use crate::pci::pcie::pci_bridge::PciBridgeBusRange;
41 use crate::pci::pcie::pci_bridge::BR_BUS_NUMBER_REG;
42 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_MASK;
43 use crate::pci::pcie::pci_bridge::BR_MEM_BASE_SHIFT;
44 use crate::pci::pcie::pci_bridge::BR_MEM_LIMIT_MASK;
45 use crate::pci::pcie::pci_bridge::BR_MEM_MINIMUM;
46 use crate::pci::pcie::pci_bridge::BR_MEM_REG;
47 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_64BIT;
48 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_BASE_HIGH_REG;
49 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LIMIT_HIGH_REG;
50 use crate::pci::pcie::pci_bridge::BR_PREF_MEM_LOW_REG;
51 use crate::pci::pcie::pci_bridge::BR_WINDOW_ALIGNMENT;
52 use crate::pci::pcie::PcieDevicePortType;
53 use crate::pci::PciCapabilityID;
54 use crate::pci::PciClassCode;
55 
56 // Host Pci device's sysfs config file
57 struct PciHostConfig {
58     config_file: File,
59 }
60 
61 impl PciHostConfig {
62     // Create a new host pci device's sysfs config file
new(host_sysfs_path: &Path) -> Result<Self>63     fn new(host_sysfs_path: &Path) -> Result<Self> {
64         let mut config_path = PathBuf::new();
65         config_path.push(host_sysfs_path);
66         config_path.push("config");
67         let f = OpenOptions::new()
68             .write(true)
69             .read(true)
70             .open(config_path.as_path())
71             .with_context(|| format!("failed to open: {}", config_path.display()))?;
72         Ok(PciHostConfig { config_file: f })
73     }
74 
75     // Read host pci device's config register
read_config<T: DataInit>(&self, offset: u64) -> T76     fn read_config<T: DataInit>(&self, offset: u64) -> T {
77         let length = std::mem::size_of::<T>();
78         let mut buf = vec![0u8; length];
79         if offset % length as u64 != 0 {
80             error!(
81                 "read_config, offset {} isn't aligned to length {}",
82                 offset, length
83             );
84         } else if let Err(e) = self.config_file.read_exact_at(&mut buf, offset) {
85             error!("failed to read host sysfs config: {}", e);
86         }
87 
88         T::from_slice(&buf)
89             .copied()
90             .expect("failed to convert host sysfs config data from slice")
91     }
92 
93     // write host pci device's config register
94     #[allow(dead_code)]
write_config(&self, offset: u64, data: &[u8])95     fn write_config(&self, offset: u64, data: &[u8]) {
96         if offset % data.len() as u64 != 0 {
97             error!(
98                 "write_config, offset {} isn't aligned to length {}",
99                 offset,
100                 data.len()
101             );
102             return;
103         }
104         if let Err(e) = self.config_file.write_all_at(data, offset) {
105             error!("failed to write host sysfs config: {}", e);
106         }
107     }
108 }
109 
110 // Find all the added pcie devices
visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()>111 fn visit_children(dir: &Path, children: &mut Vec<HotPlugDeviceInfo>) -> Result<()> {
112     // Each pci device has a sysfs directory
113     if !dir.is_dir() {
114         bail!("{} isn't directory", dir.display());
115     }
116     // Loop device sysfs subdirectory
117     let entries = dir
118         .read_dir()
119         .with_context(|| format!("failed to read dir {}", dir.display()))?;
120     let mut devices = Vec::new();
121     for entry in entries {
122         let sub_dir = match entry {
123             Ok(sub) => sub,
124             _ => continue,
125         };
126 
127         if !sub_dir.path().is_dir() {
128             continue;
129         }
130 
131         let name = sub_dir
132             .file_name()
133             .into_string()
134             .map_err(|_| anyhow!("failed to get dir name"))?;
135         // Child pci device has name format 0000:xx:xx.x, length is 12
136         if name.len() != 12 || !name.starts_with("0000:") {
137             continue;
138         }
139         let child_path = dir.join(name);
140         devices.push(child_path);
141     }
142     devices.reverse();
143     let mut iter = devices.iter().peekable();
144     while let Some(device) = iter.next() {
145         let class_path = device.join("class");
146         let class_id = read(class_path.as_path())
147             .with_context(|| format!("failed to read {}", class_path.display()))?;
148         let hp_interrupt = iter.peek().is_none();
149         if !class_id.starts_with("0x0604".as_bytes()) {
150             // If the device isn't pci bridge, this is a pcie endpoint device
151             children.push(HotPlugDeviceInfo {
152                 device_type: HotPlugDeviceType::EndPoint,
153                 path: device.to_path_buf(),
154                 hp_interrupt,
155             });
156             // No need to look further
157             return Ok(());
158         } else {
159             // Find the pci express cap to get the port type of the pcie bridge
160             let host_config = PciHostConfig::new(device)?;
161             let mut cap_pointer: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
162             while cap_pointer != 0x0 {
163                 let cap_id: u8 = host_config.read_config(cap_pointer as u64);
164                 if cap_id == PciCapabilityID::PciExpress as u8 {
165                     break;
166                 }
167                 cap_pointer = host_config.read_config(cap_pointer as u64 + 0x1);
168             }
169             if cap_pointer == 0x0 {
170                 bail!(
171                     "Failed to get pcie express capability for {}",
172                     device.display()
173                 );
174             }
175             let express_cap_reg: u16 = host_config.read_config(cap_pointer as u64 + 0x2);
176             match (express_cap_reg & 0xf0) >> 4 {
177                 x if x == PcieDevicePortType::UpstreamPort as u16 => {
178                     children.push(HotPlugDeviceInfo {
179                         device_type: HotPlugDeviceType::UpstreamPort,
180                         path: device.to_path_buf(),
181                         hp_interrupt,
182                     })
183                 }
184                 x if x == PcieDevicePortType::DownstreamPort as u16 => {
185                     children.push(HotPlugDeviceInfo {
186                         device_type: HotPlugDeviceType::DownstreamPort,
187                         path: device.to_path_buf(),
188                         hp_interrupt,
189                     })
190                 }
191                 _ => (),
192             }
193         }
194     }
195     for device in devices.iter() {
196         visit_children(device.as_path(), children)?;
197     }
198     Ok(())
199 }
200 
201 struct HotplugWorker {
202     host_name: String,
203 }
204 
205 impl HotplugWorker {
run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()>206     fn run(&self, vm_socket: Arc<Mutex<Tube>>, child_exist: Arc<Mutex<bool>>) -> Result<()> {
207         let mut host_sysfs = PathBuf::new();
208         host_sysfs.push("/sys/bus/pci/devices/");
209         host_sysfs.push(self.host_name.clone());
210         let rescan_path = host_sysfs.join("rescan");
211         // Let pcie root port rescan to find the added or removed children devices
212         write(rescan_path.as_path(), "1")
213             .with_context(|| format!("failed to write {}", rescan_path.display()))?;
214 
215         // If child device existed, but code run here again, this means host has a
216         // hotplug out event, after the above rescan, host should find the removed
217         // child device, and host vfio-pci kernel driver should notify crosvm vfio-pci
218         // devie such hotplug out event, so nothing is needed to do here, just return
219         // it now.
220         let mut child_exist = child_exist.lock();
221         if *child_exist {
222             return Ok(());
223         }
224 
225         // Probe the new added pcie endpoint devices
226         let mut children: Vec<HotPlugDeviceInfo> = Vec::new();
227         visit_children(host_sysfs.as_path(), &mut children)?;
228 
229         // Without reverse children, physical larger BDF device is at the top, it will be
230         // added into guest first with smaller virtual function number, so physical smaller
231         // BDF device has larger virtual function number, phyiscal larger BDF device has
232         // smaller virtual function number. During hotplug out process, host pcie root port
233         // driver remove physical smaller BDF pcie endpoint device first, so host vfio-pci
234         // driver send plug out event first for smaller BDF device and wait for this device
235         // removed from crosvm, when crosvm receives this plug out event, crosvm will remove
236         // all the children devices, crosvm remove smaller virtual function number device
237         // first, this isn't the target device which host vfio-pci driver is waiting for.
238         // Host vfio-pci driver holds a lock when it is waiting, when crosvm remove another
239         // device throgh vfio-pci which try to get the same lock, so deadlock happens in
240         // host kernel.
241         //
242         // In order to fix the deadlock, children is reversed, so physical smaller BDF
243         // device has smaller virtual function number, and it will have the same order
244         // between host kernel and crosvm during hotplug out process.
245         children.reverse();
246         while let Some(child) = children.pop() {
247             if let HotPlugDeviceType::EndPoint = child.device_type {
248                 // In order to bind device to vfio-pci driver, get device VID and DID
249                 let vendor_path = child.path.join("vendor");
250                 let vendor_id = read(vendor_path.as_path())
251                     .with_context(|| format!("failed to read {}", vendor_path.display()))?;
252                 // Remove the first two elements 0x
253                 let prefix: &str = "0x";
254                 let vendor = match vendor_id.strip_prefix(prefix.as_bytes()) {
255                     Some(v) => v.to_vec(),
256                     None => vendor_id,
257                 };
258                 let device_path = child.path.join("device");
259                 let device_id = read(device_path.as_path())
260                     .with_context(|| format!("failed to read {}", device_path.display()))?;
261                 // Remove the first two elements 0x
262                 let device = match device_id.strip_prefix(prefix.as_bytes()) {
263                     Some(d) => d.to_vec(),
264                     None => device_id,
265                 };
266                 let new_id = vec![
267                     String::from_utf8_lossy(&vendor),
268                     String::from_utf8_lossy(&device),
269                 ]
270                 .join(" ");
271                 if Path::new("/sys/bus/pci/drivers/vfio-pci-pm/new_id").exists() {
272                     let _ = write("/sys/bus/pci/drivers/vfio-pci-pm/new_id", &new_id);
273                 }
274                 // This is normal - either the kernel doesn't support vfio-pci-pm driver,
275                 // or the device failed to attach to vfio-pci-pm driver (most likely due to
276                 // lack of power management capability).
277                 if !child.path.join("driver/unbind").exists() {
278                     write("/sys/bus/pci/drivers/vfio-pci/new_id", &new_id).with_context(|| {
279                         format!("failed to write {} into vfio-pci/new_id", new_id)
280                     })?;
281                 }
282             }
283             // Request to hotplug the new added pcie device into guest
284             let request = VmRequest::HotPlugCommand {
285                 device: child.clone(),
286                 add: true,
287             };
288             let vm_socket = vm_socket.lock();
289             vm_socket
290                 .send(&request)
291                 .with_context(|| format!("failed to send hotplug request for {:?}", child))?;
292             let response = vm_socket
293                 .recv::<VmResponse>()
294                 .with_context(|| format!("failed to receive hotplug response for {:?}", child))?;
295             match response {
296                 VmResponse::Ok => {}
297                 _ => bail!("unexpected hotplug response: {response}"),
298             };
299             if !*child_exist {
300                 *child_exist = true;
301             }
302         }
303 
304         Ok(())
305     }
306 }
307 
308 const PCI_CONFIG_DEVICE_ID: u64 = 0x02;
309 const PCI_BASE_CLASS_CODE: u64 = 0x0B;
310 const PCI_SUB_CLASS_CODE: u64 = 0x0A;
311 
312 /// Pcie root port device has a corresponding host pcie root port.
313 pub struct PcieHostPort {
314     host_config: PciHostConfig,
315     host_name: String,
316     hotplug_in_process: Arc<Mutex<bool>>,
317     hotplug_child_exist: Arc<Mutex<bool>>,
318     vm_socket: Arc<Mutex<Tube>>,
319     #[cfg(feature = "direct")]
320     sysfs_path: Option<PathBuf>,
321     #[cfg(feature = "direct")]
322     header_type_reg: Option<u32>,
323 }
324 
325 impl PcieHostPort {
326     /// Create PcieHostPort, host_syfsfs_patch specify host pcie port
327     /// sysfs path.
new(host_sysfs_path: &Path, socket: Tube) -> Result<Self>328     pub fn new(host_sysfs_path: &Path, socket: Tube) -> Result<Self> {
329         let host_config = PciHostConfig::new(host_sysfs_path)?;
330         let host_name = host_sysfs_path
331             .file_name()
332             .unwrap()
333             .to_str()
334             .unwrap()
335             .to_owned();
336         let base_class: u8 = host_config.read_config(PCI_BASE_CLASS_CODE);
337         if base_class != PciClassCode::BridgeDevice.get_register_value() {
338             return Err(anyhow!("host {} isn't bridge", host_name));
339         }
340         let sub_class: u8 = host_config.read_config(PCI_SUB_CLASS_CODE);
341         if sub_class != PciBridgeSubclass::PciToPciBridge as u8 {
342             return Err(anyhow!("host {} isn't pci to pci bridge", host_name));
343         }
344 
345         let mut pcie_cap_reg: u8 = 0;
346 
347         let mut cap_next: u8 = host_config.read_config(CAPABILITY_LIST_HEAD_OFFSET as u64);
348         let mut counter: u16 = 0;
349         while cap_next != 0 && counter < 256 {
350             let cap_id: u8 = host_config.read_config(cap_next.into());
351             if cap_id == PciCapabilityID::PciExpress as u8 {
352                 pcie_cap_reg = cap_next;
353                 break;
354             }
355             let offset = cap_next as u64 + PCI_CAP_NEXT_POINTER as u64;
356             cap_next = host_config.read_config(offset);
357             counter += 1;
358         }
359 
360         if pcie_cap_reg == 0 {
361             return Err(anyhow!("host {} isn't pcie device", host_name));
362         }
363 
364         #[cfg(feature = "direct")]
365         let (sysfs_path, header_type_reg) =
366             match PcieHostPort::coordinated_pm(host_sysfs_path, true) {
367                 Ok(_) => {
368                     // Cache the dword at offset 0x0c (cacheline size, latency timer,
369                     // header type, BIST).
370                     // When using the "direct" feature, this dword can be accessed for
371                     // device power state. Directly accessing a device's physical PCI
372                     // config space in D3cold state causes a hang. We treat the cacheline
373                     // size, latency timer and header type field as immutable in the
374                     // guest.
375                     let reg: u32 = host_config.read_config((HEADER_TYPE_REG as u64) * 4);
376                     (Some(host_sysfs_path.to_path_buf()), Some(reg))
377                 }
378                 Err(e) => {
379                     warn!("coordinated_pm not supported: {}", e);
380                     (None, None)
381                 }
382             };
383 
384         Ok(PcieHostPort {
385             host_config,
386             host_name,
387             hotplug_in_process: Arc::new(Mutex::new(false)),
388             hotplug_child_exist: Arc::new(Mutex::new(false)),
389             vm_socket: Arc::new(Mutex::new(socket)),
390             #[cfg(feature = "direct")]
391             sysfs_path,
392             #[cfg(feature = "direct")]
393             header_type_reg,
394         })
395     }
396 
get_bus_range(&self) -> PciBridgeBusRange397     pub fn get_bus_range(&self) -> PciBridgeBusRange {
398         let bus_num: u32 = self.host_config.read_config((BR_BUS_NUMBER_REG * 4) as u64);
399         let primary = (bus_num & 0xFF) as u8;
400         let secondary = ((bus_num >> 8) & 0xFF) as u8;
401         let subordinate = ((bus_num >> 16) & 0xFF) as u8;
402 
403         PciBridgeBusRange {
404             primary,
405             secondary,
406             subordinate,
407         }
408     }
409 
read_device_id(&self) -> u16410     pub fn read_device_id(&self) -> u16 {
411         self.host_config.read_config::<u16>(PCI_CONFIG_DEVICE_ID)
412     }
413 
host_name(&self) -> String414     pub fn host_name(&self) -> String {
415         self.host_name.clone()
416     }
417 
read_config(&self, reg_idx: usize, data: &mut u32)418     pub fn read_config(&self, reg_idx: usize, data: &mut u32) {
419         if reg_idx == HEADER_TYPE_REG {
420             #[cfg(feature = "direct")]
421             if let Some(header_type_reg) = self.header_type_reg {
422                 let mut v = header_type_reg.to_le_bytes();
423                 // HACK
424                 // Reads from the "BIST" register are interpreted as device
425                 // PCI power state
426                 v[3] = self.power_state().unwrap_or_else(|e| {
427                     error!("Failed to get device power state: {}", e);
428                     5 // unknown state
429                 });
430                 *data = u32::from_le_bytes(v);
431                 return;
432             }
433             *data = self.host_config.read_config((HEADER_TYPE_REG as u64) * 4)
434         }
435     }
436 
437     #[allow(unused_variables)]
write_config(&mut self, reg_idx: usize, offset: u64, data: &[u8])438     pub fn write_config(&mut self, reg_idx: usize, offset: u64, data: &[u8]) {
439         #[cfg(feature = "direct")]
440         if self.sysfs_path.is_some()
441             && reg_idx == CLASS_REG
442             && offset == CLASS_REG_REVISION_ID_OFFSET as u64
443             && data.len() == 1
444         {
445             // HACK
446             // Byte writes to the "Revision ID" register are interpreted as PM
447             // op calls
448             if let Err(e) = self.op_call(data[0]) {
449                 error!("Failed to perform op call: {}", e);
450             }
451         }
452     }
453 
get_bridge_window_size(&self) -> (u64, u64)454     pub fn get_bridge_window_size(&self) -> (u64, u64) {
455         let br_memory: u32 = self.host_config.read_config(BR_MEM_REG as u64 * 4);
456         let mem_base = (br_memory & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
457         let mem_limit = br_memory & BR_MEM_LIMIT_MASK;
458         let mem_size = if mem_limit > mem_base {
459             (mem_limit - mem_base) as u64 + BR_WINDOW_ALIGNMENT
460         } else {
461             BR_MEM_MINIMUM
462         };
463         let br_pref_mem_low: u32 = self.host_config.read_config(BR_PREF_MEM_LOW_REG as u64 * 4);
464         let pref_mem_base_low = (br_pref_mem_low & BR_MEM_BASE_MASK) << BR_MEM_BASE_SHIFT;
465         let pref_mem_limit_low = br_pref_mem_low & BR_MEM_LIMIT_MASK;
466         let mut pref_mem_base: u64 = pref_mem_base_low as u64;
467         let mut pref_mem_limit: u64 = pref_mem_limit_low as u64;
468         if br_pref_mem_low & BR_PREF_MEM_64BIT == BR_PREF_MEM_64BIT {
469             // 64bit prefetch memory
470             let pref_mem_base_high: u32 = self
471                 .host_config
472                 .read_config(BR_PREF_MEM_BASE_HIGH_REG as u64 * 4);
473             let pref_mem_limit_high: u32 = self
474                 .host_config
475                 .read_config(BR_PREF_MEM_LIMIT_HIGH_REG as u64 * 4);
476             pref_mem_base = ((pref_mem_base_high as u64) << 32) | (pref_mem_base_low as u64);
477             pref_mem_limit = ((pref_mem_limit_high as u64) << 32) | (pref_mem_limit_low as u64);
478         }
479         let pref_mem_size = if pref_mem_limit > pref_mem_base {
480             pref_mem_limit - pref_mem_base + BR_WINDOW_ALIGNMENT
481         } else {
482             BR_MEM_MINIMUM
483         };
484 
485         (mem_size, pref_mem_size)
486     }
487 
hotplug_probe(&mut self)488     pub fn hotplug_probe(&mut self) {
489         if *self.hotplug_in_process.lock() {
490             return;
491         }
492 
493         let hotplug_process = self.hotplug_in_process.clone();
494         let child_exist = self.hotplug_child_exist.clone();
495         let socket = self.vm_socket.clone();
496         let name = self.host_name.clone();
497         let _ = thread::Builder::new()
498             .name("pcie_hotplug".to_string())
499             .spawn(move || {
500                 let mut hotplug = hotplug_process.lock();
501                 *hotplug = true;
502                 let hotplug_worker = HotplugWorker { host_name: name };
503                 let _ = hotplug_worker.run(socket, child_exist);
504                 *hotplug = false;
505             });
506     }
507 
hot_unplug(&mut self)508     pub fn hot_unplug(&mut self) {
509         *self.hotplug_child_exist.lock() = false;
510     }
511 
512     #[cfg(feature = "direct")]
coordinated_pm(host_sysfs_path: &Path, enter: bool) -> Result<()>513     fn coordinated_pm(host_sysfs_path: &Path, enter: bool) -> Result<()> {
514         let path = Path::new(host_sysfs_path).join("power/coordinated");
515         write(&path, if enter { "enter\n" } else { "exit\n" })
516             .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
517     }
518 
519     #[cfg(feature = "direct")]
power_state(&self) -> Result<u8>520     fn power_state(&self) -> Result<u8> {
521         let path = Path::new(&self.sysfs_path.as_ref().unwrap()).join("power_state");
522         let state = read_to_string(&path)
523             .with_context(|| format!("Failed to read from {}", path.to_string_lossy()))?;
524         match state.as_str() {
525             "D0\n" => Ok(0),
526             "D1\n" => Ok(1),
527             "D2\n" => Ok(2),
528             "D3hot\n" => Ok(3),
529             "D3cold\n" => Ok(4),
530             "unknown\n" => Ok(5),
531             _ => Err(std::io::Error::new(
532                 std::io::ErrorKind::InvalidData,
533                 "invalid state",
534             ))?,
535         }
536     }
537 
538     #[cfg(feature = "direct")]
op_call(&self, id: u8) -> Result<()>539     fn op_call(&self, id: u8) -> Result<()> {
540         let path = Path::new(self.sysfs_path.as_ref().unwrap()).join("power/op_call");
541         write(&path, &[id])
542             .with_context(|| format!("Failed to write to {}", path.to_string_lossy()))
543     }
544 }
545 
546 #[cfg(feature = "direct")]
547 impl Drop for PcieHostPort {
drop(&mut self)548     fn drop(&mut self) {
549         if self.sysfs_path.is_some() {
550             let _ = PcieHostPort::coordinated_pm(self.sysfs_path.as_ref().unwrap(), false);
551         }
552     }
553 }
554