1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::{mem, result};
6
7 use base::{self, warn};
8 use hypervisor::{Fpu, Register, Regs, Sregs, VcpuX86_64, Vm};
9 use remain::sorted;
10 use thiserror::Error;
11 use vm_memory::{GuestAddress, GuestMemory};
12
13 use crate::gdt;
14
15 #[sorted]
16 #[derive(Error, Debug)]
17 pub enum Error {
18 /// Failed to configure the FPU.
19 #[error("failed to configure the FPU: {0}")]
20 FpuIoctlFailed(base::Error),
21 /// Failed to get sregs for this cpu.
22 #[error("failed to get sregs for this cpu: {0}")]
23 GetSRegsIoctlFailed(base::Error),
24 /// Setting up msrs failed.
25 #[error("setting up msrs failed: {0}")]
26 MsrIoctlFailed(base::Error),
27 /// Failed to set sregs for this cpu.
28 #[error("failed to set sregs for this cpu: {0}")]
29 SetSRegsIoctlFailed(base::Error),
30 /// Failed to set base registers for this cpu.
31 #[error("failed to set base registers for this cpu: {0}")]
32 SettingRegistersIoctl(base::Error),
33 /// Writing the GDT to RAM failed.
34 #[error("writing the GDT to RAM failed")]
35 WriteGDTFailure,
36 /// Writing the IDT to RAM failed.
37 #[error("writing the IDT to RAM failed")]
38 WriteIDTFailure,
39 /// Writing PDE to RAM failed.
40 #[error("writing PDE to RAM failed")]
41 WritePDEAddress,
42 /// Writing PDPTE to RAM failed.
43 #[error("writing PDPTE to RAM failed")]
44 WritePDPTEAddress,
45 /// Writing PML4 to RAM failed.
46 #[error("writing PML4 to RAM failed")]
47 WritePML4Address,
48 }
49
50 pub type Result<T> = result::Result<T, Error>;
51
52 const MTRR_MEMTYPE_UC: u8 = 0x0;
53 const MTRR_MEMTYPE_WB: u8 = 0x6;
54 const MTRR_VAR_VALID: u64 = 0x800;
55 const MTRR_ENABLE: u64 = 0x800;
56 const MTRR_PHYS_BASE_MSR: u32 = 0x200;
57 const MTRR_PHYS_MASK_MSR: u32 = 0x201;
58 const VAR_MTRR_NUM_MASK: u64 = 0xFF;
59
60 // Returns the value of the highest bit in a 64-bit value. Equivalent to
61 // 1 << HighBitSet(x)
get_power_of_two(data: u64) -> u6462 fn get_power_of_two(data: u64) -> u64 {
63 1 << (64 - data.leading_zeros() - 1)
64 }
65
66 // Returns the max length which suitable for mtrr setting based on the
67 // specified (base, len)
get_max_len(base: u64, len: u64) -> u6468 fn get_max_len(base: u64, len: u64) -> u64 {
69 let mut ret = get_power_of_two(len);
70
71 while base % ret != 0 {
72 ret >>= 1;
73 }
74
75 ret
76 }
77
78 // For the specified (Base, Len), returns (base, len) pair which could be
79 // set into mtrr register. mtrr requires: the base-address alignment value can't be
80 // less than its length
get_mtrr_pairs(base: u64, len: u64) -> Vec<(u64, u64)>81 fn get_mtrr_pairs(base: u64, len: u64) -> Vec<(u64, u64)> {
82 let mut vecs = Vec::new();
83
84 let mut remains = len;
85 let mut new = base;
86 while remains != 0 {
87 let max = get_max_len(new, remains);
88 vecs.push((new, max));
89 remains -= max;
90 new += max;
91 }
92
93 vecs
94 }
95
append_mtrr_entries( vm: &dyn Vm, vpu: &dyn VcpuX86_64, pci_start: u64, entries: &mut Vec<Register>, )96 fn append_mtrr_entries(
97 vm: &dyn Vm,
98 vpu: &dyn VcpuX86_64,
99 pci_start: u64,
100 entries: &mut Vec<Register>,
101 ) {
102 // Get VAR MTRR num from MSR_MTRRcap
103 let mut msrs = vec![Register {
104 id: crate::msr_index::MSR_MTRRcap,
105 ..Default::default()
106 }];
107 if vpu.get_msrs(&mut msrs).is_err() {
108 warn!("get msrs fail, guest with pass through device may be very slow");
109 return;
110 }
111 let var_num = msrs[0].value & VAR_MTRR_NUM_MASK;
112
113 // Set pci_start .. 4G as UC
114 // all others are set to default WB
115 let pci_len = (1 << 32) - pci_start;
116 let vecs = get_mtrr_pairs(pci_start, pci_len);
117 if vecs.len() as u64 > var_num {
118 warn!(
119 "mtrr fail for pci mmio, please check pci_start addr,
120 guest with pass through device may be very slow"
121 );
122 return;
123 }
124
125 let phys_mask: u64 = (1 << vm.get_guest_phys_addr_bits()) - 1;
126 for (idx, (base, len)) in vecs.iter().enumerate() {
127 let reg_idx = idx as u32 * 2;
128 entries.push(Register {
129 id: MTRR_PHYS_BASE_MSR + reg_idx,
130 value: base | MTRR_MEMTYPE_UC as u64,
131 });
132 let mask: u64 = len.wrapping_neg() & phys_mask | MTRR_VAR_VALID;
133 entries.push(Register {
134 id: MTRR_PHYS_MASK_MSR + reg_idx,
135 value: mask,
136 });
137 }
138 // Disable fixed MTRRs and enable variable MTRRs, set default type as WB
139 entries.push(Register {
140 id: crate::msr_index::MSR_MTRRdefType,
141 value: MTRR_ENABLE | MTRR_MEMTYPE_WB as u64,
142 });
143 }
144
create_msr_entries(vm: &dyn Vm, vcpu: &dyn VcpuX86_64, pci_start: u64) -> Vec<Register>145 fn create_msr_entries(vm: &dyn Vm, vcpu: &dyn VcpuX86_64, pci_start: u64) -> Vec<Register> {
146 let mut entries = vec![
147 Register {
148 id: crate::msr_index::MSR_IA32_SYSENTER_CS,
149 value: 0x0,
150 },
151 Register {
152 id: crate::msr_index::MSR_IA32_SYSENTER_ESP,
153 value: 0x0,
154 },
155 Register {
156 id: crate::msr_index::MSR_IA32_SYSENTER_EIP,
157 value: 0x0,
158 },
159 // x86_64 specific msrs, we only run on x86_64 not x86
160 Register {
161 id: crate::msr_index::MSR_STAR,
162 value: 0x0,
163 },
164 Register {
165 id: crate::msr_index::MSR_CSTAR,
166 value: 0x0,
167 },
168 Register {
169 id: crate::msr_index::MSR_KERNEL_GS_BASE,
170 value: 0x0,
171 },
172 Register {
173 id: crate::msr_index::MSR_SYSCALL_MASK,
174 value: 0x0,
175 },
176 Register {
177 id: crate::msr_index::MSR_LSTAR,
178 value: 0x0,
179 },
180 // end of x86_64 specific code
181 Register {
182 id: crate::msr_index::MSR_IA32_TSC,
183 value: 0x0,
184 },
185 Register {
186 id: crate::msr_index::MSR_IA32_MISC_ENABLE,
187 value: crate::msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64,
188 },
189 ];
190 append_mtrr_entries(vm, vcpu, pci_start, &mut entries);
191 entries
192 }
193
194 /// Configure Model specific registers for x86
195 ///
196 /// # Arguments
197 ///
198 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
setup_msrs(vm: &dyn Vm, vcpu: &dyn VcpuX86_64, pci_start: u64) -> Result<()>199 pub fn setup_msrs(vm: &dyn Vm, vcpu: &dyn VcpuX86_64, pci_start: u64) -> Result<()> {
200 let msrs = create_msr_entries(vm, vcpu, pci_start);
201 vcpu.set_msrs(&msrs).map_err(Error::MsrIoctlFailed)
202 }
203
204 /// Configure FPU registers for x86
205 ///
206 /// # Arguments
207 ///
208 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
setup_fpu(vcpu: &dyn VcpuX86_64) -> Result<()>209 pub fn setup_fpu(vcpu: &dyn VcpuX86_64) -> Result<()> {
210 let fpu = Fpu {
211 fcw: 0x37f,
212 mxcsr: 0x1f80,
213 ..Default::default()
214 };
215
216 vcpu.set_fpu(&fpu).map_err(Error::FpuIoctlFailed)
217 }
218
219 /// Configure base registers for x86
220 ///
221 /// # Arguments
222 ///
223 /// * `vcpu` - Structure for the vcpu that holds the vcpu fd.
224 /// * `boot_ip` - Starting instruction pointer.
225 /// * `boot_sp` - Starting stack pointer.
226 /// * `boot_si` - Must point to zero page address per Linux ABI.
setup_regs(vcpu: &dyn VcpuX86_64, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()>227 pub fn setup_regs(vcpu: &dyn VcpuX86_64, boot_ip: u64, boot_sp: u64, boot_si: u64) -> Result<()> {
228 let regs = Regs {
229 rflags: 0x0000000000000002u64,
230 rip: boot_ip,
231 rsp: boot_sp,
232 rbp: boot_sp,
233 rsi: boot_si,
234 ..Default::default()
235 };
236
237 vcpu.set_regs(®s).map_err(Error::SettingRegistersIoctl)
238 }
239
240 const X86_CR0_PE: u64 = 0x1;
241 const X86_CR0_PG: u64 = 0x80000000;
242 const X86_CR4_PAE: u64 = 0x20;
243
244 const EFER_LME: u64 = 0x100;
245 const EFER_LMA: u64 = 0x400;
246
247 const BOOT_GDT_OFFSET: u64 = 0x1500;
248 const BOOT_IDT_OFFSET: u64 = 0x1520;
249
250 const BOOT_GDT_MAX: usize = 4;
251
write_gdt_table(table: &[u64], guest_mem: &GuestMemory) -> Result<()>252 fn write_gdt_table(table: &[u64], guest_mem: &GuestMemory) -> Result<()> {
253 let boot_gdt_addr = GuestAddress(BOOT_GDT_OFFSET);
254 for (index, entry) in table.iter().enumerate() {
255 let addr = guest_mem
256 .checked_offset(boot_gdt_addr, (index * mem::size_of::<u64>()) as u64)
257 .ok_or(Error::WriteGDTFailure)?;
258 guest_mem
259 .write_obj_at_addr(*entry, addr)
260 .map_err(|_| Error::WriteGDTFailure)?;
261 }
262 Ok(())
263 }
264
write_idt_value(val: u64, guest_mem: &GuestMemory) -> Result<()>265 fn write_idt_value(val: u64, guest_mem: &GuestMemory) -> Result<()> {
266 let boot_idt_addr = GuestAddress(BOOT_IDT_OFFSET);
267 guest_mem
268 .write_obj_at_addr(val, boot_idt_addr)
269 .map_err(|_| Error::WriteIDTFailure)
270 }
271
configure_segments_and_sregs(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()>272 fn configure_segments_and_sregs(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()> {
273 let gdt_table: [u64; BOOT_GDT_MAX as usize] = [
274 gdt::gdt_entry(0, 0, 0), // NULL
275 gdt::gdt_entry(0xa09b, 0, 0xfffff), // CODE
276 gdt::gdt_entry(0xc093, 0, 0xfffff), // DATA
277 gdt::gdt_entry(0x808b, 0, 0xfffff), // TSS
278 ];
279
280 let code_seg = gdt::segment_from_gdt(gdt_table[1], 1);
281 let data_seg = gdt::segment_from_gdt(gdt_table[2], 2);
282 let tss_seg = gdt::segment_from_gdt(gdt_table[3], 3);
283
284 // Write segments
285 write_gdt_table(&gdt_table[..], mem)?;
286 sregs.gdt.base = BOOT_GDT_OFFSET as u64;
287 sregs.gdt.limit = mem::size_of_val(&gdt_table) as u16 - 1;
288
289 write_idt_value(0, mem)?;
290 sregs.idt.base = BOOT_IDT_OFFSET as u64;
291 sregs.idt.limit = mem::size_of::<u64>() as u16 - 1;
292
293 sregs.cs = code_seg;
294 sregs.ds = data_seg;
295 sregs.es = data_seg;
296 sregs.fs = data_seg;
297 sregs.gs = data_seg;
298 sregs.ss = data_seg;
299 sregs.tr = tss_seg;
300
301 /* 64-bit protected mode */
302 sregs.cr0 |= X86_CR0_PE;
303 sregs.efer |= EFER_LME;
304
305 Ok(())
306 }
307
setup_page_tables(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()>308 fn setup_page_tables(mem: &GuestMemory, sregs: &mut Sregs) -> Result<()> {
309 // Puts PML4 right after zero page but aligned to 4k.
310 let boot_pml4_addr = GuestAddress(0x9000);
311 let boot_pdpte_addr = GuestAddress(0xa000);
312 let boot_pde_addr = GuestAddress(0xb000);
313
314 // Entry covering VA [0..512GB)
315 mem.write_obj_at_addr(boot_pdpte_addr.offset() as u64 | 0x03, boot_pml4_addr)
316 .map_err(|_| Error::WritePML4Address)?;
317
318 // Entry covering VA [0..1GB)
319 mem.write_obj_at_addr(boot_pde_addr.offset() as u64 | 0x03, boot_pdpte_addr)
320 .map_err(|_| Error::WritePDPTEAddress)?;
321
322 // 512 2MB entries together covering VA [0..1GB). Note we are assuming
323 // CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do.
324 for i in 0..512 {
325 mem.write_obj_at_addr((i << 21) + 0x83u64, boot_pde_addr.unchecked_add(i * 8))
326 .map_err(|_| Error::WritePDEAddress)?;
327 }
328 sregs.cr3 = boot_pml4_addr.offset() as u64;
329 sregs.cr4 |= X86_CR4_PAE;
330 sregs.cr0 |= X86_CR0_PG;
331 sregs.efer |= EFER_LMA; // Long mode is active. Must be auto-enabled with CR0_PG.
332 Ok(())
333 }
334
335 /// Configures the segment registers and system page tables for a given CPU.
336 ///
337 /// # Arguments
338 ///
339 /// * `mem` - The memory that will be passed to the guest.
340 /// * `vcpu` - The VCPU to configure registers on.
setup_sregs(mem: &GuestMemory, vcpu: &dyn VcpuX86_64) -> Result<()>341 pub fn setup_sregs(mem: &GuestMemory, vcpu: &dyn VcpuX86_64) -> Result<()> {
342 let mut sregs = vcpu.get_sregs().map_err(Error::GetSRegsIoctlFailed)?;
343
344 configure_segments_and_sregs(mem, &mut sregs)?;
345 setup_page_tables(mem, &mut sregs)?; // TODO(dgreid) - Can this be done once per system instead?
346
347 vcpu.set_sregs(&sregs).map_err(Error::SetSRegsIoctlFailed)?;
348
349 Ok(())
350 }
351
352 #[cfg(test)]
353 mod tests {
354 use super::*;
355 use vm_memory::{GuestAddress, GuestMemory};
356
create_guest_mem() -> GuestMemory357 fn create_guest_mem() -> GuestMemory {
358 GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap()
359 }
360
read_u64(gm: &GuestMemory, offset: u64) -> u64361 fn read_u64(gm: &GuestMemory, offset: u64) -> u64 {
362 let read_addr = GuestAddress(offset);
363 gm.read_obj_from_addr(read_addr).unwrap()
364 }
365
366 #[test]
segments_and_sregs()367 fn segments_and_sregs() {
368 let mut sregs = Default::default();
369 let gm = create_guest_mem();
370 configure_segments_and_sregs(&gm, &mut sregs).unwrap();
371
372 assert_eq!(0x0, read_u64(&gm, BOOT_GDT_OFFSET));
373 assert_eq!(0xaf9b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 8));
374 assert_eq!(0xcf93000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 16));
375 assert_eq!(0x8f8b000000ffff, read_u64(&gm, BOOT_GDT_OFFSET + 24));
376 assert_eq!(0x0, read_u64(&gm, BOOT_IDT_OFFSET));
377
378 assert_eq!(0, sregs.cs.base);
379 assert_eq!(0xfffff, sregs.ds.limit);
380 assert_eq!(0x10, sregs.es.selector);
381 assert_eq!(1, sregs.fs.present);
382 assert_eq!(1, sregs.gs.g);
383 assert_eq!(0, sregs.ss.avl);
384 assert_eq!(0, sregs.tr.base);
385 assert_eq!(0xfffff, sregs.tr.limit);
386 assert_eq!(0, sregs.tr.avl);
387 assert_eq!(X86_CR0_PE, sregs.cr0);
388 assert_eq!(EFER_LME, sregs.efer);
389 }
390
391 #[test]
page_tables()392 fn page_tables() {
393 let mut sregs = Default::default();
394 let gm = create_guest_mem();
395 setup_page_tables(&gm, &mut sregs).unwrap();
396
397 assert_eq!(0xa003, read_u64(&gm, 0x9000));
398 assert_eq!(0xb003, read_u64(&gm, 0xa000));
399 for i in 0..512 {
400 assert_eq!((i << 21) + 0x83u64, read_u64(&gm, 0xb000 + i * 8));
401 }
402
403 assert_eq!(0x9000, sregs.cr3);
404 assert_eq!(X86_CR4_PAE, sregs.cr4);
405 assert_eq!(X86_CR0_PG, sregs.cr0);
406 }
407 }
408