1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include <linux/sizes.h>
7 #include <linux/vfio_pci_core.h>
8
9 /*
10 * The device memory usable to the workloads running in the VM is cached
11 * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
12 * to the VM and is represented as usemem.
13 * Moreover, the VM GPU device driver needs a non-cacheable region to
14 * support the MIG feature. This region is also exposed as a 64b BAR
15 * (comprising of BAR2 and BAR3 region) and represented as resmem.
16 */
17 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
18 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
19
20 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
21 #define MEMBLK_SIZE SZ_512M
22
23 #define DVSEC_BITMAP_OFFSET 0xA
24 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
25
26 #define GPU_CAP_DVSEC_REGISTER 3
27
28 /*
29 * The state of the two device memory region - resmem and usemem - is
30 * saved as struct mem_region.
31 */
32 struct mem_region {
33 phys_addr_t memphys; /* Base physical address of the region */
34 size_t memlength; /* Region size */
35 size_t bar_size; /* Reported region BAR size */
36 __le64 bar_val; /* Emulated BAR offset registers */
37 union {
38 void *memaddr;
39 void __iomem *ioaddr;
40 }; /* Base virtual address of the region */
41 };
42
43 struct nvgrace_gpu_pci_core_device {
44 struct vfio_pci_core_device core_device;
45 /* Cached and usable memory for the VM. */
46 struct mem_region usemem;
47 /* Non cached memory carved out from the end of device memory */
48 struct mem_region resmem;
49 /* Lock to control device memory kernel mapping */
50 struct mutex remap_lock;
51 bool has_mig_hw_bug;
52 };
53
nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device * core_vdev)54 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
55 {
56 struct nvgrace_gpu_pci_core_device *nvdev =
57 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
58 core_device.vdev);
59
60 nvdev->resmem.bar_val = 0;
61 nvdev->usemem.bar_val = 0;
62 }
63
64 /* Choose the structure corresponding to the fake BAR with a given index. */
65 static struct mem_region *
nvgrace_gpu_memregion(int index,struct nvgrace_gpu_pci_core_device * nvdev)66 nvgrace_gpu_memregion(int index,
67 struct nvgrace_gpu_pci_core_device *nvdev)
68 {
69 if (index == USEMEM_REGION_INDEX)
70 return &nvdev->usemem;
71
72 if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
73 return &nvdev->resmem;
74
75 return NULL;
76 }
77
nvgrace_gpu_open_device(struct vfio_device * core_vdev)78 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
79 {
80 struct vfio_pci_core_device *vdev =
81 container_of(core_vdev, struct vfio_pci_core_device, vdev);
82 struct nvgrace_gpu_pci_core_device *nvdev =
83 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
84 core_device.vdev);
85 int ret;
86
87 ret = vfio_pci_core_enable(vdev);
88 if (ret)
89 return ret;
90
91 if (nvdev->usemem.memlength) {
92 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
93 mutex_init(&nvdev->remap_lock);
94 }
95
96 vfio_pci_core_finish_enable(vdev);
97
98 return 0;
99 }
100
nvgrace_gpu_close_device(struct vfio_device * core_vdev)101 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
102 {
103 struct nvgrace_gpu_pci_core_device *nvdev =
104 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
105 core_device.vdev);
106
107 /* Unmap the mapping to the device memory cached region */
108 if (nvdev->usemem.memaddr) {
109 memunmap(nvdev->usemem.memaddr);
110 nvdev->usemem.memaddr = NULL;
111 }
112
113 /* Unmap the mapping to the device memory non-cached region */
114 if (nvdev->resmem.ioaddr) {
115 iounmap(nvdev->resmem.ioaddr);
116 nvdev->resmem.ioaddr = NULL;
117 }
118
119 mutex_destroy(&nvdev->remap_lock);
120
121 vfio_pci_core_close_device(core_vdev);
122 }
123
nvgrace_gpu_mmap(struct vfio_device * core_vdev,struct vm_area_struct * vma)124 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
125 struct vm_area_struct *vma)
126 {
127 struct nvgrace_gpu_pci_core_device *nvdev =
128 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
129 core_device.vdev);
130 struct mem_region *memregion;
131 unsigned long start_pfn;
132 u64 req_len, pgoff, end;
133 unsigned int index;
134 int ret = 0;
135
136 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
137
138 memregion = nvgrace_gpu_memregion(index, nvdev);
139 if (!memregion)
140 return vfio_pci_core_mmap(core_vdev, vma);
141
142 /*
143 * Request to mmap the BAR. Map to the CPU accessible memory on the
144 * GPU using the memory information gathered from the system ACPI
145 * tables.
146 */
147 pgoff = vma->vm_pgoff &
148 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
149
150 if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
151 check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
152 check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
153 return -EOVERFLOW;
154
155 /*
156 * Check that the mapping request does not go beyond available device
157 * memory size
158 */
159 if (end > memregion->memlength)
160 return -EINVAL;
161
162 /*
163 * The carved out region of the device memory needs the NORMAL_NC
164 * property. Communicate as such to the hypervisor.
165 */
166 if (index == RESMEM_REGION_INDEX) {
167 /*
168 * The nvgrace-gpu module has no issues with uncontained
169 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
170 * set to communicate to the KVM to S2 map as NORMAL_NC.
171 * This opens up guest usage of NORMAL_NC for this mapping.
172 */
173 vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
174
175 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
176 }
177
178 /*
179 * Perform a PFN map to the memory and back the device BAR by the
180 * GPU memory.
181 *
182 * The available GPU memory size may not be power-of-2 aligned. The
183 * remainder is only backed by vfio_device_ops read/write handlers.
184 *
185 * During device reset, the GPU is safely disconnected to the CPU
186 * and access to the BAR will be immediately returned preventing
187 * machine check.
188 */
189 ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
190 req_len, vma->vm_page_prot);
191 if (ret)
192 return ret;
193
194 vma->vm_pgoff = start_pfn;
195
196 return 0;
197 }
198
199 static long
nvgrace_gpu_ioctl_get_region_info(struct vfio_device * core_vdev,unsigned long arg)200 nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
201 unsigned long arg)
202 {
203 struct nvgrace_gpu_pci_core_device *nvdev =
204 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
205 core_device.vdev);
206 unsigned long minsz = offsetofend(struct vfio_region_info, offset);
207 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
208 struct vfio_region_info_cap_sparse_mmap *sparse;
209 struct vfio_region_info info;
210 struct mem_region *memregion;
211 u32 size;
212 int ret;
213
214 if (copy_from_user(&info, (void __user *)arg, minsz))
215 return -EFAULT;
216
217 if (info.argsz < minsz)
218 return -EINVAL;
219
220 /*
221 * Request to determine the BAR region information. Send the
222 * GPU memory information.
223 */
224 memregion = nvgrace_gpu_memregion(info.index, nvdev);
225 if (!memregion)
226 return vfio_pci_core_ioctl(core_vdev,
227 VFIO_DEVICE_GET_REGION_INFO, arg);
228
229 size = struct_size(sparse, areas, 1);
230
231 /*
232 * Setup for sparse mapping for the device memory. Only the
233 * available device memory on the hardware is shown as a
234 * mappable region.
235 */
236 sparse = kzalloc(size, GFP_KERNEL);
237 if (!sparse)
238 return -ENOMEM;
239
240 sparse->nr_areas = 1;
241 sparse->areas[0].offset = 0;
242 sparse->areas[0].size = memregion->memlength;
243 sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
244 sparse->header.version = 1;
245
246 ret = vfio_info_add_capability(&caps, &sparse->header, size);
247 kfree(sparse);
248 if (ret)
249 return ret;
250
251 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
252 /*
253 * The region memory size may not be power-of-2 aligned.
254 * Given that the memory as a BAR and may not be
255 * aligned, roundup to the next power-of-2.
256 */
257 info.size = memregion->bar_size;
258 info.flags = VFIO_REGION_INFO_FLAG_READ |
259 VFIO_REGION_INFO_FLAG_WRITE |
260 VFIO_REGION_INFO_FLAG_MMAP;
261
262 if (caps.size) {
263 info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
264 if (info.argsz < sizeof(info) + caps.size) {
265 info.argsz = sizeof(info) + caps.size;
266 info.cap_offset = 0;
267 } else {
268 vfio_info_cap_shift(&caps, sizeof(info));
269 if (copy_to_user((void __user *)arg +
270 sizeof(info), caps.buf,
271 caps.size)) {
272 kfree(caps.buf);
273 return -EFAULT;
274 }
275 info.cap_offset = sizeof(info);
276 }
277 kfree(caps.buf);
278 }
279 return copy_to_user((void __user *)arg, &info, minsz) ?
280 -EFAULT : 0;
281 }
282
nvgrace_gpu_ioctl(struct vfio_device * core_vdev,unsigned int cmd,unsigned long arg)283 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
284 unsigned int cmd, unsigned long arg)
285 {
286 switch (cmd) {
287 case VFIO_DEVICE_GET_REGION_INFO:
288 return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
289 case VFIO_DEVICE_IOEVENTFD:
290 return -ENOTTY;
291 case VFIO_DEVICE_RESET:
292 nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
293 fallthrough;
294 default:
295 return vfio_pci_core_ioctl(core_vdev, cmd, arg);
296 }
297 }
298
299 static __le64
nvgrace_gpu_get_read_value(size_t bar_size,u64 flags,__le64 val64)300 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
301 {
302 u64 tmp_val;
303
304 tmp_val = le64_to_cpu(val64);
305 tmp_val &= ~(bar_size - 1);
306 tmp_val |= flags;
307
308 return cpu_to_le64(tmp_val);
309 }
310
311 /*
312 * Both the usable (usemem) and the reserved (resmem) device memory region
313 * are exposed as a 64b fake device BARs in the VM. These fake BARs must
314 * respond to the accesses on their respective PCI config space offsets.
315 *
316 * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
317 * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
318 */
319 static ssize_t
nvgrace_gpu_read_config_emu(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)320 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
321 char __user *buf, size_t count, loff_t *ppos)
322 {
323 struct nvgrace_gpu_pci_core_device *nvdev =
324 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
325 core_device.vdev);
326 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
327 struct mem_region *memregion = NULL;
328 __le64 val64;
329 size_t register_offset;
330 loff_t copy_offset;
331 size_t copy_count;
332 int ret;
333
334 ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
335 if (ret < 0)
336 return ret;
337
338 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
339 sizeof(val64),
340 ©_offset, ©_count,
341 ®ister_offset))
342 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
343 else if (vfio_pci_core_range_intersect_range(pos, count,
344 PCI_BASE_ADDRESS_4,
345 sizeof(val64),
346 ©_offset, ©_count,
347 ®ister_offset))
348 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
349
350 if (memregion) {
351 val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
352 PCI_BASE_ADDRESS_MEM_TYPE_64 |
353 PCI_BASE_ADDRESS_MEM_PREFETCH,
354 memregion->bar_val);
355 if (copy_to_user(buf + copy_offset,
356 (void *)&val64 + register_offset, copy_count)) {
357 /*
358 * The position has been incremented in
359 * vfio_pci_core_read. Reset the offset back to the
360 * starting position.
361 */
362 *ppos -= count;
363 return -EFAULT;
364 }
365 }
366
367 return count;
368 }
369
370 static ssize_t
nvgrace_gpu_write_config_emu(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)371 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
372 const char __user *buf, size_t count, loff_t *ppos)
373 {
374 struct nvgrace_gpu_pci_core_device *nvdev =
375 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
376 core_device.vdev);
377 u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
378 struct mem_region *memregion = NULL;
379 size_t register_offset;
380 loff_t copy_offset;
381 size_t copy_count;
382
383 if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
384 sizeof(u64), ©_offset,
385 ©_count, ®ister_offset))
386 memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
387 else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
388 sizeof(u64), ©_offset,
389 ©_count, ®ister_offset))
390 memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
391
392 if (memregion) {
393 if (copy_from_user((void *)&memregion->bar_val + register_offset,
394 buf + copy_offset, copy_count))
395 return -EFAULT;
396 *ppos += copy_count;
397 return copy_count;
398 }
399
400 return vfio_pci_core_write(core_vdev, buf, count, ppos);
401 }
402
403 /*
404 * Ad hoc map the device memory in the module kernel VA space. Primarily needed
405 * as vfio does not require the userspace driver to only perform accesses through
406 * mmaps of the vfio-pci BAR regions and such accesses should be supported using
407 * vfio_device_ops read/write implementations.
408 *
409 * The usemem region is cacheable memory and hence is memremaped.
410 * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
411 */
412 static int
nvgrace_gpu_map_device_mem(int index,struct nvgrace_gpu_pci_core_device * nvdev)413 nvgrace_gpu_map_device_mem(int index,
414 struct nvgrace_gpu_pci_core_device *nvdev)
415 {
416 struct mem_region *memregion;
417 int ret = 0;
418
419 memregion = nvgrace_gpu_memregion(index, nvdev);
420 if (!memregion)
421 return -EINVAL;
422
423 mutex_lock(&nvdev->remap_lock);
424
425 if (memregion->memaddr)
426 goto unlock;
427
428 if (index == USEMEM_REGION_INDEX)
429 memregion->memaddr = memremap(memregion->memphys,
430 memregion->memlength,
431 MEMREMAP_WB);
432 else
433 memregion->ioaddr = ioremap_wc(memregion->memphys,
434 memregion->memlength);
435
436 if (!memregion->memaddr)
437 ret = -ENOMEM;
438
439 unlock:
440 mutex_unlock(&nvdev->remap_lock);
441
442 return ret;
443 }
444
445 /*
446 * Read the data from the device memory (mapped either through ioremap
447 * or memremap) into the user buffer.
448 */
449 static int
nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t mem_count,loff_t * ppos)450 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
451 char __user *buf, size_t mem_count, loff_t *ppos)
452 {
453 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
454 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
455 int ret;
456
457 if (!mem_count)
458 return 0;
459
460 /*
461 * Handle read on the BAR regions. Map to the target device memory
462 * physical address and copy to the request read buffer.
463 */
464 ret = nvgrace_gpu_map_device_mem(index, nvdev);
465 if (ret)
466 return ret;
467
468 if (index == USEMEM_REGION_INDEX) {
469 if (copy_to_user(buf,
470 (u8 *)nvdev->usemem.memaddr + offset,
471 mem_count))
472 ret = -EFAULT;
473 } else {
474 /*
475 * The hardware ensures that the system does not crash when
476 * the device memory is accessed with the memory enable
477 * turned off. It synthesizes ~0 on such read. So there is
478 * no need to check or support the disablement/enablement of
479 * BAR through PCI_COMMAND config space register. Pass
480 * test_mem flag as false.
481 */
482 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
483 nvdev->resmem.ioaddr,
484 buf, offset, mem_count,
485 0, 0, false);
486 }
487
488 return ret;
489 }
490
491 /*
492 * Read count bytes from the device memory at an offset. The actual device
493 * memory size (available) may not be a power-of-2. So the driver fakes
494 * the size to a power-of-2 (reported) when exposing to a user space driver.
495 *
496 * Reads starting beyond the reported size generate -EINVAL; reads extending
497 * beyond the actual device size is filled with ~0; reads extending beyond
498 * the reported size are truncated.
499 */
500 static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t count,loff_t * ppos)501 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
502 char __user *buf, size_t count, loff_t *ppos)
503 {
504 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
505 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
506 struct mem_region *memregion;
507 size_t mem_count, i;
508 u8 val = 0xFF;
509 int ret;
510
511 /* No need to do NULL check as caller does. */
512 memregion = nvgrace_gpu_memregion(index, nvdev);
513
514 if (offset >= memregion->bar_size)
515 return -EINVAL;
516
517 /* Clip short the read request beyond reported BAR size */
518 count = min(count, memregion->bar_size - (size_t)offset);
519
520 /*
521 * Determine how many bytes to be actually read from the device memory.
522 * Read request beyond the actual device memory size is filled with ~0,
523 * while those beyond the actual reported size is skipped.
524 */
525 if (offset >= memregion->memlength)
526 mem_count = 0;
527 else
528 mem_count = min(count, memregion->memlength - (size_t)offset);
529
530 ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
531 if (ret)
532 return ret;
533
534 /*
535 * Only the device memory present on the hardware is mapped, which may
536 * not be power-of-2 aligned. A read to an offset beyond the device memory
537 * size is filled with ~0.
538 */
539 for (i = mem_count; i < count; i++) {
540 ret = put_user(val, (unsigned char __user *)(buf + i));
541 if (ret)
542 return ret;
543 }
544
545 *ppos += count;
546 return count;
547 }
548
549 static ssize_t
nvgrace_gpu_read(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)550 nvgrace_gpu_read(struct vfio_device *core_vdev,
551 char __user *buf, size_t count, loff_t *ppos)
552 {
553 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
554 struct nvgrace_gpu_pci_core_device *nvdev =
555 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
556 core_device.vdev);
557
558 if (nvgrace_gpu_memregion(index, nvdev))
559 return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
560
561 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
562 return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
563
564 return vfio_pci_core_read(core_vdev, buf, count, ppos);
565 }
566
567 /*
568 * Write the data to the device memory (mapped either through ioremap
569 * or memremap) from the user buffer.
570 */
571 static int
nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device * nvdev,const char __user * buf,size_t mem_count,loff_t * ppos)572 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
573 const char __user *buf, size_t mem_count,
574 loff_t *ppos)
575 {
576 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
577 loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
578 int ret;
579
580 if (!mem_count)
581 return 0;
582
583 ret = nvgrace_gpu_map_device_mem(index, nvdev);
584 if (ret)
585 return ret;
586
587 if (index == USEMEM_REGION_INDEX) {
588 if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
589 buf, mem_count))
590 return -EFAULT;
591 } else {
592 /*
593 * The hardware ensures that the system does not crash when
594 * the device memory is accessed with the memory enable
595 * turned off. It drops such writes. So there is no need to
596 * check or support the disablement/enablement of BAR
597 * through PCI_COMMAND config space register. Pass test_mem
598 * flag as false.
599 */
600 ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
601 nvdev->resmem.ioaddr,
602 (char __user *)buf, pos, mem_count,
603 0, 0, true);
604 }
605
606 return ret;
607 }
608
609 /*
610 * Write count bytes to the device memory at a given offset. The actual device
611 * memory size (available) may not be a power-of-2. So the driver fakes the
612 * size to a power-of-2 (reported) when exposing to a user space driver.
613 *
614 * Writes extending beyond the reported size are truncated; writes starting
615 * beyond the reported size generate -EINVAL.
616 */
617 static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device * nvdev,size_t count,loff_t * ppos,const char __user * buf)618 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
619 size_t count, loff_t *ppos, const char __user *buf)
620 {
621 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
622 u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
623 struct mem_region *memregion;
624 size_t mem_count;
625 int ret = 0;
626
627 /* No need to do NULL check as caller does. */
628 memregion = nvgrace_gpu_memregion(index, nvdev);
629
630 if (offset >= memregion->bar_size)
631 return -EINVAL;
632
633 /* Clip short the write request beyond reported BAR size */
634 count = min(count, memregion->bar_size - (size_t)offset);
635
636 /*
637 * Determine how many bytes to be actually written to the device memory.
638 * Do not write to the offset beyond available size.
639 */
640 if (offset >= memregion->memlength)
641 goto exitfn;
642
643 /*
644 * Only the device memory present on the hardware is mapped, which may
645 * not be power-of-2 aligned. Drop access outside the available device
646 * memory on the hardware.
647 */
648 mem_count = min(count, memregion->memlength - (size_t)offset);
649
650 ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
651 if (ret)
652 return ret;
653
654 exitfn:
655 *ppos += count;
656 return count;
657 }
658
659 static ssize_t
nvgrace_gpu_write(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)660 nvgrace_gpu_write(struct vfio_device *core_vdev,
661 const char __user *buf, size_t count, loff_t *ppos)
662 {
663 struct nvgrace_gpu_pci_core_device *nvdev =
664 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
665 core_device.vdev);
666 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
667
668 if (nvgrace_gpu_memregion(index, nvdev))
669 return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
670
671 if (index == VFIO_PCI_CONFIG_REGION_INDEX)
672 return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
673
674 return vfio_pci_core_write(core_vdev, buf, count, ppos);
675 }
676
677 static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
678 .name = "nvgrace-gpu-vfio-pci",
679 .init = vfio_pci_core_init_dev,
680 .release = vfio_pci_core_release_dev,
681 .open_device = nvgrace_gpu_open_device,
682 .close_device = nvgrace_gpu_close_device,
683 .ioctl = nvgrace_gpu_ioctl,
684 .device_feature = vfio_pci_core_ioctl_feature,
685 .read = nvgrace_gpu_read,
686 .write = nvgrace_gpu_write,
687 .mmap = nvgrace_gpu_mmap,
688 .request = vfio_pci_core_request,
689 .match = vfio_pci_core_match,
690 .bind_iommufd = vfio_iommufd_physical_bind,
691 .unbind_iommufd = vfio_iommufd_physical_unbind,
692 .attach_ioas = vfio_iommufd_physical_attach_ioas,
693 .detach_ioas = vfio_iommufd_physical_detach_ioas,
694 };
695
696 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
697 .name = "nvgrace-gpu-vfio-pci-core",
698 .init = vfio_pci_core_init_dev,
699 .release = vfio_pci_core_release_dev,
700 .open_device = nvgrace_gpu_open_device,
701 .close_device = vfio_pci_core_close_device,
702 .ioctl = vfio_pci_core_ioctl,
703 .device_feature = vfio_pci_core_ioctl_feature,
704 .read = vfio_pci_core_read,
705 .write = vfio_pci_core_write,
706 .mmap = vfio_pci_core_mmap,
707 .request = vfio_pci_core_request,
708 .match = vfio_pci_core_match,
709 .bind_iommufd = vfio_iommufd_physical_bind,
710 .unbind_iommufd = vfio_iommufd_physical_unbind,
711 .attach_ioas = vfio_iommufd_physical_attach_ioas,
712 .detach_ioas = vfio_iommufd_physical_detach_ioas,
713 };
714
715 static int
nvgrace_gpu_fetch_memory_property(struct pci_dev * pdev,u64 * pmemphys,u64 * pmemlength)716 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
717 u64 *pmemphys, u64 *pmemlength)
718 {
719 int ret;
720
721 /*
722 * The memory information is present in the system ACPI tables as DSD
723 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
724 */
725 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
726 pmemphys);
727 if (ret)
728 return ret;
729
730 if (*pmemphys > type_max(phys_addr_t))
731 return -EOVERFLOW;
732
733 ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
734 pmemlength);
735 if (ret)
736 return ret;
737
738 if (*pmemlength > type_max(size_t))
739 return -EOVERFLOW;
740
741 /*
742 * If the C2C link is not up due to an error, the coherent device
743 * memory size is returned as 0. Fail in such case.
744 */
745 if (*pmemlength == 0)
746 return -ENOMEM;
747
748 return ret;
749 }
750
751 static int
nvgrace_gpu_init_nvdev_struct(struct pci_dev * pdev,struct nvgrace_gpu_pci_core_device * nvdev,u64 memphys,u64 memlength)752 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
753 struct nvgrace_gpu_pci_core_device *nvdev,
754 u64 memphys, u64 memlength)
755 {
756 int ret = 0;
757 u64 resmem_size = 0;
758
759 /*
760 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
761 * region to support the MIG feature owing to a hardware bug. Since the
762 * device memory is mapped as NORMAL cached, carve out a region from the end
763 * with a different NORMAL_NC property (called as reserved memory and
764 * represented as resmem). This region then is exposed as a 64b BAR
765 * (region 2 and 3) to the VM, while exposing the rest (termed as usable
766 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
767 *
768 * devmem (memlength)
769 * |-------------------------------------------------|
770 * | |
771 * usemem.memphys resmem.memphys
772 *
773 * This hardware bug is fixed on the Grace Blackwell platforms and the
774 * presence of the bug can be determined through nvdev->has_mig_hw_bug.
775 * Thus on systems with the hardware fix, there is no need to partition
776 * the GPU device memory and the entire memory is usable and mapped as
777 * NORMAL cached (i.e. resmem size is 0).
778 */
779 if (nvdev->has_mig_hw_bug)
780 resmem_size = SZ_1G;
781
782 nvdev->usemem.memphys = memphys;
783
784 /*
785 * The device memory exposed to the VM is added to the kernel by the
786 * VM driver module in chunks of memory block size. Note that only the
787 * usable memory (usemem) is added to the kernel for usage by the VM
788 * workloads.
789 */
790 if (check_sub_overflow(memlength, resmem_size,
791 &nvdev->usemem.memlength)) {
792 ret = -EOVERFLOW;
793 goto done;
794 }
795
796 /*
797 * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
798 * Calculate and save the BAR size for the region.
799 */
800 nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
801
802 /*
803 * If the hardware has the fix for MIG, there is no requirement
804 * for splitting the device memory to create RESMEM. The entire
805 * device memory is usable and will be USEMEM. Return here for
806 * such case.
807 */
808 if (!nvdev->has_mig_hw_bug)
809 goto done;
810
811 /*
812 * When the device memory is split to workaround the MIG bug on
813 * Grace Hopper, the USEMEM part of the device memory has to be
814 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
815 * GPU FW and VFIO driver. The VM device driver is also aware of it
816 * and make use of the value for its calculation to determine USEMEM
817 * size. Note that the device memory may not be 512M aligned.
818 */
819 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
820 MEMBLK_SIZE);
821 if (nvdev->usemem.memlength == 0) {
822 ret = -EINVAL;
823 goto done;
824 }
825
826 if ((check_add_overflow(nvdev->usemem.memphys,
827 nvdev->usemem.memlength,
828 &nvdev->resmem.memphys)) ||
829 (check_sub_overflow(memlength, nvdev->usemem.memlength,
830 &nvdev->resmem.memlength))) {
831 ret = -EOVERFLOW;
832 goto done;
833 }
834
835 /*
836 * The resmem region is exposed as a 64b BAR composed of region 2 and 3
837 * for Grace Hopper. Calculate and save the BAR size for the region.
838 */
839 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
840 done:
841 return ret;
842 }
843
nvgrace_gpu_has_mig_hw_bug(struct pci_dev * pdev)844 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
845 {
846 int pcie_dvsec;
847 u16 dvsec_ctrl16;
848
849 pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
850 GPU_CAP_DVSEC_REGISTER);
851
852 if (pcie_dvsec) {
853 pci_read_config_word(pdev,
854 pcie_dvsec + DVSEC_BITMAP_OFFSET,
855 &dvsec_ctrl16);
856
857 if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
858 return false;
859 }
860
861 return true;
862 }
863
nvgrace_gpu_probe(struct pci_dev * pdev,const struct pci_device_id * id)864 static int nvgrace_gpu_probe(struct pci_dev *pdev,
865 const struct pci_device_id *id)
866 {
867 const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
868 struct nvgrace_gpu_pci_core_device *nvdev;
869 u64 memphys, memlength;
870 int ret;
871
872 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
873 if (!ret)
874 ops = &nvgrace_gpu_pci_ops;
875
876 nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
877 &pdev->dev, ops);
878 if (IS_ERR(nvdev))
879 return PTR_ERR(nvdev);
880
881 dev_set_drvdata(&pdev->dev, &nvdev->core_device);
882
883 if (ops == &nvgrace_gpu_pci_ops) {
884 nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
885
886 /*
887 * Device memory properties are identified in the host ACPI
888 * table. Set the nvgrace_gpu_pci_core_device structure.
889 */
890 ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
891 memphys, memlength);
892 if (ret)
893 goto out_put_vdev;
894 }
895
896 ret = vfio_pci_core_register_device(&nvdev->core_device);
897 if (ret)
898 goto out_put_vdev;
899
900 return ret;
901
902 out_put_vdev:
903 vfio_put_device(&nvdev->core_device.vdev);
904 return ret;
905 }
906
nvgrace_gpu_remove(struct pci_dev * pdev)907 static void nvgrace_gpu_remove(struct pci_dev *pdev)
908 {
909 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
910
911 vfio_pci_core_unregister_device(core_device);
912 vfio_put_device(&core_device->vdev);
913 }
914
915 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
916 /* GH200 120GB */
917 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
918 /* GH200 480GB */
919 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
920 {}
921 };
922
923 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
924
925 static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
926 .name = KBUILD_MODNAME,
927 .id_table = nvgrace_gpu_vfio_pci_table,
928 .probe = nvgrace_gpu_probe,
929 .remove = nvgrace_gpu_remove,
930 .err_handler = &vfio_pci_core_err_handlers,
931 .driver_managed_dma = true,
932 };
933
934 module_pci_driver(nvgrace_gpu_vfio_pci_driver);
935
936 MODULE_LICENSE("GPL");
937 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
938 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
939 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
940