• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/sizes.h>
7 #include <linux/vfio_pci_core.h>
8 
9 /*
10  * The device memory usable to the workloads running in the VM is cached
11  * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
12  * to the VM and is represented as usemem.
13  * Moreover, the VM GPU device driver needs a non-cacheable region to
14  * support the MIG feature. This region is also exposed as a 64b BAR
15  * (comprising of BAR2 and BAR3 region) and represented as resmem.
16  */
17 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
18 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
19 
20 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
21 #define MEMBLK_SIZE SZ_512M
22 
23 #define DVSEC_BITMAP_OFFSET 0xA
24 #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
25 
26 #define GPU_CAP_DVSEC_REGISTER 3
27 
28 /*
29  * The state of the two device memory region - resmem and usemem - is
30  * saved as struct mem_region.
31  */
32 struct mem_region {
33 	phys_addr_t memphys;    /* Base physical address of the region */
34 	size_t memlength;       /* Region size */
35 	size_t bar_size;        /* Reported region BAR size */
36 	__le64 bar_val;         /* Emulated BAR offset registers */
37 	union {
38 		void *memaddr;
39 		void __iomem *ioaddr;
40 	};                      /* Base virtual address of the region */
41 };
42 
43 struct nvgrace_gpu_pci_core_device {
44 	struct vfio_pci_core_device core_device;
45 	/* Cached and usable memory for the VM. */
46 	struct mem_region usemem;
47 	/* Non cached memory carved out from the end of device memory */
48 	struct mem_region resmem;
49 	/* Lock to control device memory kernel mapping */
50 	struct mutex remap_lock;
51 	bool has_mig_hw_bug;
52 };
53 
nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device * core_vdev)54 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
55 {
56 	struct nvgrace_gpu_pci_core_device *nvdev =
57 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
58 			     core_device.vdev);
59 
60 	nvdev->resmem.bar_val = 0;
61 	nvdev->usemem.bar_val = 0;
62 }
63 
64 /* Choose the structure corresponding to the fake BAR with a given index. */
65 static struct mem_region *
nvgrace_gpu_memregion(int index,struct nvgrace_gpu_pci_core_device * nvdev)66 nvgrace_gpu_memregion(int index,
67 		      struct nvgrace_gpu_pci_core_device *nvdev)
68 {
69 	if (index == USEMEM_REGION_INDEX)
70 		return &nvdev->usemem;
71 
72 	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
73 		return &nvdev->resmem;
74 
75 	return NULL;
76 }
77 
nvgrace_gpu_open_device(struct vfio_device * core_vdev)78 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
79 {
80 	struct vfio_pci_core_device *vdev =
81 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
82 	struct nvgrace_gpu_pci_core_device *nvdev =
83 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
84 			     core_device.vdev);
85 	int ret;
86 
87 	ret = vfio_pci_core_enable(vdev);
88 	if (ret)
89 		return ret;
90 
91 	if (nvdev->usemem.memlength) {
92 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
93 		mutex_init(&nvdev->remap_lock);
94 	}
95 
96 	vfio_pci_core_finish_enable(vdev);
97 
98 	return 0;
99 }
100 
nvgrace_gpu_close_device(struct vfio_device * core_vdev)101 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
102 {
103 	struct nvgrace_gpu_pci_core_device *nvdev =
104 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
105 			     core_device.vdev);
106 
107 	/* Unmap the mapping to the device memory cached region */
108 	if (nvdev->usemem.memaddr) {
109 		memunmap(nvdev->usemem.memaddr);
110 		nvdev->usemem.memaddr = NULL;
111 	}
112 
113 	/* Unmap the mapping to the device memory non-cached region */
114 	if (nvdev->resmem.ioaddr) {
115 		iounmap(nvdev->resmem.ioaddr);
116 		nvdev->resmem.ioaddr = NULL;
117 	}
118 
119 	mutex_destroy(&nvdev->remap_lock);
120 
121 	vfio_pci_core_close_device(core_vdev);
122 }
123 
nvgrace_gpu_mmap(struct vfio_device * core_vdev,struct vm_area_struct * vma)124 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
125 			    struct vm_area_struct *vma)
126 {
127 	struct nvgrace_gpu_pci_core_device *nvdev =
128 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
129 			     core_device.vdev);
130 	struct mem_region *memregion;
131 	unsigned long start_pfn;
132 	u64 req_len, pgoff, end;
133 	unsigned int index;
134 	int ret = 0;
135 
136 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
137 
138 	memregion = nvgrace_gpu_memregion(index, nvdev);
139 	if (!memregion)
140 		return vfio_pci_core_mmap(core_vdev, vma);
141 
142 	/*
143 	 * Request to mmap the BAR. Map to the CPU accessible memory on the
144 	 * GPU using the memory information gathered from the system ACPI
145 	 * tables.
146 	 */
147 	pgoff = vma->vm_pgoff &
148 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
149 
150 	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
151 	    check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
152 	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
153 		return -EOVERFLOW;
154 
155 	/*
156 	 * Check that the mapping request does not go beyond available device
157 	 * memory size
158 	 */
159 	if (end > memregion->memlength)
160 		return -EINVAL;
161 
162 	/*
163 	 * The carved out region of the device memory needs the NORMAL_NC
164 	 * property. Communicate as such to the hypervisor.
165 	 */
166 	if (index == RESMEM_REGION_INDEX) {
167 		/*
168 		 * The nvgrace-gpu module has no issues with uncontained
169 		 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
170 		 * set to communicate to the KVM to S2 map as NORMAL_NC.
171 		 * This opens up guest usage of NORMAL_NC for this mapping.
172 		 */
173 		vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
174 
175 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
176 	}
177 
178 	/*
179 	 * Perform a PFN map to the memory and back the device BAR by the
180 	 * GPU memory.
181 	 *
182 	 * The available GPU memory size may not be power-of-2 aligned. The
183 	 * remainder is only backed by vfio_device_ops read/write handlers.
184 	 *
185 	 * During device reset, the GPU is safely disconnected to the CPU
186 	 * and access to the BAR will be immediately returned preventing
187 	 * machine check.
188 	 */
189 	ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
190 			      req_len, vma->vm_page_prot);
191 	if (ret)
192 		return ret;
193 
194 	vma->vm_pgoff = start_pfn;
195 
196 	return 0;
197 }
198 
199 static long
nvgrace_gpu_ioctl_get_region_info(struct vfio_device * core_vdev,unsigned long arg)200 nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
201 				  unsigned long arg)
202 {
203 	struct nvgrace_gpu_pci_core_device *nvdev =
204 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
205 			     core_device.vdev);
206 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
207 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
208 	struct vfio_region_info_cap_sparse_mmap *sparse;
209 	struct vfio_region_info info;
210 	struct mem_region *memregion;
211 	u32 size;
212 	int ret;
213 
214 	if (copy_from_user(&info, (void __user *)arg, minsz))
215 		return -EFAULT;
216 
217 	if (info.argsz < minsz)
218 		return -EINVAL;
219 
220 	/*
221 	 * Request to determine the BAR region information. Send the
222 	 * GPU memory information.
223 	 */
224 	memregion = nvgrace_gpu_memregion(info.index, nvdev);
225 	if (!memregion)
226 		return vfio_pci_core_ioctl(core_vdev,
227 					   VFIO_DEVICE_GET_REGION_INFO, arg);
228 
229 	size = struct_size(sparse, areas, 1);
230 
231 	/*
232 	 * Setup for sparse mapping for the device memory. Only the
233 	 * available device memory on the hardware is shown as a
234 	 * mappable region.
235 	 */
236 	sparse = kzalloc(size, GFP_KERNEL);
237 	if (!sparse)
238 		return -ENOMEM;
239 
240 	sparse->nr_areas = 1;
241 	sparse->areas[0].offset = 0;
242 	sparse->areas[0].size = memregion->memlength;
243 	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
244 	sparse->header.version = 1;
245 
246 	ret = vfio_info_add_capability(&caps, &sparse->header, size);
247 	kfree(sparse);
248 	if (ret)
249 		return ret;
250 
251 	info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
252 	/*
253 	 * The region memory size may not be power-of-2 aligned.
254 	 * Given that the memory  as a BAR and may not be
255 	 * aligned, roundup to the next power-of-2.
256 	 */
257 	info.size = memregion->bar_size;
258 	info.flags = VFIO_REGION_INFO_FLAG_READ |
259 		     VFIO_REGION_INFO_FLAG_WRITE |
260 		     VFIO_REGION_INFO_FLAG_MMAP;
261 
262 	if (caps.size) {
263 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
264 		if (info.argsz < sizeof(info) + caps.size) {
265 			info.argsz = sizeof(info) + caps.size;
266 			info.cap_offset = 0;
267 		} else {
268 			vfio_info_cap_shift(&caps, sizeof(info));
269 			if (copy_to_user((void __user *)arg +
270 					 sizeof(info), caps.buf,
271 					 caps.size)) {
272 				kfree(caps.buf);
273 				return -EFAULT;
274 			}
275 			info.cap_offset = sizeof(info);
276 		}
277 		kfree(caps.buf);
278 	}
279 	return copy_to_user((void __user *)arg, &info, minsz) ?
280 			    -EFAULT : 0;
281 }
282 
nvgrace_gpu_ioctl(struct vfio_device * core_vdev,unsigned int cmd,unsigned long arg)283 static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
284 			      unsigned int cmd, unsigned long arg)
285 {
286 	switch (cmd) {
287 	case VFIO_DEVICE_GET_REGION_INFO:
288 		return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
289 	case VFIO_DEVICE_IOEVENTFD:
290 		return -ENOTTY;
291 	case VFIO_DEVICE_RESET:
292 		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
293 		fallthrough;
294 	default:
295 		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
296 	}
297 }
298 
299 static __le64
nvgrace_gpu_get_read_value(size_t bar_size,u64 flags,__le64 val64)300 nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
301 {
302 	u64 tmp_val;
303 
304 	tmp_val = le64_to_cpu(val64);
305 	tmp_val &= ~(bar_size - 1);
306 	tmp_val |= flags;
307 
308 	return cpu_to_le64(tmp_val);
309 }
310 
311 /*
312  * Both the usable (usemem) and the reserved (resmem) device memory region
313  * are exposed as a 64b fake device BARs in the VM. These fake BARs must
314  * respond to the accesses on their respective PCI config space offsets.
315  *
316  * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
317  * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
318  */
319 static ssize_t
nvgrace_gpu_read_config_emu(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)320 nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
321 			    char __user *buf, size_t count, loff_t *ppos)
322 {
323 	struct nvgrace_gpu_pci_core_device *nvdev =
324 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
325 			     core_device.vdev);
326 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
327 	struct mem_region *memregion = NULL;
328 	__le64 val64;
329 	size_t register_offset;
330 	loff_t copy_offset;
331 	size_t copy_count;
332 	int ret;
333 
334 	ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
335 	if (ret < 0)
336 		return ret;
337 
338 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
339 						sizeof(val64),
340 						&copy_offset, &copy_count,
341 						&register_offset))
342 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
343 	else if (vfio_pci_core_range_intersect_range(pos, count,
344 						     PCI_BASE_ADDRESS_4,
345 						     sizeof(val64),
346 						     &copy_offset, &copy_count,
347 						     &register_offset))
348 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
349 
350 	if (memregion) {
351 		val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
352 						   PCI_BASE_ADDRESS_MEM_TYPE_64 |
353 						   PCI_BASE_ADDRESS_MEM_PREFETCH,
354 						   memregion->bar_val);
355 		if (copy_to_user(buf + copy_offset,
356 				 (void *)&val64 + register_offset, copy_count)) {
357 			/*
358 			 * The position has been incremented in
359 			 * vfio_pci_core_read. Reset the offset back to the
360 			 * starting position.
361 			 */
362 			*ppos -= count;
363 			return -EFAULT;
364 		}
365 	}
366 
367 	return count;
368 }
369 
370 static ssize_t
nvgrace_gpu_write_config_emu(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)371 nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
372 			     const char __user *buf, size_t count, loff_t *ppos)
373 {
374 	struct nvgrace_gpu_pci_core_device *nvdev =
375 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
376 			     core_device.vdev);
377 	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
378 	struct mem_region *memregion = NULL;
379 	size_t register_offset;
380 	loff_t copy_offset;
381 	size_t copy_count;
382 
383 	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
384 						sizeof(u64), &copy_offset,
385 						&copy_count, &register_offset))
386 		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
387 	else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
388 						     sizeof(u64), &copy_offset,
389 						     &copy_count, &register_offset))
390 		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
391 
392 	if (memregion) {
393 		if (copy_from_user((void *)&memregion->bar_val + register_offset,
394 				   buf + copy_offset, copy_count))
395 			return -EFAULT;
396 		*ppos += copy_count;
397 		return copy_count;
398 	}
399 
400 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
401 }
402 
403 /*
404  * Ad hoc map the device memory in the module kernel VA space. Primarily needed
405  * as vfio does not require the userspace driver to only perform accesses through
406  * mmaps of the vfio-pci BAR regions and such accesses should be supported using
407  * vfio_device_ops read/write implementations.
408  *
409  * The usemem region is cacheable memory and hence is memremaped.
410  * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
411  */
412 static int
nvgrace_gpu_map_device_mem(int index,struct nvgrace_gpu_pci_core_device * nvdev)413 nvgrace_gpu_map_device_mem(int index,
414 			   struct nvgrace_gpu_pci_core_device *nvdev)
415 {
416 	struct mem_region *memregion;
417 	int ret = 0;
418 
419 	memregion = nvgrace_gpu_memregion(index, nvdev);
420 	if (!memregion)
421 		return -EINVAL;
422 
423 	mutex_lock(&nvdev->remap_lock);
424 
425 	if (memregion->memaddr)
426 		goto unlock;
427 
428 	if (index == USEMEM_REGION_INDEX)
429 		memregion->memaddr = memremap(memregion->memphys,
430 					      memregion->memlength,
431 					      MEMREMAP_WB);
432 	else
433 		memregion->ioaddr = ioremap_wc(memregion->memphys,
434 					       memregion->memlength);
435 
436 	if (!memregion->memaddr)
437 		ret = -ENOMEM;
438 
439 unlock:
440 	mutex_unlock(&nvdev->remap_lock);
441 
442 	return ret;
443 }
444 
445 /*
446  * Read the data from the device memory (mapped either through ioremap
447  * or memremap) into the user buffer.
448  */
449 static int
nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t mem_count,loff_t * ppos)450 nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
451 			 char __user *buf, size_t mem_count, loff_t *ppos)
452 {
453 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
454 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
455 	int ret;
456 
457 	if (!mem_count)
458 		return 0;
459 
460 	/*
461 	 * Handle read on the BAR regions. Map to the target device memory
462 	 * physical address and copy to the request read buffer.
463 	 */
464 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
465 	if (ret)
466 		return ret;
467 
468 	if (index == USEMEM_REGION_INDEX) {
469 		if (copy_to_user(buf,
470 				 (u8 *)nvdev->usemem.memaddr + offset,
471 				 mem_count))
472 			ret = -EFAULT;
473 	} else {
474 		/*
475 		 * The hardware ensures that the system does not crash when
476 		 * the device memory is accessed with the memory enable
477 		 * turned off. It synthesizes ~0 on such read. So there is
478 		 * no need to check or support the disablement/enablement of
479 		 * BAR through PCI_COMMAND config space register. Pass
480 		 * test_mem flag as false.
481 		 */
482 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
483 					     nvdev->resmem.ioaddr,
484 					     buf, offset, mem_count,
485 					     0, 0, false);
486 	}
487 
488 	return ret;
489 }
490 
491 /*
492  * Read count bytes from the device memory at an offset. The actual device
493  * memory size (available) may not be a power-of-2. So the driver fakes
494  * the size to a power-of-2 (reported) when exposing to a user space driver.
495  *
496  * Reads starting beyond the reported size generate -EINVAL; reads extending
497  * beyond the actual device size is filled with ~0; reads extending beyond
498  * the reported size are truncated.
499  */
500 static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device * nvdev,char __user * buf,size_t count,loff_t * ppos)501 nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
502 		     char __user *buf, size_t count, loff_t *ppos)
503 {
504 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
505 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
506 	struct mem_region *memregion;
507 	size_t mem_count, i;
508 	u8 val = 0xFF;
509 	int ret;
510 
511 	/* No need to do NULL check as caller does. */
512 	memregion = nvgrace_gpu_memregion(index, nvdev);
513 
514 	if (offset >= memregion->bar_size)
515 		return -EINVAL;
516 
517 	/* Clip short the read request beyond reported BAR size */
518 	count = min(count, memregion->bar_size - (size_t)offset);
519 
520 	/*
521 	 * Determine how many bytes to be actually read from the device memory.
522 	 * Read request beyond the actual device memory size is filled with ~0,
523 	 * while those beyond the actual reported size is skipped.
524 	 */
525 	if (offset >= memregion->memlength)
526 		mem_count = 0;
527 	else
528 		mem_count = min(count, memregion->memlength - (size_t)offset);
529 
530 	ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
531 	if (ret)
532 		return ret;
533 
534 	/*
535 	 * Only the device memory present on the hardware is mapped, which may
536 	 * not be power-of-2 aligned. A read to an offset beyond the device memory
537 	 * size is filled with ~0.
538 	 */
539 	for (i = mem_count; i < count; i++) {
540 		ret = put_user(val, (unsigned char __user *)(buf + i));
541 		if (ret)
542 			return ret;
543 	}
544 
545 	*ppos += count;
546 	return count;
547 }
548 
549 static ssize_t
nvgrace_gpu_read(struct vfio_device * core_vdev,char __user * buf,size_t count,loff_t * ppos)550 nvgrace_gpu_read(struct vfio_device *core_vdev,
551 		 char __user *buf, size_t count, loff_t *ppos)
552 {
553 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
554 	struct nvgrace_gpu_pci_core_device *nvdev =
555 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
556 			     core_device.vdev);
557 
558 	if (nvgrace_gpu_memregion(index, nvdev))
559 		return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
560 
561 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
562 		return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
563 
564 	return vfio_pci_core_read(core_vdev, buf, count, ppos);
565 }
566 
567 /*
568  * Write the data to the device memory (mapped either through ioremap
569  * or memremap) from the user buffer.
570  */
571 static int
nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device * nvdev,const char __user * buf,size_t mem_count,loff_t * ppos)572 nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
573 			  const char __user *buf, size_t mem_count,
574 			  loff_t *ppos)
575 {
576 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
577 	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
578 	int ret;
579 
580 	if (!mem_count)
581 		return 0;
582 
583 	ret = nvgrace_gpu_map_device_mem(index, nvdev);
584 	if (ret)
585 		return ret;
586 
587 	if (index == USEMEM_REGION_INDEX) {
588 		if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
589 				   buf, mem_count))
590 			return -EFAULT;
591 	} else {
592 		/*
593 		 * The hardware ensures that the system does not crash when
594 		 * the device memory is accessed with the memory enable
595 		 * turned off. It drops such writes. So there is no need to
596 		 * check or support the disablement/enablement of BAR
597 		 * through PCI_COMMAND config space register. Pass test_mem
598 		 * flag as false.
599 		 */
600 		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
601 					     nvdev->resmem.ioaddr,
602 					     (char __user *)buf, pos, mem_count,
603 					     0, 0, true);
604 	}
605 
606 	return ret;
607 }
608 
609 /*
610  * Write count bytes to the device memory at a given offset. The actual device
611  * memory size (available) may not be a power-of-2. So the driver fakes the
612  * size to a power-of-2 (reported) when exposing to a user space driver.
613  *
614  * Writes extending beyond the reported size are truncated; writes starting
615  * beyond the reported size generate -EINVAL.
616  */
617 static ssize_t
nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device * nvdev,size_t count,loff_t * ppos,const char __user * buf)618 nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
619 		      size_t count, loff_t *ppos, const char __user *buf)
620 {
621 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
622 	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
623 	struct mem_region *memregion;
624 	size_t mem_count;
625 	int ret = 0;
626 
627 	/* No need to do NULL check as caller does. */
628 	memregion = nvgrace_gpu_memregion(index, nvdev);
629 
630 	if (offset >= memregion->bar_size)
631 		return -EINVAL;
632 
633 	/* Clip short the write request beyond reported BAR size */
634 	count = min(count, memregion->bar_size - (size_t)offset);
635 
636 	/*
637 	 * Determine how many bytes to be actually written to the device memory.
638 	 * Do not write to the offset beyond available size.
639 	 */
640 	if (offset >= memregion->memlength)
641 		goto exitfn;
642 
643 	/*
644 	 * Only the device memory present on the hardware is mapped, which may
645 	 * not be power-of-2 aligned. Drop access outside the available device
646 	 * memory on the hardware.
647 	 */
648 	mem_count = min(count, memregion->memlength - (size_t)offset);
649 
650 	ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
651 	if (ret)
652 		return ret;
653 
654 exitfn:
655 	*ppos += count;
656 	return count;
657 }
658 
659 static ssize_t
nvgrace_gpu_write(struct vfio_device * core_vdev,const char __user * buf,size_t count,loff_t * ppos)660 nvgrace_gpu_write(struct vfio_device *core_vdev,
661 		  const char __user *buf, size_t count, loff_t *ppos)
662 {
663 	struct nvgrace_gpu_pci_core_device *nvdev =
664 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
665 			     core_device.vdev);
666 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
667 
668 	if (nvgrace_gpu_memregion(index, nvdev))
669 		return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
670 
671 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
672 		return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
673 
674 	return vfio_pci_core_write(core_vdev, buf, count, ppos);
675 }
676 
677 static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
678 	.name		= "nvgrace-gpu-vfio-pci",
679 	.init		= vfio_pci_core_init_dev,
680 	.release	= vfio_pci_core_release_dev,
681 	.open_device	= nvgrace_gpu_open_device,
682 	.close_device	= nvgrace_gpu_close_device,
683 	.ioctl		= nvgrace_gpu_ioctl,
684 	.device_feature	= vfio_pci_core_ioctl_feature,
685 	.read		= nvgrace_gpu_read,
686 	.write		= nvgrace_gpu_write,
687 	.mmap		= nvgrace_gpu_mmap,
688 	.request	= vfio_pci_core_request,
689 	.match		= vfio_pci_core_match,
690 	.bind_iommufd	= vfio_iommufd_physical_bind,
691 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
692 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
693 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
694 };
695 
696 static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
697 	.name		= "nvgrace-gpu-vfio-pci-core",
698 	.init		= vfio_pci_core_init_dev,
699 	.release	= vfio_pci_core_release_dev,
700 	.open_device	= nvgrace_gpu_open_device,
701 	.close_device	= vfio_pci_core_close_device,
702 	.ioctl		= vfio_pci_core_ioctl,
703 	.device_feature	= vfio_pci_core_ioctl_feature,
704 	.read		= vfio_pci_core_read,
705 	.write		= vfio_pci_core_write,
706 	.mmap		= vfio_pci_core_mmap,
707 	.request	= vfio_pci_core_request,
708 	.match		= vfio_pci_core_match,
709 	.bind_iommufd	= vfio_iommufd_physical_bind,
710 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
711 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
712 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
713 };
714 
715 static int
nvgrace_gpu_fetch_memory_property(struct pci_dev * pdev,u64 * pmemphys,u64 * pmemlength)716 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
717 				  u64 *pmemphys, u64 *pmemlength)
718 {
719 	int ret;
720 
721 	/*
722 	 * The memory information is present in the system ACPI tables as DSD
723 	 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
724 	 */
725 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
726 				       pmemphys);
727 	if (ret)
728 		return ret;
729 
730 	if (*pmemphys > type_max(phys_addr_t))
731 		return -EOVERFLOW;
732 
733 	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
734 				       pmemlength);
735 	if (ret)
736 		return ret;
737 
738 	if (*pmemlength > type_max(size_t))
739 		return -EOVERFLOW;
740 
741 	/*
742 	 * If the C2C link is not up due to an error, the coherent device
743 	 * memory size is returned as 0. Fail in such case.
744 	 */
745 	if (*pmemlength == 0)
746 		return -ENOMEM;
747 
748 	return ret;
749 }
750 
751 static int
nvgrace_gpu_init_nvdev_struct(struct pci_dev * pdev,struct nvgrace_gpu_pci_core_device * nvdev,u64 memphys,u64 memlength)752 nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
753 			      struct nvgrace_gpu_pci_core_device *nvdev,
754 			      u64 memphys, u64 memlength)
755 {
756 	int ret = 0;
757 	u64 resmem_size = 0;
758 
759 	/*
760 	 * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
761 	 * region to support the MIG feature owing to a hardware bug. Since the
762 	 * device memory is mapped as NORMAL cached, carve out a region from the end
763 	 * with a different NORMAL_NC property (called as reserved memory and
764 	 * represented as resmem). This region then is exposed as a 64b BAR
765 	 * (region 2 and 3) to the VM, while exposing the rest (termed as usable
766 	 * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
767 	 *
768 	 *               devmem (memlength)
769 	 * |-------------------------------------------------|
770 	 * |                                           |
771 	 * usemem.memphys                              resmem.memphys
772 	 *
773 	 * This hardware bug is fixed on the Grace Blackwell platforms and the
774 	 * presence of the bug can be determined through nvdev->has_mig_hw_bug.
775 	 * Thus on systems with the hardware fix, there is no need to partition
776 	 * the GPU device memory and the entire memory is usable and mapped as
777 	 * NORMAL cached (i.e. resmem size is 0).
778 	 */
779 	if (nvdev->has_mig_hw_bug)
780 		resmem_size = SZ_1G;
781 
782 	nvdev->usemem.memphys = memphys;
783 
784 	/*
785 	 * The device memory exposed to the VM is added to the kernel by the
786 	 * VM driver module in chunks of memory block size. Note that only the
787 	 * usable memory (usemem) is added to the kernel for usage by the VM
788 	 * workloads.
789 	 */
790 	if (check_sub_overflow(memlength, resmem_size,
791 			       &nvdev->usemem.memlength)) {
792 		ret = -EOVERFLOW;
793 		goto done;
794 	}
795 
796 	/*
797 	 * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
798 	 * Calculate and save the BAR size for the region.
799 	 */
800 	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
801 
802 	/*
803 	 * If the hardware has the fix for MIG, there is no requirement
804 	 * for splitting the device memory to create RESMEM. The entire
805 	 * device memory is usable and will be USEMEM. Return here for
806 	 * such case.
807 	 */
808 	if (!nvdev->has_mig_hw_bug)
809 		goto done;
810 
811 	/*
812 	 * When the device memory is split to workaround the MIG bug on
813 	 * Grace Hopper, the USEMEM part of the device memory has to be
814 	 * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
815 	 * GPU FW and VFIO driver. The VM device driver is also aware of it
816 	 * and make use of the value for its calculation to determine USEMEM
817 	 * size. Note that the device memory may not be 512M aligned.
818 	 */
819 	nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
820 					     MEMBLK_SIZE);
821 	if (nvdev->usemem.memlength == 0) {
822 		ret = -EINVAL;
823 		goto done;
824 	}
825 
826 	if ((check_add_overflow(nvdev->usemem.memphys,
827 				nvdev->usemem.memlength,
828 				&nvdev->resmem.memphys)) ||
829 	    (check_sub_overflow(memlength, nvdev->usemem.memlength,
830 				&nvdev->resmem.memlength))) {
831 		ret = -EOVERFLOW;
832 		goto done;
833 	}
834 
835 	/*
836 	 * The resmem region is exposed as a 64b BAR composed of region 2 and 3
837 	 * for Grace Hopper. Calculate and save the BAR size for the region.
838 	 */
839 	nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
840 done:
841 	return ret;
842 }
843 
nvgrace_gpu_has_mig_hw_bug(struct pci_dev * pdev)844 static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
845 {
846 	int pcie_dvsec;
847 	u16 dvsec_ctrl16;
848 
849 	pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
850 					       GPU_CAP_DVSEC_REGISTER);
851 
852 	if (pcie_dvsec) {
853 		pci_read_config_word(pdev,
854 				     pcie_dvsec + DVSEC_BITMAP_OFFSET,
855 				     &dvsec_ctrl16);
856 
857 		if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
858 			return false;
859 	}
860 
861 	return true;
862 }
863 
nvgrace_gpu_probe(struct pci_dev * pdev,const struct pci_device_id * id)864 static int nvgrace_gpu_probe(struct pci_dev *pdev,
865 			     const struct pci_device_id *id)
866 {
867 	const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
868 	struct nvgrace_gpu_pci_core_device *nvdev;
869 	u64 memphys, memlength;
870 	int ret;
871 
872 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
873 	if (!ret)
874 		ops = &nvgrace_gpu_pci_ops;
875 
876 	nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
877 				  &pdev->dev, ops);
878 	if (IS_ERR(nvdev))
879 		return PTR_ERR(nvdev);
880 
881 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
882 
883 	if (ops == &nvgrace_gpu_pci_ops) {
884 		nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
885 
886 		/*
887 		 * Device memory properties are identified in the host ACPI
888 		 * table. Set the nvgrace_gpu_pci_core_device structure.
889 		 */
890 		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
891 						    memphys, memlength);
892 		if (ret)
893 			goto out_put_vdev;
894 	}
895 
896 	ret = vfio_pci_core_register_device(&nvdev->core_device);
897 	if (ret)
898 		goto out_put_vdev;
899 
900 	return ret;
901 
902 out_put_vdev:
903 	vfio_put_device(&nvdev->core_device.vdev);
904 	return ret;
905 }
906 
nvgrace_gpu_remove(struct pci_dev * pdev)907 static void nvgrace_gpu_remove(struct pci_dev *pdev)
908 {
909 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
910 
911 	vfio_pci_core_unregister_device(core_device);
912 	vfio_put_device(&core_device->vdev);
913 }
914 
915 static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
916 	/* GH200 120GB */
917 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
918 	/* GH200 480GB */
919 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
920 	{}
921 };
922 
923 MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
924 
925 static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
926 	.name = KBUILD_MODNAME,
927 	.id_table = nvgrace_gpu_vfio_pci_table,
928 	.probe = nvgrace_gpu_probe,
929 	.remove = nvgrace_gpu_remove,
930 	.err_handler = &vfio_pci_core_err_handlers,
931 	.driver_managed_dma = true,
932 };
933 
934 module_pci_driver(nvgrace_gpu_vfio_pci_driver);
935 
936 MODULE_LICENSE("GPL");
937 MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
938 MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
939 MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");
940