• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use compiler::bindings::*;
5 use nak_bindings::*;
6 use nv_push_rs::Push as NvPush;
7 use nvidia_headers::classes::cla0c0::mthd as cla0c0;
8 use nvidia_headers::classes::clb1c0::mthd as clb1c0;
9 use nvidia_headers::classes::clb1c0::MAXWELL_COMPUTE_B;
10 use nvidia_headers::classes::clc3c0::mthd as clc3c0;
11 use nvidia_headers::classes::clc3c0::VOLTA_COMPUTE_A;
12 use nvidia_headers::classes::clc6c0::mthd as clc6c0;
13 use nvidia_headers::classes::clc6c0::AMPERE_COMPUTE_A;
14 
15 use std::io;
16 use std::ptr;
17 use std::ptr::NonNull;
18 use std::sync::atomic::{AtomicU64, Ordering};
19 use std::sync::Mutex;
20 
is_nvidia_device(dev: drmDevicePtr) -> bool21 unsafe fn is_nvidia_device(dev: drmDevicePtr) -> bool {
22     match (*dev).bustype as u32 {
23         DRM_BUS_PCI => {
24             let pci = &*(*dev).deviceinfo.pci;
25             pci.vendor_id == (NVIDIA_VENDOR_ID as u16)
26         }
27         _ => false,
28     }
29 }
30 
31 #[repr(C)]
32 pub struct CB0 {
33     pub data_addr_lo: u32,
34     pub data_addr_hi: u32,
35     pub data_stride: u32,
36     pub invocations: u32,
37 }
38 
39 struct BO<'a> {
40     run: &'a Runner,
41     bo: NonNull<nouveau_ws_bo>,
42     pub addr: u64,
43     pub map: *mut std::os::raw::c_void,
44 }
45 
46 impl<'a> BO<'a> {
new(run: &'a Runner, size: u64) -> io::Result<BO<'a>>47     fn new(run: &'a Runner, size: u64) -> io::Result<BO<'a>> {
48         let size = size.next_multiple_of(4096);
49 
50         let mut map: *mut std::os::raw::c_void = std::ptr::null_mut();
51         let bo = unsafe {
52             nouveau_ws_bo_new_mapped(
53                 run.dev.as_ptr(),
54                 size,
55                 0, // align
56                 NOUVEAU_WS_BO_GART,
57                 NOUVEAU_WS_BO_RDWR,
58                 ptr::from_mut(&mut map),
59             )
60         };
61         let Some(bo) = NonNull::new(bo) else {
62             return Err(io::Error::last_os_error());
63         };
64         assert!(!map.is_null());
65 
66         let addr = run.next_addr.fetch_add(size, Ordering::Relaxed);
67         assert!(addr % 4096 == 0);
68 
69         unsafe {
70             nouveau_ws_bo_bind_vma(
71                 run.dev.as_ptr(),
72                 bo.as_ptr(),
73                 addr,
74                 size,
75                 0, // bo_offset
76                 0, // pte_kind
77             );
78         }
79 
80         Ok(BO { run, bo, addr, map })
81     }
82 }
83 
84 impl Drop for BO<'_> {
drop(&mut self)85     fn drop(&mut self) {
86         unsafe {
87             nouveau_ws_bo_unbind_vma(
88                 self.run.dev.as_ptr(),
89                 self.addr,
90                 self.bo.as_ref().size,
91             );
92             nouveau_ws_bo_destroy(self.bo.as_ptr());
93         }
94     }
95 }
96 
97 pub struct Runner {
98     dev: NonNull<nouveau_ws_device>,
99     ctx: NonNull<nouveau_ws_context>,
100     syncobj: u32,
101     sync_value: Mutex<u64>,
102     next_addr: AtomicU64,
103 }
104 
105 impl<'a> Runner {
new(dev_id: Option<usize>) -> Runner106     pub fn new(dev_id: Option<usize>) -> Runner {
107         unsafe {
108             let mut drm_devices: [drmDevicePtr; 16] = std::mem::zeroed();
109             let num_drm_devices = drmGetDevices(
110                 drm_devices.as_mut_ptr(),
111                 drm_devices.len().try_into().unwrap(),
112             );
113 
114             assert!(num_drm_devices >= 0, "Failed to enumerate DRM devices");
115             let num_drm_devices: usize = num_drm_devices.try_into().unwrap();
116 
117             let drm_dev = if let Some(dev_id) = dev_id {
118                 assert!(dev_id < num_drm_devices, "Unknown device {dev_id}");
119                 assert!(
120                     is_nvidia_device(drm_devices[dev_id]),
121                     "Device {dev_id} is not an NVIDIA device",
122                 );
123                 drm_devices[dev_id]
124             } else {
125                 *drm_devices
126                     .iter()
127                     .find(|dev| is_nvidia_device(**dev))
128                     .expect("Failed to find an NVIDIA device")
129             };
130 
131             let dev = nouveau_ws_device_new(drm_dev);
132             let dev =
133                 NonNull::new(dev).expect("Failed to create nouveau device");
134 
135             drmFreeDevices(
136                 drm_devices.as_mut_ptr(),
137                 num_drm_devices.try_into().unwrap(),
138             );
139 
140             let mut ctx: *mut nouveau_ws_context = std::ptr::null_mut();
141             let err = nouveau_ws_context_create(
142                 dev.as_ptr(),
143                 NOUVEAU_WS_ENGINE_COMPUTE,
144                 &mut ctx,
145             );
146             assert!(err == 0, "Failed to create nouveau context");
147             let ctx = NonNull::new(ctx).unwrap();
148 
149             let mut syncobj = 0_u32;
150             let err = drmSyncobjCreate(dev.as_ref().fd, 0, &mut syncobj);
151             assert!(err == 0, "Failed to create syncobj");
152 
153             Runner {
154                 dev,
155                 ctx,
156                 syncobj,
157                 sync_value: Mutex::new(0),
158                 next_addr: AtomicU64::new(1 << 16),
159             }
160         }
161     }
162 
dev_info(&self) -> &nv_device_info163     pub fn dev_info(&self) -> &nv_device_info {
164         unsafe { &self.dev.as_ref().info }
165     }
166 
exec(&self, addr: u64, len: u16) -> io::Result<()>167     fn exec(&self, addr: u64, len: u16) -> io::Result<()> {
168         let sync_value = unsafe {
169             let mut sync_value = self.sync_value.lock().unwrap();
170             *sync_value += 1;
171 
172             let push = drm_nouveau_exec_push {
173                 va: addr,
174                 va_len: len.into(),
175                 flags: 0,
176             };
177             let sig = drm_nouveau_sync {
178                 flags: DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ,
179                 handle: self.syncobj,
180                 timeline_value: *sync_value,
181             };
182             let exec = drm_nouveau_exec {
183                 channel: self.ctx.as_ref().channel as u32,
184                 wait_count: 0,
185                 wait_ptr: 0,
186                 push_count: 1,
187                 push_ptr: &push as *const _ as u64,
188                 sig_count: 1,
189                 sig_ptr: &sig as *const _ as u64,
190             };
191             let err = drmIoctl(
192                 self.dev.as_ref().fd,
193                 DRM_RS_IOCTL_NOUVEAU_EXEC.into(),
194                 &exec as *const _ as *mut std::os::raw::c_void,
195             );
196             if err != 0 {
197                 return Err(io::Error::last_os_error());
198             }
199             *sync_value
200         };
201         // The close of this unsafe { } drops the lock
202 
203         unsafe {
204             let err = drmSyncobjTimelineWait(
205                 self.dev.as_ref().fd,
206                 &self.syncobj as *const _ as *mut _,
207                 &sync_value as *const _ as *mut _,
208                 1,        // num_handles
209                 i64::MAX, // timeout_nsec
210                 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
211                 std::ptr::null_mut(),
212             );
213             if err != 0 {
214                 return Err(io::Error::last_os_error());
215             }
216 
217             // Exec again to check for errors
218             let mut exec = drm_nouveau_exec {
219                 channel: self.ctx.as_ref().channel as u32,
220                 wait_count: 0,
221                 wait_ptr: 0,
222                 push_count: 0,
223                 push_ptr: 0,
224                 sig_count: 0,
225                 sig_ptr: 0,
226             };
227             let err = drmIoctl(
228                 self.dev.as_ref().fd,
229                 DRM_RS_IOCTL_NOUVEAU_EXEC.into(),
230                 ptr::from_mut(&mut exec).cast(),
231             );
232             if err != 0 {
233                 return Err(io::Error::last_os_error());
234             }
235         }
236 
237         Ok(())
238     }
239 
run_raw( &self, shader: &nak_shader_bin, invocations: u32, data_stride: u32, data: *mut std::os::raw::c_void, data_size: usize, ) -> io::Result<()>240     pub unsafe fn run_raw(
241         &self,
242         shader: &nak_shader_bin,
243         invocations: u32,
244         data_stride: u32,
245         data: *mut std::os::raw::c_void,
246         data_size: usize,
247     ) -> io::Result<()> {
248         assert!(shader.info.stage == MESA_SHADER_COMPUTE);
249         let cs_info = &shader.info.__bindgen_anon_1.cs;
250         assert!(cs_info.local_size[1] == 1 && cs_info.local_size[2] == 1);
251         let local_size = cs_info.local_size[0];
252 
253         // Compute the needed size of the buffer
254         let mut size = 0_usize;
255 
256         const MAX_PUSH_DW: usize = 256;
257         let push_offset = size;
258         size = push_offset + 4 * MAX_PUSH_DW;
259 
260         const QMD_SIZE: usize = 64 * 4;
261         let qmd_offset = size.next_multiple_of(0x100);
262         size = qmd_offset + 4 * QMD_SIZE;
263 
264         let shader_offset = size.next_multiple_of(0x80);
265         size = shader_offset + usize::try_from(shader.code_size).unwrap();
266 
267         let cb0_offset = size.next_multiple_of(256);
268         size = cb0_offset + std::mem::size_of::<CB0>();
269 
270         let data_offset = size.next_multiple_of(256);
271         size = data_offset + data_size;
272 
273         let bo = BO::new(self, size.try_into().unwrap())?;
274 
275         // Copy the data from the caller into our BO
276         let data_addr = bo.addr + u64::try_from(data_offset).unwrap();
277         let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
278         if data_size > 0 {
279             std::ptr::copy(data, data_map, data_size);
280         }
281 
282         // Fill out cb0
283         let cb0_addr = bo.addr + u64::try_from(cb0_offset).unwrap();
284         let cb0_map = bo.map.byte_offset(cb0_offset.try_into().unwrap());
285         cb0_map.cast::<CB0>().write(CB0 {
286             data_addr_lo: data_addr as u32,
287             data_addr_hi: (data_addr >> 32) as u32,
288             data_stride,
289             invocations,
290         });
291 
292         // Upload the shader
293         let shader_addr = bo.addr + u64::try_from(shader_offset).unwrap();
294         let shader_map = bo.map.byte_offset(shader_offset.try_into().unwrap());
295         std::ptr::copy(
296             shader.code,
297             shader_map,
298             shader.code_size.try_into().unwrap(),
299         );
300 
301         // Populate and upload the QMD
302         let mut qmd_cbufs: [nak_qmd_cbuf; 8] = unsafe { std::mem::zeroed() };
303         qmd_cbufs[0] = nak_qmd_cbuf {
304             index: 0,
305             size: std::mem::size_of::<CB0>()
306                 .next_multiple_of(256)
307                 .try_into()
308                 .unwrap(),
309             addr: cb0_addr,
310         };
311         let qmd_info = nak_qmd_info {
312             // Pre-Volta, we set the program region to the start of the bo
313             addr: if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
314                 shader_offset.try_into().unwrap()
315             } else {
316                 shader_addr
317             },
318             smem_size: 0,
319             smem_max: 48 * 1024,
320             global_size: [invocations.div_ceil(local_size.into()), 1, 1],
321             num_cbufs: 1,
322             cbufs: qmd_cbufs,
323         };
324 
325         let qmd_addr = bo.addr + u64::try_from(qmd_offset).unwrap();
326         let qmd_map = bo.map.byte_offset(qmd_offset.try_into().unwrap());
327         nak_fill_qmd(
328             self.dev_info(),
329             &shader.info,
330             &qmd_info,
331             qmd_map,
332             QMD_SIZE,
333         );
334 
335         // Fill out the pushbuf
336         let mut p = NvPush::new();
337 
338         p.push_method(cla0c0::SetObject {
339             class_id: self.dev_info().cls_compute.into(),
340             engine_id: 0,
341         });
342         if self.dev_info().cls_compute < VOLTA_COMPUTE_A {
343             p.push_method(cla0c0::SetProgramRegionA {
344                 address_upper: (bo.addr >> 32) as u32,
345             });
346             p.push_method(cla0c0::SetProgramRegionB {
347                 address_lower: bo.addr as u32,
348             });
349         }
350 
351         let smem_base_addr = 0xfe000000_u32;
352         let lmem_base_addr = 0xff000000_u32;
353         if self.dev_info().cls_compute >= VOLTA_COMPUTE_A {
354             p.push_method(clc3c0::SetShaderSharedMemoryWindowA {
355                 base_address_upper: 0,
356             });
357             p.push_method(clc3c0::SetShaderSharedMemoryWindowB {
358                 base_address: smem_base_addr,
359             });
360 
361             p.push_method(clc3c0::SetShaderLocalMemoryWindowA {
362                 base_address_upper: 0,
363             });
364             p.push_method(clc3c0::SetShaderLocalMemoryWindowB {
365                 base_address: lmem_base_addr,
366             });
367         } else {
368             p.push_method(cla0c0::SetShaderSharedMemoryWindow {
369                 base_address: smem_base_addr,
370             });
371             p.push_method(cla0c0::SetShaderLocalMemoryWindow {
372                 base_address: lmem_base_addr,
373             });
374         }
375 
376         if self.dev_info().cls_compute >= MAXWELL_COMPUTE_B {
377             p.push_method(clb1c0::InvalidateSkedCaches { v: 0 });
378         }
379 
380         p.push_method(cla0c0::SendPcasA {
381             qmd_address_shifted8: (qmd_addr >> 8) as u32,
382         });
383         if self.dev_info().cls_compute >= AMPERE_COMPUTE_A {
384             p.push_method(clc6c0::SendSignalingPcas2B {
385                 pcas_action: clc6c0::SendSignalingPcas2BPcasAction::InvalidateCopySchedule,
386             });
387         } else {
388             p.push_method(cla0c0::SendSignalingPcasB {
389                 invalidate: true,
390                 schedule: true,
391             });
392         }
393 
394         let push_addr = bo.addr + u64::try_from(push_offset).unwrap();
395         let push_map = bo.map.byte_offset(push_offset.try_into().unwrap());
396         std::ptr::copy(p.as_ptr(), push_map.cast(), p.len());
397 
398         let res = self.exec(push_addr, (p.len() * 4).try_into().unwrap());
399 
400         // Always copy the data back to the caller, even if exec fails
401         let data_map = bo.map.byte_offset(data_offset.try_into().unwrap());
402         if data_size > 0 {
403             std::ptr::copy(data_map, data, data_size);
404         }
405 
406         res
407     }
408 
run<T>( &self, shader: &nak_shader_bin, data: &mut [T], ) -> io::Result<()>409     pub fn run<T>(
410         &self,
411         shader: &nak_shader_bin,
412         data: &mut [T],
413     ) -> io::Result<()> {
414         unsafe {
415             let stride = std::mem::size_of::<T>();
416             self.run_raw(
417                 shader,
418                 data.len().try_into().unwrap(),
419                 stride.try_into().unwrap(),
420                 data.as_mut_ptr().cast(),
421                 data.len() * stride,
422             )
423         }
424     }
425 }
426 
427 unsafe impl Sync for Runner {}
428 unsafe impl Send for Runner {}
429