• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::api::event::create_and_queue;
2 use crate::api::icd::*;
3 use crate::api::util::*;
4 use crate::core::device::*;
5 use crate::core::event::*;
6 use crate::core::kernel::*;
7 use crate::core::memory::*;
8 use crate::core::program::*;
9 use crate::core::queue::*;
10 
11 use mesa_rust_util::ptr::*;
12 use mesa_rust_util::string::*;
13 use rusticl_opencl_gen::*;
14 use rusticl_proc_macros::cl_entrypoint;
15 use rusticl_proc_macros::cl_info_entrypoint;
16 
17 use std::cmp;
18 use std::ffi::CStr;
19 use std::mem;
20 use std::os::raw::c_void;
21 use std::ptr;
22 use std::slice;
23 use std::sync::Arc;
24 
25 #[cl_info_entrypoint(clGetKernelInfo)]
26 unsafe impl CLInfo<cl_kernel_info> for cl_kernel {
query(&self, q: cl_kernel_info, v: CLInfoValue) -> CLResult<CLInfoRes>27     fn query(&self, q: cl_kernel_info, v: CLInfoValue) -> CLResult<CLInfoRes> {
28         let kernel = Kernel::ref_from_raw(*self)?;
29         match q {
30             CL_KERNEL_ATTRIBUTES => v.write::<&str>(&kernel.kernel_info.attributes_string),
31             CL_KERNEL_CONTEXT => {
32                 let ptr = Arc::as_ptr(&kernel.prog.context);
33                 v.write::<cl_context>(cl_context::from_ptr(ptr))
34             }
35             CL_KERNEL_FUNCTION_NAME => v.write::<&str>(&kernel.name),
36             CL_KERNEL_NUM_ARGS => v.write::<cl_uint>(kernel.kernel_info.args.len() as cl_uint),
37             CL_KERNEL_PROGRAM => {
38                 let ptr = Arc::as_ptr(&kernel.prog);
39                 v.write::<cl_program>(cl_program::from_ptr(ptr))
40             }
41             CL_KERNEL_REFERENCE_COUNT => v.write::<cl_uint>(Kernel::refcnt(*self)?),
42             // CL_INVALID_VALUE if param_name is not one of the supported values
43             _ => Err(CL_INVALID_VALUE),
44         }
45     }
46 }
47 
48 #[cl_info_entrypoint(clGetKernelArgInfo)]
49 unsafe impl CLInfoObj<cl_kernel_arg_info, cl_uint> for cl_kernel {
query(&self, idx: cl_uint, q: cl_kernel_arg_info, v: CLInfoValue) -> CLResult<CLInfoRes>50     fn query(&self, idx: cl_uint, q: cl_kernel_arg_info, v: CLInfoValue) -> CLResult<CLInfoRes> {
51         let kernel = Kernel::ref_from_raw(*self)?;
52 
53         // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
54         if idx as usize >= kernel.kernel_info.args.len() {
55             return Err(CL_INVALID_ARG_INDEX);
56         }
57 
58         match *q {
59             CL_KERNEL_ARG_ACCESS_QUALIFIER => {
60                 v.write::<cl_kernel_arg_access_qualifier>(kernel.access_qualifier(idx))
61             }
62             CL_KERNEL_ARG_ADDRESS_QUALIFIER => {
63                 v.write::<cl_kernel_arg_address_qualifier>(kernel.address_qualifier(idx))
64             }
65             CL_KERNEL_ARG_NAME => v.write::<&CStr>(
66                 kernel
67                     .arg_name(idx)
68                     .ok_or(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)?,
69             ),
70             CL_KERNEL_ARG_TYPE_NAME => v.write::<&CStr>(
71                 kernel
72                     .arg_type_name(idx)
73                     .ok_or(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)?,
74             ),
75             CL_KERNEL_ARG_TYPE_QUALIFIER => {
76                 v.write::<cl_kernel_arg_type_qualifier>(kernel.type_qualifier(idx))
77             }
78             // CL_INVALID_VALUE if param_name is not one of the supported values
79             _ => Err(CL_INVALID_VALUE),
80         }
81     }
82 }
83 
84 #[cl_info_entrypoint(clGetKernelWorkGroupInfo)]
85 unsafe impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
query( &self, dev: cl_device_id, q: cl_kernel_work_group_info, v: CLInfoValue, ) -> CLResult<CLInfoRes>86     fn query(
87         &self,
88         dev: cl_device_id,
89         q: cl_kernel_work_group_info,
90         v: CLInfoValue,
91     ) -> CLResult<CLInfoRes> {
92         let kernel = Kernel::ref_from_raw(*self)?;
93 
94         // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated with kernel.
95         let dev = if dev.is_null() {
96             if kernel.prog.devs.len() > 1 {
97                 return Err(CL_INVALID_DEVICE);
98             } else {
99                 kernel.prog.devs[0]
100             }
101         } else {
102             Device::ref_from_raw(dev)?
103         };
104 
105         // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
106         if !kernel.prog.devs.contains(&dev) {
107             return Err(CL_INVALID_DEVICE);
108         }
109 
110         match *q {
111             CL_KERNEL_COMPILE_WORK_GROUP_SIZE => v.write::<[usize; 3]>(kernel.work_group_size()),
112             CL_KERNEL_LOCAL_MEM_SIZE => v.write::<cl_ulong>(kernel.local_mem_size(dev)),
113             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
114                 v.write::<usize>(kernel.preferred_simd_size(dev))
115             }
116             CL_KERNEL_PRIVATE_MEM_SIZE => v.write::<cl_ulong>(kernel.priv_mem_size(dev)),
117             CL_KERNEL_WORK_GROUP_SIZE => v.write::<usize>(kernel.max_threads_per_block(dev)),
118             // CL_INVALID_VALUE if param_name is not one of the supported values
119             _ => Err(CL_INVALID_VALUE),
120         }
121     }
122 }
123 
124 unsafe impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
125     for cl_kernel
126 {
query( &self, (dev, input_value_size, input_value, output_value_size): ( cl_device_id, usize, *const c_void, usize, ), q: cl_program_build_info, v: CLInfoValue, ) -> CLResult<CLInfoRes>127     fn query(
128         &self,
129         (dev, input_value_size, input_value, output_value_size): (
130             cl_device_id,
131             usize,
132             *const c_void,
133             usize,
134         ),
135         q: cl_program_build_info,
136         v: CLInfoValue,
137     ) -> CLResult<CLInfoRes> {
138         let kernel = Kernel::ref_from_raw(*self)?;
139 
140         // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
141         // with kernel.
142         let dev = if dev.is_null() {
143             if kernel.prog.devs.len() > 1 {
144                 return Err(CL_INVALID_DEVICE);
145             } else {
146                 kernel.prog.devs[0]
147             }
148         } else {
149             Device::ref_from_raw(dev)?
150         };
151 
152         // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
153         if !kernel.prog.devs.contains(&dev) {
154             return Err(CL_INVALID_DEVICE);
155         }
156 
157         // CL_INVALID_OPERATION if device does not support subgroups.
158         if !dev.subgroups_supported() {
159             return Err(CL_INVALID_OPERATION);
160         }
161 
162         let usize_byte = mem::size_of::<usize>();
163         // first we have to convert the input to a proper thing
164         let input: &[usize] = match q {
165             CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
166                 // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
167                 // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
168                 // input_value_size is not valid or if input_value is NULL.
169                 if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
170                     return Err(CL_INVALID_VALUE);
171                 }
172                 // SAFETY: we verified the size as best as possible, with the rest we trust the client
173                 unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
174             }
175             CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
176                 // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
177                 // and the size in bytes specified by input_value_size is not valid or if
178                 // input_value is NULL.
179                 if input_value_size != usize_byte || input_value.is_null() {
180                     return Err(CL_INVALID_VALUE);
181                 }
182                 // SAFETY: we trust the client here
183                 unsafe { slice::from_raw_parts(input_value.cast(), 1) }
184             }
185             _ => &[],
186         };
187 
188         match q {
189             CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
190                 v.write::<usize>(kernel.subgroups_for_block(dev, input))
191             }
192             CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
193                 v.write::<usize>(kernel.subgroup_size_for_block(dev, input))
194             }
195             CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
196                 let subgroups = input[0];
197                 let mut res = vec![0; 3];
198 
199                 for subgroup_size in kernel.subgroup_sizes(dev) {
200                     let threads = subgroups * subgroup_size;
201 
202                     if threads > dev.max_threads_per_block() {
203                         continue;
204                     }
205 
206                     let block = [threads, 1, 1];
207                     let real_subgroups = kernel.subgroups_for_block(dev, &block);
208 
209                     if real_subgroups == subgroups {
210                         res = block.to_vec();
211                         break;
212                     }
213                 }
214 
215                 res.truncate(output_value_size / usize_byte);
216                 v.write::<Vec<usize>>(res)
217             }
218             CL_KERNEL_MAX_NUM_SUB_GROUPS => {
219                 let threads = kernel.max_threads_per_block(dev);
220                 let max_groups = dev.max_subgroups();
221 
222                 let mut result = 0;
223                 for sgs in kernel.subgroup_sizes(dev) {
224                     result = cmp::max(result, threads / sgs);
225                     result = cmp::min(result, max_groups as usize);
226                 }
227                 v.write::<usize>(result)
228             }
229             CL_KERNEL_COMPILE_NUM_SUB_GROUPS => v.write::<usize>(kernel.num_subgroups()),
230             CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => v.write::<usize>(kernel.subgroup_size()),
231             // CL_INVALID_VALUE if param_name is not one of the supported values
232             _ => Err(CL_INVALID_VALUE),
233         }
234     }
235 }
236 
237 const ZERO_ARR: [usize; 3] = [0; 3];
238 
239 /// # Safety
240 ///
241 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize]242 unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize] {
243     if !arr.is_null() {
244         unsafe { slice::from_raw_parts(arr, work_dim as usize) }
245     } else {
246         &ZERO_ARR
247     }
248 }
249 
250 /// # Safety
251 ///
252 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]>253 unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> {
254     if !arr.is_null() {
255         unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) }
256     } else {
257         None
258     }
259 }
260 
261 #[cl_entrypoint(clCreateKernel)]
create_kernel( program: cl_program, kernel_name: *const ::std::os::raw::c_char, ) -> CLResult<cl_kernel>262 fn create_kernel(
263     program: cl_program,
264     kernel_name: *const ::std::os::raw::c_char,
265 ) -> CLResult<cl_kernel> {
266     let p = Program::arc_from_raw(program)?;
267     let name = c_string_to_string(kernel_name);
268 
269     // CL_INVALID_VALUE if kernel_name is NULL.
270     if kernel_name.is_null() {
271         return Err(CL_INVALID_VALUE);
272     }
273 
274     let build = p.build_info();
275     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for program.
276     if build.kernels().is_empty() {
277         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
278     }
279 
280     // CL_INVALID_KERNEL_NAME if kernel_name is not found in program.
281     if !build.kernels().contains(&name) {
282         return Err(CL_INVALID_KERNEL_NAME);
283     }
284 
285     // CL_INVALID_KERNEL_DEFINITION if the function definition for __kernel function given by
286     // kernel_name such as the number of arguments, the argument types are not the same for all
287     // devices for which the program executable has been built.
288     if !p.has_unique_kernel_signatures(&name) {
289         return Err(CL_INVALID_KERNEL_DEFINITION);
290     }
291 
292     Ok(Kernel::new(name, Arc::clone(&p), &build).into_cl())
293 }
294 
295 #[cl_entrypoint(clRetainKernel)]
retain_kernel(kernel: cl_kernel) -> CLResult<()>296 fn retain_kernel(kernel: cl_kernel) -> CLResult<()> {
297     Kernel::retain(kernel)
298 }
299 
300 #[cl_entrypoint(clReleaseKernel)]
release_kernel(kernel: cl_kernel) -> CLResult<()>301 fn release_kernel(kernel: cl_kernel) -> CLResult<()> {
302     Kernel::release(kernel)
303 }
304 
305 #[cl_entrypoint(clCreateKernelsInProgram)]
create_kernels_in_program( program: cl_program, num_kernels: cl_uint, kernels: *mut cl_kernel, num_kernels_ret: *mut cl_uint, ) -> CLResult<()>306 fn create_kernels_in_program(
307     program: cl_program,
308     num_kernels: cl_uint,
309     kernels: *mut cl_kernel,
310     num_kernels_ret: *mut cl_uint,
311 ) -> CLResult<()> {
312     let p = Program::arc_from_raw(program)?;
313     let build = p.build_info();
314 
315     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for any device in
316     // program.
317     if build.kernels().is_empty() {
318         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
319     }
320 
321     // CL_INVALID_VALUE if kernels is not NULL and num_kernels is less than the number of kernels
322     // in program.
323     if !kernels.is_null() && build.kernels().len() > num_kernels as usize {
324         return Err(CL_INVALID_VALUE);
325     }
326 
327     let mut num_kernels = 0;
328     for name in build.kernels() {
329         // Kernel objects are not created for any __kernel functions in program that do not have the
330         // same function definition across all devices for which a program executable has been
331         // successfully built.
332         if !p.has_unique_kernel_signatures(name) {
333             continue;
334         }
335 
336         if !kernels.is_null() {
337             // we just assume the client isn't stupid
338             unsafe {
339                 kernels
340                     .add(num_kernels as usize)
341                     .write(Kernel::new(name.clone(), p.clone(), &build).into_cl());
342             }
343         }
344         num_kernels += 1;
345     }
346     num_kernels_ret.write_checked(num_kernels);
347     Ok(())
348 }
349 
350 #[cl_entrypoint(clSetKernelArg)]
set_kernel_arg( kernel: cl_kernel, arg_index: cl_uint, arg_size: usize, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>351 fn set_kernel_arg(
352     kernel: cl_kernel,
353     arg_index: cl_uint,
354     arg_size: usize,
355     arg_value: *const ::std::os::raw::c_void,
356 ) -> CLResult<()> {
357     let k = Kernel::ref_from_raw(kernel)?;
358     let arg_index = arg_index as usize;
359 
360     // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
361     if let Some(arg) = k.kernel_info.args.get(arg_index) {
362         // CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument
363         // that is not a memory object or if the argument is a memory object and
364         // arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the
365         // local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
366         match arg.kind {
367             KernelArgType::MemLocal => {
368                 if arg_size == 0 {
369                     return Err(CL_INVALID_ARG_SIZE);
370                 }
371             }
372             KernelArgType::MemGlobal
373             | KernelArgType::MemConstant
374             | KernelArgType::Image
375             | KernelArgType::RWImage
376             | KernelArgType::Texture => {
377                 if arg_size != std::mem::size_of::<cl_mem>() {
378                     return Err(CL_INVALID_ARG_SIZE);
379                 }
380             }
381 
382             KernelArgType::Sampler => {
383                 if arg_size != std::mem::size_of::<cl_sampler>() {
384                     return Err(CL_INVALID_ARG_SIZE);
385                 }
386             }
387 
388             KernelArgType::Constant(size) => {
389                 if size as usize != arg_size {
390                     return Err(CL_INVALID_ARG_SIZE);
391                 }
392             }
393         }
394 
395         // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
396         match arg.kind {
397             // If the argument is declared with the local qualifier, the arg_value entry must be
398             // NULL.
399             KernelArgType::MemLocal => {
400                 if !arg_value.is_null() {
401                     return Err(CL_INVALID_ARG_VALUE);
402                 }
403             }
404             // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
405             // sampler object.
406             KernelArgType::Constant(_) | KernelArgType::Sampler => {
407                 if arg_value.is_null() {
408                     return Err(CL_INVALID_ARG_VALUE);
409                 }
410             }
411             _ => {}
412         };
413 
414         // let's create the arg now
415         let arg = unsafe {
416             if arg.dead {
417                 KernelArgValue::None
418             } else {
419                 match arg.kind {
420                     KernelArgType::Constant(_) => KernelArgValue::Constant(
421                         slice::from_raw_parts(arg_value.cast(), arg_size).to_vec(),
422                     ),
423                     KernelArgType::MemConstant | KernelArgType::MemGlobal => {
424                         let ptr: *const cl_mem = arg_value.cast();
425                         if ptr.is_null() || (*ptr).is_null() {
426                             KernelArgValue::None
427                         } else {
428                             let buffer = Buffer::arc_from_raw(*ptr)?;
429                             KernelArgValue::Buffer(Arc::downgrade(&buffer))
430                         }
431                     }
432                     KernelArgType::MemLocal => KernelArgValue::LocalMem(arg_size),
433                     KernelArgType::Image | KernelArgType::RWImage | KernelArgType::Texture => {
434                         let img: *const cl_mem = arg_value.cast();
435                         let img = Image::arc_from_raw(*img)?;
436                         KernelArgValue::Image(Arc::downgrade(&img))
437                     }
438                     KernelArgType::Sampler => {
439                         let ptr: *const cl_sampler = arg_value.cast();
440                         KernelArgValue::Sampler(Sampler::arc_from_raw(*ptr)?)
441                     }
442                 }
443             }
444         };
445         k.set_kernel_arg(arg_index, arg)
446     } else {
447         Err(CL_INVALID_ARG_INDEX)
448     }
449 
450     //• CL_INVALID_DEVICE_QUEUE for an argument declared to be of type queue_t when the specified arg_value is not a valid device queue object. This error code is missing before version 2.0.
451     //• CL_INVALID_ARG_VALUE if the argument is an image declared with the read_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_WRITE_ONLY or if the image argument is declared with the write_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_READ_ONLY.
452     //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
453 }
454 
455 #[cl_entrypoint(clSetKernelArgSVMPointer)]
set_kernel_arg_svm_pointer( kernel: cl_kernel, arg_index: cl_uint, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>456 fn set_kernel_arg_svm_pointer(
457     kernel: cl_kernel,
458     arg_index: cl_uint,
459     arg_value: *const ::std::os::raw::c_void,
460 ) -> CLResult<()> {
461     let kernel = Kernel::ref_from_raw(kernel)?;
462     let arg_index = arg_index as usize;
463     let arg_value = arg_value as usize;
464 
465     if !kernel.has_svm_devs() {
466         return Err(CL_INVALID_OPERATION);
467     }
468 
469     if let Some(arg) = kernel.kernel_info.args.get(arg_index) {
470         if !matches!(
471             arg.kind,
472             KernelArgType::MemConstant | KernelArgType::MemGlobal
473         ) {
474             return Err(CL_INVALID_ARG_INDEX);
475         }
476 
477         let arg_value = KernelArgValue::Constant(arg_value.to_ne_bytes().to_vec());
478         kernel.set_kernel_arg(arg_index, arg_value)
479     } else {
480         Err(CL_INVALID_ARG_INDEX)
481     }
482 
483     // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
484 }
485 
486 #[cl_entrypoint(clSetKernelExecInfo)]
set_kernel_exec_info( kernel: cl_kernel, param_name: cl_kernel_exec_info, param_value_size: usize, param_value: *const ::std::os::raw::c_void, ) -> CLResult<()>487 fn set_kernel_exec_info(
488     kernel: cl_kernel,
489     param_name: cl_kernel_exec_info,
490     param_value_size: usize,
491     param_value: *const ::std::os::raw::c_void,
492 ) -> CLResult<()> {
493     let k = Kernel::ref_from_raw(kernel)?;
494 
495     // CL_INVALID_OPERATION if no devices in the context associated with kernel support SVM.
496     if !k.prog.devs.iter().any(|dev| dev.svm_supported()) {
497         return Err(CL_INVALID_OPERATION);
498     }
499 
500     // CL_INVALID_VALUE ... if param_value is NULL
501     if param_value.is_null() {
502         return Err(CL_INVALID_VALUE);
503     }
504 
505     // CL_INVALID_VALUE ... if the size specified by param_value_size is not valid.
506     match param_name {
507         CL_KERNEL_EXEC_INFO_SVM_PTRS | CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM => {
508             // it's a list of pointers
509             if param_value_size % mem::size_of::<*const c_void>() != 0 {
510                 return Err(CL_INVALID_VALUE);
511             }
512         }
513         CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM
514         | CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM => {
515             if param_value_size != mem::size_of::<cl_bool>() {
516                 return Err(CL_INVALID_VALUE);
517             }
518         }
519         // CL_INVALID_VALUE if param_name is not valid
520         _ => return Err(CL_INVALID_VALUE),
521     }
522 
523     Ok(())
524 
525     // CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and param_value is CL_TRUE but no devices in context associated with kernel support fine-grain system SVM allocations.
526 }
527 
528 #[cl_entrypoint(clEnqueueNDRangeKernel)]
enqueue_ndrange_kernel( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, local_work_size: *const usize, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>529 fn enqueue_ndrange_kernel(
530     command_queue: cl_command_queue,
531     kernel: cl_kernel,
532     work_dim: cl_uint,
533     global_work_offset: *const usize,
534     global_work_size: *const usize,
535     local_work_size: *const usize,
536     num_events_in_wait_list: cl_uint,
537     event_wait_list: *const cl_event,
538     event: *mut cl_event,
539 ) -> CLResult<()> {
540     let q = Queue::arc_from_raw(command_queue)?;
541     let k = Kernel::arc_from_raw(kernel)?;
542     let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
543 
544     // CL_INVALID_CONTEXT if context associated with command_queue and kernel are not the same
545     if q.context != k.prog.context {
546         return Err(CL_INVALID_CONTEXT);
547     }
548 
549     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
550     // for device associated with command_queue.
551     if k.prog.status(q.device) != CL_BUILD_SUCCESS as cl_build_status {
552         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
553     }
554 
555     // CL_INVALID_KERNEL_ARGS if the kernel argument values have not been specified.
556     if k.arg_values().iter().any(|v| v.is_none()) {
557         return Err(CL_INVALID_KERNEL_ARGS);
558     }
559 
560     // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
561     // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
562     if work_dim == 0 || work_dim > q.device.max_grid_dimensions() {
563         return Err(CL_INVALID_WORK_DIMENSION);
564     }
565 
566     // we assume the application gets it right and doesn't pass shorter arrays then actually needed.
567     let global_work_size = unsafe { kernel_work_arr_or_default(global_work_size, work_dim) };
568     let local_work_size = unsafe { kernel_work_arr_or_default(local_work_size, work_dim) };
569     let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
570 
571     let device_bits = q.device.address_bits();
572     let device_max = u64::MAX >> (u64::BITS - device_bits);
573 
574     let mut threads = 0;
575     for i in 0..work_dim as usize {
576         let lws = local_work_size[i];
577         let gws = global_work_size[i];
578         let gwo = global_work_offset[i];
579 
580         threads *= lws;
581 
582         // CL_INVALID_WORK_ITEM_SIZE if the number of work-items specified in any of
583         // local_work_size[0], … local_work_size[work_dim - 1] is greater than the corresponding
584         // values specified by
585         // CL_DEVICE_MAX_WORK_ITEM_SIZES[0], …, CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1].
586         if lws > q.device.max_block_sizes()[i] {
587             return Err(CL_INVALID_WORK_ITEM_SIZE);
588         }
589 
590         // CL_INVALID_WORK_GROUP_SIZE if the work-group size must be uniform and the
591         // local_work_size is not NULL, [...] if the global_work_size is not evenly divisible by
592         // the local_work_size.
593         if lws != 0 && gws % lws != 0 {
594             return Err(CL_INVALID_WORK_GROUP_SIZE);
595         }
596 
597         // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the
598         // required work-group size for kernel in the program source.
599         if lws != 0 && k.work_group_size()[i] != 0 && lws != k.work_group_size()[i] {
600             return Err(CL_INVALID_WORK_GROUP_SIZE);
601         }
602 
603         // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size[0], …
604         // global_work_size[work_dim - 1] exceed the maximum value representable by size_t on
605         // the device on which the kernel-instance will be enqueued.
606         if gws as u64 > device_max {
607             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
608         }
609 
610         // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size + the
611         // corresponding values in global_work_offset for any dimensions is greater than the
612         // maximum value representable by size t on the device on which the kernel-instance
613         // will be enqueued
614         if u64::checked_add(gws as u64, gwo as u64)
615             .filter(|&x| x <= device_max)
616             .is_none()
617         {
618             return Err(CL_INVALID_GLOBAL_OFFSET);
619         }
620     }
621 
622     // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items
623     // in the work-group computed as local_work_size[0] × … local_work_size[work_dim - 1] is greater
624     // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in the Kernel Object Device Queries
625     // table.
626     if threads != 0 && threads > k.max_threads_per_block(q.device) {
627         return Err(CL_INVALID_WORK_GROUP_SIZE);
628     }
629 
630     // If global_work_size is NULL, or the value in any passed dimension is 0 then the kernel
631     // command will trivially succeed after its event dependencies are satisfied and subsequently
632     // update its completion event.
633     let cb: EventSig = if global_work_size.contains(&0) {
634         Box::new(|_, _| Ok(()))
635     } else {
636         k.launch(
637             &q,
638             work_dim,
639             local_work_size,
640             global_work_size,
641             global_work_offset,
642         )?
643     };
644 
645     create_and_queue(q, CL_COMMAND_NDRANGE_KERNEL, evs, event, false, cb)
646 
647     //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not consistent with the required number of sub-groups for kernel in the program source.
648     //• CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is specified as the value for an argument that is a buffer object and the offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code
649     //• CL_INVALID_IMAGE_SIZE if an image object is specified as an argument value and the image dimensions (image width, height, specified or compute row and/or slice pitch) are not supported by device associated with queue.
650     //• CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is specified as an argument value and the image format (image channel order and data type) is not supported by device associated with queue.
651     //• CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device.
652     //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with image or buffer objects specified as arguments to kernel.
653     //• CL_INVALID_OPERATION if SVM pointers are passed as arguments to a kernel and the device does not support SVM or if system pointers are passed as arguments to a kernel and/or stored inside SVM allocations passed as kernel arguments and the device does not support fine grain system SVM allocations.
654 }
655 
656 #[cl_entrypoint(clEnqueueTask)]
enqueue_task( command_queue: cl_command_queue, kernel: cl_kernel, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>657 fn enqueue_task(
658     command_queue: cl_command_queue,
659     kernel: cl_kernel,
660     num_events_in_wait_list: cl_uint,
661     event_wait_list: *const cl_event,
662     event: *mut cl_event,
663 ) -> CLResult<()> {
664     // clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim set to 1,
665     // global_work_offset set to NULL, global_work_size[0] set to 1, and local_work_size[0] set to
666     // 1.
667     enqueue_ndrange_kernel(
668         command_queue,
669         kernel,
670         1,
671         ptr::null(),
672         [1, 1, 1].as_ptr(),
673         [1, 0, 0].as_ptr(),
674         num_events_in_wait_list,
675         event_wait_list,
676         event,
677     )
678 }
679 
680 #[cl_entrypoint(clCloneKernel)]
clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel>681 fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
682     let k = Kernel::ref_from_raw(source_kernel)?;
683     Ok(Arc::new(k.clone()).into_cl())
684 }
685 
686 #[cl_entrypoint(clGetKernelSuggestedLocalWorkSizeKHR)]
get_kernel_suggested_local_work_size_khr( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, suggested_local_work_size: *mut usize, ) -> CLResult<()>687 fn get_kernel_suggested_local_work_size_khr(
688     command_queue: cl_command_queue,
689     kernel: cl_kernel,
690     work_dim: cl_uint,
691     global_work_offset: *const usize,
692     global_work_size: *const usize,
693     suggested_local_work_size: *mut usize,
694 ) -> CLResult<()> {
695     // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in
696     // global_work_size are 0.
697     if global_work_size.is_null() {
698         return Err(CL_INVALID_GLOBAL_WORK_SIZE);
699     }
700 
701     if global_work_offset.is_null() {
702         return Err(CL_INVALID_GLOBAL_OFFSET);
703     }
704 
705     // CL_INVALID_VALUE if suggested_local_work_size is NULL.
706     if suggested_local_work_size.is_null() {
707         return Err(CL_INVALID_VALUE);
708     }
709 
710     // CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue.
711     let queue = Queue::ref_from_raw(command_queue)?;
712 
713     // CL_INVALID_KERNEL if kernel is not a valid kernel object.
714     let kernel = Kernel::ref_from_raw(kernel)?;
715 
716     // CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context
717     // associated with command_queue.
718     if queue.context != kernel.prog.context {
719         return Err(CL_INVALID_CONTEXT);
720     }
721 
722     // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
723     // for kernel for the device associated with command_queue.
724     if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status {
725         return Err(CL_INVALID_PROGRAM_EXECUTABLE);
726     }
727 
728     // CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set.
729     if kernel.arg_values().iter().any(|v| v.is_none()) {
730         return Err(CL_INVALID_KERNEL_ARGS);
731     }
732 
733     // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
734     // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
735     if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() {
736         return Err(CL_INVALID_WORK_DIMENSION);
737     }
738 
739     let mut global_work_size =
740         unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() };
741 
742     let suggested_local_work_size = unsafe {
743         kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)?
744     };
745 
746     let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
747 
748     let device_bits = queue.device.address_bits();
749     let device_max = u64::MAX >> (u64::BITS - device_bits);
750     for i in 0..work_dim as usize {
751         let gws = global_work_size[i];
752         let gwo = global_work_offset[i];
753 
754         // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified
755         // in global_work_size are 0.
756         if gws == 0 {
757             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
758         }
759         // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the
760         // maximum value representable by size_t on the device associated with command_queue.
761         if gws as u64 > device_max {
762             return Err(CL_INVALID_GLOBAL_WORK_SIZE);
763         }
764         // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the
765         // corresponding value in global_work_offset for dimension exceeds the maximum value
766         // representable by size_t on the device associated with command_queue.
767         if u64::checked_add(gws as u64, gwo as u64)
768             .filter(|&x| x <= device_max)
769             .is_none()
770         {
771             return Err(CL_INVALID_GLOBAL_OFFSET);
772         }
773     }
774 
775     kernel.suggest_local_size(
776         queue.device,
777         work_dim as usize,
778         &mut global_work_size,
779         suggested_local_work_size,
780     );
781 
782     Ok(())
783 
784     // CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue.
785     // CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue.
786     // CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue.
787     // CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer.
788     // CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device.
789     // CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
790 }
791