1 use crate::api::event::create_and_queue;
2 use crate::api::icd::*;
3 use crate::api::util::*;
4 use crate::core::device::*;
5 use crate::core::event::*;
6 use crate::core::kernel::*;
7 use crate::core::memory::*;
8 use crate::core::program::*;
9 use crate::core::queue::*;
10
11 use mesa_rust_util::ptr::*;
12 use mesa_rust_util::string::*;
13 use rusticl_opencl_gen::*;
14 use rusticl_proc_macros::cl_entrypoint;
15 use rusticl_proc_macros::cl_info_entrypoint;
16
17 use std::cmp;
18 use std::ffi::CStr;
19 use std::mem;
20 use std::os::raw::c_void;
21 use std::ptr;
22 use std::slice;
23 use std::sync::Arc;
24
25 #[cl_info_entrypoint(clGetKernelInfo)]
26 unsafe impl CLInfo<cl_kernel_info> for cl_kernel {
query(&self, q: cl_kernel_info, v: CLInfoValue) -> CLResult<CLInfoRes>27 fn query(&self, q: cl_kernel_info, v: CLInfoValue) -> CLResult<CLInfoRes> {
28 let kernel = Kernel::ref_from_raw(*self)?;
29 match q {
30 CL_KERNEL_ATTRIBUTES => v.write::<&str>(&kernel.kernel_info.attributes_string),
31 CL_KERNEL_CONTEXT => {
32 let ptr = Arc::as_ptr(&kernel.prog.context);
33 v.write::<cl_context>(cl_context::from_ptr(ptr))
34 }
35 CL_KERNEL_FUNCTION_NAME => v.write::<&str>(&kernel.name),
36 CL_KERNEL_NUM_ARGS => v.write::<cl_uint>(kernel.kernel_info.args.len() as cl_uint),
37 CL_KERNEL_PROGRAM => {
38 let ptr = Arc::as_ptr(&kernel.prog);
39 v.write::<cl_program>(cl_program::from_ptr(ptr))
40 }
41 CL_KERNEL_REFERENCE_COUNT => v.write::<cl_uint>(Kernel::refcnt(*self)?),
42 // CL_INVALID_VALUE if param_name is not one of the supported values
43 _ => Err(CL_INVALID_VALUE),
44 }
45 }
46 }
47
48 #[cl_info_entrypoint(clGetKernelArgInfo)]
49 unsafe impl CLInfoObj<cl_kernel_arg_info, cl_uint> for cl_kernel {
query(&self, idx: cl_uint, q: cl_kernel_arg_info, v: CLInfoValue) -> CLResult<CLInfoRes>50 fn query(&self, idx: cl_uint, q: cl_kernel_arg_info, v: CLInfoValue) -> CLResult<CLInfoRes> {
51 let kernel = Kernel::ref_from_raw(*self)?;
52
53 // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
54 if idx as usize >= kernel.kernel_info.args.len() {
55 return Err(CL_INVALID_ARG_INDEX);
56 }
57
58 match *q {
59 CL_KERNEL_ARG_ACCESS_QUALIFIER => {
60 v.write::<cl_kernel_arg_access_qualifier>(kernel.access_qualifier(idx))
61 }
62 CL_KERNEL_ARG_ADDRESS_QUALIFIER => {
63 v.write::<cl_kernel_arg_address_qualifier>(kernel.address_qualifier(idx))
64 }
65 CL_KERNEL_ARG_NAME => v.write::<&CStr>(
66 kernel
67 .arg_name(idx)
68 .ok_or(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)?,
69 ),
70 CL_KERNEL_ARG_TYPE_NAME => v.write::<&CStr>(
71 kernel
72 .arg_type_name(idx)
73 .ok_or(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)?,
74 ),
75 CL_KERNEL_ARG_TYPE_QUALIFIER => {
76 v.write::<cl_kernel_arg_type_qualifier>(kernel.type_qualifier(idx))
77 }
78 // CL_INVALID_VALUE if param_name is not one of the supported values
79 _ => Err(CL_INVALID_VALUE),
80 }
81 }
82 }
83
84 #[cl_info_entrypoint(clGetKernelWorkGroupInfo)]
85 unsafe impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
query( &self, dev: cl_device_id, q: cl_kernel_work_group_info, v: CLInfoValue, ) -> CLResult<CLInfoRes>86 fn query(
87 &self,
88 dev: cl_device_id,
89 q: cl_kernel_work_group_info,
90 v: CLInfoValue,
91 ) -> CLResult<CLInfoRes> {
92 let kernel = Kernel::ref_from_raw(*self)?;
93
94 // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated with kernel.
95 let dev = if dev.is_null() {
96 if kernel.prog.devs.len() > 1 {
97 return Err(CL_INVALID_DEVICE);
98 } else {
99 kernel.prog.devs[0]
100 }
101 } else {
102 Device::ref_from_raw(dev)?
103 };
104
105 // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
106 if !kernel.prog.devs.contains(&dev) {
107 return Err(CL_INVALID_DEVICE);
108 }
109
110 match *q {
111 CL_KERNEL_COMPILE_WORK_GROUP_SIZE => v.write::<[usize; 3]>(kernel.work_group_size()),
112 CL_KERNEL_LOCAL_MEM_SIZE => v.write::<cl_ulong>(kernel.local_mem_size(dev)),
113 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
114 v.write::<usize>(kernel.preferred_simd_size(dev))
115 }
116 CL_KERNEL_PRIVATE_MEM_SIZE => v.write::<cl_ulong>(kernel.priv_mem_size(dev)),
117 CL_KERNEL_WORK_GROUP_SIZE => v.write::<usize>(kernel.max_threads_per_block(dev)),
118 // CL_INVALID_VALUE if param_name is not one of the supported values
119 _ => Err(CL_INVALID_VALUE),
120 }
121 }
122 }
123
124 unsafe impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
125 for cl_kernel
126 {
query( &self, (dev, input_value_size, input_value, output_value_size): ( cl_device_id, usize, *const c_void, usize, ), q: cl_program_build_info, v: CLInfoValue, ) -> CLResult<CLInfoRes>127 fn query(
128 &self,
129 (dev, input_value_size, input_value, output_value_size): (
130 cl_device_id,
131 usize,
132 *const c_void,
133 usize,
134 ),
135 q: cl_program_build_info,
136 v: CLInfoValue,
137 ) -> CLResult<CLInfoRes> {
138 let kernel = Kernel::ref_from_raw(*self)?;
139
140 // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
141 // with kernel.
142 let dev = if dev.is_null() {
143 if kernel.prog.devs.len() > 1 {
144 return Err(CL_INVALID_DEVICE);
145 } else {
146 kernel.prog.devs[0]
147 }
148 } else {
149 Device::ref_from_raw(dev)?
150 };
151
152 // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
153 if !kernel.prog.devs.contains(&dev) {
154 return Err(CL_INVALID_DEVICE);
155 }
156
157 // CL_INVALID_OPERATION if device does not support subgroups.
158 if !dev.subgroups_supported() {
159 return Err(CL_INVALID_OPERATION);
160 }
161
162 let usize_byte = mem::size_of::<usize>();
163 // first we have to convert the input to a proper thing
164 let input: &[usize] = match q {
165 CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
166 // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
167 // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
168 // input_value_size is not valid or if input_value is NULL.
169 if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
170 return Err(CL_INVALID_VALUE);
171 }
172 // SAFETY: we verified the size as best as possible, with the rest we trust the client
173 unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
174 }
175 CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
176 // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
177 // and the size in bytes specified by input_value_size is not valid or if
178 // input_value is NULL.
179 if input_value_size != usize_byte || input_value.is_null() {
180 return Err(CL_INVALID_VALUE);
181 }
182 // SAFETY: we trust the client here
183 unsafe { slice::from_raw_parts(input_value.cast(), 1) }
184 }
185 _ => &[],
186 };
187
188 match q {
189 CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
190 v.write::<usize>(kernel.subgroups_for_block(dev, input))
191 }
192 CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
193 v.write::<usize>(kernel.subgroup_size_for_block(dev, input))
194 }
195 CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
196 let subgroups = input[0];
197 let mut res = vec![0; 3];
198
199 for subgroup_size in kernel.subgroup_sizes(dev) {
200 let threads = subgroups * subgroup_size;
201
202 if threads > dev.max_threads_per_block() {
203 continue;
204 }
205
206 let block = [threads, 1, 1];
207 let real_subgroups = kernel.subgroups_for_block(dev, &block);
208
209 if real_subgroups == subgroups {
210 res = block.to_vec();
211 break;
212 }
213 }
214
215 res.truncate(output_value_size / usize_byte);
216 v.write::<Vec<usize>>(res)
217 }
218 CL_KERNEL_MAX_NUM_SUB_GROUPS => {
219 let threads = kernel.max_threads_per_block(dev);
220 let max_groups = dev.max_subgroups();
221
222 let mut result = 0;
223 for sgs in kernel.subgroup_sizes(dev) {
224 result = cmp::max(result, threads / sgs);
225 result = cmp::min(result, max_groups as usize);
226 }
227 v.write::<usize>(result)
228 }
229 CL_KERNEL_COMPILE_NUM_SUB_GROUPS => v.write::<usize>(kernel.num_subgroups()),
230 CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => v.write::<usize>(kernel.subgroup_size()),
231 // CL_INVALID_VALUE if param_name is not one of the supported values
232 _ => Err(CL_INVALID_VALUE),
233 }
234 }
235 }
236
237 const ZERO_ARR: [usize; 3] = [0; 3];
238
239 /// # Safety
240 ///
241 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize]242 unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize] {
243 if !arr.is_null() {
244 unsafe { slice::from_raw_parts(arr, work_dim as usize) }
245 } else {
246 &ZERO_ARR
247 }
248 }
249
250 /// # Safety
251 ///
252 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]>253 unsafe fn kernel_work_arr_mut<'a>(arr: *mut usize, work_dim: cl_uint) -> Option<&'a mut [usize]> {
254 if !arr.is_null() {
255 unsafe { Some(slice::from_raw_parts_mut(arr, work_dim as usize)) }
256 } else {
257 None
258 }
259 }
260
261 #[cl_entrypoint(clCreateKernel)]
create_kernel( program: cl_program, kernel_name: *const ::std::os::raw::c_char, ) -> CLResult<cl_kernel>262 fn create_kernel(
263 program: cl_program,
264 kernel_name: *const ::std::os::raw::c_char,
265 ) -> CLResult<cl_kernel> {
266 let p = Program::arc_from_raw(program)?;
267 let name = c_string_to_string(kernel_name);
268
269 // CL_INVALID_VALUE if kernel_name is NULL.
270 if kernel_name.is_null() {
271 return Err(CL_INVALID_VALUE);
272 }
273
274 let build = p.build_info();
275 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for program.
276 if build.kernels().is_empty() {
277 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
278 }
279
280 // CL_INVALID_KERNEL_NAME if kernel_name is not found in program.
281 if !build.kernels().contains(&name) {
282 return Err(CL_INVALID_KERNEL_NAME);
283 }
284
285 // CL_INVALID_KERNEL_DEFINITION if the function definition for __kernel function given by
286 // kernel_name such as the number of arguments, the argument types are not the same for all
287 // devices for which the program executable has been built.
288 if !p.has_unique_kernel_signatures(&name) {
289 return Err(CL_INVALID_KERNEL_DEFINITION);
290 }
291
292 Ok(Kernel::new(name, Arc::clone(&p), &build).into_cl())
293 }
294
295 #[cl_entrypoint(clRetainKernel)]
retain_kernel(kernel: cl_kernel) -> CLResult<()>296 fn retain_kernel(kernel: cl_kernel) -> CLResult<()> {
297 Kernel::retain(kernel)
298 }
299
300 #[cl_entrypoint(clReleaseKernel)]
release_kernel(kernel: cl_kernel) -> CLResult<()>301 fn release_kernel(kernel: cl_kernel) -> CLResult<()> {
302 Kernel::release(kernel)
303 }
304
305 #[cl_entrypoint(clCreateKernelsInProgram)]
create_kernels_in_program( program: cl_program, num_kernels: cl_uint, kernels: *mut cl_kernel, num_kernels_ret: *mut cl_uint, ) -> CLResult<()>306 fn create_kernels_in_program(
307 program: cl_program,
308 num_kernels: cl_uint,
309 kernels: *mut cl_kernel,
310 num_kernels_ret: *mut cl_uint,
311 ) -> CLResult<()> {
312 let p = Program::arc_from_raw(program)?;
313 let build = p.build_info();
314
315 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for any device in
316 // program.
317 if build.kernels().is_empty() {
318 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
319 }
320
321 // CL_INVALID_VALUE if kernels is not NULL and num_kernels is less than the number of kernels
322 // in program.
323 if !kernels.is_null() && build.kernels().len() > num_kernels as usize {
324 return Err(CL_INVALID_VALUE);
325 }
326
327 let mut num_kernels = 0;
328 for name in build.kernels() {
329 // Kernel objects are not created for any __kernel functions in program that do not have the
330 // same function definition across all devices for which a program executable has been
331 // successfully built.
332 if !p.has_unique_kernel_signatures(name) {
333 continue;
334 }
335
336 if !kernels.is_null() {
337 // we just assume the client isn't stupid
338 unsafe {
339 kernels
340 .add(num_kernels as usize)
341 .write(Kernel::new(name.clone(), p.clone(), &build).into_cl());
342 }
343 }
344 num_kernels += 1;
345 }
346 num_kernels_ret.write_checked(num_kernels);
347 Ok(())
348 }
349
350 #[cl_entrypoint(clSetKernelArg)]
set_kernel_arg( kernel: cl_kernel, arg_index: cl_uint, arg_size: usize, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>351 fn set_kernel_arg(
352 kernel: cl_kernel,
353 arg_index: cl_uint,
354 arg_size: usize,
355 arg_value: *const ::std::os::raw::c_void,
356 ) -> CLResult<()> {
357 let k = Kernel::ref_from_raw(kernel)?;
358 let arg_index = arg_index as usize;
359
360 // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
361 if let Some(arg) = k.kernel_info.args.get(arg_index) {
362 // CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument
363 // that is not a memory object or if the argument is a memory object and
364 // arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the
365 // local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
366 match arg.kind {
367 KernelArgType::MemLocal => {
368 if arg_size == 0 {
369 return Err(CL_INVALID_ARG_SIZE);
370 }
371 }
372 KernelArgType::MemGlobal
373 | KernelArgType::MemConstant
374 | KernelArgType::Image
375 | KernelArgType::RWImage
376 | KernelArgType::Texture => {
377 if arg_size != std::mem::size_of::<cl_mem>() {
378 return Err(CL_INVALID_ARG_SIZE);
379 }
380 }
381
382 KernelArgType::Sampler => {
383 if arg_size != std::mem::size_of::<cl_sampler>() {
384 return Err(CL_INVALID_ARG_SIZE);
385 }
386 }
387
388 KernelArgType::Constant(size) => {
389 if size as usize != arg_size {
390 return Err(CL_INVALID_ARG_SIZE);
391 }
392 }
393 }
394
395 // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
396 match arg.kind {
397 // If the argument is declared with the local qualifier, the arg_value entry must be
398 // NULL.
399 KernelArgType::MemLocal => {
400 if !arg_value.is_null() {
401 return Err(CL_INVALID_ARG_VALUE);
402 }
403 }
404 // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
405 // sampler object.
406 KernelArgType::Constant(_) | KernelArgType::Sampler => {
407 if arg_value.is_null() {
408 return Err(CL_INVALID_ARG_VALUE);
409 }
410 }
411 _ => {}
412 };
413
414 // let's create the arg now
415 let arg = unsafe {
416 if arg.dead {
417 KernelArgValue::None
418 } else {
419 match arg.kind {
420 KernelArgType::Constant(_) => KernelArgValue::Constant(
421 slice::from_raw_parts(arg_value.cast(), arg_size).to_vec(),
422 ),
423 KernelArgType::MemConstant | KernelArgType::MemGlobal => {
424 let ptr: *const cl_mem = arg_value.cast();
425 if ptr.is_null() || (*ptr).is_null() {
426 KernelArgValue::None
427 } else {
428 let buffer = Buffer::arc_from_raw(*ptr)?;
429 KernelArgValue::Buffer(Arc::downgrade(&buffer))
430 }
431 }
432 KernelArgType::MemLocal => KernelArgValue::LocalMem(arg_size),
433 KernelArgType::Image | KernelArgType::RWImage | KernelArgType::Texture => {
434 let img: *const cl_mem = arg_value.cast();
435 let img = Image::arc_from_raw(*img)?;
436 KernelArgValue::Image(Arc::downgrade(&img))
437 }
438 KernelArgType::Sampler => {
439 let ptr: *const cl_sampler = arg_value.cast();
440 KernelArgValue::Sampler(Sampler::arc_from_raw(*ptr)?)
441 }
442 }
443 }
444 };
445 k.set_kernel_arg(arg_index, arg)
446 } else {
447 Err(CL_INVALID_ARG_INDEX)
448 }
449
450 //• CL_INVALID_DEVICE_QUEUE for an argument declared to be of type queue_t when the specified arg_value is not a valid device queue object. This error code is missing before version 2.0.
451 //• CL_INVALID_ARG_VALUE if the argument is an image declared with the read_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_WRITE_ONLY or if the image argument is declared with the write_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_READ_ONLY.
452 //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
453 }
454
455 #[cl_entrypoint(clSetKernelArgSVMPointer)]
set_kernel_arg_svm_pointer( kernel: cl_kernel, arg_index: cl_uint, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>456 fn set_kernel_arg_svm_pointer(
457 kernel: cl_kernel,
458 arg_index: cl_uint,
459 arg_value: *const ::std::os::raw::c_void,
460 ) -> CLResult<()> {
461 let kernel = Kernel::ref_from_raw(kernel)?;
462 let arg_index = arg_index as usize;
463 let arg_value = arg_value as usize;
464
465 if !kernel.has_svm_devs() {
466 return Err(CL_INVALID_OPERATION);
467 }
468
469 if let Some(arg) = kernel.kernel_info.args.get(arg_index) {
470 if !matches!(
471 arg.kind,
472 KernelArgType::MemConstant | KernelArgType::MemGlobal
473 ) {
474 return Err(CL_INVALID_ARG_INDEX);
475 }
476
477 let arg_value = KernelArgValue::Constant(arg_value.to_ne_bytes().to_vec());
478 kernel.set_kernel_arg(arg_index, arg_value)
479 } else {
480 Err(CL_INVALID_ARG_INDEX)
481 }
482
483 // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
484 }
485
486 #[cl_entrypoint(clSetKernelExecInfo)]
set_kernel_exec_info( kernel: cl_kernel, param_name: cl_kernel_exec_info, param_value_size: usize, param_value: *const ::std::os::raw::c_void, ) -> CLResult<()>487 fn set_kernel_exec_info(
488 kernel: cl_kernel,
489 param_name: cl_kernel_exec_info,
490 param_value_size: usize,
491 param_value: *const ::std::os::raw::c_void,
492 ) -> CLResult<()> {
493 let k = Kernel::ref_from_raw(kernel)?;
494
495 // CL_INVALID_OPERATION if no devices in the context associated with kernel support SVM.
496 if !k.prog.devs.iter().any(|dev| dev.svm_supported()) {
497 return Err(CL_INVALID_OPERATION);
498 }
499
500 // CL_INVALID_VALUE ... if param_value is NULL
501 if param_value.is_null() {
502 return Err(CL_INVALID_VALUE);
503 }
504
505 // CL_INVALID_VALUE ... if the size specified by param_value_size is not valid.
506 match param_name {
507 CL_KERNEL_EXEC_INFO_SVM_PTRS | CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM => {
508 // it's a list of pointers
509 if param_value_size % mem::size_of::<*const c_void>() != 0 {
510 return Err(CL_INVALID_VALUE);
511 }
512 }
513 CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM
514 | CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM => {
515 if param_value_size != mem::size_of::<cl_bool>() {
516 return Err(CL_INVALID_VALUE);
517 }
518 }
519 // CL_INVALID_VALUE if param_name is not valid
520 _ => return Err(CL_INVALID_VALUE),
521 }
522
523 Ok(())
524
525 // CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and param_value is CL_TRUE but no devices in context associated with kernel support fine-grain system SVM allocations.
526 }
527
528 #[cl_entrypoint(clEnqueueNDRangeKernel)]
enqueue_ndrange_kernel( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, local_work_size: *const usize, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>529 fn enqueue_ndrange_kernel(
530 command_queue: cl_command_queue,
531 kernel: cl_kernel,
532 work_dim: cl_uint,
533 global_work_offset: *const usize,
534 global_work_size: *const usize,
535 local_work_size: *const usize,
536 num_events_in_wait_list: cl_uint,
537 event_wait_list: *const cl_event,
538 event: *mut cl_event,
539 ) -> CLResult<()> {
540 let q = Queue::arc_from_raw(command_queue)?;
541 let k = Kernel::arc_from_raw(kernel)?;
542 let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
543
544 // CL_INVALID_CONTEXT if context associated with command_queue and kernel are not the same
545 if q.context != k.prog.context {
546 return Err(CL_INVALID_CONTEXT);
547 }
548
549 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
550 // for device associated with command_queue.
551 if k.prog.status(q.device) != CL_BUILD_SUCCESS as cl_build_status {
552 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
553 }
554
555 // CL_INVALID_KERNEL_ARGS if the kernel argument values have not been specified.
556 if k.arg_values().iter().any(|v| v.is_none()) {
557 return Err(CL_INVALID_KERNEL_ARGS);
558 }
559
560 // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
561 // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
562 if work_dim == 0 || work_dim > q.device.max_grid_dimensions() {
563 return Err(CL_INVALID_WORK_DIMENSION);
564 }
565
566 // we assume the application gets it right and doesn't pass shorter arrays then actually needed.
567 let global_work_size = unsafe { kernel_work_arr_or_default(global_work_size, work_dim) };
568 let local_work_size = unsafe { kernel_work_arr_or_default(local_work_size, work_dim) };
569 let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
570
571 let device_bits = q.device.address_bits();
572 let device_max = u64::MAX >> (u64::BITS - device_bits);
573
574 let mut threads = 0;
575 for i in 0..work_dim as usize {
576 let lws = local_work_size[i];
577 let gws = global_work_size[i];
578 let gwo = global_work_offset[i];
579
580 threads *= lws;
581
582 // CL_INVALID_WORK_ITEM_SIZE if the number of work-items specified in any of
583 // local_work_size[0], … local_work_size[work_dim - 1] is greater than the corresponding
584 // values specified by
585 // CL_DEVICE_MAX_WORK_ITEM_SIZES[0], …, CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1].
586 if lws > q.device.max_block_sizes()[i] {
587 return Err(CL_INVALID_WORK_ITEM_SIZE);
588 }
589
590 // CL_INVALID_WORK_GROUP_SIZE if the work-group size must be uniform and the
591 // local_work_size is not NULL, [...] if the global_work_size is not evenly divisible by
592 // the local_work_size.
593 if lws != 0 && gws % lws != 0 {
594 return Err(CL_INVALID_WORK_GROUP_SIZE);
595 }
596
597 // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the
598 // required work-group size for kernel in the program source.
599 if lws != 0 && k.work_group_size()[i] != 0 && lws != k.work_group_size()[i] {
600 return Err(CL_INVALID_WORK_GROUP_SIZE);
601 }
602
603 // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size[0], …
604 // global_work_size[work_dim - 1] exceed the maximum value representable by size_t on
605 // the device on which the kernel-instance will be enqueued.
606 if gws as u64 > device_max {
607 return Err(CL_INVALID_GLOBAL_WORK_SIZE);
608 }
609
610 // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size + the
611 // corresponding values in global_work_offset for any dimensions is greater than the
612 // maximum value representable by size t on the device on which the kernel-instance
613 // will be enqueued
614 if u64::checked_add(gws as u64, gwo as u64)
615 .filter(|&x| x <= device_max)
616 .is_none()
617 {
618 return Err(CL_INVALID_GLOBAL_OFFSET);
619 }
620 }
621
622 // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items
623 // in the work-group computed as local_work_size[0] × … local_work_size[work_dim - 1] is greater
624 // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in the Kernel Object Device Queries
625 // table.
626 if threads != 0 && threads > k.max_threads_per_block(q.device) {
627 return Err(CL_INVALID_WORK_GROUP_SIZE);
628 }
629
630 // If global_work_size is NULL, or the value in any passed dimension is 0 then the kernel
631 // command will trivially succeed after its event dependencies are satisfied and subsequently
632 // update its completion event.
633 let cb: EventSig = if global_work_size.contains(&0) {
634 Box::new(|_, _| Ok(()))
635 } else {
636 k.launch(
637 &q,
638 work_dim,
639 local_work_size,
640 global_work_size,
641 global_work_offset,
642 )?
643 };
644
645 create_and_queue(q, CL_COMMAND_NDRANGE_KERNEL, evs, event, false, cb)
646
647 //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not consistent with the required number of sub-groups for kernel in the program source.
648 //• CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is specified as the value for an argument that is a buffer object and the offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code
649 //• CL_INVALID_IMAGE_SIZE if an image object is specified as an argument value and the image dimensions (image width, height, specified or compute row and/or slice pitch) are not supported by device associated with queue.
650 //• CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is specified as an argument value and the image format (image channel order and data type) is not supported by device associated with queue.
651 //• CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device.
652 //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with image or buffer objects specified as arguments to kernel.
653 //• CL_INVALID_OPERATION if SVM pointers are passed as arguments to a kernel and the device does not support SVM or if system pointers are passed as arguments to a kernel and/or stored inside SVM allocations passed as kernel arguments and the device does not support fine grain system SVM allocations.
654 }
655
656 #[cl_entrypoint(clEnqueueTask)]
enqueue_task( command_queue: cl_command_queue, kernel: cl_kernel, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>657 fn enqueue_task(
658 command_queue: cl_command_queue,
659 kernel: cl_kernel,
660 num_events_in_wait_list: cl_uint,
661 event_wait_list: *const cl_event,
662 event: *mut cl_event,
663 ) -> CLResult<()> {
664 // clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim set to 1,
665 // global_work_offset set to NULL, global_work_size[0] set to 1, and local_work_size[0] set to
666 // 1.
667 enqueue_ndrange_kernel(
668 command_queue,
669 kernel,
670 1,
671 ptr::null(),
672 [1, 1, 1].as_ptr(),
673 [1, 0, 0].as_ptr(),
674 num_events_in_wait_list,
675 event_wait_list,
676 event,
677 )
678 }
679
680 #[cl_entrypoint(clCloneKernel)]
clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel>681 fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
682 let k = Kernel::ref_from_raw(source_kernel)?;
683 Ok(Arc::new(k.clone()).into_cl())
684 }
685
686 #[cl_entrypoint(clGetKernelSuggestedLocalWorkSizeKHR)]
get_kernel_suggested_local_work_size_khr( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, suggested_local_work_size: *mut usize, ) -> CLResult<()>687 fn get_kernel_suggested_local_work_size_khr(
688 command_queue: cl_command_queue,
689 kernel: cl_kernel,
690 work_dim: cl_uint,
691 global_work_offset: *const usize,
692 global_work_size: *const usize,
693 suggested_local_work_size: *mut usize,
694 ) -> CLResult<()> {
695 // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified in
696 // global_work_size are 0.
697 if global_work_size.is_null() {
698 return Err(CL_INVALID_GLOBAL_WORK_SIZE);
699 }
700
701 if global_work_offset.is_null() {
702 return Err(CL_INVALID_GLOBAL_OFFSET);
703 }
704
705 // CL_INVALID_VALUE if suggested_local_work_size is NULL.
706 if suggested_local_work_size.is_null() {
707 return Err(CL_INVALID_VALUE);
708 }
709
710 // CL_INVALID_COMMAND_QUEUE if command_queue is not a valid host command-queue.
711 let queue = Queue::ref_from_raw(command_queue)?;
712
713 // CL_INVALID_KERNEL if kernel is not a valid kernel object.
714 let kernel = Kernel::ref_from_raw(kernel)?;
715
716 // CL_INVALID_CONTEXT if the context associated with kernel is not the same as the context
717 // associated with command_queue.
718 if queue.context != kernel.prog.context {
719 return Err(CL_INVALID_CONTEXT);
720 }
721
722 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
723 // for kernel for the device associated with command_queue.
724 if kernel.prog.status(queue.device) != CL_BUILD_SUCCESS as cl_build_status {
725 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
726 }
727
728 // CL_INVALID_KERNEL_ARGS if all argument values for kernel have not been set.
729 if kernel.arg_values().iter().any(|v| v.is_none()) {
730 return Err(CL_INVALID_KERNEL_ARGS);
731 }
732
733 // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
734 // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
735 if work_dim == 0 || work_dim > queue.device.max_grid_dimensions() {
736 return Err(CL_INVALID_WORK_DIMENSION);
737 }
738
739 let mut global_work_size =
740 unsafe { kernel_work_arr_or_default(global_work_size, work_dim).to_vec() };
741
742 let suggested_local_work_size = unsafe {
743 kernel_work_arr_mut(suggested_local_work_size, work_dim).ok_or(CL_INVALID_VALUE)?
744 };
745
746 let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
747
748 let device_bits = queue.device.address_bits();
749 let device_max = u64::MAX >> (u64::BITS - device_bits);
750 for i in 0..work_dim as usize {
751 let gws = global_work_size[i];
752 let gwo = global_work_offset[i];
753
754 // CL_INVALID_GLOBAL_WORK_SIZE if global_work_size is NULL or if any of the values specified
755 // in global_work_size are 0.
756 if gws == 0 {
757 return Err(CL_INVALID_GLOBAL_WORK_SIZE);
758 }
759 // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size exceed the
760 // maximum value representable by size_t on the device associated with command_queue.
761 if gws as u64 > device_max {
762 return Err(CL_INVALID_GLOBAL_WORK_SIZE);
763 }
764 // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size plus the
765 // corresponding value in global_work_offset for dimension exceeds the maximum value
766 // representable by size_t on the device associated with command_queue.
767 if u64::checked_add(gws as u64, gwo as u64)
768 .filter(|&x| x <= device_max)
769 .is_none()
770 {
771 return Err(CL_INVALID_GLOBAL_OFFSET);
772 }
773 }
774
775 kernel.suggest_local_size(
776 queue.device,
777 work_dim as usize,
778 &mut global_work_size,
779 suggested_local_work_size,
780 );
781
782 Ok(())
783
784 // CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is set as an argument to kernel and the offset specified when the sub-buffer object was created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN for the device associated with command_queue.
785 // CL_INVALID_IMAGE_SIZE if an image object is set as an argument to kernel and the image dimensions are not supported by device associated with command_queue.
786 // CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is set as an argument to kernel and the image format is not supported by the device associated with command_queue.
787 // CL_INVALID_OPERATION if an SVM pointer is set as an argument to kernel and the device associated with command_queue does not support SVM or the required SVM capabilities for the SVM pointer.
788 // CL_OUT_OF_RESOURCES if there is a failure to allocate resources required by the OpenCL implementation on the device.
789 // CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
790 }
791