1 use crate::api::event::create_and_queue;
2 use crate::api::icd::*;
3 use crate::api::util::*;
4 use crate::core::device::*;
5 use crate::core::event::*;
6 use crate::core::kernel::*;
7 use crate::core::memory::*;
8 use crate::core::program::*;
9 use crate::core::queue::*;
10
11 use mesa_rust_util::ptr::*;
12 use mesa_rust_util::string::*;
13 use rusticl_opencl_gen::*;
14 use rusticl_proc_macros::cl_entrypoint;
15 use rusticl_proc_macros::cl_info_entrypoint;
16
17 use std::cmp;
18 use std::mem::{self, MaybeUninit};
19 use std::os::raw::c_void;
20 use std::ptr;
21 use std::slice;
22 use std::sync::Arc;
23
24 #[cl_info_entrypoint(cl_get_kernel_info)]
25 impl CLInfo<cl_kernel_info> for cl_kernel {
query(&self, q: cl_kernel_info, _: &[u8]) -> CLResult<Vec<MaybeUninit<u8>>>26 fn query(&self, q: cl_kernel_info, _: &[u8]) -> CLResult<Vec<MaybeUninit<u8>>> {
27 let kernel = Kernel::ref_from_raw(*self)?;
28 Ok(match q {
29 CL_KERNEL_ATTRIBUTES => cl_prop::<&str>(&kernel.kernel_info.attributes_string),
30 CL_KERNEL_CONTEXT => {
31 let ptr = Arc::as_ptr(&kernel.prog.context);
32 cl_prop::<cl_context>(cl_context::from_ptr(ptr))
33 }
34 CL_KERNEL_FUNCTION_NAME => cl_prop::<&str>(&kernel.name),
35 CL_KERNEL_NUM_ARGS => cl_prop::<cl_uint>(kernel.kernel_info.args.len() as cl_uint),
36 CL_KERNEL_PROGRAM => {
37 let ptr = Arc::as_ptr(&kernel.prog);
38 cl_prop::<cl_program>(cl_program::from_ptr(ptr))
39 }
40 CL_KERNEL_REFERENCE_COUNT => cl_prop::<cl_uint>(Kernel::refcnt(*self)?),
41 // CL_INVALID_VALUE if param_name is not one of the supported values
42 _ => return Err(CL_INVALID_VALUE),
43 })
44 }
45 }
46
47 #[cl_info_entrypoint(cl_get_kernel_arg_info)]
48 impl CLInfoObj<cl_kernel_arg_info, cl_uint> for cl_kernel {
query(&self, idx: cl_uint, q: cl_kernel_arg_info) -> CLResult<Vec<MaybeUninit<u8>>>49 fn query(&self, idx: cl_uint, q: cl_kernel_arg_info) -> CLResult<Vec<MaybeUninit<u8>>> {
50 let kernel = Kernel::ref_from_raw(*self)?;
51
52 // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
53 if idx as usize >= kernel.kernel_info.args.len() {
54 return Err(CL_INVALID_ARG_INDEX);
55 }
56
57 Ok(match *q {
58 CL_KERNEL_ARG_ACCESS_QUALIFIER => {
59 cl_prop::<cl_kernel_arg_access_qualifier>(kernel.access_qualifier(idx))
60 }
61 CL_KERNEL_ARG_ADDRESS_QUALIFIER => {
62 cl_prop::<cl_kernel_arg_address_qualifier>(kernel.address_qualifier(idx))
63 }
64 CL_KERNEL_ARG_NAME => cl_prop::<&str>(kernel.arg_name(idx)),
65 CL_KERNEL_ARG_TYPE_NAME => cl_prop::<&str>(kernel.arg_type_name(idx)),
66 CL_KERNEL_ARG_TYPE_QUALIFIER => {
67 cl_prop::<cl_kernel_arg_type_qualifier>(kernel.type_qualifier(idx))
68 }
69 // CL_INVALID_VALUE if param_name is not one of the supported values
70 _ => return Err(CL_INVALID_VALUE),
71 })
72 }
73 }
74
75 #[cl_info_entrypoint(cl_get_kernel_work_group_info)]
76 impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
query( &self, dev: cl_device_id, q: cl_kernel_work_group_info, ) -> CLResult<Vec<MaybeUninit<u8>>>77 fn query(
78 &self,
79 dev: cl_device_id,
80 q: cl_kernel_work_group_info,
81 ) -> CLResult<Vec<MaybeUninit<u8>>> {
82 let kernel = Kernel::ref_from_raw(*self)?;
83
84 // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated with kernel.
85 let dev = if dev.is_null() {
86 if kernel.prog.devs.len() > 1 {
87 return Err(CL_INVALID_DEVICE);
88 } else {
89 kernel.prog.devs[0]
90 }
91 } else {
92 Device::ref_from_raw(dev)?
93 };
94
95 // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
96 if !kernel.prog.devs.contains(&dev) {
97 return Err(CL_INVALID_DEVICE);
98 }
99
100 Ok(match *q {
101 CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size()),
102 CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(dev)),
103 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
104 cl_prop::<usize>(kernel.preferred_simd_size(dev))
105 }
106 CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(dev)),
107 CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(dev)),
108 // CL_INVALID_VALUE if param_name is not one of the supported values
109 _ => return Err(CL_INVALID_VALUE),
110 })
111 }
112 }
113
114 impl CLInfoObj<cl_kernel_sub_group_info, (cl_device_id, usize, *const c_void, usize)>
115 for cl_kernel
116 {
query( &self, (dev, input_value_size, input_value, output_value_size): ( cl_device_id, usize, *const c_void, usize, ), q: cl_program_build_info, ) -> CLResult<Vec<MaybeUninit<u8>>>117 fn query(
118 &self,
119 (dev, input_value_size, input_value, output_value_size): (
120 cl_device_id,
121 usize,
122 *const c_void,
123 usize,
124 ),
125 q: cl_program_build_info,
126 ) -> CLResult<Vec<MaybeUninit<u8>>> {
127 let kernel = Kernel::ref_from_raw(*self)?;
128
129 // CL_INVALID_DEVICE [..] if device is NULL but there is more than one device associated
130 // with kernel.
131 let dev = if dev.is_null() {
132 if kernel.prog.devs.len() > 1 {
133 return Err(CL_INVALID_DEVICE);
134 } else {
135 kernel.prog.devs[0]
136 }
137 } else {
138 Device::ref_from_raw(dev)?
139 };
140
141 // CL_INVALID_DEVICE if device is not in the list of devices associated with kernel
142 if !kernel.prog.devs.contains(&dev) {
143 return Err(CL_INVALID_DEVICE);
144 }
145
146 // CL_INVALID_OPERATION if device does not support subgroups.
147 if !dev.subgroups_supported() {
148 return Err(CL_INVALID_OPERATION);
149 }
150
151 let usize_byte = mem::size_of::<usize>();
152 // first we have to convert the input to a proper thing
153 let input: &[usize] = match q {
154 CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE | CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
155 // CL_INVALID_VALUE if param_name is CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
156 // CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE or ... and the size in bytes specified by
157 // input_value_size is not valid or if input_value is NULL.
158 if ![usize_byte, 2 * usize_byte, 3 * usize_byte].contains(&input_value_size) {
159 return Err(CL_INVALID_VALUE);
160 }
161 // SAFETY: we verified the size as best as possible, with the rest we trust the client
162 unsafe { slice::from_raw_parts(input_value.cast(), input_value_size / usize_byte) }
163 }
164 CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
165 // CL_INVALID_VALUE if param_name is ... CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT
166 // and the size in bytes specified by input_value_size is not valid or if
167 // input_value is NULL.
168 if input_value_size != usize_byte || input_value.is_null() {
169 return Err(CL_INVALID_VALUE);
170 }
171 // SAFETY: we trust the client here
172 unsafe { slice::from_raw_parts(input_value.cast(), 1) }
173 }
174 _ => &[],
175 };
176
177 Ok(match q {
178 CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE => {
179 cl_prop::<usize>(kernel.subgroups_for_block(dev, input))
180 }
181 CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE => {
182 cl_prop::<usize>(kernel.subgroup_size_for_block(dev, input))
183 }
184 CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT => {
185 let subgroups = input[0];
186 let mut res = vec![0; 3];
187
188 for subgroup_size in kernel.subgroup_sizes(dev) {
189 let threads = subgroups * subgroup_size;
190
191 if threads > dev.max_threads_per_block() {
192 continue;
193 }
194
195 let block = [threads, 1, 1];
196 let real_subgroups = kernel.subgroups_for_block(dev, &block);
197
198 if real_subgroups == subgroups {
199 res = block.to_vec();
200 break;
201 }
202 }
203
204 res.truncate(output_value_size / usize_byte);
205 cl_prop::<Vec<usize>>(res)
206 }
207 CL_KERNEL_MAX_NUM_SUB_GROUPS => {
208 let threads = kernel.max_threads_per_block(dev);
209 let max_groups = dev.max_subgroups();
210
211 let mut result = 0;
212 for sgs in kernel.subgroup_sizes(dev) {
213 result = cmp::max(result, threads / sgs);
214 result = cmp::min(result, max_groups as usize);
215 }
216 cl_prop::<usize>(result)
217 }
218 CL_KERNEL_COMPILE_NUM_SUB_GROUPS => cl_prop::<usize>(kernel.num_subgroups()),
219 CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL => cl_prop::<usize>(kernel.subgroup_size()),
220 // CL_INVALID_VALUE if param_name is not one of the supported values
221 _ => return Err(CL_INVALID_VALUE),
222 })
223 }
224 }
225
226 const ZERO_ARR: [usize; 3] = [0; 3];
227
228 /// # Safety
229 ///
230 /// This function is only safe when called on an array of `work_dim` length
kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize]231 unsafe fn kernel_work_arr_or_default<'a>(arr: *const usize, work_dim: cl_uint) -> &'a [usize] {
232 if !arr.is_null() {
233 unsafe { slice::from_raw_parts(arr, work_dim as usize) }
234 } else {
235 &ZERO_ARR
236 }
237 }
238
239 #[cl_entrypoint]
create_kernel( program: cl_program, kernel_name: *const ::std::os::raw::c_char, ) -> CLResult<cl_kernel>240 fn create_kernel(
241 program: cl_program,
242 kernel_name: *const ::std::os::raw::c_char,
243 ) -> CLResult<cl_kernel> {
244 let p = Program::arc_from_raw(program)?;
245 let name = c_string_to_string(kernel_name);
246
247 // CL_INVALID_VALUE if kernel_name is NULL.
248 if kernel_name.is_null() {
249 return Err(CL_INVALID_VALUE);
250 }
251
252 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for program.
253 if p.kernels().is_empty() {
254 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
255 }
256
257 // CL_INVALID_KERNEL_NAME if kernel_name is not found in program.
258 if !p.kernels().contains(&name) {
259 return Err(CL_INVALID_KERNEL_NAME);
260 }
261
262 // CL_INVALID_KERNEL_DEFINITION if the function definition for __kernel function given by
263 // kernel_name such as the number of arguments, the argument types are not the same for all
264 // devices for which the program executable has been built.
265 if p.kernel_signatures(&name).len() != 1 {
266 return Err(CL_INVALID_KERNEL_DEFINITION);
267 }
268
269 Ok(Kernel::new(name, p).into_cl())
270 }
271
272 #[cl_entrypoint]
retain_kernel(kernel: cl_kernel) -> CLResult<()>273 fn retain_kernel(kernel: cl_kernel) -> CLResult<()> {
274 Kernel::retain(kernel)
275 }
276
277 #[cl_entrypoint]
release_kernel(kernel: cl_kernel) -> CLResult<()>278 fn release_kernel(kernel: cl_kernel) -> CLResult<()> {
279 Kernel::release(kernel)
280 }
281
282 #[cl_entrypoint]
create_kernels_in_program( program: cl_program, num_kernels: cl_uint, kernels: *mut cl_kernel, num_kernels_ret: *mut cl_uint, ) -> CLResult<()>283 fn create_kernels_in_program(
284 program: cl_program,
285 num_kernels: cl_uint,
286 kernels: *mut cl_kernel,
287 num_kernels_ret: *mut cl_uint,
288 ) -> CLResult<()> {
289 let p = Program::arc_from_raw(program)?;
290
291 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built executable for any device in
292 // program.
293 if p.kernels().is_empty() {
294 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
295 }
296
297 // CL_INVALID_VALUE if kernels is not NULL and num_kernels is less than the number of kernels
298 // in program.
299 if !kernels.is_null() && p.kernels().len() > num_kernels as usize {
300 return Err(CL_INVALID_VALUE);
301 }
302
303 let mut num_kernels = 0;
304 for name in p.kernels() {
305 // Kernel objects are not created for any __kernel functions in program that do not have the
306 // same function definition across all devices for which a program executable has been
307 // successfully built.
308 if p.kernel_signatures(&name).len() != 1 {
309 continue;
310 }
311
312 if !kernels.is_null() {
313 // we just assume the client isn't stupid
314 unsafe {
315 kernels
316 .add(num_kernels as usize)
317 .write(Kernel::new(name, p.clone()).into_cl());
318 }
319 }
320 num_kernels += 1;
321 }
322 num_kernels_ret.write_checked(num_kernels);
323 Ok(())
324 }
325
326 #[cl_entrypoint]
set_kernel_arg( kernel: cl_kernel, arg_index: cl_uint, arg_size: usize, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>327 fn set_kernel_arg(
328 kernel: cl_kernel,
329 arg_index: cl_uint,
330 arg_size: usize,
331 arg_value: *const ::std::os::raw::c_void,
332 ) -> CLResult<()> {
333 let k = Kernel::ref_from_raw(kernel)?;
334 let arg_index = arg_index as usize;
335
336 // CL_INVALID_ARG_INDEX if arg_index is not a valid argument index.
337 if let Some(arg) = k.kernel_info.args.get(arg_index) {
338 // CL_INVALID_ARG_SIZE if arg_size does not match the size of the data type for an argument
339 // that is not a memory object or if the argument is a memory object and
340 // arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the
341 // local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler).
342 match arg.kind {
343 KernelArgType::MemLocal => {
344 if arg_size == 0 {
345 return Err(CL_INVALID_ARG_SIZE);
346 }
347 }
348 KernelArgType::MemGlobal
349 | KernelArgType::MemConstant
350 | KernelArgType::Image
351 | KernelArgType::RWImage
352 | KernelArgType::Texture => {
353 if arg_size != std::mem::size_of::<cl_mem>() {
354 return Err(CL_INVALID_ARG_SIZE);
355 }
356 }
357 _ => {
358 if arg.size != arg_size {
359 return Err(CL_INVALID_ARG_SIZE);
360 }
361 }
362 }
363
364 // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
365 match arg.kind {
366 // If the argument is declared with the local qualifier, the arg_value entry must be
367 // NULL.
368 KernelArgType::MemLocal => {
369 if !arg_value.is_null() {
370 return Err(CL_INVALID_ARG_VALUE);
371 }
372 }
373 // If the argument is of type sampler_t, the arg_value entry must be a pointer to the
374 // sampler object.
375 KernelArgType::Constant | KernelArgType::Sampler => {
376 if arg_value.is_null() {
377 return Err(CL_INVALID_ARG_VALUE);
378 }
379 }
380 _ => {}
381 };
382
383 // let's create the arg now
384 let arg = unsafe {
385 if arg.dead {
386 KernelArgValue::None
387 } else {
388 match arg.kind {
389 KernelArgType::Constant => KernelArgValue::Constant(
390 slice::from_raw_parts(arg_value.cast(), arg_size).to_vec(),
391 ),
392 KernelArgType::MemConstant | KernelArgType::MemGlobal => {
393 let ptr: *const cl_mem = arg_value.cast();
394 if ptr.is_null() || (*ptr).is_null() {
395 KernelArgValue::None
396 } else {
397 KernelArgValue::Buffer(Buffer::arc_from_raw(*ptr)?)
398 }
399 }
400 KernelArgType::MemLocal => KernelArgValue::LocalMem(arg_size),
401 KernelArgType::Image | KernelArgType::RWImage | KernelArgType::Texture => {
402 let img: *const cl_mem = arg_value.cast();
403 KernelArgValue::Image(Image::arc_from_raw(*img)?)
404 }
405 KernelArgType::Sampler => {
406 let ptr: *const cl_sampler = arg_value.cast();
407 KernelArgValue::Sampler(Sampler::arc_from_raw(*ptr)?)
408 }
409 }
410 }
411 };
412 k.set_kernel_arg(arg_index, arg)
413 } else {
414 Err(CL_INVALID_ARG_INDEX)
415 }
416
417 //• CL_INVALID_DEVICE_QUEUE for an argument declared to be of type queue_t when the specified arg_value is not a valid device queue object. This error code is missing before version 2.0.
418 //• CL_INVALID_ARG_VALUE if the argument is an image declared with the read_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_WRITE_ONLY or if the image argument is declared with the write_only qualifier and arg_value refers to an image object created with cl_mem_flags of CL_MEM_READ_ONLY.
419 //• CL_MAX_SIZE_RESTRICTION_EXCEEDED if the size in bytes of the memory object (if the argument is a memory object) or arg_size (if the argument is declared with local qualifier) exceeds a language- specified maximum size restriction for this argument, such as the MaxByteOffset SPIR-V decoration. This error code is missing before version 2.2.
420 }
421
422 #[cl_entrypoint]
set_kernel_arg_svm_pointer( kernel: cl_kernel, arg_index: cl_uint, arg_value: *const ::std::os::raw::c_void, ) -> CLResult<()>423 fn set_kernel_arg_svm_pointer(
424 kernel: cl_kernel,
425 arg_index: cl_uint,
426 arg_value: *const ::std::os::raw::c_void,
427 ) -> CLResult<()> {
428 let kernel = Kernel::ref_from_raw(kernel)?;
429 let arg_index = arg_index as usize;
430 let arg_value = arg_value as usize;
431
432 if !kernel.has_svm_devs() {
433 return Err(CL_INVALID_OPERATION);
434 }
435
436 if let Some(arg) = kernel.kernel_info.args.get(arg_index) {
437 if !matches!(
438 arg.kind,
439 KernelArgType::MemConstant | KernelArgType::MemGlobal
440 ) {
441 return Err(CL_INVALID_ARG_INDEX);
442 }
443
444 let arg_value = KernelArgValue::Constant(arg_value.to_ne_bytes().to_vec());
445 kernel.set_kernel_arg(arg_index, arg_value)
446 } else {
447 Err(CL_INVALID_ARG_INDEX)
448 }
449
450 // CL_INVALID_ARG_VALUE if arg_value specified is not a valid value.
451 }
452
453 #[cl_entrypoint]
set_kernel_exec_info( kernel: cl_kernel, param_name: cl_kernel_exec_info, param_value_size: usize, param_value: *const ::std::os::raw::c_void, ) -> CLResult<()>454 fn set_kernel_exec_info(
455 kernel: cl_kernel,
456 param_name: cl_kernel_exec_info,
457 param_value_size: usize,
458 param_value: *const ::std::os::raw::c_void,
459 ) -> CLResult<()> {
460 let k = Kernel::ref_from_raw(kernel)?;
461
462 // CL_INVALID_OPERATION if no devices in the context associated with kernel support SVM.
463 if !k.prog.devs.iter().any(|dev| dev.svm_supported()) {
464 return Err(CL_INVALID_OPERATION);
465 }
466
467 // CL_INVALID_VALUE ... if param_value is NULL
468 if param_value.is_null() {
469 return Err(CL_INVALID_VALUE);
470 }
471
472 // CL_INVALID_VALUE ... if the size specified by param_value_size is not valid.
473 match param_name {
474 CL_KERNEL_EXEC_INFO_SVM_PTRS | CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM => {
475 // it's a list of pointers
476 if param_value_size % mem::size_of::<*const c_void>() != 0 {
477 return Err(CL_INVALID_VALUE);
478 }
479 }
480 CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM
481 | CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM => {
482 if param_value_size != mem::size_of::<cl_bool>() {
483 return Err(CL_INVALID_VALUE);
484 }
485 }
486 // CL_INVALID_VALUE if param_name is not valid
487 _ => return Err(CL_INVALID_VALUE),
488 }
489
490 Ok(())
491
492 // CL_INVALID_OPERATION if param_name is CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM and param_value is CL_TRUE but no devices in context associated with kernel support fine-grain system SVM allocations.
493 }
494
495 #[cl_entrypoint]
enqueue_ndrange_kernel( command_queue: cl_command_queue, kernel: cl_kernel, work_dim: cl_uint, global_work_offset: *const usize, global_work_size: *const usize, local_work_size: *const usize, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>496 fn enqueue_ndrange_kernel(
497 command_queue: cl_command_queue,
498 kernel: cl_kernel,
499 work_dim: cl_uint,
500 global_work_offset: *const usize,
501 global_work_size: *const usize,
502 local_work_size: *const usize,
503 num_events_in_wait_list: cl_uint,
504 event_wait_list: *const cl_event,
505 event: *mut cl_event,
506 ) -> CLResult<()> {
507 let q = Queue::arc_from_raw(command_queue)?;
508 let k = Kernel::arc_from_raw(kernel)?;
509 let evs = event_list_from_cl(&q, num_events_in_wait_list, event_wait_list)?;
510
511 // CL_INVALID_CONTEXT if context associated with command_queue and kernel are not the same
512 if q.context != k.prog.context {
513 return Err(CL_INVALID_CONTEXT);
514 }
515
516 // CL_INVALID_PROGRAM_EXECUTABLE if there is no successfully built program executable available
517 // for device associated with command_queue.
518 if k.prog.status(q.device) != CL_BUILD_SUCCESS as cl_build_status {
519 return Err(CL_INVALID_PROGRAM_EXECUTABLE);
520 }
521
522 // CL_INVALID_KERNEL_ARGS if the kernel argument values have not been specified.
523 if k.arg_values().iter().any(|v| v.is_none()) {
524 return Err(CL_INVALID_KERNEL_ARGS);
525 }
526
527 // CL_INVALID_WORK_DIMENSION if work_dim is not a valid value (i.e. a value between 1 and
528 // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS).
529 if work_dim == 0 || work_dim > q.device.max_grid_dimensions() {
530 return Err(CL_INVALID_WORK_DIMENSION);
531 }
532
533 // we assume the application gets it right and doesn't pass shorter arrays then actually needed.
534 let global_work_size = unsafe { kernel_work_arr_or_default(global_work_size, work_dim) };
535 let local_work_size = unsafe { kernel_work_arr_or_default(local_work_size, work_dim) };
536 let global_work_offset = unsafe { kernel_work_arr_or_default(global_work_offset, work_dim) };
537
538 let device_bits = q.device.address_bits();
539 let device_max = u64::MAX >> (u64::BITS - device_bits);
540
541 let mut threads = 0;
542 for i in 0..work_dim as usize {
543 let lws = local_work_size[i];
544 let gws = global_work_size[i];
545 let gwo = global_work_offset[i];
546
547 threads *= lws;
548
549 // CL_INVALID_WORK_ITEM_SIZE if the number of work-items specified in any of
550 // local_work_size[0], … local_work_size[work_dim - 1] is greater than the corresponding
551 // values specified by
552 // CL_DEVICE_MAX_WORK_ITEM_SIZES[0], …, CL_DEVICE_MAX_WORK_ITEM_SIZES[work_dim - 1].
553 if lws > q.device.max_block_sizes()[i] {
554 return Err(CL_INVALID_WORK_ITEM_SIZE);
555 }
556
557 // CL_INVALID_WORK_GROUP_SIZE if the work-group size must be uniform and the
558 // local_work_size is not NULL, [...] if the global_work_size is not evenly divisible by
559 // the local_work_size.
560 if lws != 0 && gws % lws != 0 {
561 return Err(CL_INVALID_WORK_GROUP_SIZE);
562 }
563
564 // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and does not match the
565 // required work-group size for kernel in the program source.
566 if lws != 0 && k.work_group_size()[i] != 0 && lws != k.work_group_size()[i] {
567 return Err(CL_INVALID_WORK_GROUP_SIZE);
568 }
569
570 // CL_INVALID_GLOBAL_WORK_SIZE if any of the values specified in global_work_size[0], …
571 // global_work_size[work_dim - 1] exceed the maximum value representable by size_t on
572 // the device on which the kernel-instance will be enqueued.
573 if gws as u64 > device_max {
574 return Err(CL_INVALID_GLOBAL_WORK_SIZE);
575 }
576
577 // CL_INVALID_GLOBAL_OFFSET if the value specified in global_work_size + the
578 // corresponding values in global_work_offset for any dimensions is greater than the
579 // maximum value representable by size t on the device on which the kernel-instance
580 // will be enqueued
581 if u64::checked_add(gws as u64, gwo as u64)
582 .filter(|&x| x <= device_max)
583 .is_none()
584 {
585 return Err(CL_INVALID_GLOBAL_OFFSET);
586 }
587 }
588
589 // CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the total number of work-items
590 // in the work-group computed as local_work_size[0] × … local_work_size[work_dim - 1] is greater
591 // than the value specified by CL_KERNEL_WORK_GROUP_SIZE in the Kernel Object Device Queries
592 // table.
593 if threads != 0 && threads > k.max_threads_per_block(q.device) {
594 return Err(CL_INVALID_WORK_GROUP_SIZE);
595 }
596
597 // If global_work_size is NULL, or the value in any passed dimension is 0 then the kernel
598 // command will trivially succeed after its event dependencies are satisfied and subsequently
599 // update its completion event.
600 let cb: EventSig = if global_work_size.contains(&0) {
601 Box::new(|_, _| Ok(()))
602 } else {
603 k.launch(
604 &q,
605 work_dim,
606 local_work_size,
607 global_work_size,
608 global_work_offset,
609 )?
610 };
611
612 create_and_queue(q, CL_COMMAND_NDRANGE_KERNEL, evs, event, false, cb)
613
614 //• CL_INVALID_WORK_GROUP_SIZE if local_work_size is specified and is not consistent with the required number of sub-groups for kernel in the program source.
615 //• CL_MISALIGNED_SUB_BUFFER_OFFSET if a sub-buffer object is specified as the value for an argument that is a buffer object and the offset specified when the sub-buffer object is created is not aligned to CL_DEVICE_MEM_BASE_ADDR_ALIGN value for device associated with queue. This error code
616 //• CL_INVALID_IMAGE_SIZE if an image object is specified as an argument value and the image dimensions (image width, height, specified or compute row and/or slice pitch) are not supported by device associated with queue.
617 //• CL_IMAGE_FORMAT_NOT_SUPPORTED if an image object is specified as an argument value and the image format (image channel order and data type) is not supported by device associated with queue.
618 //• CL_OUT_OF_RESOURCES if there is a failure to queue the execution instance of kernel on the command-queue because of insufficient resources needed to execute the kernel. For example, the explicitly specified local_work_size causes a failure to execute the kernel because of insufficient resources such as registers or local memory. Another example would be the number of read-only image args used in kernel exceed the CL_DEVICE_MAX_READ_IMAGE_ARGS value for device or the number of write-only and read-write image args used in kernel exceed the CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS value for device or the number of samplers used in kernel exceed CL_DEVICE_MAX_SAMPLERS for device.
619 //• CL_MEM_OBJECT_ALLOCATION_FAILURE if there is a failure to allocate memory for data store associated with image or buffer objects specified as arguments to kernel.
620 //• CL_INVALID_OPERATION if SVM pointers are passed as arguments to a kernel and the device does not support SVM or if system pointers are passed as arguments to a kernel and/or stored inside SVM allocations passed as kernel arguments and the device does not support fine grain system SVM allocations.
621 }
622
623 #[cl_entrypoint]
enqueue_task( command_queue: cl_command_queue, kernel: cl_kernel, num_events_in_wait_list: cl_uint, event_wait_list: *const cl_event, event: *mut cl_event, ) -> CLResult<()>624 fn enqueue_task(
625 command_queue: cl_command_queue,
626 kernel: cl_kernel,
627 num_events_in_wait_list: cl_uint,
628 event_wait_list: *const cl_event,
629 event: *mut cl_event,
630 ) -> CLResult<()> {
631 // clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim set to 1,
632 // global_work_offset set to NULL, global_work_size[0] set to 1, and local_work_size[0] set to
633 // 1.
634 enqueue_ndrange_kernel(
635 command_queue,
636 kernel,
637 1,
638 ptr::null(),
639 [1, 1, 1].as_ptr(),
640 [1, 0, 0].as_ptr(),
641 num_events_in_wait_list,
642 event_wait_list,
643 event,
644 )
645 }
646
647 #[cl_entrypoint]
clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel>648 fn clone_kernel(source_kernel: cl_kernel) -> CLResult<cl_kernel> {
649 let k = Kernel::ref_from_raw(source_kernel)?;
650 Ok(Arc::new(k.clone()).into_cl())
651 }
652