1;; bash$ cat device_execution_overloading.cl 2;; void device_kernel_with_local_args(__local float* ptr0, __local float* ptr1) { 3;; *ptr0 = 0; 4;; *ptr1 = 1; 5;; } 6;; 7;; void device_kernel(__global float* ptr) { 8;; *ptr = 3; 9;; } 10;; 11;; __kernel void host_kernel(uint size, __global float* ptr) { 12;; void(^block_with_local)(__local void*, __local void*) = ^(__local void* ptr0, __local void* ptr1){ 13;; device_kernel_with_local_args(ptr0, ptr1); 14;; }; 15;; 16;; void(^block)(void) = ^{ 17;; device_kernel(ptr); 18;; }; 19;; 20;n; uint wgSize = get_kernel_work_group_size(block_with_local); 21;; uint prefMul = get_kernel_preferred_work_group_size_multiple(block_with_local); 22;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), 23;; 0, NULL, NULL, block_with_local, size, wgSize * prefMul); 24;; 25;; wgSize = get_kernel_work_group_size(block); 26;; prefMul = get_kernel_preferred_work_group_size_multiple(block); 27;; enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange_1D(1), 28;; 0, NULL, NULL, block); 29;; } 30;; bash$ 31;;$PATH_TO_GEN/bin/clang -cc1 -x cl -O0 -cl-std=CL2.0 -triple spir64-unknonw-unknown -include $PATH_TO_GEN/lib/clang/3.6.1/include/opencl-20.h -emit-llvm device_execution_overloading.cl -o device_execution_overloading.ll 32 33;; Test overloading of device exectuion built-ins is OK after translation from SPIR-V 34 35; RUN: llvm-as %s -o %t.bc 36; RUN: llvm-spirv %t.bc -o %t.spv 37; RUN: llvm-spirv -r %t.spv -o %t.bc 38; RUN: llvm-dis < %t.bc | FileCheck %s 39 40; ModuleID = 'device_execution_overloading.cl' 41target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" 42target triple = "spir64-unknonw-unknown" 43 44%opencl.block = type opaque 45%struct.ndrange_t = type { i32, [3 x i64], [3 x i64], [3 x i64] } 46%opencl.queue_t = type opaque 47%opencl.clk_event_t = type opaque 48 49; Function Attrs: nounwind 50define spir_func void @device_kernel_with_local_args(float addrspace(3)* %ptr0, float addrspace(3)* %ptr1) #0 { 51entry: 52 %ptr0.addr = alloca float addrspace(3)*, align 8 53 %ptr1.addr = alloca float addrspace(3)*, align 8 54 store float addrspace(3)* %ptr0, float addrspace(3)** %ptr0.addr, align 8 55 store float addrspace(3)* %ptr1, float addrspace(3)** %ptr1.addr, align 8 56 %0 = load float addrspace(3)*, float addrspace(3)** %ptr0.addr, align 8 57 store float 0.000000e+00, float addrspace(3)* %0, align 4 58 %1 = load float addrspace(3)*, float addrspace(3)** %ptr1.addr, align 8 59 store float 1.000000e+00, float addrspace(3)* %1, align 4 60 ret void 61} 62 63; Function Attrs: nounwind 64define spir_func void @device_kernel(float addrspace(1)* %ptr) #0 { 65entry: 66 %ptr.addr = alloca float addrspace(1)*, align 8 67 store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 68 %0 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 69 store float 3.000000e+00, float addrspace(1)* %0, align 4 70 ret void 71} 72 73; CHECK: @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE 74; CHECK: @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE 75; CHECK: @_Z14enqueue_kernel9ocl_queue{{.*}}9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz 76 77; CHECK: @_Z26get_kernel_work_group_sizeU13block_pointerFvvE 78; CHECK: @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE 79; CHECK: @_Z14enqueue_kernel9ocl_queue{{.*}}9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE 80 81; Function Attrs: nounwind 82define spir_kernel void @host_kernel(i32 %size, float addrspace(1)* %ptr) #0 { 83entry: 84 %size.addr = alloca i32, align 4 85 %ptr.addr = alloca float addrspace(1)*, align 8 86 %block_with_local = alloca %opencl.block*, align 8 87 %block = alloca %opencl.block*, align 8 88 %captured = alloca <{ float addrspace(1)* }>, align 8 89 %wgSize = alloca i32, align 4 90 %prefMul = alloca i32, align 4 91 %agg.tmp = alloca %struct.ndrange_t, align 8 92 %agg.tmp8 = alloca %struct.ndrange_t, align 8 93 store i32 %size, i32* %size.addr, align 4 94 store float addrspace(1)* %ptr, float addrspace(1)** %ptr.addr, align 8 95 %0 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*, i8 addrspace(3)*, i8 addrspace(3)*)* @__host_kernel_block_invoke to i8*), i32 0, i32 0, i8* null) 96 store %opencl.block* %0, %opencl.block** %block_with_local, align 8 97 %block.captured = getelementptr inbounds <{float addrspace(1)* }>, <{ float addrspace(1)* }>* %captured, i32 0, i32 0 98 %1 = load float addrspace(1)*, float addrspace(1)** %ptr.addr, align 8 99 store float addrspace(1)* %1, float addrspace(1)** %block.captured, align 8 100 %2 = bitcast <{ float addrspace(1)* }>* %captured to i8* 101 %3 = call %opencl.block* @spir_block_bind(i8* bitcast (void (i8*)* @__host_kernel_block_invoke_2 to i8*), i32 8, i32 8, i8* %2) 102 store %opencl.block* %3, %opencl.block** %block, align 8 103 %4 = load %opencl.block*, %opencl.block** %block_with_local, align 8 104 %call = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block* %4) 105 store i32 %call, i32* %wgSize, align 4 106 %5 = load %opencl.block*, %opencl.block** %block_with_local, align 8 107 %call2 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block* %5) 108 store i32 %call2, i32* %prefMul, align 4 109 %call3 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() 110 call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp, i64 1) 111 %6 = load %opencl.block*, %opencl.block** %block_with_local, align 8 112 %7 = load i32, i32* %size.addr, align 4 113 %8 = load i32, i32* %wgSize, align 4 114 %9 = load i32, i32* %prefMul, align 4 115 %mul = mul i32 %8, %9 116 %call4 = call spir_func i32 (%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...)* @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t* %call3, i32 241, %struct.ndrange_t* byval %agg.tmp, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %6, i32 %7, i32 %mul) 117 %10 = load %opencl.block*, %opencl.block** %block, align 8 118 %call5 = call spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block* %10) 119 store i32 %call5, i32* %wgSize, align 4 120 %11 = load %opencl.block*, %opencl.block** %block, align 8 121 %call6 = call spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block* %11) 122 store i32 %call6, i32* %prefMul, align 4 123 %call7 = call spir_func %opencl.queue_t* @_Z17get_default_queuev() 124 call spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret %agg.tmp8, i64 1) 125 %12 = load %opencl.block*, %opencl.block** %block, align 8 126 %call9 = call spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t* %call7, i32 241, %struct.ndrange_t* byval %agg.tmp8, i32 0, %opencl.clk_event_t** null, %opencl.clk_event_t** null, %opencl.block* %12) 127 ret void 128} 129 130; Function Attrs: nounwind 131define internal spir_func void @__host_kernel_block_invoke(i8* %.block_descriptor, i8 addrspace(3)* %ptr0, i8 addrspace(3)* %ptr1) #0 { 132entry: 133 %.block_descriptor.addr = alloca i8*, align 8 134 %ptr0.addr = alloca i8 addrspace(3)*, align 8 135 %ptr1.addr = alloca i8 addrspace(3)*, align 8 136 %block.addr = alloca <{}>*, align 8 137 store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 138 %0 = load i8*, i8** %.block_descriptor.addr 139 store i8 addrspace(3)* %ptr0, i8 addrspace(3)** %ptr0.addr, align 8 140 store i8 addrspace(3)* %ptr1, i8 addrspace(3)** %ptr1.addr, align 8 141 %block = bitcast i8* %.block_descriptor to <{}>* 142 store <{}>* %block, <{}>** %block.addr, align 8 143 %1 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr0.addr, align 8 144 %2 = bitcast i8 addrspace(3)* %1 to float addrspace(3)* 145 %3 = load i8 addrspace(3)*, i8 addrspace(3)** %ptr1.addr, align 8 146 %4 = bitcast i8 addrspace(3)* %3 to float addrspace(3)* 147 call spir_func void @device_kernel_with_local_args(float addrspace(3)* %2, float addrspace(3)* %4) 148 ret void 149} 150 151declare %opencl.block* @spir_block_bind(i8*, i32, i32, i8*) 152 153; Function Attrs: nounwind 154define internal spir_func void @__host_kernel_block_invoke_2(i8* %.block_descriptor) #0 { 155entry: 156 %.block_descriptor.addr = alloca i8*, align 8 157 %block.addr = alloca <{ float addrspace(1)* }>*, align 8 158 store i8* %.block_descriptor, i8** %.block_descriptor.addr, align 8 159 %0 = load i8*, i8** %.block_descriptor.addr 160 %block = bitcast i8* %.block_descriptor to <{ float addrspace(1)* }>* 161 store <{ float addrspace(1)* }>* %block, <{ float addrspace(1)* }>** %block.addr, align 8 162 %block.capture.addr = getelementptr inbounds <{ float addrspace(1)* }>, <{ float addrspace(1)* }>* %block, i32 0, i32 0 163 %1 = load float addrspace(1)*, float addrspace(1)** %block.capture.addr, align 8 164 call spir_func void @device_kernel(float addrspace(1)* %1) 165 ret void 166} 167 168declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 169 170declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvPU3AS3vzE(%opencl.block*) #1 171 172declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvPU3AS3vzEjz(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*, i32, ...) #1 173 174declare spir_func %opencl.queue_t* @_Z17get_default_queuev() #1 175 176declare spir_func void @_Z10ndrange_1Dm(%struct.ndrange_t* sret, i64) #1 177 178declare spir_func i32 @_Z26get_kernel_work_group_sizeU13block_pointerFvvE(%opencl.block*) #1 179 180declare spir_func i32 @_Z45get_kernel_preferred_work_group_size_multipleU13block_pointerFvvE(%opencl.block*) #1 181 182declare spir_func i32 @_Z14enqueue_kernel9ocl_queuei9ndrange_tjPK12ocl_clkeventP12ocl_clkeventU13block_pointerFvvE(%opencl.queue_t*, i32, %struct.ndrange_t* byval, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, %opencl.block*) #1 183 184attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 185attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 186 187!opencl.kernels = !{!0} 188!opencl.enable.FP_CONTRACT = !{} 189!opencl.spir.version = !{!6} 190!opencl.ocl.version = !{!7} 191!opencl.used.extensions = !{!8} 192!opencl.used.optional.core.features = !{!8} 193!opencl.compiler.options = !{!8} 194 195!0 = !{void (i32, float addrspace(1)*)* @host_kernel, !1, !2, !3, !4, !5} 196!1 = !{!"kernel_arg_addr_space", i32 0, i32 1} 197!2 = !{!"kernel_arg_access_qual", !"none", !"none"} 198!3 = !{!"kernel_arg_type", !"uint", !"float*"} 199!4 = !{!"kernel_arg_base_type", !"uint", !"float*"} 200!5 = !{!"kernel_arg_type_qual", !"", !""} 201!6 = !{i32 1, i32 2} 202!7 = !{i32 2, i32 0} 203!8 = !{} 204