1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20
21 #include <vector>
22
23 #include "procs.h"
24 #include "utils.h"
25 #include <time.h>
26
27
28 #ifdef CL_VERSION_2_0
29 extern int gWimpyMode;
30 static const char* helper_ndrange_1d_glo[] =
31 {
32 NL, "void block_fn(int len, __global atomic_uint* val)"
33 NL, "{"
34 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
35 NL, "}"
36 NL, ""
37 NL, "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)"
38 NL, "{"
39 NL, " size_t tid = get_global_id(0);"
40 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
41 NL, ""
42 NL, " for(int i = 0; i < n; i++)"
43 NL, " {"
44 NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);"
45 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
46 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
47 NL, " }"
48 NL, "}"
49 NL
50 };
51
52 static const char* helper_ndrange_1d_loc[] =
53 {
54 NL, "void block_fn(int len, __global atomic_uint* val)"
55 NL, "{"
56 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
57 NL, "}"
58 NL, ""
59 NL, "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)"
60 NL, "{"
61 NL, " size_t tid = get_global_id(0);"
62 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
63 NL, ""
64 NL, " for(int k = 0; k < n; k++)"
65 NL, " {"
66 NL, " for(int i = 0; i < n; i++)"
67 NL, " {"
68 NL, " if (glob_size_arr[i] >= loc_size_arr[k])"
69 NL, " {"
70 NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], loc_size_arr[k]);"
71 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
72 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
73 NL, " }"
74 NL, " }"
75 NL, " }"
76 NL, "}"
77 NL
78 };
79
80 static const char* helper_ndrange_1d_ofs[] =
81 {
82 NL, "void block_fn(int len, __global atomic_uint* val)"
83 NL, "{"
84 NL, " atomic_fetch_add_explicit(&val[(get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
85 NL, "}"
86 NL, ""
87 NL, "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)"
88 NL, "{"
89 NL, " size_t tid = get_global_id(0);"
90 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
91 NL, ""
92 NL, " for(int l = 0; l < n; l++)"
93 NL, " {"
94 NL, " for(int k = 0; k < n; k++)"
95 NL, " {"
96 NL, " for(int i = 0; i < n; i++)"
97 NL, " {"
98 NL, " if (glob_size_arr[i] >= loc_size_arr[k])"
99 NL, " {"
100 NL, " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], loc_size_arr[k]);"
101 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
102 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
103 NL, " }"
104 NL, " }"
105 NL, " }"
106 NL, " }"
107 NL, "}"
108 NL
109 };
110
111 static const char* helper_ndrange_2d_glo[] =
112 {
113 NL, "void block_fn(int len, __global atomic_uint* val)"
114 NL, "{"
115 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
116 NL, "}"
117 NL, ""
118 NL, "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
119 NL, "{"
120 NL, " size_t tid = get_global_id(0);"
121 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
122 NL, ""
123 NL, " for(int i = 0; i < n; i++)"
124 NL, " {"
125 NL, " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
126 NL, " ndrange_t ndrange = ndrange_2D(glob_size);"
127 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
128 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
129 NL, " }"
130 NL, "}"
131 NL
132 };
133
134 static const char* helper_ndrange_2d_loc[] =
135 {
136 NL, "void block_fn(int len, __global atomic_uint* val)"
137 NL, "{"
138 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
139 NL, "}"
140 NL, ""
141 NL, "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
142 NL, "{"
143 NL, " size_t tid = get_global_id(0);"
144 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
145 NL, ""
146 NL, " for(int k = 0; k < n; k++)"
147 NL, " {"
148 NL, " for(int i = 0; i < n; i++)"
149 NL, " {"
150 NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
151 NL, " {"
152 NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
153 NL, " size_t loc_size[] = { 1, loc_size_arr[k] };"
154 NL, ""
155 NL, " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);"
156 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
157 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
158 NL, " }"
159 NL, " }"
160 NL, " }"
161 NL, "}"
162 NL
163 };
164
165
166 static const char* helper_ndrange_2d_ofs[] =
167 {
168 NL, "void block_fn(int len, __global atomic_uint* val)"
169 NL, "{"
170 NL, " atomic_fetch_add_explicit(&val[(get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
171 NL, "}"
172 NL, ""
173 NL, "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
174 NL, "{"
175 NL, " size_t tid = get_global_id(0);"
176 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
177 NL, ""
178 NL, " for(int l = 0; l < n; l++)"
179 NL, " {"
180 NL, " for(int k = 0; k < n; k++)"
181 NL, " {"
182 NL, " for(int i = 0; i < n; i++)"
183 NL, " {"
184 NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
185 NL, " {"
186 NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n]};"
187 NL, " size_t loc_size[] = { 1, loc_size_arr[k] };"
188 NL, " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };"
189 NL, ""
190 NL, " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);"
191 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
192 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
193 NL, " }"
194 NL, " }"
195 NL, " }"
196 NL, " }"
197 NL, "}"
198 NL
199 };
200
201
202 static const char* helper_ndrange_3d_glo[] =
203 {
204 NL, "void block_fn(int len, __global atomic_uint* val)"
205 NL, "{"
206 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
207 NL, "}"
208 NL, ""
209 NL, "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
210 NL, "{"
211 NL, " size_t tid = get_global_id(0);"
212 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
213 NL, ""
214 NL, " for(int i = 0; i < n; i++)"
215 NL, " {"
216 NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
217 NL, " if (global_work_size <= (len * len))"
218 NL, " {"
219 NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
220 NL, " ndrange_t ndrange = ndrange_3D(glob_size);"
221 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
222 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
223 NL, " }"
224 NL, " }"
225 NL, "}"
226 NL
227 };
228
229
230 static const char* helper_ndrange_3d_loc[] =
231 {
232 NL, "void block_fn(int len, __global atomic_uint* val)"
233 NL, "{"
234 NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
235 NL, "}"
236 NL, ""
237 NL, "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
238 NL, "{"
239 NL, " size_t tid = get_global_id(0);"
240 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
241 NL, ""
242 NL, " for(int k = 0; k < n; k++)"
243 NL, " {"
244 NL, " for(int i = 0; i < n; i++)"
245 NL, " {"
246 NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
247 NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
248 NL, " {"
249 NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
250 NL, " size_t loc_size[] = { 1, 1, loc_size_arr[k] };"
251 NL, " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);"
252 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
253 NL, " "
254 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
255 NL, " }"
256 NL, " }"
257 NL, " }"
258 NL, "}"
259 NL
260 };
261
262 static const char* helper_ndrange_3d_ofs[] =
263 {
264 NL, "void block_fn(int len, __global atomic_uint* val)"
265 NL, "{"
266 NL, " atomic_fetch_add_explicit(&val[(get_global_offset(2) * get_global_size(0) * get_global_size(1) + get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
267 NL, "}"
268 NL, ""
269 NL, "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)"
270 NL, "{"
271 NL, " size_t tid = get_global_id(0);"
272 NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
273 NL, ""
274 NL, " for(int l = 0; l < n; l++)"
275 NL, " {"
276 NL, " for(int k = 0; k < n; k++)"
277 NL, " {"
278 NL, " for(int i = 0; i < n; i++)"
279 NL, " {"
280 NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
281 NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
282 NL, " {"
283 NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n]};"
284 NL, " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };"
285 NL, " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l + 2) % n] };"
286 NL, " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);"
287 NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
288 NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
289 NL, " }"
290 NL, " }"
291 NL, " }"
292 NL, " }"
293 NL, "}"
294 NL
295 };
296
297 static const kernel_src_dim_check sources_ndrange_Xd[] =
298 {
299 { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
300 { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
301 { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
302 { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
303 { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
304 { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
305 { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
306 { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
307 { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
308 };
309 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
310
check_kernel_results(cl_int * results,cl_int len)311 static int check_kernel_results(cl_int* results, cl_int len)
312 {
313 for(cl_int i = 0; i < len; ++i)
314 {
315 if(results[i] != 0) return i;
316 }
317 return -1;
318 }
319
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)320 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
321 {
322 for (size_t g = 0; g < glob_size_arr.size(); ++g)
323 {
324 for (size_t w = 0; w < glob_size_arr[g]; ++w)
325 {
326 ++reference_results[w];
327 }
328 }
329 }
330
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)331 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
332 {
333 for (size_t g = 0; g < glob_size_arr.size(); ++g)
334 {
335 for (size_t l = 0; l < loc_size_arr.size(); ++l)
336 {
337 if (glob_size_arr[g] >= loc_size_arr[l])
338 {
339 for (size_t w = 0; w < glob_size_arr[g]; ++w)
340 {
341 ++reference_results[w];
342 }
343 }
344 }
345 }
346 }
347
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)348 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
349 {
350 for (size_t g = 0; g < glob_size_arr.size(); ++g)
351 {
352 for (size_t l = 0; l < loc_size_arr.size(); ++l)
353 {
354 if (glob_size_arr[g] >= loc_size_arr[l])
355 {
356 for (size_t o = 0; o < offset.size(); ++o)
357 {
358 for (size_t w = 0; w < glob_size_arr[g]; ++w)
359 {
360 ++reference_results[(offset[o] + w) % len];
361 }
362 }
363 }
364 }
365 }
366 }
367
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)368 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
369 {
370 for (size_t g = 0; g < glob_size_arr.size(); ++g)
371 {
372 for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
373 {
374 for (size_t w = 0; w < glob_size_arr[g]; ++w)
375 {
376 ++reference_results[(h * glob_size_arr[g] + w) % len];
377 }
378 }
379 }
380 }
381
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)382 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
383 {
384 size_t n = glob_size_arr.size();
385 for (size_t g = 0; g < glob_size_arr.size(); ++g)
386 {
387 for (size_t l = 0; l < loc_size_arr.size(); ++l)
388 {
389 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
390 {
391 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
392 {
393 for (size_t w = 0; w < glob_size_arr[g]; ++w)
394 {
395 ++reference_results[(h * glob_size_arr[g] + w) % len];
396 }
397 }
398 }
399 }
400 }
401 }
402
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)403 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
404 {
405 size_t n = glob_size_arr.size();
406 for (size_t g = 0; g < glob_size_arr.size(); ++g)
407 {
408 for (size_t l = 0; l < loc_size_arr.size(); ++l)
409 {
410 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
411 {
412 for (size_t o = 0; o < offset.size(); ++o)
413 {
414 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
415 {
416 for (size_t w = 0; w < glob_size_arr[g]; ++w)
417 {
418 ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
419 }
420 }
421 }
422 }
423 }
424 }
425 }
426
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)427 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
428 {
429 size_t n = glob_size_arr.size();
430 for (size_t g = 0; g < glob_size_arr.size(); ++g)
431 {
432 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
433 if(global_work_size <= (len * len))
434 {
435 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
436 {
437 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
438 {
439 for (size_t w = 0; w < glob_size_arr[g]; ++w)
440 {
441 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
442 }
443 }
444 }
445 }
446 }
447 }
448
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)449 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
450 {
451 size_t n = glob_size_arr.size();
452 for (size_t g = 0; g < glob_size_arr.size(); ++g)
453 {
454 for (size_t l = 0; l < loc_size_arr.size(); ++l)
455 {
456 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
457 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
458 {
459 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
460 {
461 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
462 {
463 for (size_t w = 0; w < glob_size_arr[g]; ++w)
464 {
465 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
466 }
467 }
468 }
469 }
470 }
471 }
472 }
473
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)474 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
475 {
476 size_t n = glob_size_arr.size();
477 for (size_t g = 0; g < glob_size_arr.size(); ++g)
478 {
479 for (size_t l = 0; l < loc_size_arr.size(); ++l)
480 {
481 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
482 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
483 {
484 for (size_t o = 0; o < offset.size(); ++o)
485 {
486 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
487 {
488 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
489 {
490 for (size_t w = 0; w < glob_size_arr[g]; ++w)
491 {
492 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
493 }
494 }
495 }
496 }
497 }
498 }
499 }
500 }
501
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)502 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
503 {
504 std::vector<cl_int> reference_results(len, 0);
505 switch (dim)
506 {
507 case 1:
508 if (use_local == CL_FALSE)
509 {
510 generate_reference_1D(reference_results, glob_size_arr);
511 }
512 else if(use_local == CL_TRUE && use_offset == CL_FALSE)
513 {
514 generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
515 }
516 else
517 {
518 generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
519 }
520 break;
521 case 2:
522 if (use_local == CL_FALSE)
523 {
524 generate_reference_2D(reference_results, glob_size_arr, len);
525 }
526 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
527 {
528 generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
529 }
530 else
531 {
532 generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
533 }
534 break;
535 case 3:
536 if (use_local == CL_FALSE)
537 {
538 generate_reference_3D(reference_results, glob_size_arr, len);
539 }
540 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
541 {
542 generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
543 }
544 else
545 {
546 generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
547 }
548 break;
549 default:
550 return 0;
551 break;
552 }
553
554 for (cl_int i = 0; i < len; ++i)
555 {
556 if (results[i] != reference_results[i])
557 {
558 log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
559 return i;
560 }
561 }
562
563 return -1;
564 }
565
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)566 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
567 {
568 MTdata d;
569 cl_uint i;
570 cl_int err_ret, res = 0;
571 clCommandQueueWrapper dev_queue;
572 cl_int k, kernel_results[MAX_GWS] = { 0 };
573
574 size_t ret_len;
575 cl_uint max_queues = 1;
576 cl_uint maxQueueSize = 0;
577
578 d = init_genrand(gRandomSeed);
579
580 err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
581 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
582
583 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
584 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
585
586 size_t max_local_size = 1;
587 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
588 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
589
590 cl_queue_properties queue_prop_def[] =
591 {
592 CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
593 CL_QUEUE_SIZE, maxQueueSize,
594 0
595 };
596
597 dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
598 test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
599
600 max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
601 if(gWimpyMode)
602 {
603 max_local_size = MIN(8, max_local_size);
604 }
605
606 cl_uint num = 10;
607 cl_uint global_work_size = max_local_size * 2;
608 std::vector<cl_uint> glob_size_arr(num);
609 std::vector<cl_uint> loc_size_arr(num);
610 std::vector<cl_uint> ofs_arr(num);
611 std::vector<cl_int> glob_results(global_work_size, 0);
612
613 glob_size_arr[0] = 1;
614 glob_size_arr[1] = global_work_size;
615 loc_size_arr[0] = 1;
616 loc_size_arr[1] = max_local_size;
617 ofs_arr[0] = 0;
618 ofs_arr[1] = 1;
619
620 for(i = 2; i < num; ++i)
621 {
622 glob_size_arr[i] = genrand_int32(d) % global_work_size;
623 glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
624 loc_size_arr[i] = genrand_int32(d) % max_local_size;
625 loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
626 ofs_arr[i] = genrand_int32(d) % global_work_size;
627 }
628
629 // check ndrange_dX functions
630 size_t failCnt = 0;
631 for(i = 0; i < num_kernels_ndrange_Xd; ++i)
632 {
633 if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
634 continue;
635
636 clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
637 test_error(err_ret, "clCreateBuffer() failed");
638 clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
639 test_error(err_ret, "clCreateBuffer() failed");
640 clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
641 test_error(err_ret, "clCreateBuffer() failed");
642 clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
643 test_error(err_ret, "clCreateBuffer() failed");
644
645 kernel_arg args[] =
646 {
647 { sizeof(cl_uint), &num },
648 { sizeof(cl_uint), &global_work_size },
649 { sizeof(cl_mem), &mem1 },
650 { sizeof(cl_mem), &mem2 },
651 { sizeof(cl_mem), &mem3 },
652 { sizeof(cl_mem), &mem4 },
653 };
654
655 log_info("Running '%s' kernel (%d of %d) ...\n", sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
656 err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
657
658 cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
659 test_error(err_ret, "clEnqueueMapBuffer() failed");
660
661 if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
662 else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
663 else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
664 else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
665
666 err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
667 test_error(err_ret, "clEnqueueUnmapMemObject() failed");
668
669 }
670
671 if (failCnt > 0)
672 {
673 log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
674 }
675
676 return res;
677 }
678
679
680 #endif
681
682