• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20 
21 #include <vector>
22 
23 #include "procs.h"
24 #include "utils.h"
25 #include <time.h>
26 
27 
28 #ifdef CL_VERSION_2_0
29 extern int gWimpyMode;
30 static const char* helper_ndrange_1d_glo[] =
31 {
32     NL, "void block_fn(int len, __global atomic_uint* val)"
33     NL, "{"
34     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
35     NL, "}"
36     NL, ""
37     NL, "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
38     NL, "{"
39     NL, "  size_t tid = get_global_id(0);"
40     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
41     NL, ""
42     NL, "  for(int i = 0; i < n; i++)"
43     NL, "  {"
44     NL, "    ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);"
45     NL, "    int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
46     NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
47     NL, "  }"
48     NL, "}"
49     NL
50 };
51 
52 static const char* helper_ndrange_1d_loc[] =
53 {
54     NL, "void block_fn(int len, __global atomic_uint* val)"
55     NL, "{"
56     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
57     NL, "}"
58     NL, ""
59     NL, "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
60     NL, "{"
61     NL, "  size_t tid = get_global_id(0);"
62     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
63     NL, ""
64     NL, "  for(int k = 0; k < n; k++)"
65     NL, "  {"
66     NL, "    for(int i = 0; i < n; i++)"
67     NL, "    {"
68     NL, "      if (glob_size_arr[i] >= loc_size_arr[k])"
69     NL, "      {"
70     NL, "        ndrange_t ndrange = ndrange_1D(glob_size_arr[i], loc_size_arr[k]);"
71     NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
72     NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
73     NL, "      }"
74     NL, "    }"
75     NL, "  }"
76     NL, "}"
77     NL
78 };
79 
80 static const char* helper_ndrange_1d_ofs[] =
81 {
82     NL, "void block_fn(int len, __global atomic_uint* val)"
83     NL, "{"
84     NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
85     NL, "}"
86     NL, ""
87     NL, "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val,  __global uint* ofs_arr)"
88     NL, "{"
89     NL, "  size_t tid = get_global_id(0);"
90     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
91     NL, ""
92     NL, "  for(int l = 0; l < n; l++)"
93     NL, "  {"
94     NL, "    for(int k = 0; k < n; k++)"
95     NL, "    {"
96     NL, "      for(int i = 0; i < n; i++)"
97     NL, "      {"
98     NL, "        if (glob_size_arr[i] >= loc_size_arr[k])"
99     NL, "        {"
100     NL, "          ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], loc_size_arr[k]);"
101     NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
102     NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
103     NL, "        }"
104     NL, "      }"
105     NL, "    }"
106     NL, "  }"
107     NL, "}"
108     NL
109 };
110 
111 static const char* helper_ndrange_2d_glo[] =
112 {
113     NL, "void block_fn(int len, __global atomic_uint* val)"
114     NL, "{"
115     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
116     NL, "}"
117     NL, ""
118     NL, "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
119     NL, "{"
120     NL, "  size_t tid = get_global_id(0);"
121     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
122     NL, ""
123     NL, "  for(int i = 0; i < n; i++)"
124     NL, "  {"
125     NL, "    size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
126     NL, "    ndrange_t ndrange = ndrange_2D(glob_size);"
127     NL, "    int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
128     NL, "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
129     NL, "  }"
130     NL, "}"
131     NL
132 };
133 
134 static const char* helper_ndrange_2d_loc[] =
135 {
136     NL, "void block_fn(int len, __global atomic_uint* val)"
137     NL, "{"
138     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
139     NL, "}"
140     NL, ""
141     NL, "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
142     NL, "{"
143     NL, "  size_t tid = get_global_id(0);"
144     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
145     NL, ""
146     NL, "  for(int k = 0; k < n; k++)"
147     NL, "  {"
148     NL, "    for(int i = 0; i < n; i++)"
149     NL, "    {"
150     NL, "      if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
151     NL, "      {"
152     NL, "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };"
153     NL, "        size_t loc_size[] = { 1, loc_size_arr[k] };"
154     NL, ""
155     NL, "        ndrange_t ndrange = ndrange_2D(glob_size, loc_size);"
156     NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
157     NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
158     NL, "      }"
159     NL, "    }"
160     NL, "  }"
161     NL, "}"
162     NL
163 };
164 
165 
166 static const char* helper_ndrange_2d_ofs[] =
167 {
168     NL, "void block_fn(int len, __global atomic_uint* val)"
169     NL, "{"
170     NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
171     NL, "}"
172     NL, ""
173     NL, "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
174     NL, "{"
175     NL, "  size_t tid = get_global_id(0);"
176     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
177     NL, ""
178     NL, "  for(int l = 0; l < n; l++)"
179     NL, "  {"
180     NL, "    for(int k = 0; k < n; k++)"
181     NL, "    {"
182     NL, "      for(int i = 0; i < n; i++)"
183     NL, "      {"
184     NL, "        if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])"
185     NL, "        {"
186     NL, "          size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n]};"
187     NL, "          size_t loc_size[] = { 1, loc_size_arr[k] };"
188     NL, "          size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };"
189     NL, ""
190     NL, "          ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);"
191     NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
192     NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
193     NL, "        }"
194     NL, "      }"
195     NL, "    }"
196     NL, "  }"
197     NL, "}"
198     NL
199 };
200 
201 
202 static const char* helper_ndrange_3d_glo[] =
203 {
204     NL, "void block_fn(int len, __global atomic_uint* val)"
205     NL, "{"
206     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
207     NL, "}"
208     NL, ""
209     NL, "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
210     NL, "{"
211     NL, "  size_t tid = get_global_id(0);"
212     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
213     NL, ""
214     NL, "  for(int i = 0; i < n; i++)"
215     NL, "  {"
216     NL, "    uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
217     NL, "    if (global_work_size <= (len * len))"
218     NL, "    {"
219     NL, "      size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
220     NL, "      ndrange_t ndrange = ndrange_3D(glob_size);"
221     NL, "      int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
222     NL, "      if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
223     NL, "    }"
224     NL, "  }"
225     NL, "}"
226     NL
227 };
228 
229 
230 static const char* helper_ndrange_3d_loc[] =
231 {
232     NL, "void block_fn(int len, __global atomic_uint* val)"
233     NL, "{"
234     NL, "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);"
235     NL, "}"
236     NL, ""
237     NL, "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
238     NL, "{"
239     NL, "  size_t tid = get_global_id(0);"
240     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
241     NL, ""
242     NL, "  for(int k = 0; k < n; k++)"
243     NL, "  {"
244     NL, "    for(int i = 0; i < n; i++)"
245     NL, "    {"
246     NL, "      uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
247     NL, "      if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
248     NL, "      {"
249     NL, "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };"
250     NL, "        size_t loc_size[] = { 1, 1, loc_size_arr[k] };"
251     NL, "        ndrange_t ndrange = ndrange_3D(glob_size,loc_size);"
252     NL, "        int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
253     NL, "      "
254     NL, "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
255     NL, "      }"
256     NL, "    }"
257     NL, "  }"
258     NL, "}"
259     NL
260 };
261 
262 static const char* helper_ndrange_3d_ofs[] =
263 {
264     NL, "void block_fn(int len, __global atomic_uint* val)"
265     NL, "{"
266     NL, "  atomic_fetch_add_explicit(&val[(get_global_offset(2) * get_global_size(0) * get_global_size(1) + get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);"
267     NL, "}"
268     NL, ""
269     NL, "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val,  __global uint* ofs_arr)"
270     NL, "{"
271     NL, "  size_t tid = get_global_id(0);"
272     NL, "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };"
273     NL, ""
274     NL, "  for(int l = 0; l < n; l++)"
275     NL, "  {"
276     NL, "    for(int k = 0; k < n; k++)"
277     NL, "    {"
278     NL, "      for(int i = 0; i < n; i++)"
279     NL, "      {"
280     NL, "        uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];"
281     NL, "        if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))"
282     NL, "        {"
283     NL, "          size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n]};"
284     NL, "          size_t loc_size[3] = { 1, 1, loc_size_arr[k] };"
285     NL, "          size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l + 2) % n] };"
286     NL, "          ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);"
287     NL, "          int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);"
288     NL, "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }"
289     NL, "        }"
290     NL, "      }"
291     NL, "    }"
292     NL, "  }"
293     NL, "}"
294     NL
295 };
296 
297 static const kernel_src_dim_check sources_ndrange_Xd[] =
298 {
299     { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
300     { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
301     { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
302     { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
303     { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
304     { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
305     { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
306     { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
307     { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
308 };
309 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
310 
check_kernel_results(cl_int * results,cl_int len)311 static int check_kernel_results(cl_int* results, cl_int len)
312 {
313     for(cl_int i = 0; i < len; ++i)
314     {
315         if(results[i] != 0) return i;
316     }
317     return -1;
318 }
319 
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)320 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
321 {
322     for (size_t g = 0; g < glob_size_arr.size(); ++g)
323     {
324         for (size_t w = 0; w < glob_size_arr[g]; ++w)
325         {
326             ++reference_results[w];
327         }
328     }
329 }
330 
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)331 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
332 {
333     for (size_t g = 0; g < glob_size_arr.size(); ++g)
334     {
335         for (size_t l = 0; l < loc_size_arr.size(); ++l)
336         {
337             if (glob_size_arr[g] >= loc_size_arr[l])
338             {
339                 for (size_t w = 0; w < glob_size_arr[g]; ++w)
340                 {
341                     ++reference_results[w];
342                 }
343             }
344         }
345     }
346 }
347 
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)348 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
349 {
350     for (size_t g = 0; g < glob_size_arr.size(); ++g)
351     {
352         for (size_t l = 0; l < loc_size_arr.size(); ++l)
353         {
354             if (glob_size_arr[g] >= loc_size_arr[l])
355             {
356                 for (size_t o = 0; o < offset.size(); ++o)
357                 {
358                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
359                     {
360                         ++reference_results[(offset[o] + w) % len];
361                     }
362                 }
363             }
364         }
365     }
366 }
367 
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)368 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
369 {
370     for (size_t g = 0; g < glob_size_arr.size(); ++g)
371     {
372         for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
373         {
374             for (size_t w = 0; w < glob_size_arr[g]; ++w)
375             {
376                 ++reference_results[(h * glob_size_arr[g] + w) % len];
377             }
378         }
379     }
380 }
381 
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)382 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
383 {
384     size_t n = glob_size_arr.size();
385     for (size_t g = 0; g < glob_size_arr.size(); ++g)
386     {
387         for (size_t l = 0; l < loc_size_arr.size(); ++l)
388         {
389             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
390             {
391                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
392                 {
393                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
394                     {
395                         ++reference_results[(h * glob_size_arr[g] + w) % len];
396                     }
397                 }
398             }
399         }
400     }
401 }
402 
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)403 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
404 {
405     size_t n = glob_size_arr.size();
406     for (size_t g = 0; g < glob_size_arr.size(); ++g)
407     {
408         for (size_t l = 0; l < loc_size_arr.size(); ++l)
409         {
410             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
411             {
412                 for (size_t o = 0; o < offset.size(); ++o)
413                 {
414                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
415                     {
416                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
417                         {
418                             ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
419                         }
420                     }
421                 }
422             }
423         }
424     }
425 }
426 
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)427 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
428 {
429     size_t n = glob_size_arr.size();
430     for (size_t g = 0; g < glob_size_arr.size(); ++g)
431     {
432         size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
433         if(global_work_size <= (len * len))
434         {
435             for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
436             {
437                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
438                 {
439                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
440                     {
441                         ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
442                     }
443                 }
444             }
445         }
446     }
447 }
448 
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)449 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
450 {
451     size_t n = glob_size_arr.size();
452     for (size_t g = 0; g < glob_size_arr.size(); ++g)
453     {
454         for (size_t l = 0; l < loc_size_arr.size(); ++l)
455         {
456             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
457             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
458             {
459                 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
460                 {
461                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
462                     {
463                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
464                         {
465                             ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
466                         }
467                     }
468                 }
469             }
470         }
471     }
472 }
473 
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)474 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
475 {
476     size_t n = glob_size_arr.size();
477     for (size_t g = 0; g < glob_size_arr.size(); ++g)
478     {
479         for (size_t l = 0; l < loc_size_arr.size(); ++l)
480         {
481             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
482             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
483             {
484                 for (size_t o = 0; o < offset.size(); ++o)
485                 {
486                     for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
487                     {
488                         for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
489                         {
490                             for (size_t w = 0; w < glob_size_arr[g]; ++w)
491                             {
492                                 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
493                             }
494                         }
495                     }
496                 }
497             }
498         }
499     }
500 }
501 
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)502 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
503 {
504     std::vector<cl_int> reference_results(len, 0);
505     switch (dim)
506     {
507     case 1:
508         if (use_local == CL_FALSE)
509         {
510             generate_reference_1D(reference_results, glob_size_arr);
511         }
512         else if(use_local == CL_TRUE && use_offset == CL_FALSE)
513         {
514             generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
515         }
516         else
517         {
518             generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
519         }
520         break;
521     case 2:
522         if (use_local == CL_FALSE)
523         {
524             generate_reference_2D(reference_results, glob_size_arr, len);
525         }
526         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
527         {
528             generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
529         }
530         else
531         {
532             generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
533         }
534         break;
535     case 3:
536         if (use_local == CL_FALSE)
537         {
538             generate_reference_3D(reference_results, glob_size_arr, len);
539         }
540         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
541         {
542             generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
543         }
544         else
545         {
546             generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
547         }
548         break;
549     default:
550         return 0;
551         break;
552     }
553 
554     for (cl_int i = 0; i < len; ++i)
555     {
556         if (results[i] != reference_results[i])
557         {
558             log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
559             return i;
560         }
561     }
562 
563     return -1;
564 }
565 
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)566 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
567 {
568     MTdata d;
569     cl_uint i;
570     cl_int err_ret, res = 0;
571     clCommandQueueWrapper dev_queue;
572     cl_int k, kernel_results[MAX_GWS] = { 0 };
573 
574     size_t ret_len;
575     cl_uint max_queues = 1;
576     cl_uint maxQueueSize = 0;
577 
578     d = init_genrand(gRandomSeed);
579 
580     err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
581     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
582 
583     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
584     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
585 
586     size_t max_local_size = 1;
587     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
588     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
589 
590     cl_queue_properties queue_prop_def[] =
591     {
592         CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
593         CL_QUEUE_SIZE, maxQueueSize,
594         0
595     };
596 
597     dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
598     test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
599 
600     max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
601     if(gWimpyMode)
602     {
603         max_local_size = MIN(8, max_local_size);
604     }
605 
606     cl_uint num = 10;
607     cl_uint global_work_size = max_local_size * 2;
608     std::vector<cl_uint> glob_size_arr(num);
609     std::vector<cl_uint> loc_size_arr(num);
610     std::vector<cl_uint> ofs_arr(num);
611     std::vector<cl_int> glob_results(global_work_size, 0);
612 
613     glob_size_arr[0] = 1;
614     glob_size_arr[1] = global_work_size;
615     loc_size_arr[0] = 1;
616     loc_size_arr[1] = max_local_size;
617     ofs_arr[0] = 0;
618     ofs_arr[1] = 1;
619 
620     for(i = 2; i < num; ++i)
621     {
622         glob_size_arr[i] = genrand_int32(d) % global_work_size;
623         glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
624         loc_size_arr[i] = genrand_int32(d) % max_local_size;
625         loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
626         ofs_arr[i] = genrand_int32(d) % global_work_size;
627     }
628 
629     // check ndrange_dX functions
630     size_t failCnt = 0;
631     for(i = 0; i < num_kernels_ndrange_Xd; ++i)
632     {
633         if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
634             continue;
635 
636         clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
637         test_error(err_ret, "clCreateBuffer() failed");
638         clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
639         test_error(err_ret, "clCreateBuffer() failed");
640         clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
641         test_error(err_ret, "clCreateBuffer() failed");
642         clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
643         test_error(err_ret, "clCreateBuffer() failed");
644 
645         kernel_arg args[] =
646         {
647             { sizeof(cl_uint), &num },
648             { sizeof(cl_uint), &global_work_size },
649             { sizeof(cl_mem), &mem1 },
650             { sizeof(cl_mem), &mem2 },
651             { sizeof(cl_mem), &mem3 },
652             { sizeof(cl_mem), &mem4 },
653         };
654 
655         log_info("Running '%s' kernel (%d of %d) ...\n",  sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
656         err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
657 
658         cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
659         test_error(err_ret, "clEnqueueMapBuffer() failed");
660 
661         if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
662         else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
663         else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
664         else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
665 
666         err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
667         test_error(err_ret, "clEnqueueUnmapMemObject() failed");
668 
669     }
670 
671     if (failCnt > 0)
672     {
673         log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
674     }
675 
676     return res;
677 }
678 
679 
680 #endif
681 
682