• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 
23 #include "procs.h"
24 
25 
26 const char *wg_broadcast_1D_kernel_code =
27 "__kernel void test_wg_broadcast_1D(global float *input, global float *output)\n"
28 "{\n"
29 "    int  tid = get_global_id(0);\n"
30 "\n"
31 "    float result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
32 "    output[tid] = result;\n"
33 "}\n";
34 
35 const char *wg_broadcast_2D_kernel_code =
36 "__kernel void test_wg_broadcast_2D(global float *input, global float *output)\n"
37 "{\n"
38 "    size_t tid_x = get_global_id(0);\n"
39 "    size_t tid_y = get_global_id(1);\n"
40 "    size_t x = get_group_id(0) % get_local_size(0);\n"
41 "    size_t y = get_group_id(1) % get_local_size(1);\n"
42 "\n"
43 "    size_t indx = (tid_y * get_global_size(0)) + tid_x;\n"
44 "    float result = work_group_broadcast(input[indx], x, y);\n"
45 "    output[indx] = result;\n"
46 "}\n";
47 
48 const char *wg_broadcast_3D_kernel_code =
49 "__kernel void test_wg_broadcast_3D(global float *input, global float *output)\n"
50 "{\n"
51 "    size_t tid_x = get_global_id(0);\n"
52 "    size_t tid_y = get_global_id(1);\n"
53 "    size_t tid_z = get_global_id(2);\n"
54 "    size_t x = get_group_id(0) % get_local_size(0);\n"
55 "    size_t y = get_group_id(1) % get_local_size(1);\n"
56 "    size_t z = get_group_id(2) % get_local_size(2);\n"
57 "\n"
58 "    size_t indx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
59 "    float result = work_group_broadcast(input[indx], x, y, z);\n"
60 "    output[indx] = result;\n"
61 "}\n";
62 
63 static int
verify_wg_broadcast_1D(float * inptr,float * outptr,size_t n,size_t wg_size)64 verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
65 {
66     size_t     i, j;
67     size_t     group_id;
68 
69     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
70     {
71         int local_size = (n-i) > wg_size ? wg_size : (n-i);
72         float broadcast_result = inptr[i + (group_id % local_size)];
73         for (j=0; j<local_size; j++)
74         {
75             if ( broadcast_result != outptr[i+j] )
76             {
77                 log_info("work_group_broadcast: Error at %u: expected = %f, got = %f\n", i+j, broadcast_result, outptr[i+j]);
78                 return -1;
79             }
80         }
81     }
82 
83     return 0;
84 }
85 
86 static int
verify_wg_broadcast_2D(float * inptr,float * outptr,size_t nx,size_t ny,size_t wg_size_x,size_t wg_size_y)87 verify_wg_broadcast_2D(float *inptr, float *outptr, size_t nx, size_t ny, size_t wg_size_x, size_t wg_size_y)
88 {
89     size_t i, j, _i, _j;
90     size_t group_id_x, group_id_y;
91 
92     for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
93     {
94         size_t y = group_id_y % wg_size_y;
95         size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
96         for (_i=0; _i < local_size_y; _i++)
97         {
98             for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
99             {
100                 size_t x = group_id_x % wg_size_x;
101                 size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
102                 float  broadcast_result = inptr[(i + y) * nx + (j + x)];
103                 for (_j=0; _j < local_size_x; _j++)
104                 {
105                     size_t indx = (i + _i) * nx + (j + _j);
106                     if ( broadcast_result != outptr[indx] )
107                     {
108                         log_info("work_group_broadcast: Error at (%u, %u): expected = %f, got = %f\n", j+_j, i+_i, broadcast_result, outptr[indx]);
109                         return -1;
110                     }
111                 }
112             }
113         }
114     }
115 
116     return 0;
117 }
118 
119 static int
verify_wg_broadcast_3D(float * inptr,float * outptr,size_t nx,size_t ny,size_t nz,size_t wg_size_x,size_t wg_size_y,size_t wg_size_z)120 verify_wg_broadcast_3D(float *inptr, float *outptr, size_t nx, size_t ny, size_t nz, size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
121 {
122     size_t i, j, k, _i, _j, _k;
123     size_t group_id_x, group_id_y, group_id_z;
124 
125     for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
126     {
127         size_t z = group_id_z % wg_size_z;
128         size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
129         for (_i=0; _i < local_size_z; _i++)
130         {
131             for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
132             {
133                 size_t y = group_id_y % wg_size_y;
134                 size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
135                 for (_j=0; _j < local_size_y; _j++)
136                 {
137                     for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
138                     {
139                         size_t x = group_id_x % wg_size_x;
140                         size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
141                         float  broadcast_result = inptr[(i + z) * ny * nz + (j + y) * nx + (k + x)];
142                         for (_k=0; _k < local_size_x; _k++)
143                         {
144                             size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
145                             if ( broadcast_result != outptr[indx] )
146                             {
147                                 log_info("work_group_broadcast: Error at (%u, %u, %u): expected = %f, got = %f\n", k+_k, j+_j, i+_i, broadcast_result, outptr[indx]);
148                                 return -1;
149                             }
150                         }
151                     }
152                 }
153             }
154         }
155     }
156 
157     return 0;
158 }
159 
160 
161 int
test_work_group_broadcast_1D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)162 test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
163 {
164     cl_mem       streams[2];
165     cl_float     *input_ptr[1], *p;
166     cl_float     *output_ptr;
167     cl_program   program;
168     cl_kernel    kernel;
169     void         *values[2];
170     size_t       globalsize[1];
171     size_t       wg_size[1];
172     size_t       num_elements;
173     int          err;
174     int          i;
175     MTdata       d;
176 
177     err = create_single_kernel_helper(context, &program, &kernel, 1,
178                                       &wg_broadcast_1D_kernel_code,
179                                       "test_wg_broadcast_1D");
180     if (err)
181         return -1;
182 
183     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
184     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
185     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
186 
187     num_elements = n_elems;
188 
189     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
190     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
191     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
192                                 sizeof(cl_float) * num_elements, NULL, NULL);
193     if (!streams[0])
194     {
195         log_error("clCreateBuffer failed\n");
196         return -1;
197     }
198 
199     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
200                                 sizeof(cl_float) * num_elements, NULL, NULL);
201     if (!streams[1])
202     {
203         log_error("clCreateBuffer failed\n");
204         return -1;
205     }
206 
207     p = input_ptr[0];
208     d = init_genrand( gRandomSeed );
209     for (i=0; i<num_elements; i++)
210     {
211         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
212     }
213     free_mtdata(d); d = NULL;
214 
215     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
216     if (err != CL_SUCCESS)
217     {
218         log_error("clWriteArray failed\n");
219         return -1;
220     }
221 
222     values[0] = streams[0];
223     values[1] = streams[1];
224     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
225     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
226     if (err != CL_SUCCESS)
227     {
228         log_error("clSetKernelArgs failed\n");
229         return -1;
230     }
231 
232     // Line below is troublesome...
233     globalsize[0] = (size_t)n_elems;
234     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalsize, wg_size, 0, NULL, NULL );
235     if (err != CL_SUCCESS)
236     {
237         log_error("clEnqueueNDRangeKernel failed\n");
238         return -1;
239     }
240 
241     cl_uint dead = 0xdeaddead;
242     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
243     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
244     if (err != CL_SUCCESS)
245     {
246         log_error("clEnqueueReadBuffer failed\n");
247         return -1;
248     }
249 
250     if (verify_wg_broadcast_1D(input_ptr[0], output_ptr, num_elements, wg_size[0]))
251     {
252         log_error("work_group_broadcast_1D test failed\n");
253         return -1;
254     }
255     log_info("work_group_broadcast_1D test passed\n");
256 
257     clReleaseMemObject(streams[0]);
258     clReleaseMemObject(streams[1]);
259     clReleaseKernel(kernel);
260     clReleaseProgram(program);
261     free(input_ptr[0]);
262     free(output_ptr);
263 
264     return err;
265 }
266 
267 
268 int
test_work_group_broadcast_2D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)269 test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
270 {
271     cl_mem       streams[2];
272     cl_float     *input_ptr[1], *p;
273     cl_float     *output_ptr;
274     cl_program   program;
275     cl_kernel    kernel;
276     void         *values[2];
277     size_t       globalsize[2];
278     size_t       localsize[2];
279     size_t       wg_size[1];
280     size_t       num_workgroups;
281     size_t       num_elements;
282     int          err;
283     int          i;
284     MTdata       d;
285 
286     err = create_single_kernel_helper(context, &program, &kernel, 1,
287                                       &wg_broadcast_2D_kernel_code,
288                                       "test_wg_broadcast_2D");
289     if (err)
290         return -1;
291 
292     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
293     if (err)
294         return -1;
295 
296     if (wg_size[0] >= 256)
297     {
298         localsize[0] = localsize[1] = 16;
299     }
300     else if (wg_size[0] >=64)
301     {
302         localsize[0] = localsize[1] = 8;
303     }
304     else if (wg_size[0] >= 16)
305     {
306         localsize[0] = localsize[1] = 4;
307     }
308     else
309     {
310         localsize[0] = localsize[1] = 1;
311     }
312 
313     num_workgroups = MAX(n_elems/wg_size[0], 16);
314     globalsize[0] = num_workgroups * localsize[0];
315     globalsize[1] = num_workgroups * localsize[1];
316     num_elements = globalsize[0] * globalsize[1];
317 
318     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
319     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
320     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
321                                 sizeof(cl_float) * num_elements, NULL, NULL);
322     if (!streams[0])
323     {
324         log_error("clCreateBuffer failed\n");
325         return -1;
326     }
327 
328     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
329                                 sizeof(cl_float) * num_elements, NULL, NULL);
330     if (!streams[1])
331     {
332         log_error("clCreateBuffer failed\n");
333         return -1;
334     }
335 
336     p = input_ptr[0];
337     d = init_genrand( gRandomSeed );
338     for (i=0; i<num_elements; i++)
339     {
340         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
341     }
342     free_mtdata(d); d = NULL;
343 
344     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
345     if (err != CL_SUCCESS)
346     {
347         log_error("clWriteArray failed\n");
348         return -1;
349     }
350 
351     values[0] = streams[0];
352     values[1] = streams[1];
353     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
354     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
355     if (err != CL_SUCCESS)
356     {
357         log_error("clSetKernelArgs failed\n");
358         return -1;
359     }
360 
361     err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, globalsize, localsize, 0, NULL, NULL );
362     if (err != CL_SUCCESS)
363     {
364         log_error("clEnqueueNDRangeKernel failed\n");
365         return -1;
366     }
367 
368     cl_uint dead = 0xdeaddead;
369     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
370     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
371     if (err != CL_SUCCESS)
372     {
373         log_error("clEnqueueReadBuffer failed\n");
374         return -1;
375     }
376 
377     if (verify_wg_broadcast_2D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], localsize[0], localsize[1]))
378     {
379         log_error("work_group_broadcast_2D test failed\n");
380         return -1;
381     }
382     log_info("work_group_broadcast_2D test passed\n");
383 
384     clReleaseMemObject(streams[0]);
385     clReleaseMemObject(streams[1]);
386     clReleaseKernel(kernel);
387     clReleaseProgram(program);
388     free(input_ptr[0]);
389     free(output_ptr);
390 
391     return err;
392 }
393 
394 
395 int
test_work_group_broadcast_3D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)396 test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
397 {
398     cl_mem       streams[2];
399     cl_float     *input_ptr[1], *p;
400     cl_float     *output_ptr;
401     cl_program   program;
402     cl_kernel    kernel;
403     void         *values[2];
404     size_t       globalsize[3];
405     size_t       localsize[3];
406     size_t       wg_size[1];
407     size_t       num_workgroups;
408     size_t       num_elements;
409     int          err;
410     int          i;
411     MTdata       d;
412 
413     err = create_single_kernel_helper(context, &program, &kernel, 1,
414                                       &wg_broadcast_3D_kernel_code,
415                                       "test_wg_broadcast_3D");
416     if (err)
417         return -1;
418 
419     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
420     if (err)
421         return -1;
422 
423     if (wg_size[0] >=512)
424     {
425         localsize[0] = localsize[1] = localsize[2] = 8;
426     }
427     else if (wg_size[0] >= 64)
428     {
429         localsize[0] = localsize[1] = localsize[2] = 4;
430     }
431     else if (wg_size[0] >= 8)
432     {
433         localsize[0] = localsize[1] = localsize[2] = 2;
434     }
435     else
436     {
437         localsize[0] = localsize[1] = localsize[2] = 1;
438     }
439 
440     num_workgroups = MAX(n_elems/wg_size[0], 8);
441     globalsize[0] = num_workgroups * localsize[0];
442     globalsize[1] = num_workgroups * localsize[1];
443     globalsize[2] = num_workgroups * localsize[2];
444     num_elements = globalsize[0] * globalsize[1] * globalsize[2];
445 
446     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
447     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
448     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
449                                 sizeof(cl_float) * num_elements, NULL, NULL);
450     if (!streams[0])
451     {
452         log_error("clCreateBuffer failed\n");
453         return -1;
454     }
455 
456     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
457                                 sizeof(cl_float) * num_elements, NULL, NULL);
458     if (!streams[1])
459     {
460         log_error("clCreateBuffer failed\n");
461         return -1;
462     }
463 
464     p = input_ptr[0];
465     d = init_genrand( gRandomSeed );
466     for (i=0; i<num_elements; i++)
467     {
468         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
469     }
470     free_mtdata(d); d = NULL;
471 
472     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
473     if (err != CL_SUCCESS)
474     {
475         log_error("clWriteArray failed\n");
476         return -1;
477     }
478 
479     values[0] = streams[0];
480     values[1] = streams[1];
481     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
482     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
483     if (err != CL_SUCCESS)
484     {
485         log_error("clSetKernelArgs failed\n");
486         return -1;
487     }
488 
489     err = clEnqueueNDRangeKernel( queue, kernel, 3, NULL, globalsize, localsize, 0, NULL, NULL );
490     if (err != CL_SUCCESS)
491     {
492         log_error("clEnqueueNDRangeKernel failed\n");
493         return -1;
494     }
495 
496     cl_uint dead = 0xdeaddead;
497     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
498     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
499     if (err != CL_SUCCESS)
500     {
501         log_error("clEnqueueReadBuffer failed\n");
502         return -1;
503     }
504 
505     if (verify_wg_broadcast_3D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], globalsize[2], localsize[0], localsize[1], localsize[2]))
506     {
507         log_error("work_group_broadcast_3D test failed\n");
508         return -1;
509     }
510     log_info("work_group_broadcast_3D test passed\n");
511 
512     clReleaseMemObject(streams[0]);
513     clReleaseMemObject(streams[1]);
514     clReleaseKernel(kernel);
515     clReleaseProgram(program);
516     free(input_ptr[0]);
517     free(output_ptr);
518 
519     return err;
520 }
521 
522 
523 int
test_work_group_broadcast(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)524 test_work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
525 {
526     int err;
527 
528     err = test_work_group_broadcast_1D(device, context, queue, n_elems);
529     if (err) return err;
530     err = test_work_group_broadcast_2D(device, context, queue, n_elems);
531     if (err) return err;
532     return err;
533 }
534 
535 
536