• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 
23 #include <algorithm>
24 
25 #include "procs.h"
26 
27 
28 const char *wg_broadcast_1D_kernel_code =
29 "__kernel void test_wg_broadcast_1D(global float *input, global float *output)\n"
30 "{\n"
31 "    int  tid = get_global_id(0);\n"
32 "\n"
33 "    float result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
34 "    output[tid] = result;\n"
35 "}\n";
36 
37 const char *wg_broadcast_2D_kernel_code =
38 "__kernel void test_wg_broadcast_2D(global float *input, global float *output)\n"
39 "{\n"
40 "    size_t tid_x = get_global_id(0);\n"
41 "    size_t tid_y = get_global_id(1);\n"
42 "    size_t x = get_group_id(0) % get_local_size(0);\n"
43 "    size_t y = get_group_id(1) % get_local_size(1);\n"
44 "\n"
45 "    size_t indx = (tid_y * get_global_size(0)) + tid_x;\n"
46 "    float result = work_group_broadcast(input[indx], x, y);\n"
47 "    output[indx] = result;\n"
48 "}\n";
49 
50 const char *wg_broadcast_3D_kernel_code =
51 "__kernel void test_wg_broadcast_3D(global float *input, global float *output)\n"
52 "{\n"
53 "    size_t tid_x = get_global_id(0);\n"
54 "    size_t tid_y = get_global_id(1);\n"
55 "    size_t tid_z = get_global_id(2);\n"
56 "    size_t x = get_group_id(0) % get_local_size(0);\n"
57 "    size_t y = get_group_id(1) % get_local_size(1);\n"
58 "    size_t z = get_group_id(2) % get_local_size(2);\n"
59 "\n"
60 "    size_t indx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
61 "    float result = work_group_broadcast(input[indx], x, y, z);\n"
62 "    output[indx] = result;\n"
63 "}\n";
64 
65 static int
verify_wg_broadcast_1D(float * inptr,float * outptr,size_t n,size_t wg_size)66 verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
67 {
68     size_t     i, j;
69     size_t     group_id;
70 
71     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
72     {
73         int local_size = (n-i) > wg_size ? wg_size : (n-i);
74         float broadcast_result = inptr[i + (group_id % local_size)];
75         for (j=0; j<local_size; j++)
76         {
77             if ( broadcast_result != outptr[i+j] )
78             {
79                 log_info("work_group_broadcast: Error at %u: expected = %f, got = %f\n", i+j, broadcast_result, outptr[i+j]);
80                 return -1;
81             }
82         }
83     }
84 
85     return 0;
86 }
87 
88 static int
verify_wg_broadcast_2D(float * inptr,float * outptr,size_t nx,size_t ny,size_t wg_size_x,size_t wg_size_y)89 verify_wg_broadcast_2D(float *inptr, float *outptr, size_t nx, size_t ny, size_t wg_size_x, size_t wg_size_y)
90 {
91     size_t i, j, _i, _j;
92     size_t group_id_x, group_id_y;
93 
94     for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
95     {
96         size_t y = group_id_y % wg_size_y;
97         size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
98         for (_i=0; _i < local_size_y; _i++)
99         {
100             for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
101             {
102                 size_t x = group_id_x % wg_size_x;
103                 size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
104                 float  broadcast_result = inptr[(i + y) * nx + (j + x)];
105                 for (_j=0; _j < local_size_x; _j++)
106                 {
107                     size_t indx = (i + _i) * nx + (j + _j);
108                     if ( broadcast_result != outptr[indx] )
109                     {
110                         log_info("work_group_broadcast: Error at (%u, %u): expected = %f, got = %f\n", j+_j, i+_i, broadcast_result, outptr[indx]);
111                         return -1;
112                     }
113                 }
114             }
115         }
116     }
117 
118     return 0;
119 }
120 
121 static int
verify_wg_broadcast_3D(float * inptr,float * outptr,size_t nx,size_t ny,size_t nz,size_t wg_size_x,size_t wg_size_y,size_t wg_size_z)122 verify_wg_broadcast_3D(float *inptr, float *outptr, size_t nx, size_t ny, size_t nz, size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
123 {
124     size_t i, j, k, _i, _j, _k;
125     size_t group_id_x, group_id_y, group_id_z;
126 
127     for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
128     {
129         size_t z = group_id_z % wg_size_z;
130         size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
131         for (_i=0; _i < local_size_z; _i++)
132         {
133             for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
134             {
135                 size_t y = group_id_y % wg_size_y;
136                 size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
137                 for (_j=0; _j < local_size_y; _j++)
138                 {
139                     for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
140                     {
141                         size_t x = group_id_x % wg_size_x;
142                         size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
143                         float  broadcast_result = inptr[(i + z) * ny * nz + (j + y) * nx + (k + x)];
144                         for (_k=0; _k < local_size_x; _k++)
145                         {
146                             size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
147                             if ( broadcast_result != outptr[indx] )
148                             {
149                                 log_info("work_group_broadcast: Error at (%u, %u, %u): expected = %f, got = %f\n", k+_k, j+_j, i+_i, broadcast_result, outptr[indx]);
150                                 return -1;
151                             }
152                         }
153                     }
154                 }
155             }
156         }
157     }
158 
159     return 0;
160 }
161 
162 
163 int
test_work_group_broadcast_1D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)164 test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
165 {
166     cl_mem       streams[2];
167     cl_float     *input_ptr[1], *p;
168     cl_float     *output_ptr;
169     cl_program   program;
170     cl_kernel    kernel;
171     void         *values[2];
172     size_t       globalsize[1];
173     size_t       wg_size[1];
174     size_t       num_elements;
175     int          err;
176     int          i;
177     MTdata       d;
178 
179     err = create_single_kernel_helper(context, &program, &kernel, 1,
180                                       &wg_broadcast_1D_kernel_code,
181                                       "test_wg_broadcast_1D");
182     if (err)
183         return -1;
184 
185     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
186     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
187     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
188 
189     num_elements = n_elems;
190 
191     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
192     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
193     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
194                                 sizeof(cl_float) * num_elements, NULL, NULL);
195     if (!streams[0])
196     {
197         log_error("clCreateBuffer failed\n");
198         return -1;
199     }
200 
201     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
202                                 sizeof(cl_float) * num_elements, NULL, NULL);
203     if (!streams[1])
204     {
205         log_error("clCreateBuffer failed\n");
206         return -1;
207     }
208 
209     p = input_ptr[0];
210     d = init_genrand( gRandomSeed );
211     for (i=0; i<num_elements; i++)
212     {
213         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
214     }
215     free_mtdata(d); d = NULL;
216 
217     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
218     if (err != CL_SUCCESS)
219     {
220         log_error("clWriteArray failed\n");
221         return -1;
222     }
223 
224     values[0] = streams[0];
225     values[1] = streams[1];
226     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
227     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
228     if (err != CL_SUCCESS)
229     {
230         log_error("clSetKernelArgs failed\n");
231         return -1;
232     }
233 
234     // Line below is troublesome...
235     globalsize[0] = (size_t)n_elems;
236     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalsize, wg_size, 0, NULL, NULL );
237     if (err != CL_SUCCESS)
238     {
239         log_error("clEnqueueNDRangeKernel failed\n");
240         return -1;
241     }
242 
243     cl_uint dead = 0xdeaddead;
244     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
245     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
246     if (err != CL_SUCCESS)
247     {
248         log_error("clEnqueueReadBuffer failed\n");
249         return -1;
250     }
251 
252     if (verify_wg_broadcast_1D(input_ptr[0], output_ptr, num_elements, wg_size[0]))
253     {
254         log_error("work_group_broadcast_1D test failed\n");
255         return -1;
256     }
257     log_info("work_group_broadcast_1D test passed\n");
258 
259     clReleaseMemObject(streams[0]);
260     clReleaseMemObject(streams[1]);
261     clReleaseKernel(kernel);
262     clReleaseProgram(program);
263     free(input_ptr[0]);
264     free(output_ptr);
265 
266     return err;
267 }
268 
269 
270 int
test_work_group_broadcast_2D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)271 test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
272 {
273     cl_mem       streams[2];
274     cl_float     *input_ptr[1], *p;
275     cl_float     *output_ptr;
276     cl_program   program;
277     cl_kernel    kernel;
278     void         *values[2];
279     size_t       globalsize[2];
280     size_t       localsize[2];
281     size_t       wg_size[1];
282     size_t       num_workgroups;
283     size_t       num_elements;
284     int          err;
285     int          i;
286     MTdata       d;
287 
288     err = create_single_kernel_helper(context, &program, &kernel, 1,
289                                       &wg_broadcast_2D_kernel_code,
290                                       "test_wg_broadcast_2D");
291     if (err)
292         return -1;
293 
294     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
295     if (err)
296         return -1;
297 
298     if (wg_size[0] >= 256)
299     {
300         localsize[0] = localsize[1] = 16;
301     }
302     else if (wg_size[0] >=64)
303     {
304         localsize[0] = localsize[1] = 8;
305     }
306     else if (wg_size[0] >= 16)
307     {
308         localsize[0] = localsize[1] = 4;
309     }
310     else
311     {
312         localsize[0] = localsize[1] = 1;
313     }
314 
315     num_workgroups = std::max(n_elems / wg_size[0], (size_t)16);
316     globalsize[0] = num_workgroups * localsize[0];
317     globalsize[1] = num_workgroups * localsize[1];
318     num_elements = globalsize[0] * globalsize[1];
319 
320     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
321     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
322     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
323                                 sizeof(cl_float) * num_elements, NULL, NULL);
324     if (!streams[0])
325     {
326         log_error("clCreateBuffer failed\n");
327         return -1;
328     }
329 
330     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
331                                 sizeof(cl_float) * num_elements, NULL, NULL);
332     if (!streams[1])
333     {
334         log_error("clCreateBuffer failed\n");
335         return -1;
336     }
337 
338     p = input_ptr[0];
339     d = init_genrand( gRandomSeed );
340     for (i=0; i<num_elements; i++)
341     {
342         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
343     }
344     free_mtdata(d); d = NULL;
345 
346     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
347     if (err != CL_SUCCESS)
348     {
349         log_error("clWriteArray failed\n");
350         return -1;
351     }
352 
353     values[0] = streams[0];
354     values[1] = streams[1];
355     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
356     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
357     if (err != CL_SUCCESS)
358     {
359         log_error("clSetKernelArgs failed\n");
360         return -1;
361     }
362 
363     err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, globalsize, localsize, 0, NULL, NULL );
364     if (err != CL_SUCCESS)
365     {
366         log_error("clEnqueueNDRangeKernel failed\n");
367         return -1;
368     }
369 
370     cl_uint dead = 0xdeaddead;
371     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
372     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
373     if (err != CL_SUCCESS)
374     {
375         log_error("clEnqueueReadBuffer failed\n");
376         return -1;
377     }
378 
379     if (verify_wg_broadcast_2D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], localsize[0], localsize[1]))
380     {
381         log_error("work_group_broadcast_2D test failed\n");
382         return -1;
383     }
384     log_info("work_group_broadcast_2D test passed\n");
385 
386     clReleaseMemObject(streams[0]);
387     clReleaseMemObject(streams[1]);
388     clReleaseKernel(kernel);
389     clReleaseProgram(program);
390     free(input_ptr[0]);
391     free(output_ptr);
392 
393     return err;
394 }
395 
396 
397 int
test_work_group_broadcast_3D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)398 test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
399 {
400     cl_mem       streams[2];
401     cl_float     *input_ptr[1], *p;
402     cl_float     *output_ptr;
403     cl_program   program;
404     cl_kernel    kernel;
405     void         *values[2];
406     size_t       globalsize[3];
407     size_t       localsize[3];
408     size_t       wg_size[1];
409     size_t       num_workgroups;
410     size_t       num_elements;
411     int          err;
412     int          i;
413     MTdata       d;
414 
415     err = create_single_kernel_helper(context, &program, &kernel, 1,
416                                       &wg_broadcast_3D_kernel_code,
417                                       "test_wg_broadcast_3D");
418     if (err)
419         return -1;
420 
421     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
422     if (err)
423         return -1;
424 
425     if (wg_size[0] >=512)
426     {
427         localsize[0] = localsize[1] = localsize[2] = 8;
428     }
429     else if (wg_size[0] >= 64)
430     {
431         localsize[0] = localsize[1] = localsize[2] = 4;
432     }
433     else if (wg_size[0] >= 8)
434     {
435         localsize[0] = localsize[1] = localsize[2] = 2;
436     }
437     else
438     {
439         localsize[0] = localsize[1] = localsize[2] = 1;
440     }
441 
442     num_workgroups = std::max(n_elems / wg_size[0], (size_t)8);
443     globalsize[0] = num_workgroups * localsize[0];
444     globalsize[1] = num_workgroups * localsize[1];
445     globalsize[2] = num_workgroups * localsize[2];
446     num_elements = globalsize[0] * globalsize[1] * globalsize[2];
447 
448     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
449     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
450     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
451                                 sizeof(cl_float) * num_elements, NULL, NULL);
452     if (!streams[0])
453     {
454         log_error("clCreateBuffer failed\n");
455         return -1;
456     }
457 
458     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
459                                 sizeof(cl_float) * num_elements, NULL, NULL);
460     if (!streams[1])
461     {
462         log_error("clCreateBuffer failed\n");
463         return -1;
464     }
465 
466     p = input_ptr[0];
467     d = init_genrand( gRandomSeed );
468     for (i=0; i<num_elements; i++)
469     {
470         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
471     }
472     free_mtdata(d); d = NULL;
473 
474     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
475     if (err != CL_SUCCESS)
476     {
477         log_error("clWriteArray failed\n");
478         return -1;
479     }
480 
481     values[0] = streams[0];
482     values[1] = streams[1];
483     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
484     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
485     if (err != CL_SUCCESS)
486     {
487         log_error("clSetKernelArgs failed\n");
488         return -1;
489     }
490 
491     err = clEnqueueNDRangeKernel( queue, kernel, 3, NULL, globalsize, localsize, 0, NULL, NULL );
492     if (err != CL_SUCCESS)
493     {
494         log_error("clEnqueueNDRangeKernel failed\n");
495         return -1;
496     }
497 
498     cl_uint dead = 0xdeaddead;
499     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
500     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
501     if (err != CL_SUCCESS)
502     {
503         log_error("clEnqueueReadBuffer failed\n");
504         return -1;
505     }
506 
507     if (verify_wg_broadcast_3D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], globalsize[2], localsize[0], localsize[1], localsize[2]))
508     {
509         log_error("work_group_broadcast_3D test failed\n");
510         return -1;
511     }
512     log_info("work_group_broadcast_3D test passed\n");
513 
514     clReleaseMemObject(streams[0]);
515     clReleaseMemObject(streams[1]);
516     clReleaseKernel(kernel);
517     clReleaseProgram(program);
518     free(input_ptr[0]);
519     free(output_ptr);
520 
521     return err;
522 }
523 
524 
525 int
test_work_group_broadcast(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)526 test_work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
527 {
528     int err;
529 
530     err = test_work_group_broadcast_1D(device, context, queue, n_elems);
531     if (err) return err;
532     err = test_work_group_broadcast_2D(device, context, queue, n_elems);
533     if (err) return err;
534     return err;
535 }
536 
537 
538