1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22
23 #include "procs.h"
24
25
26 const char *wg_broadcast_1D_kernel_code =
27 "__kernel void test_wg_broadcast_1D(global float *input, global float *output)\n"
28 "{\n"
29 " int tid = get_global_id(0);\n"
30 "\n"
31 " float result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
32 " output[tid] = result;\n"
33 "}\n";
34
35 const char *wg_broadcast_2D_kernel_code =
36 "__kernel void test_wg_broadcast_2D(global float *input, global float *output)\n"
37 "{\n"
38 " size_t tid_x = get_global_id(0);\n"
39 " size_t tid_y = get_global_id(1);\n"
40 " size_t x = get_group_id(0) % get_local_size(0);\n"
41 " size_t y = get_group_id(1) % get_local_size(1);\n"
42 "\n"
43 " size_t indx = (tid_y * get_global_size(0)) + tid_x;\n"
44 " float result = work_group_broadcast(input[indx], x, y);\n"
45 " output[indx] = result;\n"
46 "}\n";
47
48 const char *wg_broadcast_3D_kernel_code =
49 "__kernel void test_wg_broadcast_3D(global float *input, global float *output)\n"
50 "{\n"
51 " size_t tid_x = get_global_id(0);\n"
52 " size_t tid_y = get_global_id(1);\n"
53 " size_t tid_z = get_global_id(2);\n"
54 " size_t x = get_group_id(0) % get_local_size(0);\n"
55 " size_t y = get_group_id(1) % get_local_size(1);\n"
56 " size_t z = get_group_id(2) % get_local_size(2);\n"
57 "\n"
58 " size_t indx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
59 " float result = work_group_broadcast(input[indx], x, y, z);\n"
60 " output[indx] = result;\n"
61 "}\n";
62
63 static int
verify_wg_broadcast_1D(float * inptr,float * outptr,size_t n,size_t wg_size)64 verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
65 {
66 size_t i, j;
67 size_t group_id;
68
69 for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
70 {
71 int local_size = (n-i) > wg_size ? wg_size : (n-i);
72 float broadcast_result = inptr[i + (group_id % local_size)];
73 for (j=0; j<local_size; j++)
74 {
75 if ( broadcast_result != outptr[i+j] )
76 {
77 log_info("work_group_broadcast: Error at %u: expected = %f, got = %f\n", i+j, broadcast_result, outptr[i+j]);
78 return -1;
79 }
80 }
81 }
82
83 return 0;
84 }
85
86 static int
verify_wg_broadcast_2D(float * inptr,float * outptr,size_t nx,size_t ny,size_t wg_size_x,size_t wg_size_y)87 verify_wg_broadcast_2D(float *inptr, float *outptr, size_t nx, size_t ny, size_t wg_size_x, size_t wg_size_y)
88 {
89 size_t i, j, _i, _j;
90 size_t group_id_x, group_id_y;
91
92 for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
93 {
94 size_t y = group_id_y % wg_size_y;
95 size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
96 for (_i=0; _i < local_size_y; _i++)
97 {
98 for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
99 {
100 size_t x = group_id_x % wg_size_x;
101 size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
102 float broadcast_result = inptr[(i + y) * nx + (j + x)];
103 for (_j=0; _j < local_size_x; _j++)
104 {
105 size_t indx = (i + _i) * nx + (j + _j);
106 if ( broadcast_result != outptr[indx] )
107 {
108 log_info("work_group_broadcast: Error at (%u, %u): expected = %f, got = %f\n", j+_j, i+_i, broadcast_result, outptr[indx]);
109 return -1;
110 }
111 }
112 }
113 }
114 }
115
116 return 0;
117 }
118
119 static int
verify_wg_broadcast_3D(float * inptr,float * outptr,size_t nx,size_t ny,size_t nz,size_t wg_size_x,size_t wg_size_y,size_t wg_size_z)120 verify_wg_broadcast_3D(float *inptr, float *outptr, size_t nx, size_t ny, size_t nz, size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
121 {
122 size_t i, j, k, _i, _j, _k;
123 size_t group_id_x, group_id_y, group_id_z;
124
125 for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
126 {
127 size_t z = group_id_z % wg_size_z;
128 size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
129 for (_i=0; _i < local_size_z; _i++)
130 {
131 for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
132 {
133 size_t y = group_id_y % wg_size_y;
134 size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
135 for (_j=0; _j < local_size_y; _j++)
136 {
137 for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
138 {
139 size_t x = group_id_x % wg_size_x;
140 size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
141 float broadcast_result = inptr[(i + z) * ny * nz + (j + y) * nx + (k + x)];
142 for (_k=0; _k < local_size_x; _k++)
143 {
144 size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
145 if ( broadcast_result != outptr[indx] )
146 {
147 log_info("work_group_broadcast: Error at (%u, %u, %u): expected = %f, got = %f\n", k+_k, j+_j, i+_i, broadcast_result, outptr[indx]);
148 return -1;
149 }
150 }
151 }
152 }
153 }
154 }
155 }
156
157 return 0;
158 }
159
160
161 int
test_work_group_broadcast_1D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)162 test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
163 {
164 cl_mem streams[2];
165 cl_float *input_ptr[1], *p;
166 cl_float *output_ptr;
167 cl_program program;
168 cl_kernel kernel;
169 void *values[2];
170 size_t globalsize[1];
171 size_t wg_size[1];
172 size_t num_elements;
173 int err;
174 int i;
175 MTdata d;
176
177 err = create_single_kernel_helper(context, &program, &kernel, 1,
178 &wg_broadcast_1D_kernel_code,
179 "test_wg_broadcast_1D");
180 if (err)
181 return -1;
182
183 // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
184 err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
185 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
186
187 num_elements = n_elems;
188
189 input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
190 output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
191 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
192 sizeof(cl_float) * num_elements, NULL, NULL);
193 if (!streams[0])
194 {
195 log_error("clCreateBuffer failed\n");
196 return -1;
197 }
198
199 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
200 sizeof(cl_float) * num_elements, NULL, NULL);
201 if (!streams[1])
202 {
203 log_error("clCreateBuffer failed\n");
204 return -1;
205 }
206
207 p = input_ptr[0];
208 d = init_genrand( gRandomSeed );
209 for (i=0; i<num_elements; i++)
210 {
211 p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
212 }
213 free_mtdata(d); d = NULL;
214
215 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
216 if (err != CL_SUCCESS)
217 {
218 log_error("clWriteArray failed\n");
219 return -1;
220 }
221
222 values[0] = streams[0];
223 values[1] = streams[1];
224 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
225 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
226 if (err != CL_SUCCESS)
227 {
228 log_error("clSetKernelArgs failed\n");
229 return -1;
230 }
231
232 // Line below is troublesome...
233 globalsize[0] = (size_t)n_elems;
234 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalsize, wg_size, 0, NULL, NULL );
235 if (err != CL_SUCCESS)
236 {
237 log_error("clEnqueueNDRangeKernel failed\n");
238 return -1;
239 }
240
241 cl_uint dead = 0xdeaddead;
242 memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
243 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
244 if (err != CL_SUCCESS)
245 {
246 log_error("clEnqueueReadBuffer failed\n");
247 return -1;
248 }
249
250 if (verify_wg_broadcast_1D(input_ptr[0], output_ptr, num_elements, wg_size[0]))
251 {
252 log_error("work_group_broadcast_1D test failed\n");
253 return -1;
254 }
255 log_info("work_group_broadcast_1D test passed\n");
256
257 clReleaseMemObject(streams[0]);
258 clReleaseMemObject(streams[1]);
259 clReleaseKernel(kernel);
260 clReleaseProgram(program);
261 free(input_ptr[0]);
262 free(output_ptr);
263
264 return err;
265 }
266
267
268 int
test_work_group_broadcast_2D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)269 test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
270 {
271 cl_mem streams[2];
272 cl_float *input_ptr[1], *p;
273 cl_float *output_ptr;
274 cl_program program;
275 cl_kernel kernel;
276 void *values[2];
277 size_t globalsize[2];
278 size_t localsize[2];
279 size_t wg_size[1];
280 size_t num_workgroups;
281 size_t num_elements;
282 int err;
283 int i;
284 MTdata d;
285
286 err = create_single_kernel_helper(context, &program, &kernel, 1,
287 &wg_broadcast_2D_kernel_code,
288 "test_wg_broadcast_2D");
289 if (err)
290 return -1;
291
292 err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
293 if (err)
294 return -1;
295
296 if (wg_size[0] >= 256)
297 {
298 localsize[0] = localsize[1] = 16;
299 }
300 else if (wg_size[0] >=64)
301 {
302 localsize[0] = localsize[1] = 8;
303 }
304 else if (wg_size[0] >= 16)
305 {
306 localsize[0] = localsize[1] = 4;
307 }
308 else
309 {
310 localsize[0] = localsize[1] = 1;
311 }
312
313 num_workgroups = MAX(n_elems/wg_size[0], 16);
314 globalsize[0] = num_workgroups * localsize[0];
315 globalsize[1] = num_workgroups * localsize[1];
316 num_elements = globalsize[0] * globalsize[1];
317
318 input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
319 output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
320 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
321 sizeof(cl_float) * num_elements, NULL, NULL);
322 if (!streams[0])
323 {
324 log_error("clCreateBuffer failed\n");
325 return -1;
326 }
327
328 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
329 sizeof(cl_float) * num_elements, NULL, NULL);
330 if (!streams[1])
331 {
332 log_error("clCreateBuffer failed\n");
333 return -1;
334 }
335
336 p = input_ptr[0];
337 d = init_genrand( gRandomSeed );
338 for (i=0; i<num_elements; i++)
339 {
340 p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
341 }
342 free_mtdata(d); d = NULL;
343
344 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
345 if (err != CL_SUCCESS)
346 {
347 log_error("clWriteArray failed\n");
348 return -1;
349 }
350
351 values[0] = streams[0];
352 values[1] = streams[1];
353 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
354 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
355 if (err != CL_SUCCESS)
356 {
357 log_error("clSetKernelArgs failed\n");
358 return -1;
359 }
360
361 err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, globalsize, localsize, 0, NULL, NULL );
362 if (err != CL_SUCCESS)
363 {
364 log_error("clEnqueueNDRangeKernel failed\n");
365 return -1;
366 }
367
368 cl_uint dead = 0xdeaddead;
369 memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
370 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
371 if (err != CL_SUCCESS)
372 {
373 log_error("clEnqueueReadBuffer failed\n");
374 return -1;
375 }
376
377 if (verify_wg_broadcast_2D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], localsize[0], localsize[1]))
378 {
379 log_error("work_group_broadcast_2D test failed\n");
380 return -1;
381 }
382 log_info("work_group_broadcast_2D test passed\n");
383
384 clReleaseMemObject(streams[0]);
385 clReleaseMemObject(streams[1]);
386 clReleaseKernel(kernel);
387 clReleaseProgram(program);
388 free(input_ptr[0]);
389 free(output_ptr);
390
391 return err;
392 }
393
394
395 int
test_work_group_broadcast_3D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)396 test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
397 {
398 cl_mem streams[2];
399 cl_float *input_ptr[1], *p;
400 cl_float *output_ptr;
401 cl_program program;
402 cl_kernel kernel;
403 void *values[2];
404 size_t globalsize[3];
405 size_t localsize[3];
406 size_t wg_size[1];
407 size_t num_workgroups;
408 size_t num_elements;
409 int err;
410 int i;
411 MTdata d;
412
413 err = create_single_kernel_helper(context, &program, &kernel, 1,
414 &wg_broadcast_3D_kernel_code,
415 "test_wg_broadcast_3D");
416 if (err)
417 return -1;
418
419 err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
420 if (err)
421 return -1;
422
423 if (wg_size[0] >=512)
424 {
425 localsize[0] = localsize[1] = localsize[2] = 8;
426 }
427 else if (wg_size[0] >= 64)
428 {
429 localsize[0] = localsize[1] = localsize[2] = 4;
430 }
431 else if (wg_size[0] >= 8)
432 {
433 localsize[0] = localsize[1] = localsize[2] = 2;
434 }
435 else
436 {
437 localsize[0] = localsize[1] = localsize[2] = 1;
438 }
439
440 num_workgroups = MAX(n_elems/wg_size[0], 8);
441 globalsize[0] = num_workgroups * localsize[0];
442 globalsize[1] = num_workgroups * localsize[1];
443 globalsize[2] = num_workgroups * localsize[2];
444 num_elements = globalsize[0] * globalsize[1] * globalsize[2];
445
446 input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
447 output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
448 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
449 sizeof(cl_float) * num_elements, NULL, NULL);
450 if (!streams[0])
451 {
452 log_error("clCreateBuffer failed\n");
453 return -1;
454 }
455
456 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
457 sizeof(cl_float) * num_elements, NULL, NULL);
458 if (!streams[1])
459 {
460 log_error("clCreateBuffer failed\n");
461 return -1;
462 }
463
464 p = input_ptr[0];
465 d = init_genrand( gRandomSeed );
466 for (i=0; i<num_elements; i++)
467 {
468 p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
469 }
470 free_mtdata(d); d = NULL;
471
472 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
473 if (err != CL_SUCCESS)
474 {
475 log_error("clWriteArray failed\n");
476 return -1;
477 }
478
479 values[0] = streams[0];
480 values[1] = streams[1];
481 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
482 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
483 if (err != CL_SUCCESS)
484 {
485 log_error("clSetKernelArgs failed\n");
486 return -1;
487 }
488
489 err = clEnqueueNDRangeKernel( queue, kernel, 3, NULL, globalsize, localsize, 0, NULL, NULL );
490 if (err != CL_SUCCESS)
491 {
492 log_error("clEnqueueNDRangeKernel failed\n");
493 return -1;
494 }
495
496 cl_uint dead = 0xdeaddead;
497 memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
498 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
499 if (err != CL_SUCCESS)
500 {
501 log_error("clEnqueueReadBuffer failed\n");
502 return -1;
503 }
504
505 if (verify_wg_broadcast_3D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], globalsize[2], localsize[0], localsize[1], localsize[2]))
506 {
507 log_error("work_group_broadcast_3D test failed\n");
508 return -1;
509 }
510 log_info("work_group_broadcast_3D test passed\n");
511
512 clReleaseMemObject(streams[0]);
513 clReleaseMemObject(streams[1]);
514 clReleaseKernel(kernel);
515 clReleaseProgram(program);
516 free(input_ptr[0]);
517 free(output_ptr);
518
519 return err;
520 }
521
522
523 int
test_work_group_broadcast(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)524 test_work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
525 {
526 int err;
527
528 err = test_work_group_broadcast_1D(device, context, queue, n_elems);
529 if (err) return err;
530 err = test_work_group_broadcast_2D(device, context, queue, n_elems);
531 if (err) return err;
532 return err;
533 }
534
535
536