1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22
23 #include "procs.h"
24
25
26 const char *wg_reduce_add_kernel_code_int =
27 "__kernel void test_wg_reduce_add_int(global int *input, global int *output)\n"
28 "{\n"
29 " int tid = get_global_id(0);\n"
30 "\n"
31 " int result = work_group_reduce_add(input[tid]);\n"
32 " output[tid] = result;\n"
33 "}\n";
34
35
36 const char *wg_reduce_add_kernel_code_uint =
37 "__kernel void test_wg_reduce_add_uint(global uint *input, global uint *output)\n"
38 "{\n"
39 " int tid = get_global_id(0);\n"
40 "\n"
41 " uint result = work_group_reduce_add(input[tid]);\n"
42 " output[tid] = result;\n"
43 "}\n";
44
45 const char *wg_reduce_add_kernel_code_long =
46 "__kernel void test_wg_reduce_add_long(global long *input, global long *output)\n"
47 "{\n"
48 " int tid = get_global_id(0);\n"
49 "\n"
50 " long result = work_group_reduce_add(input[tid]);\n"
51 " output[tid] = result;\n"
52 "}\n";
53
54
55 const char *wg_reduce_add_kernel_code_ulong =
56 "__kernel void test_wg_reduce_add_ulong(global ulong *input, global ulong *output)\n"
57 "{\n"
58 " int tid = get_global_id(0);\n"
59 "\n"
60 " ulong result = work_group_reduce_add(input[tid]);\n"
61 " output[tid] = result;\n"
62 "}\n";
63
64
65 static int
verify_wg_reduce_add_int(int * inptr,int * outptr,size_t n,size_t wg_size)66 verify_wg_reduce_add_int(int *inptr, int *outptr, size_t n, size_t wg_size)
67 {
68 size_t i, j;
69
70 for (i=0; i<n; i+=wg_size)
71 {
72 int sum = 0;
73 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
74 sum += inptr[i+j];
75
76 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
77 {
78 if ( sum != outptr[i+j] )
79 {
80 log_info("work_group_reduce_add int: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
81 return -1;
82 }
83 }
84 }
85
86 return 0;
87 }
88
89 static int
verify_wg_reduce_add_uint(unsigned int * inptr,unsigned int * outptr,size_t n,size_t wg_size)90 verify_wg_reduce_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
91 {
92 size_t i, j;
93
94 for (i=0; i<n; i+=wg_size)
95 {
96 unsigned int sum = 0;
97 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
98 sum += inptr[i+j];
99
100 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
101 {
102 if ( sum != outptr[i+j] )
103 {
104 log_info("work_group_reduce_add uint: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
105 return -1;
106 }
107 }
108 }
109
110 return 0;
111 }
112
113 static int
verify_wg_reduce_add_long(cl_long * inptr,cl_long * outptr,size_t n,size_t wg_size)114 verify_wg_reduce_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
115 {
116 size_t i, j;
117
118 for (i=0; i<n; i+=wg_size)
119 {
120 cl_long sum = 0;
121 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
122 sum += inptr[i+j];
123
124 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
125 {
126 if ( sum != outptr[i+j] )
127 {
128 log_info("work_group_reduce_add long: Error at %u: expected = %lld, got = %lld\n", i+j, sum, outptr[i+j]);
129 return -1;
130 }
131 }
132 }
133
134 return 0;
135 }
136
137 static int
verify_wg_reduce_add_ulong(cl_ulong * inptr,cl_ulong * outptr,size_t n,size_t wg_size)138 verify_wg_reduce_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
139 {
140 size_t i, j;
141
142 for (i=0; i<n; i+=wg_size)
143 {
144 cl_ulong sum = 0;
145 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
146 sum += inptr[i+j];
147
148 for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
149 {
150 if ( sum != outptr[i+j] )
151 {
152 log_info("work_group_reduce_add ulong: Error at %u: expected = %llu, got = %llu\n", i+j, sum, outptr[i+j]);
153 return -1;
154 }
155 }
156 }
157
158 return 0;
159 }
160
161
162
163 int
test_work_group_reduce_add_int(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)164 test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
165 {
166 cl_mem streams[2];
167 cl_int *input_ptr[1], *p;
168 cl_int *output_ptr;
169 cl_program program;
170 cl_kernel kernel;
171 void *values[2];
172 size_t threads[1];
173 size_t wg_size[1];
174 size_t num_elements;
175 int err;
176 int i;
177 MTdata d;
178
179 err = create_single_kernel_helper(context, &program, &kernel, 1,
180 &wg_reduce_add_kernel_code_int,
181 "test_wg_reduce_add_int");
182 if (err)
183 return -1;
184
185 // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
186 err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
187 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
188
189 num_elements = n_elems;
190
191 input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
192 output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
193 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
194 sizeof(cl_int) * num_elements, NULL, NULL);
195 if (!streams[0])
196 {
197 log_error("clCreateBuffer failed\n");
198 return -1;
199 }
200
201 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
202 sizeof(cl_int) * num_elements, NULL, NULL);
203 if (!streams[1])
204 {
205 log_error("clCreateBuffer failed\n");
206 return -1;
207 }
208
209 p = input_ptr[0];
210 d = init_genrand( gRandomSeed );
211 for (i=0; i<num_elements; i++)
212 p[i] = genrand_int32(d);
213 free_mtdata(d); d = NULL;
214
215 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
216 if (err != CL_SUCCESS)
217 {
218 log_error("clWriteArray failed\n");
219 return -1;
220 }
221
222 values[0] = streams[0];
223 values[1] = streams[1];
224 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
225 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
226 if (err != CL_SUCCESS)
227 {
228 log_error("clSetKernelArgs failed\n");
229 return -1;
230 }
231
232 // Line below is troublesome...
233 threads[0] = (size_t)num_elements;
234 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
235 if (err != CL_SUCCESS)
236 {
237 log_error("clEnqueueNDRangeKernel failed\n");
238 return -1;
239 }
240
241 cl_uint dead = 0xdeaddead;
242 memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
243 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
244 if (err != CL_SUCCESS)
245 {
246 log_error("clEnqueueReadBuffer failed\n");
247 return -1;
248 }
249
250 if (verify_wg_reduce_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
251 {
252 log_error("work_group_reduce_add int failed\n");
253 return -1;
254 }
255 log_info("work_group_reduce_add int passed\n");
256
257 clReleaseMemObject(streams[0]);
258 clReleaseMemObject(streams[1]);
259 clReleaseKernel(kernel);
260 clReleaseProgram(program);
261 free(input_ptr[0]);
262 free(output_ptr);
263
264 return err;
265 }
266
267
268 int
test_work_group_reduce_add_uint(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)269 test_work_group_reduce_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
270 {
271 cl_mem streams[2];
272 cl_uint *input_ptr[1], *p;
273 cl_uint *output_ptr;
274 cl_program program;
275 cl_kernel kernel;
276 void *values[2];
277 size_t threads[1];
278 size_t wg_size[1];
279 size_t num_elements;
280 int err;
281 int i;
282 MTdata d;
283
284 err = create_single_kernel_helper(context, &program, &kernel, 1,
285 &wg_reduce_add_kernel_code_uint,
286 "test_wg_reduce_add_uint");
287 if (err)
288 return -1;
289
290 // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
291 err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
292 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
293
294 num_elements = n_elems;
295
296 input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
297 output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
298 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
299 sizeof(cl_uint) * num_elements, NULL, NULL);
300 if (!streams[0])
301 {
302 log_error("clCreateBuffer failed\n");
303 return -1;
304 }
305
306 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
307 sizeof(cl_uint) * num_elements, NULL, NULL);
308 if (!streams[1])
309 {
310 log_error("clCreateBuffer failed\n");
311 return -1;
312 }
313
314 p = input_ptr[0];
315 d = init_genrand( gRandomSeed );
316 for (i=0; i<num_elements; i++)
317 p[i] = genrand_int32(d);
318 free_mtdata(d); d = NULL;
319
320 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
321 if (err != CL_SUCCESS)
322 {
323 log_error("clWriteArray failed\n");
324 return -1;
325 }
326
327 values[0] = streams[0];
328 values[1] = streams[1];
329 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
330 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
331 if (err != CL_SUCCESS)
332 {
333 log_error("clSetKernelArgs failed\n");
334 return -1;
335 }
336
337 // Line below is troublesome...
338 threads[0] = (size_t)n_elems;
339 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
340 if (err != CL_SUCCESS)
341 {
342 log_error("clEnqueueNDRangeKernel failed\n");
343 return -1;
344 }
345
346 cl_uint dead = 0xdeaddead;
347 memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
348 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
349 if (err != CL_SUCCESS)
350 {
351 log_error("clEnqueueReadBuffer failed\n");
352 return -1;
353 }
354
355 if (verify_wg_reduce_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
356 {
357 log_error("work_group_reduce_add uint failed\n");
358 return -1;
359 }
360 log_info("work_group_reduce_add uint passed\n");
361
362 clReleaseMemObject(streams[0]);
363 clReleaseMemObject(streams[1]);
364 clReleaseKernel(kernel);
365 clReleaseProgram(program);
366 free(input_ptr[0]);
367 free(output_ptr);
368
369 return err;
370 }
371
372 int
test_work_group_reduce_add_long(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)373 test_work_group_reduce_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
374 {
375 cl_mem streams[2];
376 cl_long *input_ptr[1], *p;
377 cl_long *output_ptr;
378 cl_program program;
379 cl_kernel kernel;
380 void *values[2];
381 size_t threads[1];
382 size_t wg_size[1];
383 size_t num_elements;
384 int err;
385 int i;
386 MTdata d;
387
388 err = create_single_kernel_helper(context, &program, &kernel, 1,
389 &wg_reduce_add_kernel_code_long,
390 "test_wg_reduce_add_long");
391 if (err)
392 return -1;
393
394 // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
395 err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
396 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
397
398 num_elements = n_elems;
399
400 input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
401 output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
402 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
403 sizeof(cl_long) * num_elements, NULL, NULL);
404 if (!streams[0])
405 {
406 log_error("clCreateBuffer failed\n");
407 return -1;
408 }
409
410 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
411 sizeof(cl_long) * num_elements, NULL, NULL);
412 if (!streams[1])
413 {
414 log_error("clCreateBuffer failed\n");
415 return -1;
416 }
417
418 p = input_ptr[0];
419 d = init_genrand( gRandomSeed );
420 for (i=0; i<num_elements; i++)
421 p[i] = genrand_int64(d);
422 free_mtdata(d); d = NULL;
423
424 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
425 if (err != CL_SUCCESS)
426 {
427 log_error("clWriteArray failed\n");
428 return -1;
429 }
430
431 values[0] = streams[0];
432 values[1] = streams[1];
433 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
434 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
435 if (err != CL_SUCCESS)
436 {
437 log_error("clSetKernelArgs failed\n");
438 return -1;
439 }
440
441 // Line below is troublesome...
442 threads[0] = (size_t)n_elems;
443 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
444 if (err != CL_SUCCESS)
445 {
446 log_error("clEnqueueNDRangeKernel failed\n");
447 return -1;
448 }
449
450 cl_uint dead = 0xdeaddead;
451 memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
452 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
453 if (err != CL_SUCCESS)
454 {
455 log_error("clEnqueueReadBuffer failed\n");
456 return -1;
457 }
458
459 if (verify_wg_reduce_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
460 {
461 log_error("work_group_reduce_add long failed\n");
462 return -1;
463 }
464 log_info("work_group_reduce_add long passed\n");
465
466 clReleaseMemObject(streams[0]);
467 clReleaseMemObject(streams[1]);
468 clReleaseKernel(kernel);
469 clReleaseProgram(program);
470 free(input_ptr[0]);
471 free(output_ptr);
472
473 return err;
474 }
475
476
477 int
test_work_group_reduce_add_ulong(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)478 test_work_group_reduce_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
479 {
480 cl_mem streams[2];
481 cl_ulong *input_ptr[1], *p;
482 cl_ulong *output_ptr;
483 cl_program program;
484 cl_kernel kernel;
485 void *values[2];
486 size_t threads[1];
487 size_t wg_size[1];
488 size_t num_elements;
489 int err;
490 int i;
491 MTdata d;
492
493 err = create_single_kernel_helper(context, &program, &kernel, 1,
494 &wg_reduce_add_kernel_code_ulong,
495 "test_wg_reduce_add_ulong");
496 if (err)
497 return -1;
498
499 // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
500 err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
501 test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
502
503 num_elements = n_elems;
504
505 input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
506 output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
507 streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
508 sizeof(cl_ulong) * num_elements, NULL, NULL);
509 if (!streams[0])
510 {
511 log_error("clCreateBuffer failed\n");
512 return -1;
513 }
514
515 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
516 sizeof(cl_ulong) * num_elements, NULL, NULL);
517 if (!streams[1])
518 {
519 log_error("clCreateBuffer failed\n");
520 return -1;
521 }
522
523 p = input_ptr[0];
524 d = init_genrand( gRandomSeed );
525 for (i=0; i<num_elements; i++)
526 p[i] = genrand_int64(d);
527 free_mtdata(d); d = NULL;
528
529 err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
530 if (err != CL_SUCCESS)
531 {
532 log_error("clWriteArray failed\n");
533 return -1;
534 }
535
536 values[0] = streams[0];
537 values[1] = streams[1];
538 err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
539 err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
540 if (err != CL_SUCCESS)
541 {
542 log_error("clSetKernelArgs failed\n");
543 return -1;
544 }
545
546 // Line below is troublesome...
547 threads[0] = (size_t)n_elems;
548 err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
549 if (err != CL_SUCCESS)
550 {
551 log_error("clEnqueueNDRangeKernel failed\n");
552 return -1;
553 }
554
555 cl_uint dead = 0xdeaddead;
556 memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
557 err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
558 if (err != CL_SUCCESS)
559 {
560 log_error("clEnqueueReadBuffer failed\n");
561 return -1;
562 }
563
564 if (verify_wg_reduce_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
565 {
566 log_error("work_group_reduce_add ulong failed\n");
567 return -1;
568 }
569 log_info("work_group_reduce_add ulong passed\n");
570
571 clReleaseMemObject(streams[0]);
572 clReleaseMemObject(streams[1]);
573 clReleaseKernel(kernel);
574 clReleaseProgram(program);
575 free(input_ptr[0]);
576 free(output_ptr);
577
578 return err;
579 }
580
581
582 int
test_work_group_reduce_add(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)583 test_work_group_reduce_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
584 {
585 int err;
586
587 err = test_work_group_reduce_add_int(device, context, queue, n_elems);
588 if (err) return err;
589 err = test_work_group_reduce_add_uint(device, context, queue, n_elems);
590 if (err) return err;
591 err = test_work_group_reduce_add_long(device, context, queue, n_elems);
592 if (err) return err;
593 err = test_work_group_reduce_add_ulong(device, context, queue, n_elems);
594 return err;
595 }
596
597