• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 
23 #include "procs.h"
24 
25 
26 const char *wg_reduce_add_kernel_code_int =
27 "__kernel void test_wg_reduce_add_int(global int *input, global int *output)\n"
28 "{\n"
29 "    int  tid = get_global_id(0);\n"
30 "\n"
31 "    int result = work_group_reduce_add(input[tid]);\n"
32 "    output[tid] = result;\n"
33 "}\n";
34 
35 
36 const char *wg_reduce_add_kernel_code_uint =
37 "__kernel void test_wg_reduce_add_uint(global uint *input, global uint *output)\n"
38 "{\n"
39 "    int  tid = get_global_id(0);\n"
40 "\n"
41 "    uint result = work_group_reduce_add(input[tid]);\n"
42 "    output[tid] = result;\n"
43 "}\n";
44 
45 const char *wg_reduce_add_kernel_code_long =
46 "__kernel void test_wg_reduce_add_long(global long *input, global long *output)\n"
47 "{\n"
48 "    int  tid = get_global_id(0);\n"
49 "\n"
50 "    long result = work_group_reduce_add(input[tid]);\n"
51 "    output[tid] = result;\n"
52 "}\n";
53 
54 
55 const char *wg_reduce_add_kernel_code_ulong =
56 "__kernel void test_wg_reduce_add_ulong(global ulong *input, global ulong *output)\n"
57 "{\n"
58 "    int  tid = get_global_id(0);\n"
59 "\n"
60 "    ulong result = work_group_reduce_add(input[tid]);\n"
61 "    output[tid] = result;\n"
62 "}\n";
63 
64 
65 static int
verify_wg_reduce_add_int(int * inptr,int * outptr,size_t n,size_t wg_size)66 verify_wg_reduce_add_int(int *inptr, int *outptr, size_t n, size_t wg_size)
67 {
68     size_t     i, j;
69 
70     for (i=0; i<n; i+=wg_size)
71     {
72         int sum = 0;
73         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
74             sum += inptr[i+j];
75 
76         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
77         {
78             if ( sum != outptr[i+j] )
79             {
80                 log_info("work_group_reduce_add int: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
81                 return -1;
82             }
83         }
84     }
85 
86     return 0;
87 }
88 
89 static int
verify_wg_reduce_add_uint(unsigned int * inptr,unsigned int * outptr,size_t n,size_t wg_size)90 verify_wg_reduce_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
91 {
92     size_t     i, j;
93 
94     for (i=0; i<n; i+=wg_size)
95     {
96         unsigned int sum = 0;
97         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
98             sum += inptr[i+j];
99 
100         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
101         {
102             if ( sum != outptr[i+j] )
103             {
104                 log_info("work_group_reduce_add uint: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]);
105                 return -1;
106             }
107         }
108     }
109 
110     return 0;
111 }
112 
113 static int
verify_wg_reduce_add_long(cl_long * inptr,cl_long * outptr,size_t n,size_t wg_size)114 verify_wg_reduce_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
115 {
116     size_t     i, j;
117 
118     for (i=0; i<n; i+=wg_size)
119     {
120         cl_long sum = 0;
121         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
122             sum += inptr[i+j];
123 
124         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
125         {
126             if ( sum != outptr[i+j] )
127             {
128                 log_info("work_group_reduce_add long: Error at %u: expected = %lld, got = %lld\n", i+j, sum, outptr[i+j]);
129                 return -1;
130             }
131         }
132     }
133 
134     return 0;
135 }
136 
137 static int
verify_wg_reduce_add_ulong(cl_ulong * inptr,cl_ulong * outptr,size_t n,size_t wg_size)138 verify_wg_reduce_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
139 {
140     size_t     i, j;
141 
142     for (i=0; i<n; i+=wg_size)
143     {
144         cl_ulong sum = 0;
145         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
146             sum += inptr[i+j];
147 
148         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
149         {
150             if ( sum != outptr[i+j] )
151             {
152                 log_info("work_group_reduce_add ulong: Error at %u: expected = %llu, got = %llu\n", i+j, sum, outptr[i+j]);
153                 return -1;
154             }
155         }
156     }
157 
158     return 0;
159 }
160 
161 
162 
163 int
test_work_group_reduce_add_int(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)164 test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
165 {
166     cl_mem       streams[2];
167     cl_int       *input_ptr[1], *p;
168     cl_int       *output_ptr;
169     cl_program   program;
170     cl_kernel    kernel;
171     void         *values[2];
172     size_t       threads[1];
173     size_t       wg_size[1];
174     size_t       num_elements;
175     int          err;
176     int          i;
177     MTdata       d;
178 
179     err = create_single_kernel_helper(context, &program, &kernel, 1,
180                                       &wg_reduce_add_kernel_code_int,
181                                       "test_wg_reduce_add_int");
182     if (err)
183         return -1;
184 
185     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
186     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
187     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
188 
189     num_elements = n_elems;
190 
191     input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
192     output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
193     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
194                                 sizeof(cl_int) * num_elements, NULL, NULL);
195     if (!streams[0])
196     {
197         log_error("clCreateBuffer failed\n");
198         return -1;
199     }
200 
201     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
202                                 sizeof(cl_int) * num_elements, NULL, NULL);
203     if (!streams[1])
204     {
205         log_error("clCreateBuffer failed\n");
206         return -1;
207     }
208 
209     p = input_ptr[0];
210     d = init_genrand( gRandomSeed );
211     for (i=0; i<num_elements; i++)
212         p[i] = genrand_int32(d);
213     free_mtdata(d); d = NULL;
214 
215     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
216     if (err != CL_SUCCESS)
217     {
218         log_error("clWriteArray failed\n");
219         return -1;
220     }
221 
222     values[0] = streams[0];
223     values[1] = streams[1];
224     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
225     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
226     if (err != CL_SUCCESS)
227     {
228         log_error("clSetKernelArgs failed\n");
229         return -1;
230     }
231 
232     // Line below is troublesome...
233     threads[0] = (size_t)num_elements;
234     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
235     if (err != CL_SUCCESS)
236     {
237         log_error("clEnqueueNDRangeKernel failed\n");
238         return -1;
239     }
240 
241     cl_uint dead = 0xdeaddead;
242     memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
243     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
244     if (err != CL_SUCCESS)
245     {
246         log_error("clEnqueueReadBuffer failed\n");
247         return -1;
248     }
249 
250     if (verify_wg_reduce_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
251     {
252         log_error("work_group_reduce_add int failed\n");
253         return -1;
254     }
255     log_info("work_group_reduce_add int passed\n");
256 
257     clReleaseMemObject(streams[0]);
258     clReleaseMemObject(streams[1]);
259     clReleaseKernel(kernel);
260     clReleaseProgram(program);
261     free(input_ptr[0]);
262     free(output_ptr);
263 
264     return err;
265 }
266 
267 
268 int
test_work_group_reduce_add_uint(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)269 test_work_group_reduce_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
270 {
271     cl_mem       streams[2];
272     cl_uint      *input_ptr[1], *p;
273     cl_uint      *output_ptr;
274     cl_program   program;
275     cl_kernel    kernel;
276     void         *values[2];
277     size_t       threads[1];
278     size_t       wg_size[1];
279     size_t       num_elements;
280     int          err;
281     int          i;
282     MTdata       d;
283 
284     err = create_single_kernel_helper(context, &program, &kernel, 1,
285                                       &wg_reduce_add_kernel_code_uint,
286                                       "test_wg_reduce_add_uint");
287     if (err)
288         return -1;
289 
290     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
291     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
292     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
293 
294     num_elements = n_elems;
295 
296     input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
297     output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
298     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
299                                 sizeof(cl_uint) * num_elements, NULL, NULL);
300     if (!streams[0])
301     {
302         log_error("clCreateBuffer failed\n");
303         return -1;
304     }
305 
306     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
307                                 sizeof(cl_uint) * num_elements, NULL, NULL);
308     if (!streams[1])
309     {
310         log_error("clCreateBuffer failed\n");
311         return -1;
312     }
313 
314     p = input_ptr[0];
315     d = init_genrand( gRandomSeed );
316     for (i=0; i<num_elements; i++)
317         p[i] = genrand_int32(d);
318     free_mtdata(d); d = NULL;
319 
320     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
321     if (err != CL_SUCCESS)
322     {
323         log_error("clWriteArray failed\n");
324         return -1;
325     }
326 
327     values[0] = streams[0];
328     values[1] = streams[1];
329     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
330     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
331     if (err != CL_SUCCESS)
332     {
333         log_error("clSetKernelArgs failed\n");
334         return -1;
335     }
336 
337     // Line below is troublesome...
338     threads[0] = (size_t)n_elems;
339     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
340     if (err != CL_SUCCESS)
341     {
342         log_error("clEnqueueNDRangeKernel failed\n");
343         return -1;
344     }
345 
346     cl_uint dead = 0xdeaddead;
347     memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
348     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
349     if (err != CL_SUCCESS)
350     {
351         log_error("clEnqueueReadBuffer failed\n");
352         return -1;
353     }
354 
355     if (verify_wg_reduce_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
356     {
357         log_error("work_group_reduce_add uint failed\n");
358         return -1;
359     }
360     log_info("work_group_reduce_add uint passed\n");
361 
362     clReleaseMemObject(streams[0]);
363     clReleaseMemObject(streams[1]);
364     clReleaseKernel(kernel);
365     clReleaseProgram(program);
366     free(input_ptr[0]);
367     free(output_ptr);
368 
369     return err;
370 }
371 
372 int
test_work_group_reduce_add_long(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)373 test_work_group_reduce_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
374 {
375     cl_mem       streams[2];
376     cl_long      *input_ptr[1], *p;
377     cl_long      *output_ptr;
378     cl_program   program;
379     cl_kernel    kernel;
380     void         *values[2];
381     size_t       threads[1];
382     size_t       wg_size[1];
383     size_t       num_elements;
384     int          err;
385     int          i;
386     MTdata       d;
387 
388     err = create_single_kernel_helper(context, &program, &kernel, 1,
389                                       &wg_reduce_add_kernel_code_long,
390                                       "test_wg_reduce_add_long");
391     if (err)
392         return -1;
393 
394     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
395     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
396     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
397 
398     num_elements = n_elems;
399 
400     input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
401     output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
402     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
403                                 sizeof(cl_long) * num_elements, NULL, NULL);
404     if (!streams[0])
405     {
406         log_error("clCreateBuffer failed\n");
407         return -1;
408     }
409 
410     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
411                                 sizeof(cl_long) * num_elements, NULL, NULL);
412     if (!streams[1])
413     {
414         log_error("clCreateBuffer failed\n");
415         return -1;
416     }
417 
418     p = input_ptr[0];
419     d = init_genrand( gRandomSeed );
420     for (i=0; i<num_elements; i++)
421         p[i] = genrand_int64(d);
422     free_mtdata(d); d = NULL;
423 
424     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
425     if (err != CL_SUCCESS)
426     {
427         log_error("clWriteArray failed\n");
428         return -1;
429     }
430 
431     values[0] = streams[0];
432     values[1] = streams[1];
433     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
434     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
435     if (err != CL_SUCCESS)
436     {
437         log_error("clSetKernelArgs failed\n");
438         return -1;
439     }
440 
441     // Line below is troublesome...
442     threads[0] = (size_t)n_elems;
443     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
444     if (err != CL_SUCCESS)
445     {
446         log_error("clEnqueueNDRangeKernel failed\n");
447         return -1;
448     }
449 
450     cl_uint dead = 0xdeaddead;
451     memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
452     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
453     if (err != CL_SUCCESS)
454     {
455         log_error("clEnqueueReadBuffer failed\n");
456         return -1;
457     }
458 
459     if (verify_wg_reduce_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
460     {
461         log_error("work_group_reduce_add long failed\n");
462         return -1;
463     }
464     log_info("work_group_reduce_add long passed\n");
465 
466     clReleaseMemObject(streams[0]);
467     clReleaseMemObject(streams[1]);
468     clReleaseKernel(kernel);
469     clReleaseProgram(program);
470     free(input_ptr[0]);
471     free(output_ptr);
472 
473     return err;
474 }
475 
476 
477 int
test_work_group_reduce_add_ulong(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)478 test_work_group_reduce_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
479 {
480     cl_mem       streams[2];
481     cl_ulong     *input_ptr[1], *p;
482     cl_ulong     *output_ptr;
483     cl_program   program;
484     cl_kernel    kernel;
485     void         *values[2];
486     size_t       threads[1];
487     size_t       wg_size[1];
488     size_t       num_elements;
489     int          err;
490     int          i;
491     MTdata       d;
492 
493     err = create_single_kernel_helper(context, &program, &kernel, 1,
494                                       &wg_reduce_add_kernel_code_ulong,
495                                       "test_wg_reduce_add_ulong");
496     if (err)
497         return -1;
498 
499     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
500     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
501     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
502 
503     num_elements = n_elems;
504 
505     input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
506     output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
507     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
508                                 sizeof(cl_ulong) * num_elements, NULL, NULL);
509     if (!streams[0])
510     {
511         log_error("clCreateBuffer failed\n");
512         return -1;
513     }
514 
515     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
516                                 sizeof(cl_ulong) * num_elements, NULL, NULL);
517     if (!streams[1])
518     {
519         log_error("clCreateBuffer failed\n");
520         return -1;
521     }
522 
523     p = input_ptr[0];
524     d = init_genrand( gRandomSeed );
525     for (i=0; i<num_elements; i++)
526         p[i] = genrand_int64(d);
527     free_mtdata(d); d = NULL;
528 
529     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
530     if (err != CL_SUCCESS)
531     {
532         log_error("clWriteArray failed\n");
533         return -1;
534     }
535 
536     values[0] = streams[0];
537     values[1] = streams[1];
538     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
539     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
540     if (err != CL_SUCCESS)
541     {
542         log_error("clSetKernelArgs failed\n");
543         return -1;
544     }
545 
546     // Line below is troublesome...
547     threads[0] = (size_t)n_elems;
548     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
549     if (err != CL_SUCCESS)
550     {
551         log_error("clEnqueueNDRangeKernel failed\n");
552         return -1;
553     }
554 
555     cl_uint dead = 0xdeaddead;
556     memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
557     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
558     if (err != CL_SUCCESS)
559     {
560         log_error("clEnqueueReadBuffer failed\n");
561         return -1;
562     }
563 
564     if (verify_wg_reduce_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
565     {
566         log_error("work_group_reduce_add ulong failed\n");
567         return -1;
568     }
569     log_info("work_group_reduce_add ulong passed\n");
570 
571     clReleaseMemObject(streams[0]);
572     clReleaseMemObject(streams[1]);
573     clReleaseKernel(kernel);
574     clReleaseProgram(program);
575     free(input_ptr[0]);
576     free(output_ptr);
577 
578     return err;
579 }
580 
581 
582 int
test_work_group_reduce_add(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)583 test_work_group_reduce_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
584 {
585     int err;
586 
587     err = test_work_group_reduce_add_int(device, context, queue, n_elems);
588     if (err) return err;
589     err = test_work_group_reduce_add_uint(device, context, queue, n_elems);
590     if (err) return err;
591     err = test_work_group_reduce_add_long(device, context, queue, n_elems);
592     if (err) return err;
593     err = test_work_group_reduce_add_ulong(device, context, queue, n_elems);
594     return err;
595 }
596 
597