• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 
23 #include "procs.h"
24 
25 
26 const char *wg_reduce_max_kernel_code_int =
27 "__kernel void test_wg_reduce_max_int(global int *input, global int *output)\n"
28 "{\n"
29 "    int  tid = get_global_id(0);\n"
30 "\n"
31 "    int result = work_group_reduce_max(input[tid]);\n"
32 "    output[tid] = result;\n"
33 "}\n";
34 
35 
36 const char *wg_reduce_max_kernel_code_uint =
37 "__kernel void test_wg_reduce_max_uint(global uint *input, global uint *output)\n"
38 "{\n"
39 "    int  tid = get_global_id(0);\n"
40 "\n"
41 "    uint result = work_group_reduce_max(input[tid]);\n"
42 "    output[tid] = result;\n"
43 "}\n";
44 
45 const char *wg_reduce_max_kernel_code_long =
46 "__kernel void test_wg_reduce_max_long(global long *input, global long *output)\n"
47 "{\n"
48 "    int  tid = get_global_id(0);\n"
49 "\n"
50 "    long result = work_group_reduce_max(input[tid]);\n"
51 "    output[tid] = result;\n"
52 "}\n";
53 
54 
55 const char *wg_reduce_max_kernel_code_ulong =
56 "__kernel void test_wg_reduce_max_ulong(global ulong *input, global ulong *output)\n"
57 "{\n"
58 "    int  tid = get_global_id(0);\n"
59 "\n"
60 "    ulong result = work_group_reduce_max(input[tid]);\n"
61 "    output[tid] = result;\n"
62 "}\n";
63 
64 
65 static int
verify_wg_reduce_max_int(int * inptr,int * outptr,size_t n,size_t wg_size)66 verify_wg_reduce_max_int(int *inptr, int *outptr, size_t n, size_t wg_size)
67 {
68     size_t     i, j;
69 
70     for (i=0; i<n; i+=wg_size)
71     {
72         int max = CL_INT_MIN;
73         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
74             max = (max > inptr[i+j]) ? max : inptr[i+j];
75 
76         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
77         {
78             if ( max != outptr[i+j] )
79             {
80                 log_info("work_group_reduce_max int: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]);
81                 return -1;
82             }
83         }
84     }
85 
86     return 0;
87 }
88 
89 static int
verify_wg_reduce_max_uint(unsigned int * inptr,unsigned int * outptr,size_t n,size_t wg_size)90 verify_wg_reduce_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size)
91 {
92     size_t     i, j;
93 
94     for (i=0; i<n; i+=wg_size)
95     {
96         unsigned int max = 0;
97         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
98             max = (max > inptr[i+j]) ? max : inptr[i+j];
99 
100         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
101         {
102             if ( max != outptr[i+j] )
103             {
104                 log_info("work_group_reduce_max uint: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]);
105                 return -1;
106             }
107         }
108     }
109 
110     return 0;
111 }
112 
113 static int
verify_wg_reduce_max_long(cl_long * inptr,cl_long * outptr,size_t n,size_t wg_size)114 verify_wg_reduce_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size)
115 {
116     size_t     i, j;
117 
118     for (i=0; i<n; i+=wg_size)
119     {
120         cl_long max = CL_LONG_MIN;
121         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
122             max = (max > inptr[i+j]) ? max : inptr[i+j];
123 
124         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
125         {
126             if ( max != outptr[i+j] )
127             {
128                 log_info("work_group_reduce_max long: Error at %u: expected = %lld, got = %lld\n", i+j, max, outptr[i+j]);
129                 return -1;
130             }
131         }
132     }
133 
134     return 0;
135 }
136 
137 static int
verify_wg_reduce_max_ulong(cl_ulong * inptr,cl_ulong * outptr,size_t n,size_t wg_size)138 verify_wg_reduce_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size)
139 {
140     size_t     i, j;
141 
142     for (i=0; i<n; i+=wg_size)
143     {
144         cl_ulong max = 0;
145         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
146             max = (max > inptr[i+j]) ? max : inptr[i+j];
147 
148         for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++)
149         {
150             if ( max != outptr[i+j] )
151             {
152                 log_info("work_group_reduce_max ulong: Error at %u: expected = %llu, got = %llu\n", i+j, max, outptr[i+j]);
153                 return -1;
154             }
155         }
156     }
157 
158     return 0;
159 }
160 
161 
162 
163 int
test_work_group_reduce_max_int(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)164 test_work_group_reduce_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
165 {
166     cl_mem       streams[2];
167     cl_int       *input_ptr[1], *p;
168     cl_int       *output_ptr;
169     cl_program   program;
170     cl_kernel    kernel;
171     void         *values[2];
172     size_t       threads[1];
173     size_t       wg_size[1];
174     size_t       wg_sizes_per_dimension[3];
175     size_t       num_elements;
176     int          err;
177     int          i;
178     MTdata       d;
179 
180     err = create_single_kernel_helper(context, &program, &kernel, 1,
181                                       &wg_reduce_max_kernel_code_int,
182                                       "test_wg_reduce_max_int");
183     if (err)
184         return -1;
185 
186     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
187     if (err)
188         return -1;
189 
190     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
191     if (err)
192         return -1;
193     if(wg_sizes_per_dimension[0] < wg_size[0])
194     {
195         wg_size[0] = wg_sizes_per_dimension[0];
196     }
197 
198     num_elements = n_elems;
199 
200     input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements);
201     output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements);
202     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
203                                 sizeof(cl_int) * num_elements, NULL, NULL);
204     if (!streams[0])
205     {
206         log_error("clCreateBuffer failed\n");
207         return -1;
208     }
209 
210     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
211                                 sizeof(cl_int) * num_elements, NULL, NULL);
212     if (!streams[1])
213     {
214         log_error("clCreateBuffer failed\n");
215         return -1;
216     }
217 
218     p = input_ptr[0];
219     d = init_genrand( gRandomSeed );
220     for (i=0; i<num_elements; i++)
221         p[i] = genrand_int32(d);
222     free_mtdata(d); d = NULL;
223 
224     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL );
225     if (err != CL_SUCCESS)
226     {
227         log_error("clWriteArray failed\n");
228         return -1;
229     }
230 
231     values[0] = streams[0];
232     values[1] = streams[1];
233     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
234     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
235     if (err != CL_SUCCESS)
236     {
237         log_error("clSetKernelArgs failed\n");
238         return -1;
239     }
240 
241     // Line below is troublesome...
242     threads[0] = (size_t)num_elements;
243     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
244     if (err != CL_SUCCESS)
245     {
246         log_error("clEnqueueNDRangeKernel failed\n");
247         return -1;
248     }
249 
250     cl_uint dead = 0xdeaddead;
251     memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements);
252     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL );
253     if (err != CL_SUCCESS)
254     {
255         log_error("clEnqueueReadBuffer failed\n");
256         return -1;
257     }
258 
259     if (verify_wg_reduce_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0]))
260     {
261         log_error("work_group_reduce_max int failed\n");
262         return -1;
263     }
264     log_info("work_group_reduce_max int passed\n");
265 
266     clReleaseMemObject(streams[0]);
267     clReleaseMemObject(streams[1]);
268     clReleaseKernel(kernel);
269     clReleaseProgram(program);
270     free(input_ptr[0]);
271     free(output_ptr);
272 
273     return err;
274 }
275 
276 
277 int
test_work_group_reduce_max_uint(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)278 test_work_group_reduce_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
279 {
280     cl_mem       streams[2];
281     cl_uint      *input_ptr[1], *p;
282     cl_uint      *output_ptr;
283     cl_program   program;
284     cl_kernel    kernel;
285     void         *values[2];
286     size_t       threads[1];
287     size_t       wg_size[1];
288     size_t       wg_sizes_per_dimension[3];
289     size_t       num_elements;
290     int          err;
291     int          i;
292     MTdata       d;
293 
294     err = create_single_kernel_helper(context, &program, &kernel, 1,
295                                       &wg_reduce_max_kernel_code_uint,
296                                       "test_wg_reduce_max_uint");
297     if (err)
298         return -1;
299 
300     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
301     if (err)
302         return -1;
303 
304     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
305     if (err)
306         return -1;
307     if(wg_sizes_per_dimension[0] < wg_size[0])
308     {
309         wg_size[0] = wg_sizes_per_dimension[0];
310     }
311 
312     num_elements = n_elems;
313 
314     input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
315     output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements);
316     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
317                                 sizeof(cl_uint) * num_elements, NULL, NULL);
318     if (!streams[0])
319     {
320         log_error("clCreateBuffer failed\n");
321         return -1;
322     }
323 
324     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
325                                 sizeof(cl_uint) * num_elements, NULL, NULL);
326     if (!streams[1])
327     {
328         log_error("clCreateBuffer failed\n");
329         return -1;
330     }
331 
332     p = input_ptr[0];
333     d = init_genrand( gRandomSeed );
334     for (i=0; i<num_elements; i++)
335         p[i] = genrand_int32(d);
336     free_mtdata(d); d = NULL;
337 
338     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
339     if (err != CL_SUCCESS)
340     {
341         log_error("clWriteArray failed\n");
342         return -1;
343     }
344 
345     values[0] = streams[0];
346     values[1] = streams[1];
347     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
348     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
349     if (err != CL_SUCCESS)
350     {
351         log_error("clSetKernelArgs failed\n");
352         return -1;
353     }
354 
355     // Line below is troublesome...
356     threads[0] = (size_t)n_elems;
357     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
358     if (err != CL_SUCCESS)
359     {
360         log_error("clEnqueueNDRangeKernel failed\n");
361         return -1;
362     }
363 
364     cl_uint dead = 0xdeaddead;
365     memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements);
366     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL );
367     if (err != CL_SUCCESS)
368     {
369         log_error("clEnqueueReadBuffer failed\n");
370         return -1;
371     }
372 
373     if (verify_wg_reduce_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0]))
374     {
375         log_error("work_group_reduce_max uint failed\n");
376         return -1;
377     }
378     log_info("work_group_reduce_max uint passed\n");
379 
380     clReleaseMemObject(streams[0]);
381     clReleaseMemObject(streams[1]);
382     clReleaseKernel(kernel);
383     clReleaseProgram(program);
384     free(input_ptr[0]);
385     free(output_ptr);
386 
387     return err;
388 }
389 
390 int
test_work_group_reduce_max_long(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)391 test_work_group_reduce_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
392 {
393     cl_mem       streams[2];
394     cl_long      *input_ptr[1], *p;
395     cl_long      *output_ptr;
396     cl_program   program;
397     cl_kernel    kernel;
398     void         *values[2];
399     size_t       threads[1];
400     size_t       wg_size[1];
401     size_t       wg_sizes_per_dimension[3];
402     size_t       num_elements;
403     int          err;
404     int          i;
405     MTdata       d;
406 
407     err = create_single_kernel_helper(context, &program, &kernel, 1,
408                                       &wg_reduce_max_kernel_code_long,
409                                       "test_wg_reduce_max_long");
410     if (err)
411         return -1;
412 
413     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
414     if (err)
415         return -1;
416 
417     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
418     if (err)
419         return -1;
420     if(wg_sizes_per_dimension[0] < wg_size[0])
421     {
422         wg_size[0] = wg_sizes_per_dimension[0];
423     }
424 
425     num_elements = n_elems;
426 
427     input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements);
428     output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements);
429     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
430                                 sizeof(cl_long) * num_elements, NULL, NULL);
431     if (!streams[0])
432     {
433         log_error("clCreateBuffer failed\n");
434         return -1;
435     }
436 
437     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
438                                 sizeof(cl_long) * num_elements, NULL, NULL);
439     if (!streams[1])
440     {
441         log_error("clCreateBuffer failed\n");
442         return -1;
443     }
444 
445     p = input_ptr[0];
446     d = init_genrand( gRandomSeed );
447     for (i=0; i<num_elements; i++)
448         p[i] = genrand_int64(d);
449     free_mtdata(d); d = NULL;
450 
451     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
452     if (err != CL_SUCCESS)
453     {
454         log_error("clWriteArray failed\n");
455         return -1;
456     }
457 
458     values[0] = streams[0];
459     values[1] = streams[1];
460     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
461     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
462     if (err != CL_SUCCESS)
463     {
464         log_error("clSetKernelArgs failed\n");
465         return -1;
466     }
467 
468     // Line below is troublesome...
469     threads[0] = (size_t)n_elems;
470     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
471     if (err != CL_SUCCESS)
472     {
473         log_error("clEnqueueNDRangeKernel failed\n");
474         return -1;
475     }
476 
477     cl_uint dead = 0xdeaddead;
478     memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements);
479     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL );
480     if (err != CL_SUCCESS)
481     {
482         log_error("clEnqueueReadBuffer failed\n");
483         return -1;
484     }
485 
486     if (verify_wg_reduce_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0]))
487     {
488         log_error("work_group_reduce_max long failed\n");
489         return -1;
490     }
491     log_info("work_group_reduce_max long passed\n");
492 
493     clReleaseMemObject(streams[0]);
494     clReleaseMemObject(streams[1]);
495     clReleaseKernel(kernel);
496     clReleaseProgram(program);
497     free(input_ptr[0]);
498     free(output_ptr);
499 
500     return err;
501 }
502 
503 
504 int
test_work_group_reduce_max_ulong(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)505 test_work_group_reduce_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
506 {
507     cl_mem       streams[2];
508     cl_ulong     *input_ptr[1], *p;
509     cl_ulong     *output_ptr;
510     cl_program   program;
511     cl_kernel    kernel;
512     void         *values[2];
513     size_t       threads[1];
514     size_t       wg_size[1];
515     size_t       wg_sizes_per_dimension[3];
516     size_t       num_elements;
517     int          err;
518     int          i;
519     MTdata       d;
520 
521     err = create_single_kernel_helper(context, &program, &kernel, 1,
522                                       &wg_reduce_max_kernel_code_ulong,
523                                       "test_wg_reduce_max_ulong");
524     if (err)
525         return -1;
526 
527     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
528     if (err)
529         return -1;
530 
531     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL);
532     if (err)
533         return -1;
534     if(wg_sizes_per_dimension[0] < wg_size[0])
535     {
536         wg_size[0] = wg_sizes_per_dimension[0];
537     }
538 
539     num_elements = n_elems;
540 
541     input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
542     output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements);
543     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
544                                 sizeof(cl_ulong) * num_elements, NULL, NULL);
545     if (!streams[0])
546     {
547         log_error("clCreateBuffer failed\n");
548         return -1;
549     }
550 
551     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
552                                 sizeof(cl_ulong) * num_elements, NULL, NULL);
553     if (!streams[1])
554     {
555         log_error("clCreateBuffer failed\n");
556         return -1;
557     }
558 
559     p = input_ptr[0];
560     d = init_genrand( gRandomSeed );
561     for (i=0; i<num_elements; i++)
562         p[i] = genrand_int64(d);
563     free_mtdata(d); d = NULL;
564 
565     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
566     if (err != CL_SUCCESS)
567     {
568         log_error("clWriteArray failed\n");
569         return -1;
570     }
571 
572     values[0] = streams[0];
573     values[1] = streams[1];
574     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
575     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
576     if (err != CL_SUCCESS)
577     {
578         log_error("clSetKernelArgs failed\n");
579         return -1;
580     }
581 
582     // Line below is troublesome...
583     threads[0] = (size_t)n_elems;
584     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL );
585     if (err != CL_SUCCESS)
586     {
587         log_error("clEnqueueNDRangeKernel failed\n");
588         return -1;
589     }
590 
591     cl_uint dead = 0xdeaddead;
592     memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements);
593     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL );
594     if (err != CL_SUCCESS)
595     {
596         log_error("clEnqueueReadBuffer failed\n");
597         return -1;
598     }
599 
600     if (verify_wg_reduce_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0]))
601     {
602         log_error("work_group_reduce_max ulong failed\n");
603         return -1;
604     }
605     log_info("work_group_reduce_max ulong passed\n");
606 
607     clReleaseMemObject(streams[0]);
608     clReleaseMemObject(streams[1]);
609     clReleaseKernel(kernel);
610     clReleaseProgram(program);
611     free(input_ptr[0]);
612     free(output_ptr);
613 
614     return err;
615 }
616 
617 
618 int
test_work_group_reduce_max(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)619 test_work_group_reduce_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
620 {
621     int err;
622 
623     err = test_work_group_reduce_max_int(device, context, queue, n_elems);
624     if (err) return err;
625     err = test_work_group_reduce_max_uint(device, context, queue, n_elems);
626     if (err) return err;
627     err = test_work_group_reduce_max_long(device, context, queue, n_elems);
628     if (err) return err;
629     err = test_work_group_reduce_max_ulong(device, context, queue, n_elems);
630     return err;
631 }
632 
633