• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19 
20 #include <string.h>
21 
22 #include <algorithm>
23 
24 #include "cl_utils.h"
25 #include "tests.h"
26 
27 #include <CL/cl_half.h>
28 
29 typedef struct ComputeReferenceInfoF_
30 {
31     float *x;
32     cl_ushort *r;
33     f2h f;
34     cl_ulong i;
35     cl_uint lim;
36     cl_uint count;
37 } ComputeReferenceInfoF;
38 
39 typedef struct ComputeReferenceInfoD_
40 {
41     double *x;
42     cl_ushort *r;
43     d2h f;
44     cl_ulong i;
45     cl_uint lim;
46     cl_uint count;
47 } ComputeReferenceInfoD;
48 
49 typedef struct CheckResultInfoF_
50 {
51     const float *x;
52     const cl_ushort *r;
53     const cl_ushort *s;
54     f2h f;
55     const char *aspace;
56     cl_uint lim;
57     cl_uint count;
58     int vsz;
59 } CheckResultInfoF;
60 
61 typedef struct CheckResultInfoD_
62 {
63     const double *x;
64     const cl_ushort *r;
65     const cl_ushort *s;
66     d2h f;
67     const char *aspace;
68     cl_uint lim;
69     cl_uint count;
70     int vsz;
71 } CheckResultInfoD;
72 
73 static cl_int
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)74 ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
75 {
76     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
77     cl_uint lim = cri->lim;
78     cl_uint count = cri->count;
79     cl_uint off = jid * count;
80     float *x = cri->x + off;
81     cl_ushort *r = cri->r + off;
82     f2h f = cri->f;
83     cl_ulong i = cri->i + off;
84     cl_uint j;
85 
86     if (off + count > lim)
87         count = lim - off;
88 
89     for (j = 0; j < count; ++j) {
90         x[j] = as_float((cl_uint)(i + j));
91         r[j] = f(x[j]);
92     }
93 
94     return 0;
95 }
96 
97 static cl_int
CheckF(cl_uint jid,cl_uint tid,void * userInfo)98 CheckF(cl_uint jid, cl_uint tid, void *userInfo)
99 {
100     CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
101     cl_uint lim = cri->lim;
102     cl_uint count = cri->count;
103     cl_uint off = jid * count;
104     const float *x = cri->x + off;
105     const cl_ushort *r = cri->r + off;
106     const cl_ushort *s = cri->s + off;
107     f2h f = cri->f;
108     cl_uint j;
109     cl_ushort correct2 = f( 0.0f);
110     cl_ushort correct3 = f(-0.0f);
111     cl_int ret = 0;
112 
113     if (off + count > lim)
114         count = lim - off;
115 
116     if (!memcmp(r, s, count*sizeof(cl_ushort)))
117         return 0;
118 
119     for (j = 0; j < count; j++) {
120         if (s[j] == r[j]) continue;
121 
122         // Pass any NaNs
123         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
124             continue;
125 
126         // retry per section 6.5.3.3
127         if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
128             continue;
129 
130         // if reference result is subnormal, pass any zero
131         if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
132             continue;
133 
134         vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  vector_size = %d, address_space = %s\n",
135                    j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
136 
137         ret = 1;
138         break;
139     }
140 
141     return ret;
142 }
143 
144 static cl_int
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)145 ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
146 {
147     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
148     cl_uint lim = cri->lim;
149     cl_uint count = cri->count;
150     cl_uint off = jid * count;
151     double *x = cri->x + off;
152     cl_ushort *r = cri->r + off;
153     d2h f = cri->f;
154     cl_uint j;
155     cl_ulong i = cri->i + off;
156 
157     if (off + count > lim)
158         count = lim - off;
159 
160     for (j = 0; j < count; ++j) {
161         x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
162         r[j] = f(x[j]);
163     }
164 
165     return 0;
166 }
167 
168 static cl_int
CheckD(cl_uint jid,cl_uint tid,void * userInfo)169 CheckD(cl_uint jid, cl_uint tid, void *userInfo)
170 {
171     CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
172     cl_uint lim = cri->lim;
173     cl_uint count = cri->count;
174     cl_uint off = jid * count;
175     const double *x = cri->x + off;
176     const cl_ushort *r = cri->r + off;
177     const cl_ushort *s = cri->s + off;
178     d2h f = cri->f;
179     cl_uint j;
180     cl_ushort correct2 = f( 0.0);
181     cl_ushort correct3 = f(-0.0);
182     cl_int ret = 0;
183 
184     if (off + count > lim)
185         count = lim - off;
186 
187     if (!memcmp(r, s, count*sizeof(cl_ushort)))
188         return 0;
189 
190     for (j = 0; j < count; j++) {
191         if (s[j] == r[j]) continue;
192 
193         // Pass any NaNs
194         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
195             continue;
196 
197         if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
198             continue;
199 
200         // if reference result is subnormal, pass any zero result
201         if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
202             continue;
203 
204         vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
205                    j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
206 
207         ret = 1;
208     break;
209     }
210 
211     return ret;
212 }
213 
float2half_rte(float f)214 static cl_half float2half_rte(float f)
215 {
216     return cl_half_from_float(f, CL_HALF_RTE);
217 }
218 
float2half_rtz(float f)219 static cl_half float2half_rtz(float f)
220 {
221     return cl_half_from_float(f, CL_HALF_RTZ);
222 }
223 
float2half_rtp(float f)224 static cl_half float2half_rtp(float f)
225 {
226     return cl_half_from_float(f, CL_HALF_RTP);
227 }
228 
float2half_rtn(float f)229 static cl_half float2half_rtn(float f)
230 {
231     return cl_half_from_float(f, CL_HALF_RTN);
232 }
233 
double2half_rte(double f)234 static cl_half double2half_rte(double f)
235 {
236     return cl_half_from_double(f, CL_HALF_RTE);
237 }
238 
double2half_rtz(double f)239 static cl_half double2half_rtz(double f)
240 {
241     return cl_half_from_double(f, CL_HALF_RTZ);
242 }
243 
double2half_rtp(double f)244 static cl_half double2half_rtp(double f)
245 {
246     return cl_half_from_double(f, CL_HALF_RTP);
247 }
248 
double2half_rtn(double f)249 static cl_half double2half_rtn(double f)
250 {
251     return cl_half_from_double(f, CL_HALF_RTN);
252 }
253 
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)254 int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
255 {
256     switch (get_default_rounding_mode(deviceID))
257     {
258         case CL_FP_ROUND_TO_ZERO:
259             return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, "");
260         case 0:
261             return -1;
262         default:
263             return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "");
264     }
265 }
266 
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)267 int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
268 {
269     return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
270 }
271 
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)272 int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
273 {
274     return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
275 }
276 
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)277 int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
278 {
279     return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
280 }
281 
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)282 int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
283 {
284     return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
285 }
286 
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)287 int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
288 {
289     switch (get_default_rounding_mode(deviceID))
290     {
291         case CL_FP_ROUND_TO_ZERO:
292             return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, "");
293         case 0:
294             return -1;
295         default:
296             return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "");
297     }
298 }
299 
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)300 int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
301 {
302     return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
303 }
304 
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)305 int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
306 {
307     return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
308 }
309 
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)310 int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
311 {
312     return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
313 }
314 
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)315 int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
316 {
317     return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
318 }
319 
320 #pragma mark -
321 
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)322 int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
323 {
324     int vectorSize, error;
325     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
326     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
327 
328     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
329     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
330     memset( min_time, -1, sizeof( min_time ) );
331     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
332     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
333     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
334     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
335     memset( min_double_time, -1, sizeof( min_double_time ) );
336 
337     bool aligned= false;
338 
339     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
340     {
341         const char *source[] = {
342             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
343             "{\n"
344             "   size_t i = get_global_id(0);\n"
345             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
346             "}\n"
347         };
348 
349         const char *source_v3[] = {
350             "__kernel void test( __global float *p, __global half *f,\n"
351             "                   uint extra_last_thread)\n"
352             "{\n"
353             "   size_t i = get_global_id(0);\n"
354             "   size_t last_i = get_global_size(0)-1;\n"
355             "   size_t adjust = 0;\n"
356             "   if(last_i == i && extra_last_thread != 0) {\n"
357             "     adjust = 3-extra_last_thread;\n"
358             "   } "
359             "   vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
360             "}\n"
361         };
362 
363         const char *source_private_store[] = {
364             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
365             "{\n"
366             "   __private ushort data[16];\n"
367             "   size_t i = get_global_id(0);\n"
368             "   size_t offset = 0;\n"
369             "   size_t vecsize = vec_step(p[i]);\n"
370             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
371             "   for(offset = 0; offset < vecsize; offset++)\n"
372             "   {\n"
373             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
374             "   }\n"
375             "}\n"
376         };
377 
378 
379         const char *source_private_store_v3[] = {
380             "__kernel void test( __global float *p, __global half *f,\n"
381             "                   uint extra_last_thread )\n"
382             "{\n"
383             "   __private ushort data[4];\n"
384             "   size_t i = get_global_id(0);\n"
385             "   size_t last_i = get_global_size(0)-1;\n"
386             "   size_t adjust = 0;\n"
387             "   size_t offset = 0;\n"
388             "   if(last_i == i && extra_last_thread != 0) {\n"
389             "     adjust = 3-extra_last_thread;\n"
390             "   } "
391             "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
392             "   for(offset = 0; offset < 3; offset++)\n"
393             "   {\n"
394             "       vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
395             "   }\n"
396             "}\n"
397         };
398 
399         char local_buf_size[10];
400         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
401 
402 
403         const char *source_local_store[] = {
404             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
405             "{\n"
406             "   __local ushort data[16*", local_buf_size, "];\n"
407             "   size_t i = get_global_id(0);\n"
408             "   size_t lid = get_local_id(0);\n"
409             "   size_t lsize = get_local_size(0);\n"
410             "   size_t vecsize = vec_step(p[0]);\n"
411             "   event_t async_event;\n"
412             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
413             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
414             "   async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
415             "   wait_group_events(1, &async_event);\n"
416             "}\n"
417         };
418 
419         const char *source_local_store_v3[] = {
420             "__kernel void test( __global float *p, __global half *f,\n"
421             "                   uint extra_last_thread )\n"
422             "{\n"
423             "   __local ushort data[3*(",
424             local_buf_size,
425             "+1)];\n"
426             "   size_t i = get_global_id(0);\n"
427             "   size_t lid = get_local_id(0);\n"
428             "   size_t last_i = get_global_size(0)-1;\n"
429             "   size_t adjust = 0;\n"
430             "   size_t lsize = get_local_size(0);\n"
431             "   event_t async_event;\n"
432             "   if(last_i == i && extra_last_thread != 0) {\n"
433             "     adjust = 3-extra_last_thread;\n"
434             "   } "
435             "   vstore_half3",
436             roundName,
437             "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
438             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
439             "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
440             "       extra_last_thread != 0) {\n"
441             "     adjust = 3-extra_last_thread;\n"
442             "   }\n"
443             "   async_event = async_work_group_copy(\n"
444             "       (__global ushort*)(f+3*(i-lid)),\n"
445             "       (__local ushort *)(&data[adjust]),\n"
446             "       lsize*3-adjust, 0);\n" // investigate later
447             "   wait_group_events(1, &async_event);\n"
448             "}\n"
449         };
450 
451         const char *double_source[] = {
452             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
453             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
454             "{\n"
455             "   size_t i = get_global_id(0);\n"
456             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
457             "}\n"
458         };
459 
460         const char *double_source_private_store[] = {
461             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
462             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
463             "{\n"
464             "   __private ushort data[16];\n"
465             "   size_t i = get_global_id(0);\n"
466             "   size_t offset = 0;\n"
467             "   size_t vecsize = vec_step(p[i]);\n"
468             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
469             "   for(offset = 0; offset < vecsize; offset++)\n"
470             "   {\n"
471             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
472             "   }\n"
473             "}\n"
474         };
475 
476 
477         const char *double_source_local_store[] = {
478             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
479             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
480             "{\n"
481             "   __local ushort data[16*", local_buf_size, "];\n"
482             "   size_t i = get_global_id(0);\n"
483             "   size_t lid = get_local_id(0);\n"
484             "   size_t vecsize = vec_step(p[0]);\n"
485             "   size_t lsize = get_local_size(0);\n"
486             "   event_t async_event;\n"
487             "   vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
488             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
489             "   async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
490             "   wait_group_events(1, &async_event);\n"
491             "}\n"
492         };
493 
494 
495         const char *double_source_v3[] = {
496             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
497             "__kernel void test( __global double *p, __global half *f ,\n"
498             "                   uint extra_last_thread)\n"
499             "{\n"
500             "   size_t i = get_global_id(0);\n"
501             "   size_t last_i = get_global_size(0)-1;\n"
502             "   size_t adjust = 0;\n"
503             "   if(last_i == i && extra_last_thread != 0) {\n"
504             "     adjust = 3-extra_last_thread;\n"
505             "   } "
506             "   vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
507             "}\n"
508         };
509 
510         const char *double_source_private_store_v3[] = {
511             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
512             "__kernel void test( __global double *p, __global half *f,\n"
513             "                   uint extra_last_thread )\n"
514             "{\n"
515             "   __private ushort data[4];\n"
516             "   size_t i = get_global_id(0);\n"
517             "   size_t last_i = get_global_size(0)-1;\n"
518             "   size_t adjust = 0;\n"
519             "   size_t offset = 0;\n"
520             "   if(last_i == i && extra_last_thread != 0) {\n"
521             "     adjust = 3-extra_last_thread;\n"
522             "   } "
523             "   vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
524             "   for(offset = 0; offset < 3; offset++)\n"
525             "   {\n"
526             "       vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
527             "   }\n"
528             "}\n"
529         };
530 
531         const char *double_source_local_store_v3[] = {
532             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
533             "__kernel void test( __global double *p, __global half *f,\n"
534             "                   uint extra_last_thread )\n"
535             "{\n"
536             "   __local ushort data[3*(",
537             local_buf_size,
538             "+1)];\n"
539             "   size_t i = get_global_id(0);\n"
540             "   size_t lid = get_local_id(0);\n"
541             "   size_t last_i = get_global_size(0)-1;\n"
542             "   size_t adjust = 0;\n"
543             "   size_t lsize = get_local_size(0);\n"
544             "   event_t async_event;\n"
545             "   if(last_i == i && extra_last_thread != 0) {\n"
546             "     adjust = 3-extra_last_thread;\n"
547             "   }\n "
548             "   vstore_half3",
549             roundName,
550             "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
551             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
552             "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
553             "       extra_last_thread != 0) {\n"
554             "     adjust = 3-extra_last_thread;\n"
555             "   }\n"
556             "   async_event = async_work_group_copy(\n"
557             "       (__global ushort *)(f+3*(i-lid)),\n"
558             "       (__local ushort *)(&data[adjust]),\n"
559             "       lsize*3-adjust, 0);\n" // investigate later
560             "   wait_group_events(1, &async_event);\n"
561             "}\n"
562         };
563 
564 
565         if(g_arrVecSizes[vectorSize] == 3) {
566             programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
567         } else {
568             programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
569         }
570         if( NULL == programs[ vectorSize ][0] )
571         {
572             gFailCount++;
573             return -1;
574         }
575 
576         kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
577         if( NULL == kernels[vectorSize][0] )
578         {
579             gFailCount++;
580             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
581             return error;
582         }
583 
584         if(g_arrVecSizes[vectorSize] == 3) {
585             programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
586         } else {
587             programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
588         }
589         if( NULL == programs[ vectorSize ][1] )
590         {
591             gFailCount++;
592             return -1;
593         }
594 
595         kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
596         if( NULL == kernels[vectorSize][1] )
597         {
598             gFailCount++;
599             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
600             return error;
601         }
602 
603         if(g_arrVecSizes[vectorSize] == 3) {
604             programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
605             if(  NULL == programs[ vectorSize ][2] )
606             {
607                 unsigned q;
608                 for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
609                     vlog_error("%s", source_local_store_v3[q]);
610 
611                 gFailCount++;
612                 return -1;
613 
614             }
615         } else {
616             programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
617             if( NULL == programs[ vectorSize ][2] )
618             {
619                 unsigned q;
620                 for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
621                     vlog_error("%s", source_local_store[q]);
622 
623                 gFailCount++;
624                 return -1;
625 
626             }
627         }
628 
629         kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
630         if( NULL == kernels[vectorSize][2] )
631         {
632             gFailCount++;
633             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
634             return error;
635         }
636 
637         if( gTestDouble )
638         {
639             if(g_arrVecSizes[vectorSize] == 3) {
640                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
641             } else {
642                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
643             }
644             if( NULL == doublePrograms[ vectorSize ][0] )
645             {
646                 gFailCount++;
647                 return -1;
648             }
649 
650             doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
651             if( NULL == kernels[vectorSize][0] )
652             {
653                 gFailCount++;
654                 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
655                 return error;
656             }
657 
658             if(g_arrVecSizes[vectorSize] == 3)
659                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
660             else
661                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
662 
663             if( NULL == doublePrograms[ vectorSize ][1] )
664             {
665                 gFailCount++;
666                 return -1;
667             }
668 
669             doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
670             if( NULL == kernels[vectorSize][1] )
671             {
672                 gFailCount++;
673                 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
674                 return error;
675             }
676 
677             if(g_arrVecSizes[vectorSize] == 3) {
678                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
679             } else {
680                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
681             }
682             if( NULL == doublePrograms[ vectorSize ][2] )
683             {
684                 gFailCount++;
685                 return -1;
686             }
687 
688             doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
689             if( NULL == kernels[vectorSize][2] )
690             {
691                 gFailCount++;
692                 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
693                 return error;
694             }
695         }
696     } // end for vector size
697 
698     // Figure out how many elements are in a work block
699     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
700     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
701     uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
702     size_t stride = blockCount;
703 
704     if (gWimpyMode)
705         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
706 
707     // we handle 64-bit types a bit differently.
708     if( lastCase == 0 )
709         lastCase = 0x100000000ULL;
710 
711     uint64_t i, j;
712     error = 0;
713     uint64_t printMask = (lastCase >> 4) - 1;
714     cl_uint count = 0;
715     int addressSpace;
716     size_t loopCount;
717     cl_uint threadCount = GetThreadCount();
718 
719     ComputeReferenceInfoF fref;
720     fref.x = (float *)gIn_single;
721     fref.r = (cl_half *)gOut_half_reference;
722     fref.f = referenceFunc;
723     fref.lim = blockCount;
724     fref.count = (blockCount + threadCount - 1) / threadCount;
725 
726     CheckResultInfoF fchk;
727     fchk.x = (const float *)gIn_single;
728     fchk.r = (const cl_half *)gOut_half_reference;
729     fchk.s = (const cl_half *)gOut_half;
730     fchk.f = referenceFunc;
731     fchk.lim = blockCount;
732     fchk.count = (blockCount + threadCount - 1) / threadCount;
733 
734     ComputeReferenceInfoD dref;
735     dref.x = (double *)gIn_double;
736     dref.r = (cl_half *)gOut_half_reference_double;
737     dref.f = doubleReferenceFunc;
738     dref.lim = blockCount;
739     dref.count = (blockCount + threadCount - 1) / threadCount;
740 
741     CheckResultInfoD dchk;
742     dchk.x = (const double *)gIn_double;
743     dchk.r = (const cl_half *)gOut_half_reference_double;
744     dchk.s = (const cl_half *)gOut_half;
745     dchk.f = doubleReferenceFunc;
746     dchk.lim = blockCount;
747     dchk.count = (blockCount + threadCount - 1) / threadCount;
748 
749     for( i = 0; i < lastCase; i += stride )
750     {
751         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
752         fref.i = i;
753         dref.i = i;
754 
755         // Compute the input and reference
756         ThreadPool_Do(ReferenceF, threadCount, &fref);
757 
758         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
759         if (error) {
760             vlog_error( "Failure in clWriteBuffer\n" );
761             gFailCount++;
762             goto exit;
763         }
764 
765         if (gTestDouble) {
766             ThreadPool_Do(ReferenceD, threadCount, &dref);
767 
768             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
769             if (error) {
770                 vlog_error( "Failure in clWriteBuffer\n" );
771                 gFailCount++;
772                 goto exit;
773             }
774         }
775 
776         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
777             // Loop through vector sizes
778             fchk.vsz = g_arrVecSizes[vectorSize];
779             dchk.vsz = g_arrVecSizes[vectorSize];
780 
781             for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
782                 // Loop over address spaces
783                 fchk.aspace = addressSpaceNames[addressSpace];
784                 dchk.aspace = addressSpaceNames[addressSpace];
785 
786                 cl_uint pattern = 0xdeaddead;
787                 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
788 
789                 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
790                                              0, count * sizeof(cl_half),
791                                              gOut_half, 0, NULL, NULL);
792                 if (error) {
793                     vlog_error( "Failure in clWriteArray\n" );
794                     gFailCount++;
795                     goto exit;
796                 }
797 
798                 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
799                                        numVecs(count, vectorSize, aligned) ,
800                                   runsOverBy(count, vectorSize, aligned));
801                 if (error) {
802                     gFailCount++;
803                     goto exit;
804                 }
805 
806                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
807                                             count * sizeof(cl_half), gOut_half,
808                                             0, NULL, NULL);
809                 if (error) {
810                     vlog_error( "Failure in clReadArray\n" );
811                     gFailCount++;
812                     goto exit;
813                 }
814 
815                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
816                 if (error) {
817                             gFailCount++;
818                             goto exit;
819                         }
820 
821                 if (gTestDouble) {
822                     memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
823 
824                     error = clEnqueueWriteBuffer(
825                         gQueue, gOutBuffer_half, CL_FALSE, 0,
826                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
827                     if (error) {
828                         vlog_error( "Failure in clWriteArray\n" );
829                         gFailCount++;
830                         goto exit;
831                     }
832 
833                     error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
834                                       numVecs(count, vectorSize, aligned),
835                                       runsOverBy(count, vectorSize, aligned));
836                     if (error) {
837                         gFailCount++;
838                         goto exit;
839                     }
840 
841                     error = clEnqueueReadBuffer(
842                         gQueue, gOutBuffer_half, CL_TRUE, 0,
843                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
844                     if (error) {
845                         vlog_error( "Failure in clReadArray\n" );
846                         gFailCount++;
847                         goto exit;
848                     }
849 
850                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
851                     if (error) {
852                                 gFailCount++;
853                                 goto exit;
854                             }
855                         }
856                     }
857                 }
858 
859         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
860         {
861             vlog( "." );
862             fflush( stdout );
863         }
864     }  // end last case
865 
866     loopCount = count == blockCount ? 1 : 100;
867     if( gReportTimes )
868     {
869         //Init the input stream
870         cl_float *p = (cl_float *)gIn_single;
871         for( j = 0; j < count; j++ )
872             p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
873 
874         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
875         {
876             vlog_error( "Failure in clWriteArray\n" );
877             gFailCount++;
878             goto exit;
879         }
880 
881         if( gTestDouble )
882         {
883             //Init the input stream
884             cl_double *q = (cl_double *)gIn_double;
885             for( j = 0; j < count; j++ )
886                 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
887 
888             if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
889             {
890                 vlog_error( "Failure in clWriteArray\n" );
891                 gFailCount++;
892                 goto exit;
893             }
894         }
895 
896         //Run again for timing
897         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
898         {
899             uint64_t bestTime = -1ULL;
900             for( j = 0; j < loopCount; j++ )
901             {
902                 uint64_t startTime = ReadTime();
903 
904 
905                 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
906                                        runsOverBy(count, vectorSize, aligned)) ) )
907                 {
908                     gFailCount++;
909                     goto exit;
910                 }
911 
912                 if( (error = clFinish(gQueue)) )
913                 {
914                     vlog_error( "Failure in clFinish\n" );
915                     gFailCount++;
916                     goto exit;
917                 }
918                 uint64_t currentTime = ReadTime() - startTime;
919                 if( currentTime < bestTime )
920                     bestTime = currentTime;
921                 time[ vectorSize ] += currentTime;
922             }
923             if( bestTime < min_time[ vectorSize ] )
924                 min_time[ vectorSize ] = bestTime ;
925 
926             if( gTestDouble )
927             {
928                 bestTime = -1ULL;
929                 for( j = 0; j < loopCount; j++ )
930                 {
931                     uint64_t startTime = ReadTime();
932                     if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
933                                            runsOverBy(count, vectorSize, aligned)) ) )
934                     {
935                         gFailCount++;
936                         goto exit;
937                     }
938 
939                     if( (error = clFinish(gQueue)) )
940                     {
941                         vlog_error( "Failure in clFinish\n" );
942                         gFailCount++;
943                         goto exit;
944                     }
945                     uint64_t currentTime = ReadTime() - startTime;
946                     if( currentTime < bestTime )
947                         bestTime = currentTime;
948                     doubleTime[ vectorSize ] += currentTime;
949                 }
950                 if( bestTime < min_double_time[ vectorSize ] )
951                     min_double_time[ vectorSize ] = bestTime;
952             }
953         }
954     }
955 
956     if( gReportTimes )
957     {
958         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
959             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
960                       "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
961         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
962             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
963                       "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
964         if( gTestDouble )
965         {
966             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
967                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
968                           "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0],  (g_arrVecSizes[vectorSize])  );
969             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
970                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
971                           "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
972         }
973     }
974 
975 exit:
976     //clean up
977     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
978     {
979         for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
980             clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
981             clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
982             if( gTestDouble )
983             {
984                 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
985                 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
986             }
987         }
988     }
989 
990     return error;
991 }
992 
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)993 int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
994 {
995     int vectorSize, error;
996     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
997     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
998 
999     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1000     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1001     memset( min_time, -1, sizeof( min_time ) );
1002     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
1003     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
1004     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1005     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1006     memset( min_double_time, -1, sizeof( min_double_time ) );
1007 
1008     bool aligned = true;
1009 
1010     int minVectorSize = kMinVectorSize;
1011     // There is no aligned scalar vstorea_half
1012     if( 0 == minVectorSize )
1013         minVectorSize = 1;
1014 
1015     //Loop over vector sizes
1016     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1017     {
1018         const char *source[] = {
1019             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1020             "{\n"
1021             "   size_t i = get_global_id(0);\n"
1022             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1023             "}\n"
1024         };
1025 
1026         const char *source_v3[] = {
1027             "__kernel void test( __global float3 *p, __global half *f )\n"
1028             "{\n"
1029             "   size_t i = get_global_id(0);\n"
1030             "   vstorea_half3",roundName,"( p[i], i, f );\n"
1031             "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1032             "}\n"
1033         };
1034 
1035         const char *source_private[] = {
1036             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1037             "{\n"
1038             "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
1039             "   size_t i = get_global_id(0);\n"
1040             "   data = p[i];\n"
1041             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1042             "}\n"
1043         };
1044 
1045         const char *source_private_v3[] = {
1046             "__kernel void test( __global float3 *p, __global half *f )\n"
1047             "{\n"
1048             "   __private float", vector_size_name_extensions[vectorSize], " data;\n"
1049             "   size_t i = get_global_id(0);\n"
1050             "   data = p[i];\n"
1051             "   vstorea_half3",roundName,"( data, i, f );\n"
1052             "   vstore_half",roundName,"( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1053             "}\n"
1054         };
1055 
1056         char local_buf_size[10];
1057         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1058         const char *source_local[] = {
1059             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1060             "{\n"
1061             "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1062             "   size_t i = get_global_id(0);\n"
1063             "   size_t lid = get_local_id(0);\n"
1064             "   data[lid] = p[i];\n"
1065             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1066             "}\n"
1067         };
1068 
1069         const char *source_local_v3[] = {
1070             "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1071             "{\n"
1072             "   __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1073             "   size_t i = get_global_id(0);\n"
1074             "   size_t lid = get_local_id(0);\n"
1075             "   data[lid] = p[i];\n"
1076             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1077             "   vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1078             "}\n"
1079         };
1080 
1081         const char *double_source[] = {
1082             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1083             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1084             "{\n"
1085             "   size_t i = get_global_id(0);\n"
1086             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1087             "}\n"
1088         };
1089 
1090         const char *double_source_v3[] = {
1091             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1092             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1093             "{\n"
1094             "   size_t i = get_global_id(0);\n"
1095             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1096             "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1097             "}\n"
1098         };
1099 
1100         const char *double_source_private[] = {
1101             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1102             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1103             "{\n"
1104             "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
1105             "   size_t i = get_global_id(0);\n"
1106             "   data = p[i];\n"
1107             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1108             "}\n"
1109         };
1110 
1111         const char *double_source_private_v3[] = {
1112             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1113             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1114             "{\n"
1115             "   __private double", vector_size_name_extensions[vectorSize], " data;\n"
1116             "   size_t i = get_global_id(0);\n"
1117             "   data = p[i];\n"
1118             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1119             "   vstore_half",roundName,"( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
1120             "}\n"
1121         };
1122 
1123         const char *double_source_local[] = {
1124             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1125             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1126             "{\n"
1127             "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1128             "   size_t i = get_global_id(0);\n"
1129             "   size_t lid = get_local_id(0);\n"
1130             "   data[lid] = p[i];\n"
1131             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1132             "}\n"
1133         };
1134 
1135         const char *double_source_local_v3[] = {
1136             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1137             "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1138             "{\n"
1139             "   __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1140             "   size_t i = get_global_id(0);\n"
1141             "   size_t lid = get_local_id(0);\n"
1142             "   data[lid] = p[i];\n"
1143             "   vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1144             "   vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1145             "}\n"
1146         };
1147 
1148         if(g_arrVecSizes[vectorSize] == 3) {
1149             programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
1150             if( NULL == programs[ vectorSize ][0] )
1151             {
1152                 gFailCount++;
1153                 return -1;
1154             }
1155         } else {
1156             programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
1157             if( NULL == programs[ vectorSize ][0] )
1158             {
1159                 gFailCount++;
1160                 return -1;
1161             }
1162         }
1163 
1164         kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
1165         if( NULL == kernels[vectorSize][0] )
1166         {
1167             gFailCount++;
1168             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
1169             return error;
1170         }
1171 
1172         if(g_arrVecSizes[vectorSize] == 3) {
1173             programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
1174             if( NULL == programs[ vectorSize ][1] )
1175             {
1176                 gFailCount++;
1177                 return -1;
1178             }
1179         } else {
1180             programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) );
1181             if( NULL == programs[ vectorSize ][1] )
1182             {
1183                 gFailCount++;
1184                 return -1;
1185             }
1186         }
1187 
1188         kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
1189         if( NULL == kernels[vectorSize][1] )
1190         {
1191             gFailCount++;
1192             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
1193             return error;
1194         }
1195 
1196         if(g_arrVecSizes[vectorSize] == 3) {
1197             programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
1198             if( NULL == programs[ vectorSize ][2] )
1199             {
1200                 gFailCount++;
1201                 return -1;
1202             }
1203         } else {
1204             programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) );
1205             if( NULL == programs[ vectorSize ][2] )
1206             {
1207                 gFailCount++;
1208                 return -1;
1209             }
1210         }
1211 
1212         kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
1213         if( NULL == kernels[vectorSize][2] )
1214         {
1215             gFailCount++;
1216             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
1217             return error;
1218         }
1219 
1220         if( gTestDouble )
1221         {
1222             if(g_arrVecSizes[vectorSize] == 3) {
1223                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
1224                 if( NULL == doublePrograms[ vectorSize ][0] )
1225                 {
1226                     gFailCount++;
1227                     return -1;
1228                 }
1229             } else {
1230                 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
1231                 if( NULL == doublePrograms[ vectorSize ][0] )
1232                 {
1233                     gFailCount++;
1234                     return -1;
1235                 }
1236             }
1237 
1238             doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
1239             if( NULL == kernels[vectorSize][0] )
1240             {
1241                 gFailCount++;
1242                 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
1243                 return error;
1244             }
1245 
1246             if(g_arrVecSizes[vectorSize] == 3) {
1247                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
1248                 if( NULL == doublePrograms[ vectorSize ][1] )
1249                 {
1250                     gFailCount++;
1251                     return -1;
1252                 }
1253             } else {
1254                 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
1255                 if( NULL == doublePrograms[ vectorSize ][1] )
1256                 {
1257                     gFailCount++;
1258                     return -1;
1259                 }
1260             }
1261 
1262             doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
1263             if( NULL == kernels[vectorSize][1] )
1264             {
1265                 gFailCount++;
1266                 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
1267                 return error;
1268             }
1269 
1270             if(g_arrVecSizes[vectorSize] == 3) {
1271                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
1272                 if( NULL == doublePrograms[ vectorSize ][2] )
1273                 {
1274                     gFailCount++;
1275                     return -1;
1276                 }
1277             } else {
1278                 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
1279                 if( NULL == doublePrograms[ vectorSize ][2] )
1280                 {
1281                     gFailCount++;
1282                     return -1;
1283                 }
1284             }
1285 
1286             doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
1287             if( NULL == kernels[vectorSize][2] )
1288             {
1289                 gFailCount++;
1290                 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
1291                 return error;
1292             }
1293         }
1294     }
1295 
1296     // Figure out how many elements are in a work block
1297     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
1298     size_t blockCount = BUFFER_SIZE / elementSize;
1299     uint64_t lastCase = 1ULL << (8*sizeof(float));
1300     size_t stride = blockCount;
1301 
1302     if (gWimpyMode)
1303         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1304 
1305     // we handle 64-bit types a bit differently.
1306     if( lastCase == 0 )
1307         lastCase = 0x100000000ULL;
1308     uint64_t i, j;
1309     error = 0;
1310     uint64_t printMask = (lastCase >> 4) - 1;
1311     cl_uint count = 0;
1312     int addressSpace;
1313     size_t loopCount;
1314     cl_uint threadCount = GetThreadCount();
1315 
1316     ComputeReferenceInfoF fref;
1317     fref.x = (float *)gIn_single;
1318     fref.r = (cl_half *)gOut_half_reference;
1319     fref.f = referenceFunc;
1320     fref.lim = blockCount;
1321     fref.count = (blockCount + threadCount - 1) / threadCount;
1322 
1323     CheckResultInfoF fchk;
1324     fchk.x = (const float *)gIn_single;
1325     fchk.r = (const cl_half *)gOut_half_reference;
1326     fchk.s = (const cl_half *)gOut_half;
1327     fchk.f = referenceFunc;
1328     fchk.lim = blockCount;
1329     fchk.count = (blockCount + threadCount - 1) / threadCount;
1330 
1331     ComputeReferenceInfoD dref;
1332     dref.x = (double *)gIn_double;
1333     dref.r = (cl_half *)gOut_half_reference_double;
1334     dref.f = doubleReferenceFunc;
1335     dref.lim = blockCount;
1336     dref.count = (blockCount + threadCount - 1) / threadCount;
1337 
1338     CheckResultInfoD dchk;
1339     dchk.x = (const double *)gIn_double;
1340     dchk.r = (const cl_half *)gOut_half_reference_double;
1341     dchk.s = (const cl_half *)gOut_half;
1342     dchk.f = doubleReferenceFunc;
1343     dchk.lim = blockCount;
1344     dchk.count = (blockCount + threadCount - 1) / threadCount;
1345 
1346     for( i = 0; i < (uint64_t)lastCase; i += stride )
1347     {
1348         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
1349         fref.i = i;
1350         dref.i = i;
1351 
1352         // Create the input and reference
1353         ThreadPool_Do(ReferenceF, threadCount, &fref);
1354 
1355         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
1356         if (error) {
1357             vlog_error( "Failure in clWriteArray\n" );
1358             gFailCount++;
1359             goto exit;
1360         }
1361 
1362         if (gTestDouble) {
1363             ThreadPool_Do(ReferenceD, threadCount, &dref);
1364 
1365             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
1366             if (error) {
1367                 vlog_error( "Failure in clWriteArray\n" );
1368                 gFailCount++;
1369                 goto exit;
1370             }
1371         }
1372 
1373         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
1374             // Loop over vector legths
1375             fchk.vsz = g_arrVecSizes[vectorSize];
1376             dchk.vsz = g_arrVecSizes[vectorSize];
1377 
1378             for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1379                 // Loop over address spaces
1380                 fchk.aspace = addressSpaceNames[addressSpace];
1381                 dchk.aspace = addressSpaceNames[addressSpace];
1382 
1383                 cl_uint pattern = 0xdeaddead;
1384                 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1385 
1386                 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
1387                                              0, count * sizeof(cl_half),
1388                                              gOut_half, 0, NULL, NULL);
1389                 if (error) {
1390                     vlog_error( "Failure in clWriteArray\n" );
1391                     gFailCount++;
1392                     goto exit;
1393                 }
1394 
1395                 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
1396                                   numVecs(count, vectorSize, aligned),
1397                                   runsOverBy(count, vectorSize, aligned));
1398                 if (error) {
1399                     gFailCount++;
1400                     goto exit;
1401                 }
1402 
1403                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1404                                             count * sizeof(cl_half), gOut_half,
1405                                             0, NULL, NULL);
1406                 if (error) {
1407                     vlog_error( "Failure in clReadArray\n" );
1408                     gFailCount++;
1409                     goto exit;
1410                 }
1411 
1412                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1413                 if (error) {
1414                             gFailCount++;
1415                             goto exit;
1416                         }
1417 
1418                 if (gTestDouble) {
1419                     memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1420 
1421                     error = clEnqueueWriteBuffer(
1422                         gQueue, gOutBuffer_half, CL_FALSE, 0,
1423                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1424                     if (error) {
1425                         vlog_error( "Failure in clWriteArray\n" );
1426                         gFailCount++;
1427                         goto exit;
1428                     }
1429 
1430                     error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
1431                                       numVecs(count, vectorSize, aligned),
1432                                       runsOverBy(count, vectorSize, aligned));
1433                     if (error) {
1434                         gFailCount++;
1435                         goto exit;
1436                     }
1437 
1438                     error = clEnqueueReadBuffer(
1439                         gQueue, gOutBuffer_half, CL_TRUE, 0,
1440                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1441                     if (error) {
1442                         vlog_error( "Failure in clReadArray\n" );
1443                         gFailCount++;
1444                         goto exit;
1445                     }
1446 
1447                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
1448                     if (error) {
1449                                 gFailCount++;
1450                                 goto exit;
1451                             }
1452                         }
1453                     }
1454         }  // end for vector size
1455 
1456         if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
1457             vlog( "." );
1458             fflush( stdout );
1459         }
1460     }  // for end lastcase
1461 
1462     loopCount = count == blockCount ? 1 : 100;
1463     if( gReportTimes )
1464     {
1465         //Init the input stream
1466         cl_float *p = (cl_float *)gIn_single;
1467         for( j = 0; j < count; j++ )
1468             p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1469 
1470         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
1471         {
1472             vlog_error( "Failure in clWriteArray\n" );
1473             gFailCount++;
1474             goto exit;
1475         }
1476 
1477         if( gTestDouble )
1478         {
1479             //Init the input stream
1480             cl_double *q = (cl_double *)gIn_double;
1481             for( j = 0; j < count; j++ )
1482                 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1483 
1484             if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
1485             {
1486                 vlog_error( "Failure in clWriteArray\n" );
1487                 gFailCount++;
1488                 goto exit;
1489             }
1490         }
1491 
1492         //Run again for timing
1493         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1494         {
1495             uint64_t bestTime = -1ULL;
1496             for( j = 0; j < loopCount; j++ )
1497             {
1498                 uint64_t startTime = ReadTime();
1499                 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1500                                        runsOverBy(count, vectorSize, aligned)) ) )
1501                 {
1502                     gFailCount++;
1503                     goto exit;
1504                 }
1505 
1506                 if( (error = clFinish(gQueue)) )
1507                 {
1508                     vlog_error( "Failure in clFinish\n" );
1509                     gFailCount++;
1510                     goto exit;
1511                 }
1512                 uint64_t currentTime = ReadTime() - startTime;
1513                 if( currentTime < bestTime )
1514                     bestTime = currentTime;
1515                 time[ vectorSize ] += currentTime;
1516             }
1517             if( bestTime < min_time[ vectorSize ] )
1518                 min_time[ vectorSize ] = bestTime ;
1519 
1520             if( gTestDouble )
1521             {
1522                 bestTime = -1ULL;
1523                 for( j = 0; j < loopCount; j++ )
1524                 {
1525                     uint64_t startTime = ReadTime();
1526                     if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1527                                            runsOverBy(count, vectorSize, aligned)) ) )
1528                     {
1529                         gFailCount++;
1530                         goto exit;
1531                     }
1532 
1533                     if( (error = clFinish(gQueue)) )
1534                     {
1535                         vlog_error( "Failure in clFinish\n" );
1536                         gFailCount++;
1537                         goto exit;
1538                     }
1539                     uint64_t currentTime = ReadTime() - startTime;
1540                     if( currentTime < bestTime )
1541                         bestTime = currentTime;
1542                     doubleTime[ vectorSize ] += currentTime;
1543                 }
1544                 if( bestTime < min_double_time[ vectorSize ] )
1545                     min_double_time[ vectorSize ] = bestTime;
1546             }
1547         }
1548     }
1549 
1550     if( gReportTimes )
1551     {
1552         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1553             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1554                       "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1555         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1556             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1557                       "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
1558         if( gTestDouble )
1559         {
1560             for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1561                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1562                           "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
1563             for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1564                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1565                           "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1566         }
1567     }
1568 
1569 exit:
1570     //clean up
1571     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1572     {
1573         for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1574             clReleaseKernel( kernels[ vectorSize ][addressSpace] );
1575             clReleaseProgram( programs[ vectorSize ][addressSpace] );
1576             if( gTestDouble )
1577             {
1578                 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
1579                 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
1580             }
1581         }
1582     }
1583 
1584     return error;
1585 }
1586 
1587