1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19
20 #include <string.h>
21 #include "cl_utils.h"
22 #include "tests.h"
23
24 #include <CL/cl_half.h>
25
26 typedef struct ComputeReferenceInfoF_
27 {
28 float *x;
29 cl_ushort *r;
30 f2h f;
31 cl_ulong i;
32 cl_uint lim;
33 cl_uint count;
34 } ComputeReferenceInfoF;
35
36 typedef struct ComputeReferenceInfoD_
37 {
38 double *x;
39 cl_ushort *r;
40 d2h f;
41 cl_ulong i;
42 cl_uint lim;
43 cl_uint count;
44 } ComputeReferenceInfoD;
45
46 typedef struct CheckResultInfoF_
47 {
48 const float *x;
49 const cl_ushort *r;
50 const cl_ushort *s;
51 f2h f;
52 const char *aspace;
53 cl_uint lim;
54 cl_uint count;
55 int vsz;
56 } CheckResultInfoF;
57
58 typedef struct CheckResultInfoD_
59 {
60 const double *x;
61 const cl_ushort *r;
62 const cl_ushort *s;
63 d2h f;
64 const char *aspace;
65 cl_uint lim;
66 cl_uint count;
67 int vsz;
68 } CheckResultInfoD;
69
70 static cl_int
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)71 ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
72 {
73 ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
74 cl_uint lim = cri->lim;
75 cl_uint count = cri->count;
76 cl_uint off = jid * count;
77 float *x = cri->x + off;
78 cl_ushort *r = cri->r + off;
79 f2h f = cri->f;
80 cl_ulong i = cri->i + off;
81 cl_uint j, rr;
82
83 if (off + count > lim)
84 count = lim - off;
85
86 for (j = 0; j < count; ++j) {
87 x[j] = as_float((cl_uint)(i + j));
88 r[j] = f(x[j]);
89 }
90
91 return 0;
92 }
93
94 static cl_int
CheckF(cl_uint jid,cl_uint tid,void * userInfo)95 CheckF(cl_uint jid, cl_uint tid, void *userInfo)
96 {
97 CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
98 cl_uint lim = cri->lim;
99 cl_uint count = cri->count;
100 cl_uint off = jid * count;
101 const float *x = cri->x + off;
102 const cl_ushort *r = cri->r + off;
103 const cl_ushort *s = cri->s + off;
104 f2h f = cri->f;
105 cl_uint j;
106 cl_ushort correct2 = f( 0.0f);
107 cl_ushort correct3 = f(-0.0f);
108 cl_int ret = 0;
109
110 if (off + count > lim)
111 count = lim - off;
112
113 if (!memcmp(r, s, count*sizeof(cl_ushort)))
114 return 0;
115
116 for (j = 0; j < count; j++) {
117 if (s[j] == r[j])
118 continue;
119
120 // Pass any NaNs
121 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
122 continue;
123
124 // retry per section 6.5.3.3
125 if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
126 continue;
127
128 // if reference result is subnormal, pass any zero
129 if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
130 continue;
131
132 vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, vector_size = %d, address_space = %s\n",
133 j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
134
135 ret = 1;
136 break;
137 }
138
139 return ret;
140 }
141
142 static cl_int
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)143 ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
144 {
145 ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
146 cl_uint lim = cri->lim;
147 cl_uint count = cri->count;
148 cl_uint off = jid * count;
149 double *x = cri->x + off;
150 cl_ushort *r = cri->r + off;
151 d2h f = cri->f;
152 cl_uint j;
153 cl_ulong i = cri->i + off;
154
155 if (off + count > lim)
156 count = lim - off;
157
158 for (j = 0; j < count; ++j) {
159 x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
160 r[j] = f(x[j]);
161 }
162
163 return 0;
164 }
165
166 static cl_int
CheckD(cl_uint jid,cl_uint tid,void * userInfo)167 CheckD(cl_uint jid, cl_uint tid, void *userInfo)
168 {
169 CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
170 cl_uint lim = cri->lim;
171 cl_uint count = cri->count;
172 cl_uint off = jid * count;
173 const double *x = cri->x + off;
174 const cl_ushort *r = cri->r + off;
175 const cl_ushort *s = cri->s + off;
176 d2h f = cri->f;
177 cl_uint j;
178 cl_ushort correct2 = f( 0.0);
179 cl_ushort correct3 = f(-0.0);
180 cl_int ret = 0;
181
182 if (off + count > lim)
183 count = lim - off;
184
185 if (!memcmp(r, s, count*sizeof(cl_ushort)))
186 return 0;
187
188 for (j = 0; j < count; j++) {
189 if (s[j] == r[j])
190 continue;
191
192 // Pass any NaNs
193 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
194 continue;
195
196 if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
197 continue;
198
199 // if reference result is subnormal, pass any zero result
200 if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
201 continue;
202
203 vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
204 j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
205
206 ret = 1;
207 break;
208 }
209
210 return ret;
211 }
212
float2half_rte(float f)213 static cl_half float2half_rte(float f)
214 {
215 return cl_half_from_float(f, CL_HALF_RTE);
216 }
217
float2half_rtz(float f)218 static cl_half float2half_rtz(float f)
219 {
220 return cl_half_from_float(f, CL_HALF_RTZ);
221 }
222
float2half_rtp(float f)223 static cl_half float2half_rtp(float f)
224 {
225 return cl_half_from_float(f, CL_HALF_RTP);
226 }
227
float2half_rtn(float f)228 static cl_half float2half_rtn(float f)
229 {
230 return cl_half_from_float(f, CL_HALF_RTN);
231 }
232
double2half_rte(double f)233 static cl_half double2half_rte(double f)
234 {
235 return cl_half_from_double(f, CL_HALF_RTE);
236 }
237
double2half_rtz(double f)238 static cl_half double2half_rtz(double f)
239 {
240 return cl_half_from_double(f, CL_HALF_RTZ);
241 }
242
double2half_rtp(double f)243 static cl_half double2half_rtp(double f)
244 {
245 return cl_half_from_double(f, CL_HALF_RTP);
246 }
247
double2half_rtn(double f)248 static cl_half double2half_rtn(double f)
249 {
250 return cl_half_from_double(f, CL_HALF_RTN);
251 }
252
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)253 int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
254 {
255 switch (get_default_rounding_mode(deviceID))
256 {
257 case CL_FP_ROUND_TO_ZERO:
258 return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, "");
259 case 0:
260 return -1;
261 default:
262 return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "");
263 }
264 }
265
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)266 int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
267 {
268 return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
269 }
270
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)271 int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
272 {
273 return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
274 }
275
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)276 int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
277 {
278 return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
279 }
280
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)281 int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
282 {
283 return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
284 }
285
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
287 {
288 switch (get_default_rounding_mode(deviceID))
289 {
290 case CL_FP_ROUND_TO_ZERO:
291 return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, "");
292 case 0:
293 return -1;
294 default:
295 return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "");
296 }
297 }
298
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)299 int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
300 {
301 return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
302 }
303
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)304 int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
305 {
306 return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
307 }
308
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)309 int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
310 {
311 return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
312 }
313
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)314 int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
315 {
316 return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
317 }
318
319 #pragma mark -
320
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)321 int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
322 {
323 int vectorSize, error;
324 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
325 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
326
327 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
328 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
329 memset( min_time, -1, sizeof( min_time ) );
330 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
331 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
332 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
333 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
334 memset( min_double_time, -1, sizeof( min_double_time ) );
335
336 bool aligned= false;
337
338 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
339 {
340 const char *source[] = {
341 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
342 "{\n"
343 " size_t i = get_global_id(0);\n"
344 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
345 "}\n"
346 };
347
348 const char *source_v3[] = {
349 "__kernel void test( __global float *p, __global half *f,\n"
350 " uint extra_last_thread)\n"
351 "{\n"
352 " size_t i = get_global_id(0);\n"
353 " size_t last_i = get_global_size(0)-1;\n"
354 " size_t adjust = 0;\n"
355 " if(last_i == i && extra_last_thread != 0) {\n"
356 " adjust = 3-extra_last_thread;\n"
357 " } "
358 " vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
359 "}\n"
360 };
361
362 const char *source_private_store[] = {
363 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
364 "{\n"
365 " __private ushort data[16];\n"
366 " size_t i = get_global_id(0);\n"
367 " size_t offset = 0;\n"
368 " size_t vecsize = vec_step(p[i]);\n"
369 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
370 " for(offset = 0; offset < vecsize; offset++)\n"
371 " {\n"
372 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
373 " }\n"
374 "}\n"
375 };
376
377
378 const char *source_private_store_v3[] = {
379 "__kernel void test( __global float *p, __global half *f,\n"
380 " uint extra_last_thread )\n"
381 "{\n"
382 " __private ushort data[4];\n"
383 " size_t i = get_global_id(0);\n"
384 " size_t last_i = get_global_size(0)-1;\n"
385 " size_t adjust = 0;\n"
386 " size_t offset = 0;\n"
387 " if(last_i == i && extra_last_thread != 0) {\n"
388 " adjust = 3-extra_last_thread;\n"
389 " } "
390 " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
391 " for(offset = 0; offset < 3; offset++)\n"
392 " {\n"
393 " vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
394 " }\n"
395 "}\n"
396 };
397
398 char local_buf_size[10];
399 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
400
401
402 const char *source_local_store[] = {
403 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
404 "{\n"
405 " __local ushort data[16*", local_buf_size, "];\n"
406 " size_t i = get_global_id(0);\n"
407 " size_t lid = get_local_id(0);\n"
408 " size_t lsize = get_local_size(0);\n"
409 " size_t vecsize = vec_step(p[0]);\n"
410 " event_t async_event;\n"
411 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
412 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
413 " async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
414 " wait_group_events(1, &async_event);\n"
415 "}\n"
416 };
417
418 const char *source_local_store_v3[] = {
419 "__kernel void test( __global float *p, __global half *f,\n"
420 " uint extra_last_thread )\n"
421 "{\n"
422 " __local ushort data[3*(", local_buf_size, "+1)];\n"
423 " size_t i = get_global_id(0);\n"
424 " size_t lid = get_local_id(0);\n"
425 " size_t last_i = get_global_size(0)-1;\n"
426 " size_t adjust = 0;\n"
427 " size_t lsize = get_local_size(0);\n"
428 " event_t async_event;\n"
429 " if(last_i == i && extra_last_thread != 0) {\n"
430 " adjust = 3-extra_last_thread;\n"
431 " } "
432 " vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
433 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
434 " async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
435 " wait_group_events(1, &async_event);\n"
436 "}\n"
437 };
438
439 const char *double_source[] = {
440 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
441 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
442 "{\n"
443 " size_t i = get_global_id(0);\n"
444 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
445 "}\n"
446 };
447
448 const char *double_source_private_store[] = {
449 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
450 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
451 "{\n"
452 " __private ushort data[16];\n"
453 " size_t i = get_global_id(0);\n"
454 " size_t offset = 0;\n"
455 " size_t vecsize = vec_step(p[i]);\n"
456 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
457 " for(offset = 0; offset < vecsize; offset++)\n"
458 " {\n"
459 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
460 " }\n"
461 "}\n"
462 };
463
464
465 const char *double_source_local_store[] = {
466 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
467 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
468 "{\n"
469 " __local ushort data[16*", local_buf_size, "];\n"
470 " size_t i = get_global_id(0);\n"
471 " size_t lid = get_local_id(0);\n"
472 " size_t vecsize = vec_step(p[0]);\n"
473 " size_t lsize = get_local_size(0);\n"
474 " event_t async_event;\n"
475 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
476 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
477 " async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
478 " wait_group_events(1, &async_event);\n"
479 "}\n"
480 };
481
482
483 const char *double_source_v3[] = {
484 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
485 "__kernel void test( __global double *p, __global half *f ,\n"
486 " uint extra_last_thread)\n"
487 "{\n"
488 " size_t i = get_global_id(0);\n"
489 " size_t last_i = get_global_size(0)-1;\n"
490 " size_t adjust = 0;\n"
491 " if(last_i == i && extra_last_thread != 0) {\n"
492 " adjust = 3-extra_last_thread;\n"
493 " } "
494 " vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
495 "}\n"
496 };
497
498 const char *double_source_private_store_v3[] = {
499 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
500 "__kernel void test( __global double *p, __global half *f,\n"
501 " uint extra_last_thread )\n"
502 "{\n"
503 " __private ushort data[4];\n"
504 " size_t i = get_global_id(0);\n"
505 " size_t last_i = get_global_size(0)-1;\n"
506 " size_t adjust = 0;\n"
507 " size_t offset = 0;\n"
508 " if(last_i == i && extra_last_thread != 0) {\n"
509 " adjust = 3-extra_last_thread;\n"
510 " } "
511 " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
512 " for(offset = 0; offset < 3; offset++)\n"
513 " {\n"
514 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
515 " }\n"
516 "}\n"
517 };
518
519 const char *double_source_local_store_v3[] = {
520 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
521 "__kernel void test( __global double *p, __global half *f,\n"
522 " uint extra_last_thread )\n"
523 "{\n"
524 " __local ushort data[3*(", local_buf_size, "+1)];\n"
525 " size_t i = get_global_id(0);\n"
526 " size_t lid = get_local_id(0);\n"
527 " size_t last_i = get_global_size(0)-1;\n"
528 " size_t adjust = 0;\n"
529 " size_t lsize = get_local_size(0);\n"
530 " event_t async_event;\n"
531 " if(last_i == i && extra_last_thread != 0) {\n"
532 " adjust = 3-extra_last_thread;\n"
533 " }\n "
534 " vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
535 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
536 " async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later
537 " wait_group_events(1, &async_event);\n"
538 "}\n"
539 };
540
541
542
543 if(g_arrVecSizes[vectorSize] == 3) {
544 programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
545 } else {
546 programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
547 }
548 if( NULL == programs[ vectorSize ][0] )
549 {
550 gFailCount++;
551 return -1;
552 }
553
554 kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
555 if( NULL == kernels[vectorSize][0] )
556 {
557 gFailCount++;
558 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
559 return error;
560 }
561
562 if(g_arrVecSizes[vectorSize] == 3) {
563 programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
564 } else {
565 programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
566 }
567 if( NULL == programs[ vectorSize ][1] )
568 {
569 gFailCount++;
570 return -1;
571 }
572
573 kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
574 if( NULL == kernels[vectorSize][1] )
575 {
576 gFailCount++;
577 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
578 return error;
579 }
580
581 if(g_arrVecSizes[vectorSize] == 3) {
582 programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
583 if( NULL == programs[ vectorSize ][2] )
584 {
585 unsigned q;
586 for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
587 vlog_error("%s", source_local_store_v3[q]);
588
589 gFailCount++;
590 return -1;
591
592 }
593 } else {
594 programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
595 if( NULL == programs[ vectorSize ][2] )
596 {
597 unsigned q;
598 for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
599 vlog_error("%s", source_local_store[q]);
600
601 gFailCount++;
602 return -1;
603
604 }
605 }
606
607 kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
608 if( NULL == kernels[vectorSize][2] )
609 {
610 gFailCount++;
611 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
612 return error;
613 }
614
615 if( gTestDouble )
616 {
617 if(g_arrVecSizes[vectorSize] == 3) {
618 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
619 } else {
620 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
621 }
622 if( NULL == doublePrograms[ vectorSize ][0] )
623 {
624 gFailCount++;
625 return -1;
626 }
627
628 doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
629 if( NULL == kernels[vectorSize][0] )
630 {
631 gFailCount++;
632 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
633 return error;
634 }
635
636 if(g_arrVecSizes[vectorSize] == 3)
637 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
638 else
639 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
640
641 if( NULL == doublePrograms[ vectorSize ][1] )
642 {
643 gFailCount++;
644 return -1;
645 }
646
647 doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
648 if( NULL == kernels[vectorSize][1] )
649 {
650 gFailCount++;
651 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
652 return error;
653 }
654
655 if(g_arrVecSizes[vectorSize] == 3) {
656 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
657 } else {
658 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
659 }
660 if( NULL == doublePrograms[ vectorSize ][2] )
661 {
662 gFailCount++;
663 return -1;
664 }
665
666 doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
667 if( NULL == kernels[vectorSize][2] )
668 {
669 gFailCount++;
670 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
671 return error;
672 }
673 }
674 } // end for vector size
675
676 // Figure out how many elements are in a work block
677 size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
678 size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
679 uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
680 size_t stride = blockCount;
681
682 if (gWimpyMode)
683 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
684
685 // we handle 64-bit types a bit differently.
686 if( lastCase == 0 )
687 lastCase = 0x100000000ULL;
688
689 uint64_t i, j;
690 error = 0;
691 uint64_t printMask = (lastCase >> 4) - 1;
692 cl_uint count = 0;
693 int addressSpace;
694 size_t loopCount;
695 cl_uint threadCount = GetThreadCount();
696
697 ComputeReferenceInfoF fref;
698 fref.x = (float *)gIn_single;
699 fref.r = (cl_half *)gOut_half_reference;
700 fref.f = referenceFunc;
701 fref.lim = blockCount;
702 fref.count = (blockCount + threadCount - 1) / threadCount;
703
704 CheckResultInfoF fchk;
705 fchk.x = (const float *)gIn_single;
706 fchk.r = (const cl_half *)gOut_half_reference;
707 fchk.s = (const cl_half *)gOut_half;
708 fchk.f = referenceFunc;
709 fchk.lim = blockCount;
710 fchk.count = (blockCount + threadCount - 1) / threadCount;
711
712 ComputeReferenceInfoD dref;
713 dref.x = (double *)gIn_double;
714 dref.r = (cl_half *)gOut_half_reference_double;
715 dref.f = doubleReferenceFunc;
716 dref.lim = blockCount;
717 dref.count = (blockCount + threadCount - 1) / threadCount;
718
719 CheckResultInfoD dchk;
720 dchk.x = (const double *)gIn_double;
721 dchk.r = (const cl_half *)gOut_half_reference_double;
722 dchk.s = (const cl_half *)gOut_half;
723 dchk.f = doubleReferenceFunc;
724 dchk.lim = blockCount;
725 dchk.count = (blockCount + threadCount - 1) / threadCount;
726
727 for( i = 0; i < lastCase; i += stride )
728 {
729 count = (cl_uint) MIN( blockCount, lastCase - i );
730 fref.i = i;
731 dref.i = i;
732
733 // Compute the input and reference
734 ThreadPool_Do(ReferenceF, threadCount, &fref);
735
736 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
737 if (error) {
738 vlog_error( "Failure in clWriteBuffer\n" );
739 gFailCount++;
740 goto exit;
741 }
742
743 if (gTestDouble) {
744 ThreadPool_Do(ReferenceD, threadCount, &dref);
745
746 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
747 if (error) {
748 vlog_error( "Failure in clWriteBuffer\n" );
749 gFailCount++;
750 goto exit;
751 }
752 }
753
754 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
755 // Loop through vector sizes
756 fchk.vsz = g_arrVecSizes[vectorSize];
757 dchk.vsz = g_arrVecSizes[vectorSize];
758
759 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
760 // Loop over address spaces
761 fchk.aspace = addressSpaceNames[addressSpace];
762 dchk.aspace = addressSpaceNames[addressSpace];
763
764 cl_uint pattern = 0xdeaddead;
765 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
766
767 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
768 0, count * sizeof(cl_half),
769 gOut_half, 0, NULL, NULL);
770 if (error) {
771 vlog_error( "Failure in clWriteArray\n" );
772 gFailCount++;
773 goto exit;
774 }
775
776 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
777 numVecs(count, vectorSize, aligned) ,
778 runsOverBy(count, vectorSize, aligned));
779 if (error) {
780 gFailCount++;
781 goto exit;
782 }
783
784 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
785 count * sizeof(cl_half), gOut_half,
786 0, NULL, NULL);
787 if (error) {
788 vlog_error( "Failure in clReadArray\n" );
789 gFailCount++;
790 goto exit;
791 }
792
793 error = ThreadPool_Do(CheckF, threadCount, &fchk);
794 if (error) {
795 gFailCount++;
796 goto exit;
797 }
798
799 if (gTestDouble) {
800 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
801
802 error = clEnqueueWriteBuffer(
803 gQueue, gOutBuffer_half, CL_FALSE, 0,
804 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
805 if (error) {
806 vlog_error( "Failure in clWriteArray\n" );
807 gFailCount++;
808 goto exit;
809 }
810
811 error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
812 numVecs(count, vectorSize, aligned),
813 runsOverBy(count, vectorSize, aligned));
814 if (error) {
815 gFailCount++;
816 goto exit;
817 }
818
819 error = clEnqueueReadBuffer(
820 gQueue, gOutBuffer_half, CL_TRUE, 0,
821 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
822 if (error) {
823 vlog_error( "Failure in clReadArray\n" );
824 gFailCount++;
825 goto exit;
826 }
827
828 error = ThreadPool_Do(CheckD, threadCount, &dchk);
829 if (error) {
830 gFailCount++;
831 goto exit;
832 }
833 }
834 }
835 }
836
837 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
838 {
839 vlog( "." );
840 fflush( stdout );
841 }
842 } // end last case
843
844 loopCount = count == blockCount ? 1 : 100;
845 if( gReportTimes )
846 {
847 //Init the input stream
848 cl_float *p = (cl_float *)gIn_single;
849 for( j = 0; j < count; j++ )
850 p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
851
852 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
853 {
854 vlog_error( "Failure in clWriteArray\n" );
855 gFailCount++;
856 goto exit;
857 }
858
859 if( gTestDouble )
860 {
861 //Init the input stream
862 cl_double *q = (cl_double *)gIn_double;
863 for( j = 0; j < count; j++ )
864 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
865
866 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
867 {
868 vlog_error( "Failure in clWriteArray\n" );
869 gFailCount++;
870 goto exit;
871 }
872 }
873
874 //Run again for timing
875 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
876 {
877 uint64_t bestTime = -1ULL;
878 for( j = 0; j < loopCount; j++ )
879 {
880 uint64_t startTime = ReadTime();
881
882
883 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
884 runsOverBy(count, vectorSize, aligned)) ) )
885 {
886 gFailCount++;
887 goto exit;
888 }
889
890 if( (error = clFinish(gQueue)) )
891 {
892 vlog_error( "Failure in clFinish\n" );
893 gFailCount++;
894 goto exit;
895 }
896 uint64_t currentTime = ReadTime() - startTime;
897 if( currentTime < bestTime )
898 bestTime = currentTime;
899 time[ vectorSize ] += currentTime;
900 }
901 if( bestTime < min_time[ vectorSize ] )
902 min_time[ vectorSize ] = bestTime ;
903
904 if( gTestDouble )
905 {
906 bestTime = -1ULL;
907 for( j = 0; j < loopCount; j++ )
908 {
909 uint64_t startTime = ReadTime();
910 if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
911 runsOverBy(count, vectorSize, aligned)) ) )
912 {
913 gFailCount++;
914 goto exit;
915 }
916
917 if( (error = clFinish(gQueue)) )
918 {
919 vlog_error( "Failure in clFinish\n" );
920 gFailCount++;
921 goto exit;
922 }
923 uint64_t currentTime = ReadTime() - startTime;
924 if( currentTime < bestTime )
925 bestTime = currentTime;
926 doubleTime[ vectorSize ] += currentTime;
927 }
928 if( bestTime < min_double_time[ vectorSize ] )
929 min_double_time[ vectorSize ] = bestTime;
930 }
931 }
932 }
933
934 if( gReportTimes )
935 {
936 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
937 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
938 "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
939 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
940 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
941 "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
942 if( gTestDouble )
943 {
944 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
945 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
946 "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
947 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
948 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
949 "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
950 }
951 }
952
953 exit:
954 //clean up
955 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
956 {
957 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
958 clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
959 clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
960 if( gTestDouble )
961 {
962 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
963 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
964 }
965 }
966 }
967
968 return error;
969 }
970
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)971 int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
972 {
973 int vectorSize, error;
974 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
975 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
976
977 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
978 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
979 memset( min_time, -1, sizeof( min_time ) );
980 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
981 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
982 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
983 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
984 memset( min_double_time, -1, sizeof( min_double_time ) );
985
986 bool aligned = true;
987
988 int minVectorSize = kMinVectorSize;
989 // There is no aligned scalar vstorea_half
990 if( 0 == minVectorSize )
991 minVectorSize = 1;
992
993 //Loop over vector sizes
994 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
995 {
996 const char *source[] = {
997 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
998 "{\n"
999 " size_t i = get_global_id(0);\n"
1000 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1001 "}\n"
1002 };
1003
1004 const char *source_v3[] = {
1005 "__kernel void test( __global float3 *p, __global half *f )\n"
1006 "{\n"
1007 " size_t i = get_global_id(0);\n"
1008 " vstorea_half3",roundName,"( p[i], i, f );\n"
1009 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1010 "}\n"
1011 };
1012
1013 const char *source_private[] = {
1014 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1015 "{\n"
1016 " __private float", vector_size_name_extensions[vectorSize], " data;\n"
1017 " size_t i = get_global_id(0);\n"
1018 " data = p[i];\n"
1019 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1020 "}\n"
1021 };
1022
1023 const char *source_private_v3[] = {
1024 "__kernel void test( __global float3 *p, __global half *f )\n"
1025 "{\n"
1026 " __private float", vector_size_name_extensions[vectorSize], " data;\n"
1027 " size_t i = get_global_id(0);\n"
1028 " data = p[i];\n"
1029 " vstorea_half3",roundName,"( data, i, f );\n"
1030 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1031 "}\n"
1032 };
1033
1034 char local_buf_size[10];
1035 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1036 const char *source_local[] = {
1037 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1038 "{\n"
1039 " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1040 " size_t i = get_global_id(0);\n"
1041 " size_t lid = get_local_id(0);\n"
1042 " data[lid] = p[i];\n"
1043 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1044 "}\n"
1045 };
1046
1047 const char *source_local_v3[] = {
1048 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1049 "{\n"
1050 " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1051 " size_t i = get_global_id(0);\n"
1052 " size_t lid = get_local_id(0);\n"
1053 " data[lid] = p[i];\n"
1054 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1055 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1056 "}\n"
1057 };
1058
1059 const char *double_source[] = {
1060 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1061 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1062 "{\n"
1063 " size_t i = get_global_id(0);\n"
1064 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1065 "}\n"
1066 };
1067
1068 const char *double_source_v3[] = {
1069 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1070 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1071 "{\n"
1072 " size_t i = get_global_id(0);\n"
1073 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1074 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1075 "}\n"
1076 };
1077
1078 const char *double_source_private[] = {
1079 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1080 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1081 "{\n"
1082 " __private double", vector_size_name_extensions[vectorSize], " data;\n"
1083 " size_t i = get_global_id(0);\n"
1084 " data = p[i];\n"
1085 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1086 "}\n"
1087 };
1088
1089 const char *double_source_private_v3[] = {
1090 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1091 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1092 "{\n"
1093 " __private double", vector_size_name_extensions[vectorSize], " data;\n"
1094 " size_t i = get_global_id(0);\n"
1095 " data = p[i];\n"
1096 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1097 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1098 "}\n"
1099 };
1100
1101 const char *double_source_local[] = {
1102 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1103 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1104 "{\n"
1105 " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1106 " size_t i = get_global_id(0);\n"
1107 " size_t lid = get_local_id(0);\n"
1108 " data[lid] = p[i];\n"
1109 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1110 "}\n"
1111 };
1112
1113 const char *double_source_local_v3[] = {
1114 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1115 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1116 "{\n"
1117 " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1118 " size_t i = get_global_id(0);\n"
1119 " size_t lid = get_local_id(0);\n"
1120 " data[lid] = p[i];\n"
1121 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1122 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1123 "}\n"
1124 };
1125
1126 if(g_arrVecSizes[vectorSize] == 3) {
1127 programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
1128 if( NULL == programs[ vectorSize ][0] )
1129 {
1130 gFailCount++;
1131 return -1;
1132 }
1133 } else {
1134 programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
1135 if( NULL == programs[ vectorSize ][0] )
1136 {
1137 gFailCount++;
1138 return -1;
1139 }
1140 }
1141
1142 kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
1143 if( NULL == kernels[vectorSize][0] )
1144 {
1145 gFailCount++;
1146 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
1147 return error;
1148 }
1149
1150 if(g_arrVecSizes[vectorSize] == 3) {
1151 programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
1152 if( NULL == programs[ vectorSize ][1] )
1153 {
1154 gFailCount++;
1155 return -1;
1156 }
1157 } else {
1158 programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) );
1159 if( NULL == programs[ vectorSize ][1] )
1160 {
1161 gFailCount++;
1162 return -1;
1163 }
1164 }
1165
1166 kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
1167 if( NULL == kernels[vectorSize][1] )
1168 {
1169 gFailCount++;
1170 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
1171 return error;
1172 }
1173
1174 if(g_arrVecSizes[vectorSize] == 3) {
1175 programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
1176 if( NULL == programs[ vectorSize ][2] )
1177 {
1178 gFailCount++;
1179 return -1;
1180 }
1181 } else {
1182 programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) );
1183 if( NULL == programs[ vectorSize ][2] )
1184 {
1185 gFailCount++;
1186 return -1;
1187 }
1188 }
1189
1190 kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
1191 if( NULL == kernels[vectorSize][2] )
1192 {
1193 gFailCount++;
1194 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
1195 return error;
1196 }
1197
1198 if( gTestDouble )
1199 {
1200 if(g_arrVecSizes[vectorSize] == 3) {
1201 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
1202 if( NULL == doublePrograms[ vectorSize ][0] )
1203 {
1204 gFailCount++;
1205 return -1;
1206 }
1207 } else {
1208 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
1209 if( NULL == doublePrograms[ vectorSize ][0] )
1210 {
1211 gFailCount++;
1212 return -1;
1213 }
1214 }
1215
1216 doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
1217 if( NULL == kernels[vectorSize][0] )
1218 {
1219 gFailCount++;
1220 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
1221 return error;
1222 }
1223
1224 if(g_arrVecSizes[vectorSize] == 3) {
1225 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
1226 if( NULL == doublePrograms[ vectorSize ][1] )
1227 {
1228 gFailCount++;
1229 return -1;
1230 }
1231 } else {
1232 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
1233 if( NULL == doublePrograms[ vectorSize ][1] )
1234 {
1235 gFailCount++;
1236 return -1;
1237 }
1238 }
1239
1240 doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
1241 if( NULL == kernels[vectorSize][1] )
1242 {
1243 gFailCount++;
1244 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
1245 return error;
1246 }
1247
1248 if(g_arrVecSizes[vectorSize] == 3) {
1249 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
1250 if( NULL == doublePrograms[ vectorSize ][2] )
1251 {
1252 gFailCount++;
1253 return -1;
1254 }
1255 } else {
1256 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
1257 if( NULL == doublePrograms[ vectorSize ][2] )
1258 {
1259 gFailCount++;
1260 return -1;
1261 }
1262 }
1263
1264 doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
1265 if( NULL == kernels[vectorSize][2] )
1266 {
1267 gFailCount++;
1268 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
1269 return error;
1270 }
1271 }
1272 }
1273
1274 // Figure out how many elements are in a work block
1275 size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float));
1276 size_t blockCount = BUFFER_SIZE / elementSize;
1277 uint64_t lastCase = 1ULL << (8*sizeof(float));
1278 size_t stride = blockCount;
1279
1280 if (gWimpyMode)
1281 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1282
1283 // we handle 64-bit types a bit differently.
1284 if( lastCase == 0 )
1285 lastCase = 0x100000000ULL;
1286 uint64_t i, j;
1287 error = 0;
1288 uint64_t printMask = (lastCase >> 4) - 1;
1289 cl_uint count = 0;
1290 int addressSpace;
1291 size_t loopCount;
1292 cl_uint threadCount = GetThreadCount();
1293
1294 ComputeReferenceInfoF fref;
1295 fref.x = (float *)gIn_single;
1296 fref.r = (cl_half *)gOut_half_reference;
1297 fref.f = referenceFunc;
1298 fref.lim = blockCount;
1299 fref.count = (blockCount + threadCount - 1) / threadCount;
1300
1301 CheckResultInfoF fchk;
1302 fchk.x = (const float *)gIn_single;
1303 fchk.r = (const cl_half *)gOut_half_reference;
1304 fchk.s = (const cl_half *)gOut_half;
1305 fchk.f = referenceFunc;
1306 fchk.lim = blockCount;
1307 fchk.count = (blockCount + threadCount - 1) / threadCount;
1308
1309 ComputeReferenceInfoD dref;
1310 dref.x = (double *)gIn_double;
1311 dref.r = (cl_half *)gOut_half_reference_double;
1312 dref.f = doubleReferenceFunc;
1313 dref.lim = blockCount;
1314 dref.count = (blockCount + threadCount - 1) / threadCount;
1315
1316 CheckResultInfoD dchk;
1317 dchk.x = (const double *)gIn_double;
1318 dchk.r = (const cl_half *)gOut_half_reference_double;
1319 dchk.s = (const cl_half *)gOut_half;
1320 dchk.f = doubleReferenceFunc;
1321 dchk.lim = blockCount;
1322 dchk.count = (blockCount + threadCount - 1) / threadCount;
1323
1324 for( i = 0; i < (uint64_t)lastCase; i += stride )
1325 {
1326 count = (cl_uint) MIN( blockCount, lastCase - i );
1327 fref.i = i;
1328 dref.i = i;
1329
1330 // Create the input and reference
1331 ThreadPool_Do(ReferenceF, threadCount, &fref);
1332
1333 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
1334 if (error) {
1335 vlog_error( "Failure in clWriteArray\n" );
1336 gFailCount++;
1337 goto exit;
1338 }
1339
1340 if (gTestDouble) {
1341 ThreadPool_Do(ReferenceD, threadCount, &dref);
1342
1343 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
1344 if (error) {
1345 vlog_error( "Failure in clWriteArray\n" );
1346 gFailCount++;
1347 goto exit;
1348 }
1349 }
1350
1351 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
1352 // Loop over vector legths
1353 fchk.vsz = g_arrVecSizes[vectorSize];
1354 dchk.vsz = g_arrVecSizes[vectorSize];
1355
1356 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1357 // Loop over address spaces
1358 fchk.aspace = addressSpaceNames[addressSpace];
1359 dchk.aspace = addressSpaceNames[addressSpace];
1360
1361 cl_uint pattern = 0xdeaddead;
1362 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1363
1364 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
1365 0, count * sizeof(cl_half),
1366 gOut_half, 0, NULL, NULL);
1367 if (error) {
1368 vlog_error( "Failure in clWriteArray\n" );
1369 gFailCount++;
1370 goto exit;
1371 }
1372
1373 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
1374 numVecs(count, vectorSize, aligned),
1375 runsOverBy(count, vectorSize, aligned));
1376 if (error) {
1377 gFailCount++;
1378 goto exit;
1379 }
1380
1381 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1382 count * sizeof(cl_half), gOut_half,
1383 0, NULL, NULL);
1384 if (error) {
1385 vlog_error( "Failure in clReadArray\n" );
1386 gFailCount++;
1387 goto exit;
1388 }
1389
1390 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1391 if (error) {
1392 gFailCount++;
1393 goto exit;
1394 }
1395
1396 if (gTestDouble) {
1397 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1398
1399 error = clEnqueueWriteBuffer(
1400 gQueue, gOutBuffer_half, CL_FALSE, 0,
1401 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1402 if (error) {
1403 vlog_error( "Failure in clWriteArray\n" );
1404 gFailCount++;
1405 goto exit;
1406 }
1407
1408 error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
1409 numVecs(count, vectorSize, aligned),
1410 runsOverBy(count, vectorSize, aligned));
1411 if (error) {
1412 gFailCount++;
1413 goto exit;
1414 }
1415
1416 error = clEnqueueReadBuffer(
1417 gQueue, gOutBuffer_half, CL_TRUE, 0,
1418 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1419 if (error) {
1420 vlog_error( "Failure in clReadArray\n" );
1421 gFailCount++;
1422 goto exit;
1423 }
1424
1425 error = ThreadPool_Do(CheckD, threadCount, &dchk);
1426 if (error) {
1427 gFailCount++;
1428 goto exit;
1429 }
1430 }
1431 }
1432 } // end for vector size
1433
1434 if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
1435 vlog( "." );
1436 fflush( stdout );
1437 }
1438 } // for end lastcase
1439
1440 loopCount = count == blockCount ? 1 : 100;
1441 if( gReportTimes )
1442 {
1443 //Init the input stream
1444 cl_float *p = (cl_float *)gIn_single;
1445 for( j = 0; j < count; j++ )
1446 p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1447
1448 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
1449 {
1450 vlog_error( "Failure in clWriteArray\n" );
1451 gFailCount++;
1452 goto exit;
1453 }
1454
1455 if( gTestDouble )
1456 {
1457 //Init the input stream
1458 cl_double *q = (cl_double *)gIn_double;
1459 for( j = 0; j < count; j++ )
1460 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1461
1462 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
1463 {
1464 vlog_error( "Failure in clWriteArray\n" );
1465 gFailCount++;
1466 goto exit;
1467 }
1468 }
1469
1470 //Run again for timing
1471 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1472 {
1473 uint64_t bestTime = -1ULL;
1474 for( j = 0; j < loopCount; j++ )
1475 {
1476 uint64_t startTime = ReadTime();
1477 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1478 runsOverBy(count, vectorSize, aligned)) ) )
1479 {
1480 gFailCount++;
1481 goto exit;
1482 }
1483
1484 if( (error = clFinish(gQueue)) )
1485 {
1486 vlog_error( "Failure in clFinish\n" );
1487 gFailCount++;
1488 goto exit;
1489 }
1490 uint64_t currentTime = ReadTime() - startTime;
1491 if( currentTime < bestTime )
1492 bestTime = currentTime;
1493 time[ vectorSize ] += currentTime;
1494 }
1495 if( bestTime < min_time[ vectorSize ] )
1496 min_time[ vectorSize ] = bestTime ;
1497
1498 if( gTestDouble )
1499 {
1500 bestTime = -1ULL;
1501 for( j = 0; j < loopCount; j++ )
1502 {
1503 uint64_t startTime = ReadTime();
1504 if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1505 runsOverBy(count, vectorSize, aligned)) ) )
1506 {
1507 gFailCount++;
1508 goto exit;
1509 }
1510
1511 if( (error = clFinish(gQueue)) )
1512 {
1513 vlog_error( "Failure in clFinish\n" );
1514 gFailCount++;
1515 goto exit;
1516 }
1517 uint64_t currentTime = ReadTime() - startTime;
1518 if( currentTime < bestTime )
1519 bestTime = currentTime;
1520 doubleTime[ vectorSize ] += currentTime;
1521 }
1522 if( bestTime < min_double_time[ vectorSize ] )
1523 min_double_time[ vectorSize ] = bestTime;
1524 }
1525 }
1526 }
1527
1528 if( gReportTimes )
1529 {
1530 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1531 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1532 "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1533 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1534 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1535 "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1536 if( gTestDouble )
1537 {
1538 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1539 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1540 "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1541 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1542 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1543 "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1544 }
1545 }
1546
1547 exit:
1548 //clean up
1549 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1550 {
1551 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1552 clReleaseKernel( kernels[ vectorSize ][addressSpace] );
1553 clReleaseProgram( programs[ vectorSize ][addressSpace] );
1554 if( gTestDouble )
1555 {
1556 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
1557 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
1558 }
1559 }
1560 }
1561
1562 return error;
1563 }
1564
1565