1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19
20 #include <string.h>
21
22 #include <algorithm>
23
24 #include "cl_utils.h"
25 #include "tests.h"
26
27 #include <CL/cl_half.h>
28
29 typedef struct ComputeReferenceInfoF_
30 {
31 float *x;
32 cl_ushort *r;
33 f2h f;
34 cl_ulong i;
35 cl_uint lim;
36 cl_uint count;
37 } ComputeReferenceInfoF;
38
39 typedef struct ComputeReferenceInfoD_
40 {
41 double *x;
42 cl_ushort *r;
43 d2h f;
44 cl_ulong i;
45 cl_uint lim;
46 cl_uint count;
47 } ComputeReferenceInfoD;
48
49 typedef struct CheckResultInfoF_
50 {
51 const float *x;
52 const cl_ushort *r;
53 const cl_ushort *s;
54 f2h f;
55 const char *aspace;
56 cl_uint lim;
57 cl_uint count;
58 int vsz;
59 } CheckResultInfoF;
60
61 typedef struct CheckResultInfoD_
62 {
63 const double *x;
64 const cl_ushort *r;
65 const cl_ushort *s;
66 d2h f;
67 const char *aspace;
68 cl_uint lim;
69 cl_uint count;
70 int vsz;
71 } CheckResultInfoD;
72
73 static cl_int
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)74 ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
75 {
76 ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
77 cl_uint lim = cri->lim;
78 cl_uint count = cri->count;
79 cl_uint off = jid * count;
80 float *x = cri->x + off;
81 cl_ushort *r = cri->r + off;
82 f2h f = cri->f;
83 cl_ulong i = cri->i + off;
84 cl_uint j;
85
86 if (off + count > lim)
87 count = lim - off;
88
89 for (j = 0; j < count; ++j) {
90 x[j] = as_float((cl_uint)(i + j));
91 r[j] = f(x[j]);
92 }
93
94 return 0;
95 }
96
97 static cl_int
CheckF(cl_uint jid,cl_uint tid,void * userInfo)98 CheckF(cl_uint jid, cl_uint tid, void *userInfo)
99 {
100 CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
101 cl_uint lim = cri->lim;
102 cl_uint count = cri->count;
103 cl_uint off = jid * count;
104 const float *x = cri->x + off;
105 const cl_ushort *r = cri->r + off;
106 const cl_ushort *s = cri->s + off;
107 f2h f = cri->f;
108 cl_uint j;
109 cl_ushort correct2 = f( 0.0f);
110 cl_ushort correct3 = f(-0.0f);
111 cl_int ret = 0;
112
113 if (off + count > lim)
114 count = lim - off;
115
116 if (!memcmp(r, s, count*sizeof(cl_ushort)))
117 return 0;
118
119 for (j = 0; j < count; j++) {
120 if (s[j] == r[j]) continue;
121
122 // Pass any NaNs
123 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 )
124 continue;
125
126 // retry per section 6.5.3.3
127 if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
128 continue;
129
130 // if reference result is subnormal, pass any zero
131 if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
132 continue;
133
134 vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, vector_size = %d, address_space = %s\n",
135 j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
136
137 ret = 1;
138 break;
139 }
140
141 return ret;
142 }
143
144 static cl_int
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)145 ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
146 {
147 ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
148 cl_uint lim = cri->lim;
149 cl_uint count = cri->count;
150 cl_uint off = jid * count;
151 double *x = cri->x + off;
152 cl_ushort *r = cri->r + off;
153 d2h f = cri->f;
154 cl_uint j;
155 cl_ulong i = cri->i + off;
156
157 if (off + count > lim)
158 count = lim - off;
159
160 for (j = 0; j < count; ++j) {
161 x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
162 r[j] = f(x[j]);
163 }
164
165 return 0;
166 }
167
168 static cl_int
CheckD(cl_uint jid,cl_uint tid,void * userInfo)169 CheckD(cl_uint jid, cl_uint tid, void *userInfo)
170 {
171 CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
172 cl_uint lim = cri->lim;
173 cl_uint count = cri->count;
174 cl_uint off = jid * count;
175 const double *x = cri->x + off;
176 const cl_ushort *r = cri->r + off;
177 const cl_ushort *s = cri->s + off;
178 d2h f = cri->f;
179 cl_uint j;
180 cl_ushort correct2 = f( 0.0);
181 cl_ushort correct3 = f(-0.0);
182 cl_int ret = 0;
183
184 if (off + count > lim)
185 count = lim - off;
186
187 if (!memcmp(r, s, count*sizeof(cl_ushort)))
188 return 0;
189
190 for (j = 0; j < count; j++) {
191 if (s[j] == r[j]) continue;
192
193 // Pass any NaNs
194 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00)
195 continue;
196
197 if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
198 continue;
199
200 // if reference result is subnormal, pass any zero result
201 if (gIsEmbedded && IsHalfSubnormal(r[j]) && (s[j] == 0x0000 || s[j] == 0x8000))
202 continue;
203
204 vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, vector_size = %d, address space = %s (double precision)\n",
205 j+off, x[j], r[j], s[j], cri->vsz, cri->aspace);
206
207 ret = 1;
208 break;
209 }
210
211 return ret;
212 }
213
float2half_rte(float f)214 static cl_half float2half_rte(float f)
215 {
216 return cl_half_from_float(f, CL_HALF_RTE);
217 }
218
float2half_rtz(float f)219 static cl_half float2half_rtz(float f)
220 {
221 return cl_half_from_float(f, CL_HALF_RTZ);
222 }
223
float2half_rtp(float f)224 static cl_half float2half_rtp(float f)
225 {
226 return cl_half_from_float(f, CL_HALF_RTP);
227 }
228
float2half_rtn(float f)229 static cl_half float2half_rtn(float f)
230 {
231 return cl_half_from_float(f, CL_HALF_RTN);
232 }
233
double2half_rte(double f)234 static cl_half double2half_rte(double f)
235 {
236 return cl_half_from_double(f, CL_HALF_RTE);
237 }
238
double2half_rtz(double f)239 static cl_half double2half_rtz(double f)
240 {
241 return cl_half_from_double(f, CL_HALF_RTZ);
242 }
243
double2half_rtp(double f)244 static cl_half double2half_rtp(double f)
245 {
246 return cl_half_from_double(f, CL_HALF_RTP);
247 }
248
double2half_rtn(double f)249 static cl_half double2half_rtn(double f)
250 {
251 return cl_half_from_double(f, CL_HALF_RTN);
252 }
253
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)254 int test_vstore_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
255 {
256 switch (get_default_rounding_mode(deviceID))
257 {
258 case CL_FP_ROUND_TO_ZERO:
259 return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rte, "");
260 case 0:
261 return -1;
262 default:
263 return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "");
264 }
265 }
266
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)267 int test_vstore_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
268 {
269 return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
270 }
271
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)272 int test_vstore_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
273 {
274 return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
275 }
276
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)277 int test_vstore_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
278 {
279 return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
280 }
281
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)282 int test_vstore_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
283 {
284 return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
285 }
286
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)287 int test_vstorea_half( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
288 {
289 switch (get_default_rounding_mode(deviceID))
290 {
291 case CL_FP_ROUND_TO_ZERO:
292 return Test_vStoreaHalf_private(deviceID,float2half_rtz, double2half_rte, "");
293 case 0:
294 return -1;
295 default:
296 return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "");
297 }
298 }
299
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)300 int test_vstorea_half_rte( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
301 {
302 return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte, "_rte");
303 }
304
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)305 int test_vstorea_half_rtz( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
306 {
307 return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz, "_rtz");
308 }
309
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)310 int test_vstorea_half_rtp( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
311 {
312 return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp, "_rtp");
313 }
314
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)315 int test_vstorea_half_rtn( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
316 {
317 return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn, "_rtn");
318 }
319
320 #pragma mark -
321
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)322 int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
323 {
324 int vectorSize, error;
325 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
326 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
327
328 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
329 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
330 memset( min_time, -1, sizeof( min_time ) );
331 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
332 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
333 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
334 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
335 memset( min_double_time, -1, sizeof( min_double_time ) );
336
337 bool aligned= false;
338
339 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
340 {
341 const char *source[] = {
342 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
343 "{\n"
344 " size_t i = get_global_id(0);\n"
345 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
346 "}\n"
347 };
348
349 const char *source_v3[] = {
350 "__kernel void test( __global float *p, __global half *f,\n"
351 " uint extra_last_thread)\n"
352 "{\n"
353 " size_t i = get_global_id(0);\n"
354 " size_t last_i = get_global_size(0)-1;\n"
355 " size_t adjust = 0;\n"
356 " if(last_i == i && extra_last_thread != 0) {\n"
357 " adjust = 3-extra_last_thread;\n"
358 " } "
359 " vstore_half3",roundName,"( vload3(i, p-adjust), i, f-adjust );\n"
360 "}\n"
361 };
362
363 const char *source_private_store[] = {
364 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
365 "{\n"
366 " __private ushort data[16];\n"
367 " size_t i = get_global_id(0);\n"
368 " size_t offset = 0;\n"
369 " size_t vecsize = vec_step(p[i]);\n"
370 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
371 " for(offset = 0; offset < vecsize; offset++)\n"
372 " {\n"
373 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
374 " }\n"
375 "}\n"
376 };
377
378
379 const char *source_private_store_v3[] = {
380 "__kernel void test( __global float *p, __global half *f,\n"
381 " uint extra_last_thread )\n"
382 "{\n"
383 " __private ushort data[4];\n"
384 " size_t i = get_global_id(0);\n"
385 " size_t last_i = get_global_size(0)-1;\n"
386 " size_t adjust = 0;\n"
387 " size_t offset = 0;\n"
388 " if(last_i == i && extra_last_thread != 0) {\n"
389 " adjust = 3-extra_last_thread;\n"
390 " } "
391 " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
392 " for(offset = 0; offset < 3; offset++)\n"
393 " {\n"
394 " vstore_half(vload_half(offset, (__private half *) data), 0, &f[3*i+offset-adjust]);\n"
395 " }\n"
396 "}\n"
397 };
398
399 char local_buf_size[10];
400 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
401
402
403 const char *source_local_store[] = {
404 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
405 "{\n"
406 " __local ushort data[16*", local_buf_size, "];\n"
407 " size_t i = get_global_id(0);\n"
408 " size_t lid = get_local_id(0);\n"
409 " size_t lsize = get_local_size(0);\n"
410 " size_t vecsize = vec_step(p[0]);\n"
411 " event_t async_event;\n"
412 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
413 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
414 " async_event = async_work_group_copy((__global ushort *)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
415 " wait_group_events(1, &async_event);\n"
416 "}\n"
417 };
418
419 const char *source_local_store_v3[] = {
420 "__kernel void test( __global float *p, __global half *f,\n"
421 " uint extra_last_thread )\n"
422 "{\n"
423 " __local ushort data[3*(",
424 local_buf_size,
425 "+1)];\n"
426 " size_t i = get_global_id(0);\n"
427 " size_t lid = get_local_id(0);\n"
428 " size_t last_i = get_global_size(0)-1;\n"
429 " size_t adjust = 0;\n"
430 " size_t lsize = get_local_size(0);\n"
431 " event_t async_event;\n"
432 " if(last_i == i && extra_last_thread != 0) {\n"
433 " adjust = 3-extra_last_thread;\n"
434 " } "
435 " vstore_half3",
436 roundName,
437 "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
438 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
439 " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
440 " extra_last_thread != 0) {\n"
441 " adjust = 3-extra_last_thread;\n"
442 " }\n"
443 " async_event = async_work_group_copy(\n"
444 " (__global ushort*)(f+3*(i-lid)),\n"
445 " (__local ushort *)(&data[adjust]),\n"
446 " lsize*3-adjust, 0);\n" // investigate later
447 " wait_group_events(1, &async_event);\n"
448 "}\n"
449 };
450
451 const char *double_source[] = {
452 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
453 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
454 "{\n"
455 " size_t i = get_global_id(0);\n"
456 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
457 "}\n"
458 };
459
460 const char *double_source_private_store[] = {
461 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
462 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
463 "{\n"
464 " __private ushort data[16];\n"
465 " size_t i = get_global_id(0);\n"
466 " size_t offset = 0;\n"
467 " size_t vecsize = vec_step(p[i]);\n"
468 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], 0, (__private half *)(&data[0]) );\n"
469 " for(offset = 0; offset < vecsize; offset++)\n"
470 " {\n"
471 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[vecsize*i+offset]);\n"
472 " }\n"
473 "}\n"
474 };
475
476
477 const char *double_source_local_store[] = {
478 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
479 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
480 "{\n"
481 " __local ushort data[16*", local_buf_size, "];\n"
482 " size_t i = get_global_id(0);\n"
483 " size_t lid = get_local_id(0);\n"
484 " size_t vecsize = vec_step(p[0]);\n"
485 " size_t lsize = get_local_size(0);\n"
486 " event_t async_event;\n"
487 " vstore_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], lid, (__local half *)(&data[0]) );\n"
488 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
489 " async_event = async_work_group_copy((__global ushort *)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), vecsize*lsize, 0);\n" // investigate later
490 " wait_group_events(1, &async_event);\n"
491 "}\n"
492 };
493
494
495 const char *double_source_v3[] = {
496 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
497 "__kernel void test( __global double *p, __global half *f ,\n"
498 " uint extra_last_thread)\n"
499 "{\n"
500 " size_t i = get_global_id(0);\n"
501 " size_t last_i = get_global_size(0)-1;\n"
502 " size_t adjust = 0;\n"
503 " if(last_i == i && extra_last_thread != 0) {\n"
504 " adjust = 3-extra_last_thread;\n"
505 " } "
506 " vstore_half3",roundName,"( vload3(i,p-adjust), i, f -adjust);\n"
507 "}\n"
508 };
509
510 const char *double_source_private_store_v3[] = {
511 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
512 "__kernel void test( __global double *p, __global half *f,\n"
513 " uint extra_last_thread )\n"
514 "{\n"
515 " __private ushort data[4];\n"
516 " size_t i = get_global_id(0);\n"
517 " size_t last_i = get_global_size(0)-1;\n"
518 " size_t adjust = 0;\n"
519 " size_t offset = 0;\n"
520 " if(last_i == i && extra_last_thread != 0) {\n"
521 " adjust = 3-extra_last_thread;\n"
522 " } "
523 " vstore_half3",roundName,"( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
524 " for(offset = 0; offset < 3; offset++)\n"
525 " {\n"
526 " vstore_half(vload_half(offset, (__private half *)data), 0, &f[3*i+offset-adjust]);\n"
527 " }\n"
528 "}\n"
529 };
530
531 const char *double_source_local_store_v3[] = {
532 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
533 "__kernel void test( __global double *p, __global half *f,\n"
534 " uint extra_last_thread )\n"
535 "{\n"
536 " __local ushort data[3*(",
537 local_buf_size,
538 "+1)];\n"
539 " size_t i = get_global_id(0);\n"
540 " size_t lid = get_local_id(0);\n"
541 " size_t last_i = get_global_size(0)-1;\n"
542 " size_t adjust = 0;\n"
543 " size_t lsize = get_local_size(0);\n"
544 " event_t async_event;\n"
545 " if(last_i == i && extra_last_thread != 0) {\n"
546 " adjust = 3-extra_last_thread;\n"
547 " }\n "
548 " vstore_half3",
549 roundName,
550 "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
551 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
552 " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
553 " extra_last_thread != 0) {\n"
554 " adjust = 3-extra_last_thread;\n"
555 " }\n"
556 " async_event = async_work_group_copy(\n"
557 " (__global ushort *)(f+3*(i-lid)),\n"
558 " (__local ushort *)(&data[adjust]),\n"
559 " lsize*3-adjust, 0);\n" // investigate later
560 " wait_group_events(1, &async_event);\n"
561 "}\n"
562 };
563
564
565 if(g_arrVecSizes[vectorSize] == 3) {
566 programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
567 } else {
568 programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
569 }
570 if( NULL == programs[ vectorSize ][0] )
571 {
572 gFailCount++;
573 return -1;
574 }
575
576 kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
577 if( NULL == kernels[vectorSize][0] )
578 {
579 gFailCount++;
580 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
581 return error;
582 }
583
584 if(g_arrVecSizes[vectorSize] == 3) {
585 programs[vectorSize][1] = MakeProgram( device, source_private_store_v3, sizeof(source_private_store_v3) / sizeof( source_private_store_v3[0]) );
586 } else {
587 programs[vectorSize][1] = MakeProgram( device, source_private_store, sizeof(source_private_store) / sizeof( source_private_store[0]) );
588 }
589 if( NULL == programs[ vectorSize ][1] )
590 {
591 gFailCount++;
592 return -1;
593 }
594
595 kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
596 if( NULL == kernels[vectorSize][1] )
597 {
598 gFailCount++;
599 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
600 return error;
601 }
602
603 if(g_arrVecSizes[vectorSize] == 3) {
604 programs[vectorSize][2] = MakeProgram( device, source_local_store_v3, sizeof(source_local_store_v3) / sizeof( source_local_store_v3[0]) );
605 if( NULL == programs[ vectorSize ][2] )
606 {
607 unsigned q;
608 for ( q= 0; q < sizeof( source_local_store_v3) / sizeof( source_local_store_v3[0]); q++)
609 vlog_error("%s", source_local_store_v3[q]);
610
611 gFailCount++;
612 return -1;
613
614 }
615 } else {
616 programs[vectorSize][2] = MakeProgram( device, source_local_store, sizeof(source_local_store) / sizeof( source_local_store[0]) );
617 if( NULL == programs[ vectorSize ][2] )
618 {
619 unsigned q;
620 for ( q= 0; q < sizeof( source_local_store) / sizeof( source_local_store[0]); q++)
621 vlog_error("%s", source_local_store[q]);
622
623 gFailCount++;
624 return -1;
625
626 }
627 }
628
629 kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
630 if( NULL == kernels[vectorSize][2] )
631 {
632 gFailCount++;
633 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
634 return error;
635 }
636
637 if( gTestDouble )
638 {
639 if(g_arrVecSizes[vectorSize] == 3) {
640 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
641 } else {
642 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
643 }
644 if( NULL == doublePrograms[ vectorSize ][0] )
645 {
646 gFailCount++;
647 return -1;
648 }
649
650 doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
651 if( NULL == kernels[vectorSize][0] )
652 {
653 gFailCount++;
654 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
655 return error;
656 }
657
658 if(g_arrVecSizes[vectorSize] == 3)
659 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store_v3, sizeof(double_source_private_store_v3) / sizeof( double_source_private_store_v3[0]) );
660 else
661 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_store, sizeof(double_source_private_store) / sizeof( double_source_private_store[0]) );
662
663 if( NULL == doublePrograms[ vectorSize ][1] )
664 {
665 gFailCount++;
666 return -1;
667 }
668
669 doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
670 if( NULL == kernels[vectorSize][1] )
671 {
672 gFailCount++;
673 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
674 return error;
675 }
676
677 if(g_arrVecSizes[vectorSize] == 3) {
678 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store_v3, sizeof(double_source_local_store_v3) / sizeof( double_source_local_store_v3[0]) );
679 } else {
680 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_store, sizeof(double_source_local_store) / sizeof( double_source_local_store[0]) );
681 }
682 if( NULL == doublePrograms[ vectorSize ][2] )
683 {
684 gFailCount++;
685 return -1;
686 }
687
688 doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
689 if( NULL == kernels[vectorSize][2] )
690 {
691 gFailCount++;
692 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
693 return error;
694 }
695 }
696 } // end for vector size
697
698 // Figure out how many elements are in a work block
699 size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
700 size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
701 uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats.
702 size_t stride = blockCount;
703
704 if (gWimpyMode)
705 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
706
707 // we handle 64-bit types a bit differently.
708 if( lastCase == 0 )
709 lastCase = 0x100000000ULL;
710
711 uint64_t i, j;
712 error = 0;
713 uint64_t printMask = (lastCase >> 4) - 1;
714 cl_uint count = 0;
715 int addressSpace;
716 size_t loopCount;
717 cl_uint threadCount = GetThreadCount();
718
719 ComputeReferenceInfoF fref;
720 fref.x = (float *)gIn_single;
721 fref.r = (cl_half *)gOut_half_reference;
722 fref.f = referenceFunc;
723 fref.lim = blockCount;
724 fref.count = (blockCount + threadCount - 1) / threadCount;
725
726 CheckResultInfoF fchk;
727 fchk.x = (const float *)gIn_single;
728 fchk.r = (const cl_half *)gOut_half_reference;
729 fchk.s = (const cl_half *)gOut_half;
730 fchk.f = referenceFunc;
731 fchk.lim = blockCount;
732 fchk.count = (blockCount + threadCount - 1) / threadCount;
733
734 ComputeReferenceInfoD dref;
735 dref.x = (double *)gIn_double;
736 dref.r = (cl_half *)gOut_half_reference_double;
737 dref.f = doubleReferenceFunc;
738 dref.lim = blockCount;
739 dref.count = (blockCount + threadCount - 1) / threadCount;
740
741 CheckResultInfoD dchk;
742 dchk.x = (const double *)gIn_double;
743 dchk.r = (const cl_half *)gOut_half_reference_double;
744 dchk.s = (const cl_half *)gOut_half;
745 dchk.f = doubleReferenceFunc;
746 dchk.lim = blockCount;
747 dchk.count = (blockCount + threadCount - 1) / threadCount;
748
749 for( i = 0; i < lastCase; i += stride )
750 {
751 count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
752 fref.i = i;
753 dref.i = i;
754
755 // Compute the input and reference
756 ThreadPool_Do(ReferenceF, threadCount, &fref);
757
758 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
759 if (error) {
760 vlog_error( "Failure in clWriteBuffer\n" );
761 gFailCount++;
762 goto exit;
763 }
764
765 if (gTestDouble) {
766 ThreadPool_Do(ReferenceD, threadCount, &dref);
767
768 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
769 if (error) {
770 vlog_error( "Failure in clWriteBuffer\n" );
771 gFailCount++;
772 goto exit;
773 }
774 }
775
776 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
777 // Loop through vector sizes
778 fchk.vsz = g_arrVecSizes[vectorSize];
779 dchk.vsz = g_arrVecSizes[vectorSize];
780
781 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
782 // Loop over address spaces
783 fchk.aspace = addressSpaceNames[addressSpace];
784 dchk.aspace = addressSpaceNames[addressSpace];
785
786 cl_uint pattern = 0xdeaddead;
787 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
788
789 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
790 0, count * sizeof(cl_half),
791 gOut_half, 0, NULL, NULL);
792 if (error) {
793 vlog_error( "Failure in clWriteArray\n" );
794 gFailCount++;
795 goto exit;
796 }
797
798 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
799 numVecs(count, vectorSize, aligned) ,
800 runsOverBy(count, vectorSize, aligned));
801 if (error) {
802 gFailCount++;
803 goto exit;
804 }
805
806 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
807 count * sizeof(cl_half), gOut_half,
808 0, NULL, NULL);
809 if (error) {
810 vlog_error( "Failure in clReadArray\n" );
811 gFailCount++;
812 goto exit;
813 }
814
815 error = ThreadPool_Do(CheckF, threadCount, &fchk);
816 if (error) {
817 gFailCount++;
818 goto exit;
819 }
820
821 if (gTestDouble) {
822 memset_pattern4( gOut_half, &pattern, BUFFER_SIZE/2);
823
824 error = clEnqueueWriteBuffer(
825 gQueue, gOutBuffer_half, CL_FALSE, 0,
826 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
827 if (error) {
828 vlog_error( "Failure in clWriteArray\n" );
829 gFailCount++;
830 goto exit;
831 }
832
833 error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
834 numVecs(count, vectorSize, aligned),
835 runsOverBy(count, vectorSize, aligned));
836 if (error) {
837 gFailCount++;
838 goto exit;
839 }
840
841 error = clEnqueueReadBuffer(
842 gQueue, gOutBuffer_half, CL_TRUE, 0,
843 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
844 if (error) {
845 vlog_error( "Failure in clReadArray\n" );
846 gFailCount++;
847 goto exit;
848 }
849
850 error = ThreadPool_Do(CheckD, threadCount, &dchk);
851 if (error) {
852 gFailCount++;
853 goto exit;
854 }
855 }
856 }
857 }
858
859 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
860 {
861 vlog( "." );
862 fflush( stdout );
863 }
864 } // end last case
865
866 loopCount = count == blockCount ? 1 : 100;
867 if( gReportTimes )
868 {
869 //Init the input stream
870 cl_float *p = (cl_float *)gIn_single;
871 for( j = 0; j < count; j++ )
872 p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
873
874 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
875 {
876 vlog_error( "Failure in clWriteArray\n" );
877 gFailCount++;
878 goto exit;
879 }
880
881 if( gTestDouble )
882 {
883 //Init the input stream
884 cl_double *q = (cl_double *)gIn_double;
885 for( j = 0; j < count; j++ )
886 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
887
888 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
889 {
890 vlog_error( "Failure in clWriteArray\n" );
891 gFailCount++;
892 goto exit;
893 }
894 }
895
896 //Run again for timing
897 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
898 {
899 uint64_t bestTime = -1ULL;
900 for( j = 0; j < loopCount; j++ )
901 {
902 uint64_t startTime = ReadTime();
903
904
905 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
906 runsOverBy(count, vectorSize, aligned)) ) )
907 {
908 gFailCount++;
909 goto exit;
910 }
911
912 if( (error = clFinish(gQueue)) )
913 {
914 vlog_error( "Failure in clFinish\n" );
915 gFailCount++;
916 goto exit;
917 }
918 uint64_t currentTime = ReadTime() - startTime;
919 if( currentTime < bestTime )
920 bestTime = currentTime;
921 time[ vectorSize ] += currentTime;
922 }
923 if( bestTime < min_time[ vectorSize ] )
924 min_time[ vectorSize ] = bestTime ;
925
926 if( gTestDouble )
927 {
928 bestTime = -1ULL;
929 for( j = 0; j < loopCount; j++ )
930 {
931 uint64_t startTime = ReadTime();
932 if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
933 runsOverBy(count, vectorSize, aligned)) ) )
934 {
935 gFailCount++;
936 goto exit;
937 }
938
939 if( (error = clFinish(gQueue)) )
940 {
941 vlog_error( "Failure in clFinish\n" );
942 gFailCount++;
943 goto exit;
944 }
945 uint64_t currentTime = ReadTime() - startTime;
946 if( currentTime < bestTime )
947 bestTime = currentTime;
948 doubleTime[ vectorSize ] += currentTime;
949 }
950 if( bestTime < min_double_time[ vectorSize ] )
951 min_double_time[ vectorSize ] = bestTime;
952 }
953 }
954 }
955
956 if( gReportTimes )
957 {
958 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
959 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
960 "average us/elem", "vStoreHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
961 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
962 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
963 "best us/elem", "vStoreHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
964 if( gTestDouble )
965 {
966 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
967 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
968 "average us/elem (double)", "vStoreHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
969 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
970 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
971 "best us/elem (double)", "vStoreHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
972 }
973 }
974
975 exit:
976 //clean up
977 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
978 {
979 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
980 clReleaseKernel( kernels[ vectorSize ][ addressSpace ] );
981 clReleaseProgram( programs[ vectorSize ][ addressSpace ] );
982 if( gTestDouble )
983 {
984 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
985 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
986 }
987 }
988 }
989
990 return error;
991 }
992
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)993 int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleReferenceFunc, const char *roundName )
994 {
995 int vectorSize, error;
996 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][3];
997 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
998
999 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1000 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1001 memset( min_time, -1, sizeof( min_time ) );
1002 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount][3];
1003 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount][3];
1004 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1005 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
1006 memset( min_double_time, -1, sizeof( min_double_time ) );
1007
1008 bool aligned = true;
1009
1010 int minVectorSize = kMinVectorSize;
1011 // There is no aligned scalar vstorea_half
1012 if( 0 == minVectorSize )
1013 minVectorSize = 1;
1014
1015 //Loop over vector sizes
1016 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1017 {
1018 const char *source[] = {
1019 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1020 "{\n"
1021 " size_t i = get_global_id(0);\n"
1022 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1023 "}\n"
1024 };
1025
1026 const char *source_v3[] = {
1027 "__kernel void test( __global float3 *p, __global half *f )\n"
1028 "{\n"
1029 " size_t i = get_global_id(0);\n"
1030 " vstorea_half3",roundName,"( p[i], i, f );\n"
1031 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1032 "}\n"
1033 };
1034
1035 const char *source_private[] = {
1036 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1037 "{\n"
1038 " __private float", vector_size_name_extensions[vectorSize], " data;\n"
1039 " size_t i = get_global_id(0);\n"
1040 " data = p[i];\n"
1041 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1042 "}\n"
1043 };
1044
1045 const char *source_private_v3[] = {
1046 "__kernel void test( __global float3 *p, __global half *f )\n"
1047 "{\n"
1048 " __private float", vector_size_name_extensions[vectorSize], " data;\n"
1049 " size_t i = get_global_id(0);\n"
1050 " data = p[i];\n"
1051 " vstorea_half3",roundName,"( data, i, f );\n"
1052 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1053 "}\n"
1054 };
1055
1056 char local_buf_size[10];
1057 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1058 const char *source_local[] = {
1059 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1060 "{\n"
1061 " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1062 " size_t i = get_global_id(0);\n"
1063 " size_t lid = get_local_id(0);\n"
1064 " data[lid] = p[i];\n"
1065 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1066 "}\n"
1067 };
1068
1069 const char *source_local_v3[] = {
1070 "__kernel void test( __global float", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1071 "{\n"
1072 " __local float", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1073 " size_t i = get_global_id(0);\n"
1074 " size_t lid = get_local_id(0);\n"
1075 " data[lid] = p[i];\n"
1076 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1077 " vstore_half",roundName,"( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1078 "}\n"
1079 };
1080
1081 const char *double_source[] = {
1082 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1083 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1084 "{\n"
1085 " size_t i = get_global_id(0);\n"
1086 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1087 "}\n"
1088 };
1089
1090 const char *double_source_v3[] = {
1091 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1092 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1093 "{\n"
1094 " size_t i = get_global_id(0);\n"
1095 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( p[i], i, f );\n"
1096 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1097 "}\n"
1098 };
1099
1100 const char *double_source_private[] = {
1101 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1102 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1103 "{\n"
1104 " __private double", vector_size_name_extensions[vectorSize], " data;\n"
1105 " size_t i = get_global_id(0);\n"
1106 " data = p[i];\n"
1107 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1108 "}\n"
1109 };
1110
1111 const char *double_source_private_v3[] = {
1112 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1113 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1114 "{\n"
1115 " __private double", vector_size_name_extensions[vectorSize], " data;\n"
1116 " size_t i = get_global_id(0);\n"
1117 " data = p[i];\n"
1118 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data, i, f );\n"
1119 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1120 "}\n"
1121 };
1122
1123 const char *double_source_local[] = {
1124 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1125 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1126 "{\n"
1127 " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1128 " size_t i = get_global_id(0);\n"
1129 " size_t lid = get_local_id(0);\n"
1130 " data[lid] = p[i];\n"
1131 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1132 "}\n"
1133 };
1134
1135 const char *double_source_local_v3[] = {
1136 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1137 "__kernel void test( __global double", vector_size_name_extensions[vectorSize]," *p, __global half *f )\n"
1138 "{\n"
1139 " __local double", vector_size_name_extensions[vectorSize], " data[", local_buf_size, "];\n"
1140 " size_t i = get_global_id(0);\n"
1141 " size_t lid = get_local_id(0);\n"
1142 " data[lid] = p[i];\n"
1143 " vstorea_half",vector_size_name_extensions[vectorSize],roundName,"( data[lid], i, f );\n"
1144 " vstore_half",roundName,"( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1145 "}\n"
1146 };
1147
1148 if(g_arrVecSizes[vectorSize] == 3) {
1149 programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) );
1150 if( NULL == programs[ vectorSize ][0] )
1151 {
1152 gFailCount++;
1153 return -1;
1154 }
1155 } else {
1156 programs[vectorSize][0] = MakeProgram( device, source, sizeof(source) / sizeof( source[0]) );
1157 if( NULL == programs[ vectorSize ][0] )
1158 {
1159 gFailCount++;
1160 return -1;
1161 }
1162 }
1163
1164 kernels[ vectorSize ][0] = clCreateKernel( programs[ vectorSize ][0], "test", &error );
1165 if( NULL == kernels[vectorSize][0] )
1166 {
1167 gFailCount++;
1168 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
1169 return error;
1170 }
1171
1172 if(g_arrVecSizes[vectorSize] == 3) {
1173 programs[vectorSize][1] = MakeProgram( device, source_private_v3, sizeof(source_private_v3) / sizeof( source_private_v3[0]) );
1174 if( NULL == programs[ vectorSize ][1] )
1175 {
1176 gFailCount++;
1177 return -1;
1178 }
1179 } else {
1180 programs[vectorSize][1] = MakeProgram( device, source_private, sizeof(source_private) / sizeof( source_private[0]) );
1181 if( NULL == programs[ vectorSize ][1] )
1182 {
1183 gFailCount++;
1184 return -1;
1185 }
1186 }
1187
1188 kernels[ vectorSize ][1] = clCreateKernel( programs[ vectorSize ][1], "test", &error );
1189 if( NULL == kernels[vectorSize][1] )
1190 {
1191 gFailCount++;
1192 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
1193 return error;
1194 }
1195
1196 if(g_arrVecSizes[vectorSize] == 3) {
1197 programs[vectorSize][2] = MakeProgram( device, source_local_v3, sizeof(source_local_v3) / sizeof( source_local_v3[0]) );
1198 if( NULL == programs[ vectorSize ][2] )
1199 {
1200 gFailCount++;
1201 return -1;
1202 }
1203 } else {
1204 programs[vectorSize][2] = MakeProgram( device, source_local, sizeof(source_local) / sizeof( source_local[0]) );
1205 if( NULL == programs[ vectorSize ][2] )
1206 {
1207 gFailCount++;
1208 return -1;
1209 }
1210 }
1211
1212 kernels[ vectorSize ][2] = clCreateKernel( programs[ vectorSize ][2], "test", &error );
1213 if( NULL == kernels[vectorSize][2] )
1214 {
1215 gFailCount++;
1216 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
1217 return error;
1218 }
1219
1220 if( gTestDouble )
1221 {
1222 if(g_arrVecSizes[vectorSize] == 3) {
1223 doublePrograms[vectorSize][0] = MakeProgram( device, double_source_v3, sizeof(double_source_v3) / sizeof( double_source_v3[0]) );
1224 if( NULL == doublePrograms[ vectorSize ][0] )
1225 {
1226 gFailCount++;
1227 return -1;
1228 }
1229 } else {
1230 doublePrograms[vectorSize][0] = MakeProgram( device, double_source, sizeof(double_source) / sizeof( double_source[0]) );
1231 if( NULL == doublePrograms[ vectorSize ][0] )
1232 {
1233 gFailCount++;
1234 return -1;
1235 }
1236 }
1237
1238 doubleKernels[ vectorSize ][0] = clCreateKernel( doublePrograms[ vectorSize ][0], "test", &error );
1239 if( NULL == kernels[vectorSize][0] )
1240 {
1241 gFailCount++;
1242 vlog_error( "\t\tFAILED -- Failed to create double kernel. (%d)\n", error );
1243 return error;
1244 }
1245
1246 if(g_arrVecSizes[vectorSize] == 3) {
1247 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private_v3, sizeof(double_source_private_v3) / sizeof( double_source_private_v3[0]) );
1248 if( NULL == doublePrograms[ vectorSize ][1] )
1249 {
1250 gFailCount++;
1251 return -1;
1252 }
1253 } else {
1254 doublePrograms[vectorSize][1] = MakeProgram( device, double_source_private, sizeof(double_source_private) / sizeof( double_source_private[0]) );
1255 if( NULL == doublePrograms[ vectorSize ][1] )
1256 {
1257 gFailCount++;
1258 return -1;
1259 }
1260 }
1261
1262 doubleKernels[ vectorSize ][1] = clCreateKernel( doublePrograms[ vectorSize ][1], "test", &error );
1263 if( NULL == kernels[vectorSize][1] )
1264 {
1265 gFailCount++;
1266 vlog_error( "\t\tFAILED -- Failed to create double private kernel. (%d)\n", error );
1267 return error;
1268 }
1269
1270 if(g_arrVecSizes[vectorSize] == 3) {
1271 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local_v3, sizeof(double_source_local_v3) / sizeof( double_source_local_v3[0]) );
1272 if( NULL == doublePrograms[ vectorSize ][2] )
1273 {
1274 gFailCount++;
1275 return -1;
1276 }
1277 } else {
1278 doublePrograms[vectorSize][2] = MakeProgram( device, double_source_local, sizeof(double_source_local) / sizeof( double_source_local[0]) );
1279 if( NULL == doublePrograms[ vectorSize ][2] )
1280 {
1281 gFailCount++;
1282 return -1;
1283 }
1284 }
1285
1286 doubleKernels[ vectorSize ][2] = clCreateKernel( doublePrograms[ vectorSize ][2], "test", &error );
1287 if( NULL == kernels[vectorSize][2] )
1288 {
1289 gFailCount++;
1290 vlog_error( "\t\tFAILED -- Failed to create double local kernel. (%d)\n", error );
1291 return error;
1292 }
1293 }
1294 }
1295
1296 // Figure out how many elements are in a work block
1297 size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
1298 size_t blockCount = BUFFER_SIZE / elementSize;
1299 uint64_t lastCase = 1ULL << (8*sizeof(float));
1300 size_t stride = blockCount;
1301
1302 if (gWimpyMode)
1303 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1304
1305 // we handle 64-bit types a bit differently.
1306 if( lastCase == 0 )
1307 lastCase = 0x100000000ULL;
1308 uint64_t i, j;
1309 error = 0;
1310 uint64_t printMask = (lastCase >> 4) - 1;
1311 cl_uint count = 0;
1312 int addressSpace;
1313 size_t loopCount;
1314 cl_uint threadCount = GetThreadCount();
1315
1316 ComputeReferenceInfoF fref;
1317 fref.x = (float *)gIn_single;
1318 fref.r = (cl_half *)gOut_half_reference;
1319 fref.f = referenceFunc;
1320 fref.lim = blockCount;
1321 fref.count = (blockCount + threadCount - 1) / threadCount;
1322
1323 CheckResultInfoF fchk;
1324 fchk.x = (const float *)gIn_single;
1325 fchk.r = (const cl_half *)gOut_half_reference;
1326 fchk.s = (const cl_half *)gOut_half;
1327 fchk.f = referenceFunc;
1328 fchk.lim = blockCount;
1329 fchk.count = (blockCount + threadCount - 1) / threadCount;
1330
1331 ComputeReferenceInfoD dref;
1332 dref.x = (double *)gIn_double;
1333 dref.r = (cl_half *)gOut_half_reference_double;
1334 dref.f = doubleReferenceFunc;
1335 dref.lim = blockCount;
1336 dref.count = (blockCount + threadCount - 1) / threadCount;
1337
1338 CheckResultInfoD dchk;
1339 dchk.x = (const double *)gIn_double;
1340 dchk.r = (const cl_half *)gOut_half_reference_double;
1341 dchk.s = (const cl_half *)gOut_half;
1342 dchk.f = doubleReferenceFunc;
1343 dchk.lim = blockCount;
1344 dchk.count = (blockCount + threadCount - 1) / threadCount;
1345
1346 for( i = 0; i < (uint64_t)lastCase; i += stride )
1347 {
1348 count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
1349 fref.i = i;
1350 dref.i = i;
1351
1352 // Create the input and reference
1353 ThreadPool_Do(ReferenceF, threadCount, &fref);
1354
1355 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0, count * sizeof(float ), gIn_single, 0, NULL, NULL);
1356 if (error) {
1357 vlog_error( "Failure in clWriteArray\n" );
1358 gFailCount++;
1359 goto exit;
1360 }
1361
1362 if (gTestDouble) {
1363 ThreadPool_Do(ReferenceD, threadCount, &dref);
1364
1365 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0, count * sizeof(double ), gIn_double, 0, NULL, NULL);
1366 if (error) {
1367 vlog_error( "Failure in clWriteArray\n" );
1368 gFailCount++;
1369 goto exit;
1370 }
1371 }
1372
1373 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) {
1374 // Loop over vector legths
1375 fchk.vsz = g_arrVecSizes[vectorSize];
1376 dchk.vsz = g_arrVecSizes[vectorSize];
1377
1378 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1379 // Loop over address spaces
1380 fchk.aspace = addressSpaceNames[addressSpace];
1381 dchk.aspace = addressSpaceNames[addressSpace];
1382
1383 cl_uint pattern = 0xdeaddead;
1384 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1385
1386 error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_FALSE,
1387 0, count * sizeof(cl_half),
1388 gOut_half, 0, NULL, NULL);
1389 if (error) {
1390 vlog_error( "Failure in clWriteArray\n" );
1391 gFailCount++;
1392 goto exit;
1393 }
1394
1395 error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_single, gOutBuffer_half,
1396 numVecs(count, vectorSize, aligned),
1397 runsOverBy(count, vectorSize, aligned));
1398 if (error) {
1399 gFailCount++;
1400 goto exit;
1401 }
1402
1403 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1404 count * sizeof(cl_half), gOut_half,
1405 0, NULL, NULL);
1406 if (error) {
1407 vlog_error( "Failure in clReadArray\n" );
1408 gFailCount++;
1409 goto exit;
1410 }
1411
1412 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1413 if (error) {
1414 gFailCount++;
1415 goto exit;
1416 }
1417
1418 if (gTestDouble) {
1419 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE/2);
1420
1421 error = clEnqueueWriteBuffer(
1422 gQueue, gOutBuffer_half, CL_FALSE, 0,
1423 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1424 if (error) {
1425 vlog_error( "Failure in clWriteArray\n" );
1426 gFailCount++;
1427 goto exit;
1428 }
1429
1430 error = RunKernel(device, doubleKernels[vectorSize][addressSpace], gInBuffer_double, gOutBuffer_half,
1431 numVecs(count, vectorSize, aligned),
1432 runsOverBy(count, vectorSize, aligned));
1433 if (error) {
1434 gFailCount++;
1435 goto exit;
1436 }
1437
1438 error = clEnqueueReadBuffer(
1439 gQueue, gOutBuffer_half, CL_TRUE, 0,
1440 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1441 if (error) {
1442 vlog_error( "Failure in clReadArray\n" );
1443 gFailCount++;
1444 goto exit;
1445 }
1446
1447 error = ThreadPool_Do(CheckD, threadCount, &dchk);
1448 if (error) {
1449 gFailCount++;
1450 goto exit;
1451 }
1452 }
1453 }
1454 } // end for vector size
1455
1456 if( ((i+blockCount) & ~printMask) == (i+blockCount) ) {
1457 vlog( "." );
1458 fflush( stdout );
1459 }
1460 } // for end lastcase
1461
1462 loopCount = count == blockCount ? 1 : 100;
1463 if( gReportTimes )
1464 {
1465 //Init the input stream
1466 cl_float *p = (cl_float *)gIn_single;
1467 for( j = 0; j < count; j++ )
1468 p[j] = (float)((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1469
1470 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0, count * sizeof( float ), gIn_single, 0, NULL, NULL)) )
1471 {
1472 vlog_error( "Failure in clWriteArray\n" );
1473 gFailCount++;
1474 goto exit;
1475 }
1476
1477 if( gTestDouble )
1478 {
1479 //Init the input stream
1480 cl_double *q = (cl_double *)gIn_double;
1481 for( j = 0; j < count; j++ )
1482 q[j] = ((double) (rand() - RAND_MAX/2) / (RAND_MAX/2));
1483
1484 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE, 0, count * sizeof( double ), gIn_double, 0, NULL, NULL)) )
1485 {
1486 vlog_error( "Failure in clWriteArray\n" );
1487 gFailCount++;
1488 goto exit;
1489 }
1490 }
1491
1492 //Run again for timing
1493 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1494 {
1495 uint64_t bestTime = -1ULL;
1496 for( j = 0; j < loopCount; j++ )
1497 {
1498 uint64_t startTime = ReadTime();
1499 if( (error = RunKernel(device, kernels[vectorSize][0], gInBuffer_single, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1500 runsOverBy(count, vectorSize, aligned)) ) )
1501 {
1502 gFailCount++;
1503 goto exit;
1504 }
1505
1506 if( (error = clFinish(gQueue)) )
1507 {
1508 vlog_error( "Failure in clFinish\n" );
1509 gFailCount++;
1510 goto exit;
1511 }
1512 uint64_t currentTime = ReadTime() - startTime;
1513 if( currentTime < bestTime )
1514 bestTime = currentTime;
1515 time[ vectorSize ] += currentTime;
1516 }
1517 if( bestTime < min_time[ vectorSize ] )
1518 min_time[ vectorSize ] = bestTime ;
1519
1520 if( gTestDouble )
1521 {
1522 bestTime = -1ULL;
1523 for( j = 0; j < loopCount; j++ )
1524 {
1525 uint64_t startTime = ReadTime();
1526 if( (error = RunKernel(device, doubleKernels[vectorSize][0], gInBuffer_double, gOutBuffer_half, numVecs(count, vectorSize, aligned) ,
1527 runsOverBy(count, vectorSize, aligned)) ) )
1528 {
1529 gFailCount++;
1530 goto exit;
1531 }
1532
1533 if( (error = clFinish(gQueue)) )
1534 {
1535 vlog_error( "Failure in clFinish\n" );
1536 gFailCount++;
1537 goto exit;
1538 }
1539 uint64_t currentTime = ReadTime() - startTime;
1540 if( currentTime < bestTime )
1541 bestTime = currentTime;
1542 doubleTime[ vectorSize ] += currentTime;
1543 }
1544 if( bestTime < min_double_time[ vectorSize ] )
1545 min_double_time[ vectorSize ] = bestTime;
1546 }
1547 }
1548 }
1549
1550 if( gReportTimes )
1551 {
1552 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1553 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1554 "average us/elem", "vStoreaHalf%s avg. (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1555 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1556 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1557 "best us/elem", "vStoreaHalf%s best (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1558 if( gTestDouble )
1559 {
1560 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1561 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0,
1562 "average us/elem (double)", "vStoreaHalf%s avg. d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1563 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1564 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
1565 "best us/elem (double)", "vStoreaHalf%s best d (%s vector size: %d)", roundName, addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
1566 }
1567 }
1568
1569 exit:
1570 //clean up
1571 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
1572 {
1573 for ( addressSpace = 0; addressSpace < 3; addressSpace++) {
1574 clReleaseKernel( kernels[ vectorSize ][addressSpace] );
1575 clReleaseProgram( programs[ vectorSize ][addressSpace] );
1576 if( gTestDouble )
1577 {
1578 clReleaseKernel( doubleKernels[ vectorSize ][addressSpace] );
1579 clReleaseProgram( doublePrograms[ vectorSize ][addressSpace] );
1580 }
1581 }
1582 }
1583
1584 return error;
1585 }
1586
1587