1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "Utility.h"
17
18 #include <string.h>
19 #include "FunctionList.h"
20
21 #if defined( __APPLE__ )
22 #include <sys/time.h>
23 #endif
24
25 int TestFunc_Float_Float(const Func *f, MTdata);
26 int TestFunc_Double_Double(const Func *f, MTdata);
27
28 extern const vtbl _unary = { "unary", TestFunc_Float_Float,
29 TestFunc_Double_Double };
30
31 static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
32 static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p );
33
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p)34 static int BuildKernel( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
35 {
36 const char *c[] = {
37 "__kernel void math_kernel", sizeNames[vectorSize], "( __global float", sizeNames[vectorSize], "* out, __global float", sizeNames[vectorSize], "* in)\n"
38 "{\n"
39 " int i = get_global_id(0);\n"
40 " out[i] = ", name, "( in[i] );\n"
41 "}\n"
42 };
43 const char *c3[] = { "__kernel void math_kernel", sizeNames[vectorSize], "( __global float* out, __global float* in)\n"
44 "{\n"
45 " size_t i = get_global_id(0);\n"
46 " if( i + 1 < get_global_size(0) )\n"
47 " {\n"
48 " float3 f0 = vload3( 0, in + 3 * i );\n"
49 " f0 = ", name, "( f0 );\n"
50 " vstore3( f0, 0, out + 3*i );\n"
51 " }\n"
52 " else\n"
53 " {\n"
54 " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
55 " float3 f0;\n"
56 " switch( parity )\n"
57 " {\n"
58 " case 1:\n"
59 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
60 " break;\n"
61 " case 0:\n"
62 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
63 " break;\n"
64 " }\n"
65 " f0 = ", name, "( f0 );\n"
66 " switch( parity )\n"
67 " {\n"
68 " case 0:\n"
69 " out[3*i+1] = f0.y; \n"
70 " // fall through\n"
71 " case 1:\n"
72 " out[3*i] = f0.x; \n"
73 " break;\n"
74 " }\n"
75 " }\n"
76 "}\n"
77 };
78
79
80 const char **kern = c;
81 size_t kernSize = sizeof(c)/sizeof(c[0]);
82
83 if( sizeValues[vectorSize] == 3 )
84 {
85 kern = c3;
86 kernSize = sizeof(c3)/sizeof(c3[0]);
87 }
88
89 char testName[32];
90 snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
91
92 return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
93 }
94
BuildKernelDouble(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p)95 static int BuildKernelDouble( const char *name, int vectorSize, cl_uint kernel_count, cl_kernel *k, cl_program *p )
96 {
97 const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
98 "__kernel void math_kernel", sizeNames[vectorSize], "( __global double", sizeNames[vectorSize], "* out, __global double", sizeNames[vectorSize], "* in)\n"
99 "{\n"
100 " int i = get_global_id(0);\n"
101 " out[i] = ", name, "( in[i] );\n"
102 "}\n"
103 };
104
105 const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
106 "__kernel void math_kernel", sizeNames[vectorSize], "( __global double* out, __global double* in)\n"
107 "{\n"
108 " size_t i = get_global_id(0);\n"
109 " if( i + 1 < get_global_size(0) )\n"
110 " {\n"
111 " double3 f0 = vload3( 0, in + 3 * i );\n"
112 " f0 = ", name, "( f0 );\n"
113 " vstore3( f0, 0, out + 3*i );\n"
114 " }\n"
115 " else\n"
116 " {\n"
117 " size_t parity = i & 1; // Figure out how many elements are left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two buffer size \n"
118 " double3 f0;\n"
119 " switch( parity )\n"
120 " {\n"
121 " case 1:\n"
122 " f0 = (double3)( in[3*i], NAN, NAN ); \n"
123 " break;\n"
124 " case 0:\n"
125 " f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
126 " break;\n"
127 " }\n"
128 " f0 = ", name, "( f0 );\n"
129 " switch( parity )\n"
130 " {\n"
131 " case 0:\n"
132 " out[3*i+1] = f0.y; \n"
133 " // fall through\n"
134 " case 1:\n"
135 " out[3*i] = f0.x; \n"
136 " break;\n"
137 " }\n"
138 " }\n"
139 "}\n"
140 };
141
142 const char **kern = c;
143 size_t kernSize = sizeof(c)/sizeof(c[0]);
144
145 if( sizeValues[vectorSize] == 3 )
146 {
147 kern = c3;
148 kernSize = sizeof(c3)/sizeof(c3[0]);
149 }
150
151
152 char testName[32];
153 snprintf( testName, sizeof( testName ) -1, "math_kernel%s", sizeNames[vectorSize] );
154
155 return MakeKernels(kern, (cl_uint) kernSize, testName, kernel_count, k, p);
156 }
157
158 typedef struct BuildKernelInfo
159 {
160 cl_uint offset; // the first vector size to build
161 cl_uint kernel_count;
162 cl_kernel **kernels;
163 cl_program *programs;
164 const char *nameInCode;
165 }BuildKernelInfo;
166
167 static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
BuildKernel_FloatFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)168 static cl_int BuildKernel_FloatFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
169 {
170 BuildKernelInfo *info = (BuildKernelInfo*) p;
171 cl_uint i = info->offset + job_id;
172 return BuildKernel( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
173 }
174
175 static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p );
BuildKernel_DoubleFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)176 static cl_int BuildKernel_DoubleFn( cl_uint job_id, cl_uint thread_id UNUSED, void *p )
177 {
178 BuildKernelInfo *info = (BuildKernelInfo*) p;
179 cl_uint i = info->offset + job_id;
180 return BuildKernelDouble( info->nameInCode, i, info->kernel_count, info->kernels[i], info->programs + i );
181 }
182
183 //Thread specific data for a worker thread
184 typedef struct ThreadInfo
185 {
186 cl_mem inBuf; // input buffer for the thread
187 cl_mem outBuf[ VECTOR_SIZE_COUNT ]; // output buffers for the thread
188 float maxError; // max error value. Init to 0.
189 double maxErrorValue; // position of the max error value. Init to 0.
190 cl_command_queue tQueue; // per thread command queue to improve performance
191 }ThreadInfo;
192
193 typedef struct TestInfo
194 {
195 size_t subBufferSize; // Size of the sub-buffer in elements
196 const Func *f; // A pointer to the function info
197 cl_program programs[ VECTOR_SIZE_COUNT ]; // programs for various vector sizes
198 cl_kernel *k[VECTOR_SIZE_COUNT ]; // arrays of thread-specific kernels for each worker thread: k[vector_size][thread_id]
199 ThreadInfo *tinfo; // An array of thread specific information for each worker thread
200 cl_uint threadCount; // Number of worker threads
201 cl_uint jobCount; // Number of jobs
202 cl_uint step; // step between each chunk and the next.
203 cl_uint scale; // stride between individual test values
204 float ulps; // max_allowed ulps
205 int ftz; // non-zero if running in flush to zero mode
206
207 int isRangeLimited; // 1 if the function is only to be evaluated over a range
208 float half_sin_cos_tan_limit;
209 }TestInfo;
210
211 static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *p );
212
TestFunc_Float_Float(const Func * f,MTdata d)213 int TestFunc_Float_Float(const Func *f, MTdata d)
214 {
215 TestInfo test_info;
216 cl_int error;
217 size_t i, j;
218 float maxError = 0.0f;
219 double maxErrorVal = 0.0;
220 int skipTestingRelaxed = ( gTestFastRelaxed && strcmp(f->name,"tan") == 0 );
221
222 logFunctionInfo(f->name,sizeof(cl_float),gTestFastRelaxed);
223
224 // Init test_info
225 memset( &test_info, 0, sizeof( test_info ) );
226 test_info.threadCount = GetThreadCount();
227
228 test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
229 test_info.scale = 1;
230 if (gWimpyMode)
231 {
232 test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
233 test_info.scale = (cl_uint) sizeof(cl_float) * 2 * gWimpyReductionFactor;
234 }
235 test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
236 if (test_info.step / test_info.subBufferSize != test_info.scale)
237 {
238 //there was overflow
239 test_info.jobCount = 1;
240 }
241 else
242 {
243 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
244 }
245
246 test_info.f = f;
247 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
248 test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
249 // cl_kernels aren't thread safe, so we make one for each vector size for every thread
250 for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
251 {
252 size_t array_size = test_info.threadCount * sizeof( cl_kernel );
253 test_info.k[i] = (cl_kernel*)malloc( array_size );
254 if( NULL == test_info.k[i] )
255 {
256 vlog_error( "Error: Unable to allocate storage for kernels!\n" );
257 error = CL_OUT_OF_HOST_MEMORY;
258 goto exit;
259 }
260 memset( test_info.k[i], 0, array_size );
261 }
262 test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
263 if( NULL == test_info.tinfo )
264 {
265 vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
266 error = CL_OUT_OF_HOST_MEMORY;
267 goto exit;
268 }
269 memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
270 for( i = 0; i < test_info.threadCount; i++ )
271 {
272 cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_float), test_info.subBufferSize * sizeof( cl_float) };
273 test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
274 if( error || NULL == test_info.tinfo[i].inBuf)
275 {
276 vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
277 goto exit;
278 }
279
280 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
281 {
282 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
283 if( error || NULL == test_info.tinfo[i].outBuf[j] )
284 {
285 vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
286 goto exit;
287 }
288 }
289 test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
290 if( NULL == test_info.tinfo[i].tQueue || error )
291 {
292 vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
293 goto exit;
294 }
295
296 }
297
298 // Check for special cases for unary float
299 test_info.isRangeLimited = 0;
300 test_info.half_sin_cos_tan_limit = 0;
301 if( 0 == strcmp( f->name, "half_sin") || 0 == strcmp( f->name, "half_cos") )
302 {
303 test_info.isRangeLimited = 1;
304 test_info.half_sin_cos_tan_limit = 1.0f + test_info.ulps * (FLT_EPSILON/2.0f); // out of range results from finite inputs must be in [-1,1]
305 }
306 else if( 0 == strcmp( f->name, "half_tan"))
307 {
308 test_info.isRangeLimited = 1;
309 test_info.half_sin_cos_tan_limit = INFINITY; // out of range resut from finite inputs must be numeric
310 }
311
312 // Init the kernels
313 {
314 BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
315 if( (error = ThreadPool_Do( BuildKernel_FloatFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
316 goto exit;
317 }
318
319 if( !gSkipCorrectnessTesting || skipTestingRelaxed)
320 {
321 error = ThreadPool_Do( TestFloat, test_info.jobCount, &test_info );
322
323 // Accumulate the arithmetic errors
324 for( i = 0; i < test_info.threadCount; i++ )
325 {
326 if( test_info.tinfo[i].maxError > maxError )
327 {
328 maxError = test_info.tinfo[i].maxError;
329 maxErrorVal = test_info.tinfo[i].maxErrorValue;
330 }
331 }
332
333 if( error )
334 goto exit;
335
336 if( gWimpyMode )
337 vlog( "Wimp pass" );
338 else
339 vlog( "passed" );
340
341 if( skipTestingRelaxed )
342 {
343 vlog(" (rlx skip correctness testing)\n");
344 goto exit;
345 }
346 }
347
348 if( gMeasureTimes )
349 {
350 //Init input array
351 uint32_t *p = (uint32_t *)gIn;
352 if( strstr( f->name, "exp" ) || strstr( f->name, "sin" ) || strstr( f->name, "cos" ) || strstr( f->name, "tan" ) )
353 for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
354 ((float*)p)[j] = (float) genrand_real1(d);
355 else if( strstr( f->name, "log" ) )
356 for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
357 p[j] = genrand_int32(d) & 0x7fffffff;
358 else
359 for( j = 0; j < BUFFER_SIZE / sizeof( float ); j++ )
360 p[j] = genrand_int32(d);
361 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
362 {
363 vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
364 return error;
365 }
366
367
368 // Run the kernels
369 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
370 {
371 size_t vectorSize = sizeValues[j] * sizeof(cl_float);
372 size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
373 if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError( test_info.programs[j]); goto exit; }
374 if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
375
376 double sum = 0.0;
377 double bestTime = INFINITY;
378 for( i = 0; i < PERF_LOOP_COUNT; i++ )
379 {
380 uint64_t startTime = GetTime();
381 if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
382 {
383 vlog_error( "FAILED -- could not execute kernel\n" );
384 goto exit;
385 }
386
387 // Make sure OpenCL is done
388 if( (error = clFinish(gQueue) ) )
389 {
390 vlog_error( "Error %d at clFinish\n", error );
391 goto exit;
392 }
393
394 uint64_t endTime = GetTime();
395 double current_time = SubtractTime( endTime, startTime );
396 sum += current_time;
397 if( current_time < bestTime )
398 bestTime = current_time;
399 }
400
401 if( gReportAverageTimes )
402 bestTime = sum / PERF_LOOP_COUNT;
403 double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( float ) );
404 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s", f->name, sizeNames[j] );
405 }
406 }
407
408 if( ! gSkipCorrectnessTesting )
409 vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
410 vlog( "\n" );
411
412 exit:
413 for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
414 {
415 clReleaseProgram(test_info.programs[i]);
416 if( test_info.k[i] )
417 {
418 for( j = 0; j < test_info.threadCount; j++ )
419 clReleaseKernel(test_info.k[i][j]);
420
421 free( test_info.k[i] );
422 }
423 }
424 if( test_info.tinfo )
425 {
426 for( i = 0; i < test_info.threadCount; i++ )
427 {
428 clReleaseMemObject(test_info.tinfo[i].inBuf);
429 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
430 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
431 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
432 }
433
434 free( test_info.tinfo );
435 }
436
437 return error;
438 }
439
TestFloat(cl_uint job_id,cl_uint thread_id,void * data)440 static cl_int TestFloat( cl_uint job_id, cl_uint thread_id, void *data )
441 {
442 const TestInfo *job = (const TestInfo *) data;
443 size_t buffer_elements = job->subBufferSize;
444 size_t buffer_size = buffer_elements * sizeof( cl_float );
445 cl_uint scale = job->scale;
446 cl_uint base = job_id * (cl_uint) job->step;
447 ThreadInfo *tinfo = job->tinfo + thread_id;
448 float ulps = job->ulps;
449 fptr func = job->f->func;
450 const char * fname = job->f->name;
451 if ( gTestFastRelaxed )
452 {
453 ulps = job->f->relaxed_error;
454 func = job->f->rfunc;
455 }
456
457 cl_uint j, k;
458 cl_int error;
459
460 int isRangeLimited = job->isRangeLimited;
461 float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
462 int ftz = job->ftz;
463
464 // start the map of the output arrays
465 cl_event e[ VECTOR_SIZE_COUNT ];
466 cl_uint *out[ VECTOR_SIZE_COUNT ];
467 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
468 {
469 out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
470 if( error || NULL == out[j])
471 {
472 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
473 return error;
474 }
475 }
476
477 // Get that moving
478 if( (error = clFlush(tinfo->tQueue) ))
479 vlog( "clFlush failed\n" );
480
481 // Write the new values to the input array
482 cl_uint *p = (cl_uint*) gIn + thread_id * buffer_elements;
483 for( j = 0; j < buffer_elements; j++ )
484 {
485 p[j] = base + j * scale;
486 if( gTestFastRelaxed )
487 {
488 float p_j = *(float *) &p[j];
489 if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 ) //the domain of the function is [-pi,pi]
490 {
491 if( fabs(p_j) > M_PI )
492 p[j] = NAN;
493 }
494
495 if ( strcmp( fname, "reciprocal" ) == 0 )
496 {
497 if( fabs(p_j) > 0x7E800000 ) //the domain of the function is [2^-126,2^126]
498 p[j] = NAN;
499 }
500 }
501 }
502
503 if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
504 {
505 vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
506 return error;
507 }
508
509 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
510 {
511 //Wait for the map to finish
512 if( (error = clWaitForEvents(1, e + j) ))
513 {
514 vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
515 return error;
516 }
517 if( (error = clReleaseEvent( e[j] ) ))
518 {
519 vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
520 return error;
521 }
522
523 // Fill the result buffer with garbage, so that old results don't carry over
524 uint32_t pattern = 0xffffdead;
525 memset_pattern4(out[j], &pattern, buffer_size);
526 if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
527 {
528 vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
529 return error;
530 }
531
532 // run the kernel
533 size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
534 cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
535 cl_program program = job->programs[j];
536
537 if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
538 if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
539
540 if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
541 {
542 vlog_error( "FAILED -- could not execute kernel\n" );
543 return error;
544 }
545 }
546
547
548 // Get that moving
549 if( (error = clFlush(tinfo->tQueue) ))
550 vlog( "clFlush 2 failed\n" );
551
552 if( gSkipCorrectnessTesting )
553 return CL_SUCCESS;
554
555 //Calculate the correctly rounded reference result
556 float *r = (float *)gOut_Ref + thread_id * buffer_elements;
557 float *s = (float *)p;
558 for( j = 0; j < buffer_elements; j++ )
559 r[j] = (float) func.f_f( s[j] );
560
561 // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
562 for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
563 {
564 out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
565 if( error || NULL == out[j] )
566 {
567 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
568 return error;
569 }
570 }
571 // Wait for the last buffer
572 out[j] = (uint32_t*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
573 if( error || NULL == out[j] )
574 {
575 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
576 return error;
577 }
578
579 //Verify data
580 uint32_t *t = (uint32_t *)r;
581 for( j = 0; j < buffer_elements; j++ )
582 {
583 for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
584 {
585 uint32_t *q = out[k];
586
587 // If we aren't getting the correctly rounded result
588 if( t[j] != q[j] )
589 {
590 float test = ((float*) q)[j];
591 double correct = func.f_f( s[j] );
592 float err = Ulp_Error( test, correct );
593 float abs_error = Abs_Error( test, correct );
594 int fail = 0;
595 int use_abs_error = 0;
596
597 // it is possible for the output to not match the reference result but for Ulp_Error
598 // to be zero, for example -1.#QNAN vs. 1.#QNAN. In such cases there is no failure
599 if (err == 0.0f)
600 {
601 fail = 0;
602 }
603 else if( gTestFastRelaxed )
604 {
605 if ( strcmp(fname,"sin")==0 || strcmp(fname,"cos")==0 )
606 {
607 fail = ! (fabsf(abs_error) <= ulps);
608 use_abs_error = 1;
609 }
610
611 if ( strcmp(fname, "reciprocal") == 0 )
612 {
613 fail = ! (fabsf(err) <= ulps);
614 }
615
616 if ( strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0 )
617 {
618
619 float exp_error = 3+floor(fabs(2*s[j]));
620 fail = ! (fabsf(err) <= exp_error);
621 ulps = exp_error;
622 }
623 if (strcmp(fname, "tan") == 0) {
624
625 if( !gFastRelaxedDerived )
626 {
627 fail = ! (fabsf(err) <= ulps);
628 }
629 // Else fast math derived implementation does not require ULP verification
630 }
631 if (strcmp(fname, "exp10") == 0)
632 {
633 if( !gFastRelaxedDerived )
634 {
635 fail = ! (fabsf(err) <= ulps);
636 }
637 // Else fast math derived implementation does not require ULP verification
638 }
639 if ( strcmp(fname,"log") == 0 || strcmp(fname,"log2") == 0 )
640 {
641 if( s[j] >= 0.5 && s[j] <= 2 )
642 {
643 fail = ! (fabsf(abs_error) <= ulps );
644 }
645 else
646 {
647 ulps = gIsEmbedded ? job->f->float_embedded_ulps : job->f->float_ulps;
648 fail = ! (fabsf(err) <= ulps);
649 }
650
651 }
652
653
654 // fast-relaxed implies finite-only
655 if( IsFloatInfinity(correct) || IsFloatNaN(correct) ||
656 IsFloatInfinity(s[j]) || IsFloatNaN(s[j]) ) {
657 fail = 0;
658 err = 0;
659 }
660 }
661 else
662 {
663 fail = ! (fabsf(err) <= ulps);
664 }
665
666 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
667 if( isRangeLimited && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) && fabsf(s[j]) < INFINITY )
668 {
669 if( fabsf( test ) <= half_sin_cos_tan_limit )
670 {
671 err = 0;
672 fail = 0;
673 }
674 }
675
676 if( fail )
677 {
678 if( ftz )
679 {
680 typedef int (*CheckForSubnormal) (double,float); // If we are in fast relaxed math, we have a different calculation for the subnormal threshold.
681 CheckForSubnormal isFloatResultSubnormalPtr;
682
683 if ( gTestFastRelaxed )
684 {
685 isFloatResultSubnormalPtr = &IsFloatResultSubnormalAbsError;
686 }
687 else
688 {
689 isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
690 }
691 // retry per section 6.5.3.2
692 if( (*isFloatResultSubnormalPtr)(correct, ulps) )
693 {
694 fail = fail && ( test != 0.0f );
695 if( ! fail )
696 err = 0.0f;
697 }
698
699 // retry per section 6.5.3.3
700 if( IsFloatSubnormal( s[j] ) )
701 {
702 double correct2 = func.f_f( 0.0 );
703 double correct3 = func.f_f( -0.0 );
704 float err2;
705 float err3;
706 if( use_abs_error )
707 {
708 err2 = Abs_Error( test, correct2 );
709 err3 = Abs_Error( test, correct3 );
710 }
711 else
712 {
713 err2 = Ulp_Error( test, correct2 );
714 err3 = Ulp_Error( test, correct3 );
715 }
716 fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
717 if( fabsf( err2 ) < fabsf(err ) )
718 err = err2;
719 if( fabsf( err3 ) < fabsf(err ) )
720 err = err3;
721
722 // retry per section 6.5.3.4
723 if( (*isFloatResultSubnormalPtr)(correct2, ulps ) || (*isFloatResultSubnormalPtr)(correct3, ulps ) )
724 {
725 fail = fail && ( test != 0.0f);
726 if( ! fail )
727 err = 0.0f;
728 }
729 }
730 }
731 }
732 if( fabsf(err ) > tinfo->maxError )
733 {
734 tinfo->maxError = fabsf(err);
735 tinfo->maxErrorValue = s[j];
736 }
737 if( fail )
738 {
739 vlog_error( "\nERROR: %s%s: %f ulp error at %a (0x%8.8x): *%a vs. %a\n", job->f->name, sizeNames[k], err, ((float*) s)[j], ((uint32_t*) s)[j], ((float*) t)[j], test);
740 return -1;
741 }
742 }
743 }
744 }
745
746 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
747 {
748 if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
749 {
750 vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
751 return error;
752 }
753 }
754
755 if( (error = clFlush(tinfo->tQueue) ))
756 vlog( "clFlush 3 failed\n" );
757
758
759 if( 0 == ( base & 0x0fffffff) )
760 {
761 if (gVerboseBruteForce)
762 {
763 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount);
764 } else
765 {
766 vlog("." );
767 }
768 fflush(stdout);
769 }
770
771 return CL_SUCCESS;
772 }
773
774
775
TestDouble(cl_uint job_id,cl_uint thread_id,void * data)776 static cl_int TestDouble( cl_uint job_id, cl_uint thread_id, void *data )
777 {
778 const TestInfo *job = (const TestInfo *) data;
779 size_t buffer_elements = job->subBufferSize;
780 size_t buffer_size = buffer_elements * sizeof( cl_double );
781 cl_uint scale = job->scale;
782 cl_uint base = job_id * (cl_uint) job->step;
783 ThreadInfo *tinfo = job->tinfo + thread_id;
784 float ulps = job->ulps;
785 dptr func = job->f->dfunc;
786 cl_uint j, k;
787 cl_int error;
788 int ftz = job->ftz;
789
790 Force64BitFPUPrecision();
791
792 // start the map of the output arrays
793 cl_event e[ VECTOR_SIZE_COUNT ];
794 cl_ulong *out[ VECTOR_SIZE_COUNT ];
795 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
796 {
797 out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, buffer_size, 0, NULL, e + j, &error);
798 if( error || NULL == out[j])
799 {
800 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
801 return error;
802 }
803 }
804
805 // Get that moving
806 if( (error = clFlush(tinfo->tQueue) ))
807 vlog( "clFlush failed\n" );
808
809 // Write the new values to the input array
810 cl_double *p = (cl_double*) gIn + thread_id * buffer_elements;
811 for( j = 0; j < buffer_elements; j++ )
812 p[j] = DoubleFromUInt32( base + j * scale);
813
814 if( (error = clEnqueueWriteBuffer( tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, buffer_size, p, 0, NULL, NULL) ))
815 {
816 vlog_error( "Error: clEnqueueWriteBuffer failed! err: %d\n", error );
817 return error;
818 }
819
820 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
821 {
822 //Wait for the map to finish
823 if( (error = clWaitForEvents(1, e + j) ))
824 {
825 vlog_error( "Error: clWaitForEvents failed! err: %d\n", error );
826 return error;
827 }
828 if( (error = clReleaseEvent( e[j] ) ))
829 {
830 vlog_error( "Error: clReleaseEvent failed! err: %d\n", error );
831 return error;
832 }
833
834 // Fill the result buffer with garbage, so that old results don't carry over
835 uint32_t pattern = 0xffffdead;
836 memset_pattern4(out[j], &pattern, buffer_size);
837 if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL) ))
838 {
839 vlog_error( "Error: clEnqueueMapBuffer failed! err: %d\n", error );
840 return error;
841 }
842
843 // run the kernel
844 size_t vectorCount = (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
845 cl_kernel kernel = job->k[j][thread_id]; //each worker thread has its own copy of the cl_kernel
846 cl_program program = job->programs[j];
847
848 if( ( error = clSetKernelArg( kernel, 0, sizeof( tinfo->outBuf[j] ), &tinfo->outBuf[j] ))){ LogBuildError(program); return error; }
849 if( ( error = clSetKernelArg( kernel, 1, sizeof( tinfo->inBuf ), &tinfo->inBuf ) )) { LogBuildError(program); return error; }
850
851 if( (error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, &vectorCount, NULL, 0, NULL, NULL)))
852 {
853 vlog_error( "FAILED -- could not execute kernel\n" );
854 return error;
855 }
856 }
857
858
859 // Get that moving
860 if( (error = clFlush(tinfo->tQueue) ))
861 vlog( "clFlush 2 failed\n" );
862
863 if( gSkipCorrectnessTesting )
864 return CL_SUCCESS;
865
866 //Calculate the correctly rounded reference result
867 cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
868 cl_double *s = (cl_double *)p;
869 for( j = 0; j < buffer_elements; j++ )
870 r[j] = (cl_double) func.f_f( s[j] );
871
872 // Read the data back -- no need to wait for the first N-1 buffers. This is an in order queue.
873 for( j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++ )
874 {
875 out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
876 if( error || NULL == out[j] )
877 {
878 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
879 return error;
880 }
881 }
882 // Wait for the last buffer
883 out[j] = (cl_ulong*) clEnqueueMapBuffer( tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error);
884 if( error || NULL == out[j] )
885 {
886 vlog_error( "Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error );
887 return error;
888 }
889
890
891 //Verify data
892 cl_ulong *t = (cl_ulong *)r;
893 for( j = 0; j < buffer_elements; j++ )
894 {
895 for( k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++ )
896 {
897 cl_ulong *q = out[k];
898
899 // If we aren't getting the correctly rounded result
900 if( t[j] != q[j] )
901 {
902 cl_double test = ((cl_double*) q)[j];
903 long double correct = func.f_f( s[j] );
904 float err = Bruteforce_Ulp_Error_Double( test, correct );
905 int fail = ! (fabsf(err) <= ulps);
906
907 if( fail )
908 {
909 if( ftz )
910 {
911 // retry per section 6.5.3.2
912 if( IsDoubleResultSubnormal(correct, ulps) )
913 {
914 fail = fail && ( test != 0.0f );
915 if( ! fail )
916 err = 0.0f;
917 }
918
919 // retry per section 6.5.3.3
920 if( IsDoubleSubnormal( s[j] ) )
921 {
922 long double correct2 = func.f_f( 0.0L );
923 long double correct3 = func.f_f( -0.0L );
924 float err2 = Bruteforce_Ulp_Error_Double( test, correct2 );
925 float err3 = Bruteforce_Ulp_Error_Double( test, correct3 );
926 fail = fail && ((!(fabsf(err2) <= ulps)) && (!(fabsf(err3) <= ulps)));
927 if( fabsf( err2 ) < fabsf(err ) )
928 err = err2;
929 if( fabsf( err3 ) < fabsf(err ) )
930 err = err3;
931
932 // retry per section 6.5.3.4
933 if( IsDoubleResultSubnormal(correct2, ulps ) || IsDoubleResultSubnormal(correct3, ulps ) )
934 {
935 fail = fail && ( test != 0.0f);
936 if( ! fail )
937 err = 0.0f;
938 }
939 }
940 }
941 }
942 if( fabsf(err ) > tinfo->maxError )
943 {
944 tinfo->maxError = fabsf(err);
945 tinfo->maxErrorValue = s[j];
946 }
947 if( fail )
948 {
949 vlog_error( "\nERROR: %s%s: %f ulp error at %.13la (0x%16.16llx): *%.13la vs. %.13la\n", job->f->name, sizeNames[k], err, ((cl_double*) gIn)[j], ((cl_ulong*) gIn)[j], ((cl_double*) gOut_Ref)[j], test );
950 return -1;
951 }
952 }
953 }
954 }
955
956 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
957 {
958 if( (error = clEnqueueUnmapMemObject( tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)) )
959 {
960 vlog_error( "Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error );
961 return error;
962 }
963 }
964
965 if( (error = clFlush(tinfo->tQueue) ))
966 vlog( "clFlush 3 failed\n" );
967
968
969 if( 0 == ( base & 0x0fffffff) )
970 {
971 if (gVerboseBruteForce)
972 {
973 vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f ThreadCount:%2u\n", base, job->step, buffer_elements, job->scale, job->ulps, job->threadCount);
974 } else
975 {
976 vlog("." );
977 }
978 fflush(stdout);
979 }
980
981 return CL_SUCCESS;
982 }
983
TestFunc_Double_Double(const Func * f,MTdata d)984 int TestFunc_Double_Double(const Func *f, MTdata d)
985 {
986 TestInfo test_info;
987 cl_int error;
988 size_t i, j;
989 float maxError = 0.0f;
990 double maxErrorVal = 0.0;
991 #if defined( __APPLE__ )
992 struct timeval time_val;
993 gettimeofday( &time_val, NULL );
994 double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
995 double end_time;
996 #endif
997
998 logFunctionInfo(f->name,sizeof(cl_double),gTestFastRelaxed);
999 // Init test_info
1000 memset( &test_info, 0, sizeof( test_info ) );
1001 test_info.threadCount = GetThreadCount();
1002 test_info.subBufferSize = BUFFER_SIZE / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
1003 test_info.scale = 1;
1004 if (gWimpyMode)
1005 {
1006 test_info.subBufferSize = gWimpyBufferSize / (sizeof( cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
1007 test_info.scale = (cl_uint) sizeof(cl_double) * 2 * gWimpyReductionFactor;
1008 }
1009 test_info.step = (cl_uint) test_info.subBufferSize * test_info.scale;
1010 if (test_info.step / test_info.subBufferSize != test_info.scale)
1011 {
1012 //there was overflow
1013 test_info.jobCount = 1;
1014 }
1015 else
1016 {
1017 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
1018 }
1019
1020 test_info.f = f;
1021 test_info.ulps = f->double_ulps;
1022 test_info.ftz = f->ftz || gForceFTZ;
1023
1024 // cl_kernels aren't thread safe, so we make one for each vector size for every thread
1025 for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
1026 {
1027 size_t array_size = test_info.threadCount * sizeof( cl_kernel );
1028 test_info.k[i] = (cl_kernel*)malloc( array_size );
1029 if( NULL == test_info.k[i] )
1030 {
1031 vlog_error( "Error: Unable to allocate storage for kernels!\n" );
1032 error = CL_OUT_OF_HOST_MEMORY;
1033 goto exit;
1034 }
1035 memset( test_info.k[i], 0, array_size );
1036 }
1037 test_info.tinfo = (ThreadInfo*)malloc( test_info.threadCount * sizeof(*test_info.tinfo) );
1038 if( NULL == test_info.tinfo )
1039 {
1040 vlog_error( "Error: Unable to allocate storage for thread specific data.\n" );
1041 error = CL_OUT_OF_HOST_MEMORY;
1042 goto exit;
1043 }
1044 memset( test_info.tinfo, 0, test_info.threadCount * sizeof(*test_info.tinfo) );
1045 for( i = 0; i < test_info.threadCount; i++ )
1046 {
1047 cl_buffer_region region = { i * test_info.subBufferSize * sizeof( cl_double), test_info.subBufferSize * sizeof( cl_double) };
1048 test_info.tinfo[i].inBuf = clCreateSubBuffer( gInBuffer, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
1049 if( error || NULL == test_info.tinfo[i].inBuf)
1050 {
1051 vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
1052 goto exit;
1053 }
1054
1055 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
1056 {
1057 /* Qualcomm fix: 9461 read-write flags must be compatible with parent buffer */
1058 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
1059 /* Qualcomm fix: end */
1060 if( error || NULL == test_info.tinfo[i].outBuf[j] )
1061 {
1062 vlog_error( "Error: Unable to create sub-buffer of gInBuffer for region {%zd, %zd}\n", region.origin, region.size );
1063 goto exit;
1064 }
1065 }
1066 test_info.tinfo[i].tQueue = clCreateCommandQueue(gContext, gDevice, 0, &error);
1067 if( NULL == test_info.tinfo[i].tQueue || error )
1068 {
1069 vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
1070 goto exit;
1071 }
1072 }
1073
1074 // Init the kernels
1075 {
1076 BuildKernelInfo build_info = { gMinVectorSizeIndex, test_info.threadCount, test_info.k, test_info.programs, f->nameInCode };
1077 if( (error = ThreadPool_Do( BuildKernel_DoubleFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info ) ))
1078 goto exit;
1079 }
1080
1081 if( !gSkipCorrectnessTesting )
1082 {
1083 error = ThreadPool_Do( TestDouble, test_info.jobCount, &test_info );
1084
1085 // Accumulate the arithmetic errors
1086 for( i = 0; i < test_info.threadCount; i++ )
1087 {
1088 if( test_info.tinfo[i].maxError > maxError )
1089 {
1090 maxError = test_info.tinfo[i].maxError;
1091 maxErrorVal = test_info.tinfo[i].maxErrorValue;
1092 }
1093 }
1094
1095 if( error )
1096 goto exit;
1097
1098 if( gWimpyMode )
1099 vlog( "Wimp pass" );
1100 else
1101 vlog( "passed" );
1102 }
1103
1104
1105 #if defined( __APPLE__ )
1106 gettimeofday( &time_val, NULL);
1107 end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
1108 #endif
1109
1110 if( gMeasureTimes )
1111 {
1112 //Init input array
1113 double *p = (double *)gIn;
1114
1115 if( strstr( f->name, "exp" ) )
1116 for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
1117 p[j] = (double)genrand_real1(d);
1118 else if( strstr( f->name, "log" ) )
1119 for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
1120 p[j] = fabs(DoubleFromUInt32( genrand_int32(d)));
1121 else
1122 for( j = 0; j < BUFFER_SIZE / sizeof( double ); j++ )
1123 p[j] = DoubleFromUInt32( genrand_int32(d) );
1124 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, BUFFER_SIZE, gIn, 0, NULL, NULL) ))
1125 {
1126 vlog_error( "\n*** Error %d in clEnqueueWriteBuffer ***\n", error );
1127 return error;
1128 }
1129
1130
1131 // Run the kernels
1132 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
1133 {
1134 size_t vectorSize = sizeValues[j] * sizeof(cl_double);
1135 size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
1136 if( ( error = clSetKernelArg( test_info.k[j][0], 0, sizeof( gOutBuffer[j] ), &gOutBuffer[j] ) )) { LogBuildError(test_info.programs[j]); goto exit; }
1137 if( ( error = clSetKernelArg( test_info.k[j][0], 1, sizeof( gInBuffer ), &gInBuffer ) )) { LogBuildError(test_info.programs[j]); goto exit; }
1138
1139 double sum = 0.0;
1140 double bestTime = INFINITY;
1141 for( i = 0; i < PERF_LOOP_COUNT; i++ )
1142 {
1143 uint64_t startTime = GetTime();
1144 if( (error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0], 1, NULL, &localCount, NULL, 0, NULL, NULL)) )
1145 {
1146 vlog_error( "FAILED -- could not execute kernel\n" );
1147 goto exit;
1148 }
1149
1150 // Make sure OpenCL is done
1151 if( (error = clFinish(gQueue) ) )
1152 {
1153 vlog_error( "Error %d at clFinish\n", error );
1154 goto exit;
1155 }
1156
1157 uint64_t endTime = GetTime();
1158 double current_time = SubtractTime( endTime, startTime );
1159 sum += current_time;
1160 if( current_time < bestTime )
1161 bestTime = current_time;
1162 }
1163
1164 if( gReportAverageTimes )
1165 bestTime = sum / PERF_LOOP_COUNT;
1166 double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (BUFFER_SIZE / sizeof( double ) );
1167 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s", f->name, sizeNames[j] );
1168 }
1169 for( ; j < gMaxVectorSizeIndex; j++ )
1170 vlog( "\t -- " );
1171 }
1172
1173 if( ! gSkipCorrectnessTesting )
1174 vlog( "\t%8.2f @ %a", maxError, maxErrorVal );
1175
1176 #if defined( __APPLE__ )
1177 vlog( "\t(%2.2f seconds)", end_time - start_time );
1178 #endif
1179 vlog( "\n" );
1180
1181 exit:
1182 for( i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++ )
1183 {
1184 clReleaseProgram(test_info.programs[i]);
1185 if( test_info.k[i] )
1186 {
1187 for( j = 0; j < test_info.threadCount; j++ )
1188 clReleaseKernel(test_info.k[i][j]);
1189
1190 free( test_info.k[i] );
1191 }
1192 }
1193 if( test_info.tinfo )
1194 {
1195 for( i = 0; i < test_info.threadCount; i++ )
1196 {
1197 clReleaseMemObject(test_info.tinfo[i].inBuf);
1198 for( j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++ )
1199 clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
1200 clReleaseCommandQueue(test_info.tinfo[i].tQueue);
1201 }
1202
1203 free( test_info.tinfo );
1204 }
1205
1206 return error;
1207 }
1208
1209
1210