1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include "testBase.h"
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/conversions.h"
22 #include "harness/errorHelpers.h"
23 #include <float.h>
24
25 const char *crossKernelSource =
26 "__kernel void sample_test(__global float4 *sourceA, __global float4 *sourceB, __global float4 *destValues)\n"
27 "{\n"
28 " int tid = get_global_id(0);\n"
29 " destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
30 "\n"
31 "}\n" ;
32
33 const char *crossKernelSourceV3 =
34 "__kernel void sample_test(__global float *sourceA, __global float *sourceB, __global float *destValues)\n"
35 "{\n"
36 " int tid = get_global_id(0);\n"
37 " vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues );\n"
38 "\n"
39 "}\n";
40
41 const char *twoToFloatKernelPattern =
42 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
43 "{\n"
44 " int tid = get_global_id(0);\n"
45 " destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
46 "\n"
47 "}\n";
48
49 const char *twoToFloatKernelPatternV3 =
50 "__kernel void sample_test(__global float%s *sourceA, __global float%s *sourceB, __global float *destValues)\n"
51 "{\n"
52 " int tid = get_global_id(0);\n"
53 " destValues[tid] = %s( vload3( tid, (__global float*) sourceA), vload3( tid, (__global float*) sourceB) );\n"
54 "\n"
55 "}\n";
56
57 const char *oneToFloatKernelPattern =
58 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
59 "{\n"
60 " int tid = get_global_id(0);\n"
61 " destValues[tid] = %s( sourceA[tid] );\n"
62 "\n"
63 "}\n";
64
65 const char *oneToFloatKernelPatternV3 =
66 "__kernel void sample_test(__global float%s *sourceA, __global float *destValues)\n"
67 "{\n"
68 " int tid = get_global_id(0);\n"
69 " destValues[tid] = %s( vload3( tid, (__global float*) sourceA) );\n"
70 "\n"
71 "}\n";
72
73 const char *oneToOneKernelPattern =
74 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
75 "{\n"
76 " int tid = get_global_id(0);\n"
77 " destValues[tid] = %s( sourceA[tid] );\n"
78 "\n"
79 "}\n";
80
81 const char *oneToOneKernelPatternV3 =
82 "__kernel void sample_test(__global float%s *sourceA, __global float%s *destValues)\n"
83 "{\n"
84 " int tid = get_global_id(0);\n"
85 " vstore3( %s( vload3( tid, (__global float*) sourceA) ), tid, (__global float*) destValues );\n"
86 "\n"
87 "}\n";
88
89 #define TEST_SIZE (1 << 20)
90
91 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize );
92 double verifyFastLength( float *srcA, size_t vecSize );
93
94
95
vector2string(char * string,float * vector,size_t elements)96 void vector2string( char *string, float *vector, size_t elements )
97 {
98 *string++ = '{';
99 *string++ = ' ';
100 string += sprintf( string, "%a", vector[0] );
101 size_t i;
102 for( i = 1; i < elements; i++ )
103 string += sprintf( string, ", %a", vector[i] );
104 *string++ = ' ';
105 *string++ = '}';
106 *string = '\0';
107 }
108
fillWithTrickyNumbers(float * aVectors,float * bVectors,size_t vecSize)109 void fillWithTrickyNumbers( float *aVectors, float *bVectors, size_t vecSize )
110 {
111 static const cl_float trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
112 MAKE_HEX_FLOAT(0x1.0p63f, 0x1L, 63), MAKE_HEX_FLOAT(0x1.8p63f, 0x18L, 59), MAKE_HEX_FLOAT(0x1.0p64f, 0x1L, 64), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
113 MAKE_HEX_FLOAT(0x1.0p-63f, 0x1L, -63), MAKE_HEX_FLOAT(0x1.8p-63f, 0x18L, -67), MAKE_HEX_FLOAT(0x1.0p-64f, 0x1L, -64), MAKE_HEX_FLOAT(-0x1.0p-63f, -0x1L, -63), MAKE_HEX_FLOAT(-0x1.8p-63f, -0x18L, -67), MAKE_HEX_FLOAT(-0x1.0p-64f, -0x1L, -64),
114 FLT_MAX / 2.f, -FLT_MAX / 2.f, INFINITY, -INFINITY, 0.f, -0.f };
115 static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
116 static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
117 size_t i, j, k;
118
119 for( j = 0; j < vecSize; j++ )
120 for( k = 0; k < vecSize; k++ )
121 for( i = 0; i < trickyCount; i++ )
122 aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
123
124 if( bVectors )
125 {
126 size_t copySize = vecSize * vecSize * trickyCount;
127 memset( bVectors, 0, sizeof(float) * copySize );
128 memset( aVectors + copySize, 0, sizeof(float) * copySize );
129 memcpy( bVectors + copySize, aVectors, sizeof(float) * copySize );
130 }
131 }
132
133
cross_product(const float * vecA,const float * vecB,float * outVector,float * errorTolerances,float ulpTolerance)134 void cross_product( const float *vecA, const float *vecB, float *outVector, float *errorTolerances, float ulpTolerance )
135 {
136 outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
137 outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
138 outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
139 outVector[ 3 ] = 0.0f;
140
141 errorTolerances[ 0 ] = fmaxf( fabsf( vecA[ 1 ] ), fmaxf( fabsf( vecB[ 2 ] ), fmaxf( fabsf( vecA[ 2 ] ), fabsf( vecB[ 1 ] ) ) ) );
142 errorTolerances[ 1 ] = fmaxf( fabsf( vecA[ 2 ] ), fmaxf( fabsf( vecB[ 0 ] ), fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 2 ] ) ) ) );
143 errorTolerances[ 2 ] = fmaxf( fabsf( vecA[ 0 ] ), fmaxf( fabsf( vecB[ 1 ] ), fmaxf( fabsf( vecA[ 1 ] ), fabsf( vecB[ 0 ] ) ) ) );
144
145 errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON ); // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
146 errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
147 errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
148 }
149
150
151
152
test_geom_cross(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)153 int test_geom_cross(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements )
154 {
155 int vecsize;
156 RandomSeed seed(gRandomSeed);
157
158 /* Get the default rounding mode */
159 cl_device_fp_config defaultRoundingMode = get_default_rounding_mode(deviceID);
160 if( 0 == defaultRoundingMode )
161 return -1;
162
163
164 for(vecsize = 3; vecsize <= 4; ++vecsize)
165 {
166 clProgramWrapper program;
167 clKernelWrapper kernel;
168 clMemWrapper streams[3];
169 BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
170 BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
171 BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE * vecsize));
172 cl_float testVector[4];
173 int error, i;
174 cl_float *inDataA = A;
175 cl_float *inDataB = B;
176 cl_float *outData = C;
177 size_t threads[1], localThreads[1];
178
179 /* Create kernels */
180 if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSourceV3 : &crossKernelSource, "sample_test" ) )
181 return -1;
182
183 /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
184 for( i = 0; i < TEST_SIZE * vecsize; i++ )
185 {
186 inDataA[ i ] = get_random_float( -512.f, 512.f, seed );
187 inDataB[ i ] = get_random_float( -512.f, 512.f, seed );
188 }
189 fillWithTrickyNumbers( inDataA, inDataB, vecsize );
190
191 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
192 sizeof(cl_float) * vecsize * TEST_SIZE,
193 inDataA, NULL);
194 if( streams[0] == NULL )
195 {
196 log_error("ERROR: Creating input array A failed!\n");
197 return -1;
198 }
199 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
200 sizeof(cl_float) * vecsize * TEST_SIZE,
201 inDataB, NULL);
202 if( streams[1] == NULL )
203 {
204 log_error("ERROR: Creating input array B failed!\n");
205 return -1;
206 }
207 streams[2] =
208 clCreateBuffer(context, CL_MEM_READ_WRITE,
209 sizeof(cl_float) * vecsize * TEST_SIZE, NULL, NULL);
210 if( streams[2] == NULL )
211 {
212 log_error("ERROR: Creating output array failed!\n");
213 return -1;
214 }
215
216 /* Assign streams and execute */
217 for( i = 0; i < 3; i++ )
218 {
219 error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
220 test_error( error, "Unable to set indexed kernel arguments" );
221 }
222
223 /* Run the kernel */
224 threads[0] = TEST_SIZE;
225
226 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
227 test_error( error, "Unable to get work group size to use" );
228
229 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
230 test_error( error, "Unable to execute test kernel" );
231
232 /* Now get the results */
233 error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE * vecsize, outData, 0, NULL, NULL );
234 test_error( error, "Unable to read output array!" );
235
236 /* And verify! */
237 for( i = 0; i < TEST_SIZE; i++ )
238 {
239 float errorTolerances[ 4 ];
240 // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
241 cross_product( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
242
243 // RTZ devices accrue approximately double the amount of error per operation. Allow for that.
244 if( defaultRoundingMode == CL_FP_ROUND_TO_ZERO )
245 {
246 errorTolerances[0] *= 2.0f;
247 errorTolerances[1] *= 2.0f;
248 errorTolerances[2] *= 2.0f;
249 errorTolerances[3] *= 2.0f;
250 }
251
252 float errs[] = { fabsf( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
253 fabsf( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
254 fabsf( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
255
256 if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
257 {
258 log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
259 i, testVector[0], testVector[1], testVector[2], testVector[3],
260 outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
261 log_error( " Input: (%a %a %a) and (%a %a %a)\n",
262 inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
263 inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
264 log_error( " Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
265 errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
266 log_error(" ulp %f\n", Ulp_Error( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
267 return -1;
268 }
269 }
270 } // for(vecsize=...
271
272 if(!is_extension_available(deviceID, "cl_khr_fp64")) {
273 log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
274 return 0;
275 } else {
276 log_info("Testing doubles...\n");
277 return test_geom_cross_double( deviceID, context, queue, num_elements, seed);
278 }
279 }
280
getMaxValue(float vecA[],float vecB[],size_t vecSize)281 float getMaxValue( float vecA[], float vecB[], size_t vecSize )
282 {
283 float a = fmaxf( fabsf( vecA[ 0 ] ), fabsf( vecB[ 0 ] ) );
284 for( size_t i = 1; i < vecSize; i++ )
285 a = fmaxf( fabsf( vecA[ i ] ), fmaxf( fabsf( vecB[ i ] ), a ) );
286 return a;
287 }
288
289 typedef double (*twoToFloatVerifyFn)( float *srcA, float *srcB, size_t vecSize );
290
test_twoToFloat_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,twoToFloatVerifyFn verifyFn,float ulpLimit,MTdata d)291 int test_twoToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
292 size_t vecSize, twoToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
293 {
294 clProgramWrapper program;
295 clKernelWrapper kernel;
296 clMemWrapper streams[3];
297 int error;
298 size_t i, threads[1], localThreads[1];
299 char kernelSource[10240];
300 char *programPtr;
301 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
302 int hasInfNan = 1;
303 cl_device_id device = NULL;
304
305 error = clGetCommandQueueInfo( queue, CL_QUEUE_DEVICE, sizeof( device ), &device, NULL );
306 test_error( error, "Unable to get command queue device" );
307
308 /* Check for embedded devices doing nutty stuff */
309 error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( kernelSource ), kernelSource, NULL );
310 test_error( error, "Unable to get device profile" );
311 if( 0 == strcmp( kernelSource, "EMBEDDED_PROFILE" ) )
312 {
313 cl_device_fp_config config = 0;
314 error = clGetDeviceInfo( device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( config ), &config, NULL );
315 test_error( error, "Unable to get CL_DEVICE_SINGLE_FP_CONFIG" );
316
317 if( CL_FP_ROUND_TO_ZERO == (config & (CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO)))
318 ulpLimit *= 2.0f; // rtz operations average twice the accrued error of rte operations
319
320 if( 0 == (config & CL_FP_INF_NAN) )
321 hasInfNan = 0;
322 }
323
324 BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
325 BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE * 4));
326 BufferOwningPtr<cl_float> C(malloc(sizeof(cl_float) * TEST_SIZE));
327
328 cl_float *inDataA = A;
329 cl_float *inDataB = B;
330 cl_float *outData = C;
331
332 /* Create the source */
333 sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPatternV3 : twoToFloatKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
334
335 /* Create kernels */
336 programPtr = kernelSource;
337 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
338 {
339 return -1;
340 }
341 /* Generate some streams */
342 for( i = 0; i < TEST_SIZE * vecSize; i++ )
343 {
344 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
345 inDataB[ i ] = get_random_float( -512.f, 512.f, d );
346 }
347 fillWithTrickyNumbers( inDataA, inDataB, vecSize );
348
349 /* Clamp values to be in range for fast_ functions */
350 if( verifyFn == verifyFastDistance )
351 {
352 for( i = 0; i < TEST_SIZE * vecSize; i++ )
353 {
354 if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
355 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
356 if( fabsf( inDataB[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataB[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
357 inDataB[ i ] = get_random_float( -512.f, 512.f, d );
358 }
359 }
360
361
362 streams[0] =
363 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
364 sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
365 if( streams[0] == NULL )
366 {
367 log_error("ERROR: Creating input array A failed!\n");
368 return -1;
369 }
370 streams[1] =
371 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
372 sizeof(cl_float) * vecSize * TEST_SIZE, inDataB, NULL);
373 if( streams[1] == NULL )
374 {
375 log_error("ERROR: Creating input array B failed!\n");
376 return -1;
377 }
378 streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
379 sizeof(cl_float) * TEST_SIZE, NULL, NULL);
380 if( streams[2] == NULL )
381 {
382 log_error("ERROR: Creating output array failed!\n");
383 return -1;
384 }
385
386 /* Assign streams and execute */
387 for( i = 0; i < 3; i++ )
388 {
389 error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
390 test_error( error, "Unable to set indexed kernel arguments" );
391 }
392
393 /* Run the kernel */
394 threads[0] = TEST_SIZE;
395
396 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
397 test_error( error, "Unable to get work group size to use" );
398
399 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
400 test_error( error, "Unable to execute test kernel" );
401
402 /* Now get the results */
403 error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_float ) * TEST_SIZE, outData, 0, NULL, NULL );
404 test_error( error, "Unable to read output array!" );
405
406
407 /* And verify! */
408 int skipCount = 0;
409 for( i = 0; i < TEST_SIZE; i++ )
410 {
411 cl_float *src1 = inDataA + i * vecSize;
412 cl_float *src2 = inDataB + i * vecSize;
413 double expected = verifyFn( src1, src2, vecSize );
414 if( (float) expected != outData[ i ] )
415 {
416 if( isnan(expected) && isnan( outData[i] ) )
417 continue;
418
419 if( ! hasInfNan )
420 {
421 size_t ii;
422 for( ii = 0; ii < vecSize; ii++ )
423 {
424 if( ! isfinite( src1[ii] ) || ! isfinite( src2[ii] ) )
425 {
426 skipCount++;
427 continue;
428 }
429 }
430 if( ! isfinite( (cl_float) expected ) )
431 {
432 skipCount++;
433 continue;
434 }
435 }
436
437 if( ulpLimit < 0 )
438 {
439 // Limit below zero means we need to test via a computed error (like cross product does)
440 float maxValue =
441 getMaxValue( inDataA + i * vecSize, inDataB + i * vecSize,vecSize );
442 // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
443 float errorTolerance = maxValue * maxValue * ( 2.f * (float)vecSize - 1.f ) * FLT_EPSILON;
444
445 // Limit below zero means test via epsilon instead
446 double error =
447 fabs( (double)expected - (double)outData[ i ] );
448 if( error > errorTolerance )
449 {
450
451 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
452 (int)i, (int)vecSize, expected,
453 outData[ i ],
454 inDataA[i*vecSize],
455 inDataB[i*vecSize],
456 (float)error,
457 (float)errorTolerance );
458
459 char vecA[1000], vecB[1000];
460 vector2string( vecA, inDataA +i * vecSize, vecSize );
461 vector2string( vecB, inDataB + i * vecSize, vecSize );
462 log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
463 return -1;
464 }
465 }
466 else
467 {
468 float error = Ulp_Error( outData[ i ], expected );
469 if( fabsf(error) > ulpLimit )
470 {
471 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
472 (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], inDataB[i*vecSize], error );
473
474 char vecA[1000], vecB[1000];
475 vector2string( vecA, inDataA + i * vecSize, vecSize );
476 vector2string( vecB, inDataB + i * vecSize, vecSize );
477 log_error( "\tvector A: %s, vector B: %s\n", vecA, vecB );
478 return -1;
479 }
480 }
481 }
482 }
483
484 if( skipCount )
485 log_info( "Skipped %d tests out of %d because they contained Infs or NaNs\n\tEMBEDDED_PROFILE Device does not support CL_FP_INF_NAN\n", skipCount, TEST_SIZE );
486
487 return 0;
488 }
489
verifyDot(float * srcA,float * srcB,size_t vecSize)490 double verifyDot( float *srcA, float *srcB, size_t vecSize )
491 {
492 double total = 0.f;
493
494 for( unsigned int i = 0; i < vecSize; i++ )
495 total += (double)srcA[ i ] * (double)srcB[ i ];
496
497 return total;
498 }
499
test_geom_dot(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)500 int test_geom_dot(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
501 {
502 size_t sizes[] = { 1, 2, 3, 4, 0 };
503 unsigned int size;
504 int retVal = 0;
505 RandomSeed seed(gRandomSeed);
506
507 for( size = 0; sizes[ size ] != 0 ; size++ )
508 {
509 if( test_twoToFloat_kernel( queue, context, "dot", sizes[size], verifyDot, -1.0f /*magic value*/, seed ) != 0 )
510 {
511 log_error( " dot vector size %d FAILED\n", (int)sizes[ size ] );
512 retVal = -1;
513 }
514 }
515
516 if (retVal)
517 return retVal;
518
519 if(!is_extension_available(deviceID, "cl_khr_fp64"))
520 {
521 log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
522 return 0;
523 }
524
525 log_info("Testing doubles...\n");
526 return test_geom_dot_double( deviceID, context, queue, num_elements, seed);
527 }
528
verifyFastDistance(float * srcA,float * srcB,size_t vecSize)529 double verifyFastDistance( float *srcA, float *srcB, size_t vecSize )
530 {
531 double total = 0, value;
532 unsigned int i;
533
534 // We calculate the distance as a double, to try and make up for the fact that
535 // the GPU has better precision distance since it's a single op
536 for( i = 0; i < vecSize; i++ )
537 {
538 value = (double)srcA[i] - (double)srcB[i];
539 total += value * value;
540 }
541
542 return sqrt( total );
543 }
544
test_geom_fast_distance(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)545 int test_geom_fast_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
546 {
547 size_t sizes[] = { 1, 2, 3, 4, 0 };
548 unsigned int size;
549 int retVal = 0;
550 RandomSeed seed(gRandomSeed);
551
552 for( size = 0; sizes[ size ] != 0 ; size++ )
553 {
554 float maxUlps = 8192.0f + // error in sqrt
555 ( 1.5f * (float) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
556 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
557
558 if( test_twoToFloat_kernel( queue, context, "fast_distance",
559 sizes[ size ], verifyFastDistance,
560 maxUlps, seed ) != 0 )
561 {
562 log_error( " fast_distance vector size %d FAILED\n",
563 (int)sizes[ size ] );
564 retVal = -1;
565 }
566 else
567 {
568 log_info( " fast_distance vector size %d passed\n",
569 (int)sizes[ size ] );
570 }
571 }
572 return retVal;
573 }
574
575
verifyDistance(float * srcA,float * srcB,size_t vecSize)576 double verifyDistance( float *srcA, float *srcB, size_t vecSize )
577 {
578 double total = 0, value;
579 unsigned int i;
580
581 // We calculate the distance as a double, to try and make up for the fact that
582 // the GPU has better precision distance since it's a single op
583 for( i = 0; i < vecSize; i++ )
584 {
585 value = (double)srcA[i] - (double)srcB[i];
586 total += value * value;
587 }
588
589 return sqrt( total );
590 }
591
test_geom_distance(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)592 int test_geom_distance(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
593 {
594 size_t sizes[] = { 1, 2, 3, 4, 0 };
595 unsigned int size;
596 int retVal = 0;
597 RandomSeed seed(gRandomSeed );
598
599 for( size = 0; sizes[ size ] != 0 ; size++ )
600 {
601 float maxUlps = 3.0f + // error in sqrt
602 ( 1.5f * (float) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
603 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
604
605 if( test_twoToFloat_kernel( queue, context, "distance", sizes[ size ], verifyDistance, maxUlps, seed ) != 0 )
606 {
607 log_error( " distance vector size %d FAILED\n",
608 (int)sizes[ size ] );
609 retVal = -1;
610 }
611 else
612 {
613 log_info( " distance vector size %d passed\n", (int)sizes[ size ] );
614 }
615 }
616 if (retVal)
617 return retVal;
618
619 if(!is_extension_available(deviceID, "cl_khr_fp64"))
620 {
621 log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
622 return 0;
623 } else {
624 log_info("Testing doubles...\n");
625 return test_geom_distance_double( deviceID, context, queue, num_elements, seed);
626 }
627 }
628
629 typedef double (*oneToFloatVerifyFn)( float *srcA, size_t vecSize );
630
test_oneToFloat_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToFloatVerifyFn verifyFn,float ulpLimit,MTdata d)631 int test_oneToFloat_kernel(cl_command_queue queue, cl_context context, const char *fnName,
632 size_t vecSize, oneToFloatVerifyFn verifyFn, float ulpLimit, MTdata d )
633 {
634 clProgramWrapper program;
635 clKernelWrapper kernel;
636 clMemWrapper streams[2];
637 BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE * 4));
638 BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE));
639 int error;
640 size_t i, threads[1], localThreads[1];
641 char kernelSource[10240];
642 char *programPtr;
643 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
644 cl_float *inDataA = A;
645 cl_float *outData = B;
646
647 /* Create the source */
648 sprintf( kernelSource, vecSize == 3? oneToFloatKernelPatternV3 : oneToFloatKernelPattern, sizeNames[vecSize-1], fnName );
649
650 /* Create kernels */
651 programPtr = kernelSource;
652 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
653 {
654 return -1;
655 }
656
657 /* Generate some streams */
658 for( i = 0; i < TEST_SIZE * vecSize; i++ )
659 {
660 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
661 }
662 fillWithTrickyNumbers( inDataA, NULL, vecSize );
663
664 /* Clamp values to be in range for fast_ functions */
665 if( verifyFn == verifyFastLength )
666 {
667 for( i = 0; i < TEST_SIZE * vecSize; i++ )
668 {
669 if( fabsf( inDataA[i] ) > MAKE_HEX_FLOAT(0x1.0p62f, 0x1L, 62) || fabsf( inDataA[i] ) < MAKE_HEX_FLOAT(0x1.0p-62f, 0x1L, -62) )
670 inDataA[ i ] = get_random_float( -512.f, 512.f, d );
671 }
672 }
673
674 streams[0] =
675 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
676 sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
677 if( streams[0] == NULL )
678 {
679 log_error("ERROR: Creating input array A failed!\n");
680 return -1;
681 }
682 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
683 sizeof(cl_float) * TEST_SIZE, NULL, NULL);
684 if( streams[1] == NULL )
685 {
686 log_error("ERROR: Creating output array failed!\n");
687 return -1;
688 }
689
690 /* Assign streams and execute */
691 error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
692 test_error( error, "Unable to set indexed kernel arguments" );
693 error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
694 test_error( error, "Unable to set indexed kernel arguments" );
695
696 /* Run the kernel */
697 threads[0] = TEST_SIZE;
698
699 error = get_max_common_work_group_size( context, kernel, threads[0],
700 &localThreads[0] );
701 test_error( error, "Unable to get work group size to use" );
702
703 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads,
704 localThreads, 0, NULL, NULL );
705 test_error( error, "Unable to execute test kernel" );
706
707 /* Now get the results */
708 error = clEnqueueReadBuffer( queue, streams[1], true, 0,
709 sizeof( cl_float ) * TEST_SIZE, outData,
710 0, NULL, NULL );
711 test_error( error, "Unable to read output array!" );
712
713 /* And verify! */
714 for( i = 0; i < TEST_SIZE; i++ )
715 {
716 double expected = verifyFn( inDataA + i * vecSize, vecSize );
717 if( (float) expected != outData[ i ] )
718 {
719 float ulps = Ulp_Error( outData[i], expected );
720 if( fabsf( ulps ) <= ulpLimit )
721 continue;
722
723 // We have to special case NAN
724 if( isnan( outData[ i ] ) && isnan( expected ) )
725 continue;
726
727 if(! (fabsf(ulps) < ulpLimit) )
728 {
729 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
730 (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps );
731 char vecA[1000];
732 vector2string( vecA, inDataA + i *vecSize, vecSize );
733 log_error( "\tvector: %s", vecA );
734 return -1;
735 }
736 }
737 }
738
739 return 0;
740 }
741
verifyLength(float * srcA,size_t vecSize)742 double verifyLength( float *srcA, size_t vecSize )
743 {
744 double total = 0;
745 unsigned int i;
746
747 // We calculate the distance as a double, to try and make up for the fact that
748 // the GPU has better precision distance since it's a single op
749 for( i = 0; i < vecSize; i++ )
750 {
751 total += (double)srcA[i] * (double)srcA[i];
752 }
753
754 return sqrt( total );
755 }
756
test_geom_length(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)757 int test_geom_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
758 {
759 size_t sizes[] = { 1, 2, 3, 4, 0 };
760 unsigned int size;
761 int retVal = 0;
762 RandomSeed seed( gRandomSeed );
763
764 for( size = 0; sizes[ size ] != 0 ; size++ )
765 {
766 float maxUlps = 3.0f + // error in sqrt
767 0.5f * // effect on e of taking sqrt( x + e )
768 ( 0.5f * (float) sizes[size] + // cumulative error for multiplications
769 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
770
771 if( test_oneToFloat_kernel( queue, context, "length", sizes[ size ], verifyLength, maxUlps, seed ) != 0 )
772 {
773 log_error( " length vector size %d FAILED\n", (int)sizes[ size ] );
774 retVal = -1;
775 }
776 else
777 {
778 log_info( " length vector vector size %d passed\n", (int)sizes[ size ] );
779 }
780 }
781 if (retVal)
782 return retVal;
783
784 if(!is_extension_available(deviceID, "cl_khr_fp64"))
785 {
786 log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
787 return 0;
788 }
789 else
790 {
791 log_info("Testing doubles...\n");
792 return test_geom_length_double( deviceID, context, queue, num_elements, seed);
793 }
794 }
795
796
verifyFastLength(float * srcA,size_t vecSize)797 double verifyFastLength( float *srcA, size_t vecSize )
798 {
799 double total = 0;
800 unsigned int i;
801
802 // We calculate the distance as a double, to try and make up for the fact that
803 // the GPU has better precision distance since it's a single op
804 for( i = 0; i < vecSize; i++ )
805 {
806 total += (double)srcA[i] * (double)srcA[i];
807 }
808
809 return sqrt( total );
810 }
811
test_geom_fast_length(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)812 int test_geom_fast_length(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
813 {
814 size_t sizes[] = { 1, 2, 3, 4, 0 };
815 unsigned int size;
816 int retVal = 0;
817 RandomSeed seed(gRandomSeed);
818
819 for( size = 0; sizes[ size ] != 0 ; size++ )
820 {
821 float maxUlps = 8192.0f + // error in half_sqrt
822 ( 0.5f * (float) sizes[size] + // cumulative error for multiplications
823 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
824
825 if( test_oneToFloat_kernel( queue, context, "fast_length", sizes[ size ], verifyFastLength, maxUlps, seed ) != 0 )
826 {
827 log_error( " fast_length vector size %d FAILED\n", (int)sizes[ size ] );
828 retVal = -1;
829 }
830 else
831 {
832 log_info( " fast_length vector size %d passed\n", (int)sizes[ size ] );
833 }
834 }
835 return retVal;
836 }
837
838
839 typedef void (*oneToOneVerifyFn)( float *srcA, float *dstA, size_t vecSize );
840
841
test_oneToOne_kernel(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToOneVerifyFn verifyFn,float ulpLimit,int softball,MTdata d)842 int test_oneToOne_kernel(cl_command_queue queue, cl_context context, const char *fnName,
843 size_t vecSize, oneToOneVerifyFn verifyFn, float ulpLimit, int softball, MTdata d )
844 {
845 clProgramWrapper program;
846 clKernelWrapper kernel;
847 clMemWrapper streams[2];
848 BufferOwningPtr<cl_float> A(malloc(sizeof(cl_float) * TEST_SIZE
849 * vecSize));
850 BufferOwningPtr<cl_float> B(malloc(sizeof(cl_float) * TEST_SIZE
851 * vecSize));
852 int error;
853 size_t i, j, threads[1], localThreads[1];
854 char kernelSource[10240];
855 char *programPtr;
856 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
857 cl_float *inDataA = A;
858 cl_float *outData = B;
859 float ulp_error = 0;
860
861 /* Create the source */
862 sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPatternV3: oneToOneKernelPattern, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
863
864 /* Create kernels */
865 programPtr = kernelSource;
866 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
867 return -1;
868
869 /* Initialize data. First element always 0. */
870 memset( inDataA, 0, sizeof(cl_float) * vecSize );
871 if( 0 == strcmp( fnName, "fast_normalize" ))
872 { // keep problematic cases out of the fast function
873 for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
874 {
875 cl_float z = get_random_float( -MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), MAKE_HEX_FLOAT( 0x1.0p60f, 1, 60), d);
876 if( fabsf(z) < MAKE_HEX_FLOAT( 0x1.0p-60f, 1, -60) )
877 z = copysignf( 0.0f, z );
878 inDataA[i] = z;
879 }
880 }
881 else
882 {
883 for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
884 inDataA[i] = any_float(d);
885 }
886
887 streams[0] =
888 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
889 sizeof(cl_float) * vecSize * TEST_SIZE, inDataA, NULL);
890 if( streams[0] == NULL )
891 {
892 log_error("ERROR: Creating input array A failed!\n");
893 return -1;
894 }
895 streams[1] =
896 clCreateBuffer(context, CL_MEM_READ_WRITE,
897 sizeof(cl_float) * vecSize * TEST_SIZE, NULL, NULL);
898 if( streams[1] == NULL )
899 {
900 log_error("ERROR: Creating output array failed!\n");
901 return -1;
902 }
903
904 /* Assign streams and execute */
905 error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
906 test_error( error, "Unable to set indexed kernel arguments" );
907 error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
908 test_error( error, "Unable to set indexed kernel arguments" );
909
910 /* Run the kernel */
911 threads[0] = TEST_SIZE;
912
913 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
914 test_error( error, "Unable to get work group size to use" );
915
916 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
917 test_error( error, "Unable to execute test kernel" );
918
919 /* Now get the results */
920 error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_float ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL );
921 test_error( error, "Unable to read output array!" );
922
923 /* And verify! */
924 for( i = 0; i < TEST_SIZE; i++ )
925 {
926 float expected[4];
927 int fail = 0;
928 verifyFn( inDataA + i * vecSize, expected, vecSize );
929 for( j = 0; j < vecSize; j++ )
930 {
931 // We have to special case NAN
932 if( isnan( outData[ i * vecSize + j ] )
933 && isnan( expected[ j ] ) )
934 continue;
935
936 if( expected[j] != outData[ i * vecSize + j ] ) {
937 ulp_error = Ulp_Error( outData[i*vecSize+j], expected[ j ] );
938
939 if( fabsf(ulp_error) > ulpLimit ) {
940 fail = 1;
941 break;
942 }
943 }
944
945 }
946
947 // try again with subnormals flushed to zero if the platform flushes
948 if( fail && gFlushDenormsToZero )
949 {
950 float temp[4], expected2[4];
951 for( j = 0; j < vecSize; j++ )
952 {
953 if( IsFloatSubnormal(inDataA[i*vecSize+j] ) )
954 temp[j] = copysignf( 0.0f, inDataA[i*vecSize+j] );
955 else
956 temp[j] = inDataA[ i*vecSize +j];
957 }
958
959 verifyFn( temp, expected2, vecSize );
960 fail = 0;
961
962 for( j = 0; j < vecSize; j++ )
963 {
964 // We have to special case NAN
965 if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
966 continue;
967
968 if( expected2[j] != outData[ i * vecSize + j ] )
969 {
970 ulp_error = Ulp_Error(outData[i*vecSize + j ], expected[ j ] );
971
972 if( fabsf(ulp_error) > ulpLimit )
973 {
974 if( IsFloatSubnormal(expected2[j]) )
975 {
976 expected2[j] = 0.0f;
977 if( expected2[j] != outData[i*vecSize + j ] )
978 {
979 ulp_error = Ulp_Error( outData[ i * vecSize + j ], expected[ j ] );
980 if( fabsf(ulp_error) > ulpLimit ) {
981 fail = 1;
982 break;
983 }
984 }
985 }
986 }
987 }
988 }
989 }
990
991 if( fail )
992 {
993 log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
994 (int)i, (int)j, (int)vecSize, expected[j], expected[j], outData[ i*vecSize+j], outData[ i*vecSize+j], ulp_error );
995 log_error( " Source: " );
996 for( size_t q = 0; q < vecSize; q++ )
997 log_error( "%g ", inDataA[ i * vecSize+q]);
998 log_error( "\n : " );
999 for( size_t q = 0; q < vecSize; q++ )
1000 log_error( "%a ", inDataA[i*vecSize +q] );
1001 log_error( "\n" );
1002 log_error( " Result: " );
1003 for( size_t q = 0; q < vecSize; q++ )
1004 log_error( "%g ", outData[ i *vecSize + q ] );
1005 log_error( "\n : " );
1006 for( size_t q = 0; q < vecSize; q++ )
1007 log_error( "%a ", outData[ i * vecSize + q ] );
1008 log_error( "\n" );
1009 log_error( " Expected: " );
1010 for( size_t q = 0; q < vecSize; q++ )
1011 log_error( "%g ", expected[ q ] );
1012 log_error( "\n : " );
1013 for( size_t q = 0; q < vecSize; q++ )
1014 log_error( "%a ", expected[ q ] );
1015 log_error( "\n" );
1016 return -1;
1017 }
1018 }
1019
1020 return 0;
1021 }
1022
verifyNormalize(float * srcA,float * dst,size_t vecSize)1023 void verifyNormalize( float *srcA, float *dst, size_t vecSize )
1024 {
1025 double total = 0, value;
1026 unsigned int i;
1027
1028 // We calculate everything as a double, to try and make up for the fact that
1029 // the GPU has better precision distance since it's a single op
1030 for( i = 0; i < vecSize; i++ )
1031 total += (double)srcA[i] * (double)srcA[i];
1032
1033 if( total == 0.f )
1034 {
1035 // Special edge case: copy vector over without change
1036 for( i = 0; i < vecSize; i++ )
1037 dst[i] = srcA[i];
1038 return;
1039 }
1040
1041 // Deal with infinities
1042 if( total == INFINITY )
1043 {
1044 total = 0.0f;
1045 for( i = 0; i < vecSize; i++ )
1046 {
1047 if( fabsf( srcA[i]) == INFINITY )
1048 dst[i] = copysignf( 1.0f, srcA[i] );
1049 else
1050 dst[i] = copysignf( 0.0f, srcA[i] );
1051 total += (double)dst[i] * (double)dst[i];
1052 }
1053
1054 srcA = dst;
1055 }
1056
1057 value = sqrt( total );
1058 for( i = 0; i < vecSize; i++ )
1059 dst[i] = (float)( (double)srcA[i] / value );
1060 }
1061
test_geom_normalize(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1062 int test_geom_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1063 {
1064 size_t sizes[] = { 1, 2, 3, 4, 0 };
1065 unsigned int size;
1066 int retVal = 0;
1067 RandomSeed seed(gRandomSeed);
1068
1069 for( size = 0; sizes[ size ] != 0 ; size++ )
1070 {
1071 float maxUlps = 2.5f + // error in rsqrt + error in multiply
1072 ( 0.5f * (float) sizes[size] + // cumulative error for multiplications
1073 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
1074 if( test_oneToOne_kernel( queue, context, "normalize", sizes[ size ], verifyNormalize, maxUlps, 0, seed ) != 0 )
1075 {
1076 log_error( " normalized vector size %d FAILED\n", (int)sizes[ size ] );
1077 retVal = -1;
1078 }
1079 else
1080 {
1081 log_info( " normalized vector size %d passed\n", (int)sizes[ size ] );
1082 }
1083 }
1084 if (retVal)
1085 return retVal;
1086
1087 if(!is_extension_available(deviceID, "cl_khr_fp64"))
1088 {
1089 log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
1090 return 0;
1091 } else {
1092 log_info("Testing doubles...\n");
1093 return test_geom_normalize_double( deviceID, context, queue, num_elements, seed);
1094 }
1095 }
1096
1097
test_geom_fast_normalize(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1098 int test_geom_fast_normalize(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1099 {
1100 size_t sizes[] = { 1, 2, 3, 4, 0 };
1101 unsigned int size;
1102 int retVal = 0;
1103 RandomSeed seed( gRandomSeed );
1104
1105 for( size = 0; sizes[ size ] != 0 ; size++ )
1106 {
1107 float maxUlps = 8192.5f + // error in rsqrt + error in multiply
1108 ( 0.5f * (float) sizes[size] + // cumulative error for multiplications
1109 0.5f * (float) (sizes[size]-1)); // cumulative error for additions
1110
1111 if( test_oneToOne_kernel( queue, context, "fast_normalize", sizes[ size ], verifyNormalize, maxUlps, 1, seed ) != 0 )
1112 {
1113 log_error( " fast_normalize vector size %d FAILED\n", (int)sizes[ size ] );
1114 retVal = -1;
1115 }
1116 else
1117 {
1118 log_info( " fast_normalize vector size %d passed\n", (int)sizes[ size ] );
1119 }
1120 }
1121 return retVal;
1122 }
1123
1124
1125
1126