1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/typeWrappers.h"
18 #include "harness/conversions.h"
19 #include "harness/errorHelpers.h"
20
21 const char *crossKernelSource_double =
22 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
23 "__kernel void sample_test(__global double4 *sourceA, __global double4 *sourceB, __global double4 *destValues)\n"
24 "{\n"
25 " int tid = get_global_id(0);\n"
26 " destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
27 "\n"
28 "}\n";
29
30 const char *crossKernelSource_doubleV3 =
31 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
32 "__kernel void sample_test(__global double *sourceA, __global double *sourceB, __global double *destValues)\n"
33 "{\n"
34 " int tid = get_global_id(0);\n"
35 " vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues);\n"
36 "\n"
37 "}\n";
38
39 const char *twoToFloatKernelPattern_double =
40 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
41 "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
42 "{\n"
43 " int tid = get_global_id(0);\n"
44 " destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
45 "\n"
46 "}\n";
47
48 const char *twoToFloatKernelPattern_doubleV3 =
49 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
50 "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
51 "{\n"
52 " int tid = get_global_id(0);\n"
53 " destValues[tid] = %s( vload3( tid, (__global double*) sourceA), vload3( tid, (__global double*) sourceB ) );\n"
54 "\n"
55 "}\n";
56
57 const char *oneToFloatKernelPattern_double =
58 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
59 "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
60 "{\n"
61 " int tid = get_global_id(0);\n"
62 " destValues[tid] = %s( sourceA[tid] );\n"
63 "\n"
64 "}\n";
65
66 const char *oneToFloatKernelPattern_doubleV3 =
67 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
68 "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
69 "{\n"
70 " int tid = get_global_id(0);\n"
71 " destValues[tid] = %s( vload3( tid, (__global double*) sourceA) );\n"
72 "\n"
73 "}\n";
74
75 const char *oneToOneKernelPattern_double =
76 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
77 "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
78 "{\n"
79 " int tid = get_global_id(0);\n"
80 " destValues[tid] = %s( sourceA[tid] );\n"
81 "\n"
82 "}\n";
83
84 const char *oneToOneKernelPattern_doubleV3 =
85 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
86 "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
87 "{\n"
88 " int tid = get_global_id(0);\n"
89 " vstore3( %s( vload3( tid, (__global double*) sourceA) ), tid, (__global double*) destValues );\n"
90 "\n"
91 "}\n";
92
93 #define TEST_SIZE (1 << 20)
94
95 double verifyLength_double( double *srcA, size_t vecSize );
96 double verifyDistance_double( double *srcA, double *srcB, size_t vecSize );
97
98
99
vector2string_double(char * string,double * vector,size_t elements)100 void vector2string_double( char *string, double *vector, size_t elements )
101 {
102 *string++ = '{';
103 *string++ = ' ';
104 string += sprintf( string, "%a", vector[0] );
105 size_t i;
106 for( i = 1; i < elements; i++ )
107 string += sprintf( string, ", %a", vector[i] );
108 *string++ = ' ';
109 *string++ = '}';
110 *string = '\0';
111 }
112
fillWithTrickyNumbers_double(double * aVectors,double * bVectors,size_t vecSize)113 void fillWithTrickyNumbers_double( double *aVectors, double *bVectors, size_t vecSize )
114 {
115 static const cl_double trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
116 MAKE_HEX_DOUBLE(0x1.0p511, 0x1L, 511), MAKE_HEX_DOUBLE(0x1.8p511, 0x18L, 507), MAKE_HEX_DOUBLE(0x1.0p512, 0x1L, 512), MAKE_HEX_DOUBLE(-0x1.0p511, -0x1L, 511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p512, -0x1L, 512),
117 MAKE_HEX_DOUBLE(0x1.0p-511, 0x1L, -511), MAKE_HEX_DOUBLE(0x1.8p-511, 0x18L, -515), MAKE_HEX_DOUBLE(0x1.0p-512, 0x1L, -512), MAKE_HEX_DOUBLE(-0x1.0p-511, -0x1L, -511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p-512, -0x1L, -512),
118 DBL_MAX / 2., -DBL_MAX / 2., INFINITY, -INFINITY, 0., -0. };
119 static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
120 static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
121 size_t i, j, k;
122
123 for( j = 0; j < vecSize; j++ )
124 for( k = 0; k < vecSize; k++ )
125 for( i = 0; i < trickyCount; i++ )
126 aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
127
128 if( bVectors )
129 {
130 size_t copySize = vecSize * vecSize * trickyCount;
131 memset( bVectors, 0, sizeof(double) * copySize );
132 memset( aVectors + copySize, 0, sizeof(double) * copySize );
133 memcpy( bVectors + copySize, aVectors, sizeof(double) * copySize );
134 }
135 }
136
137
cross_product_double(const double * vecA,const double * vecB,double * outVector,double * errorTolerances,double ulpTolerance)138 void cross_product_double( const double *vecA, const double *vecB, double *outVector, double *errorTolerances, double ulpTolerance )
139 {
140 outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
141 outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
142 outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
143 outVector[ 3 ] = 0.0f;
144
145 errorTolerances[ 0 ] = fmax( fabs( vecA[ 1 ] ), fmax( fabs( vecB[ 2 ] ), fmax( fabs( vecA[ 2 ] ), fabs( vecB[ 1 ] ) ) ) );
146 errorTolerances[ 1 ] = fmax( fabs( vecA[ 2 ] ), fmax( fabs( vecB[ 0 ] ), fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 2 ] ) ) ) );
147 errorTolerances[ 2 ] = fmax( fabs( vecA[ 0 ] ), fmax( fabs( vecB[ 1 ] ), fmax( fabs( vecA[ 1 ] ), fabs( vecB[ 0 ] ) ) ) );
148
149 errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON ); // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
150 errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
151 errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
152 }
153
test_geom_cross_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)154 int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
155 {
156 cl_int error;
157 cl_ulong maxAllocSize, maxGlobalMemSize;
158
159 error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
160 error |= clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( maxGlobalMemSize ), &maxGlobalMemSize, NULL );
161 test_error( error, "Unable to get device config" );
162
163 log_info("Device supports:\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: %gMB\nCL_DEVICE_GLOBAL_MEM_SIZE: %gMB\n",
164 maxGlobalMemSize/(1024.0*1024.0), maxAllocSize/(1024.0*1024.0));
165
166 if (maxGlobalMemSize > (cl_ulong)SIZE_MAX) {
167 maxGlobalMemSize = (cl_ulong)SIZE_MAX;
168 }
169
170 unsigned int size;
171 unsigned int bufSize;
172 unsigned int adjustment;
173 int vecsize;
174
175 adjustment = 32*1024*1024; /* Try to allocate a bit less than the limits */
176 for(vecsize = 3; vecsize <= 4; ++vecsize)
177 {
178 /* Make sure we adhere to the maximum individual allocation size and global memory size limits. */
179 size = TEST_SIZE;
180 bufSize = sizeof(cl_double) * TEST_SIZE * vecsize;
181
182 while ((bufSize > (maxAllocSize - adjustment)) || (3*bufSize > (maxGlobalMemSize - adjustment))) {
183 size /= 2;
184 bufSize = sizeof(cl_double) * size * vecsize;
185 }
186
187 /* Perform the test */
188 clProgramWrapper program;
189 clKernelWrapper kernel;
190 clMemWrapper streams[3];
191 cl_double testVector[4];
192 int error, i;
193 size_t threads[1], localThreads[1];
194 BufferOwningPtr<cl_double> A(malloc(bufSize));
195 BufferOwningPtr<cl_double> B(malloc(bufSize));
196 BufferOwningPtr<cl_double> C(malloc(bufSize));
197 cl_double *inDataA = A;
198 cl_double *inDataB = B;
199 cl_double *outData = C;
200
201 /* Create kernels */
202 if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSource_doubleV3 : &crossKernelSource_double, "sample_test" ) )
203 return -1;
204
205 /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
206 for( i = 0; i < size * vecsize; i++ )
207 {
208 inDataA[ i ] = get_random_double( -512.f, 512.f, d );
209 inDataB[ i ] = get_random_double( -512.f, 512.f, d );
210 }
211 fillWithTrickyNumbers_double( inDataA, inDataB, vecsize );
212
213 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize,
214 inDataA, NULL);
215 if( streams[0] == NULL )
216 {
217 log_error("ERROR: Creating input array A failed!\n");
218 return -1;
219 }
220 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize,
221 inDataB, NULL);
222 if( streams[1] == NULL )
223 {
224 log_error("ERROR: Creating input array B failed!\n");
225 return -1;
226 }
227 streams[2] =
228 clCreateBuffer(context, CL_MEM_READ_WRITE, bufSize, NULL, NULL);
229 if( streams[2] == NULL )
230 {
231 log_error("ERROR: Creating output array failed!\n");
232 return -1;
233 }
234
235 /* Assign streams and execute */
236 for( i = 0; i < 3; i++ )
237 {
238 error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
239 test_error( error, "Unable to set indexed kernel arguments" );
240 }
241
242 /* Run the kernel */
243 threads[0] = size;
244
245 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
246 test_error( error, "Unable to get work group size to use" );
247
248 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
249 test_error( error, "Unable to execute test kernel" );
250
251 /* Now get the results */
252 error = clEnqueueReadBuffer( queue, streams[2], true, 0, bufSize, outData, 0, NULL, NULL );
253 test_error( error, "Unable to read output array!" );
254
255 /* And verify! */
256 for( i = 0; i < size; i++ )
257 {
258 double errorTolerances[ 4 ];
259 // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
260 cross_product_double( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
261
262 double errs[] = { fabs( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
263 fabs( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
264 fabs( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
265
266 if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
267 {
268 log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
269 i, testVector[0], testVector[1], testVector[2], testVector[3],
270 outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
271 log_error( " Input: (%a %a %a) and (%a %a %a)\n",
272 inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
273 inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
274 log_error( " Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
275 errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
276 log_error(" ulp %g\n", Ulp_Error_Double( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
277 return -1;
278 }
279 }
280 }
281 return 0;
282 }
283
getMaxValue_double(double vecA[],double vecB[],size_t vecSize)284 double getMaxValue_double( double vecA[], double vecB[], size_t vecSize )
285 {
286 double a = fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 0 ] ) );
287 for( size_t i = 1; i < vecSize; i++ )
288 a = fmax( fabs( vecA[ i ] ), fmax( fabs( vecB[ i ] ), a ) );
289 return a;
290 }
291
292 typedef double (*twoToFloatVerifyFn_double)( double *srcA, double *srcB, size_t vecSize );
293
test_twoToFloat_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,twoToFloatVerifyFn_double verifyFn,double ulpLimit,MTdata d)294 int test_twoToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
295 size_t vecSize, twoToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
296 {
297 clProgramWrapper program;
298 clKernelWrapper kernel;
299 clMemWrapper streams[3];
300 int error;
301 size_t i, threads[1], localThreads[1];
302 char kernelSource[10240];
303 char *programPtr;
304 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
305 BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
306 BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
307 BufferOwningPtr<cl_double> C(malloc(sizeof(cl_double) * TEST_SIZE));
308
309 cl_double *inDataA = A;
310 cl_double *inDataB = B;
311 cl_double *outData = C;
312
313 /* Create the source */
314 sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPattern_doubleV3 : twoToFloatKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
315
316 /* Create kernels */
317 programPtr = kernelSource;
318 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
319 return -1;
320
321 /* Generate some streams */
322 for( i = 0; i < TEST_SIZE * vecSize; i++ )
323 {
324 inDataA[ i ] = any_double(d);
325 inDataB[ i ] = any_double(d);
326 }
327 fillWithTrickyNumbers_double( inDataA, inDataB, vecSize );
328
329
330 streams[0] =
331 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
332 sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
333 if( streams[0] == NULL )
334 {
335 log_error("ERROR: Creating input array A failed!\n");
336 return -1;
337 }
338 streams[1] =
339 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
340 sizeof(cl_double) * vecSize * TEST_SIZE, inDataB, NULL);
341 if( streams[1] == NULL )
342 {
343 log_error("ERROR: Creating input array B failed!\n");
344 return -1;
345 }
346 streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
347 sizeof(cl_double) * TEST_SIZE, NULL, NULL);
348 if( streams[2] == NULL )
349 {
350 log_error("ERROR: Creating output array failed!\n");
351 return -1;
352 }
353
354 /* Assign streams and execute */
355 for( i = 0; i < 3; i++ )
356 {
357 error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
358 test_error( error, "Unable to set indexed kernel arguments" );
359 }
360
361 /* Run the kernel */
362 threads[0] = TEST_SIZE;
363
364 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
365 test_error( error, "Unable to get work group size to use" );
366
367 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
368 test_error( error, "Unable to execute test kernel" );
369
370 /* Now get the results */
371 error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
372 test_error( error, "Unable to read output array!" );
373
374 /* And verify! */
375 for( i = 0; i < TEST_SIZE; i++ )
376 {
377 double expected = verifyFn( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
378 if( (double) expected != outData[ i ] )
379 {
380 if( isnan(expected) && isnan( outData[i] ) )
381 continue;
382
383 if( ulpLimit < 0 )
384 {
385 // Limit below zero means we need to test via a computed error (like cross product does)
386 double maxValue =
387 getMaxValue_double( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
388
389 // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
390 double errorTolerance = maxValue * maxValue * ( 2.f * (double)vecSize - 1.f ) * FLT_EPSILON;
391
392 // Limit below zero means test via epsilon instead
393 double error = fabs( (double)expected - (double)outData[ i ] );
394 if( error > errorTolerance )
395 {
396
397 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
398 (int)i, (int)vecSize, expected,
399 outData[ i ],
400 inDataA[i*vecSize],
401 inDataB[i*vecSize],
402 (double)error,
403 (double)errorTolerance );
404
405 char vecA[1000], vecB[1000];
406 vector2string_double( vecA, inDataA + i * vecSize, vecSize );
407 vector2string_double( vecB, inDataB + i * vecSize, vecSize );
408 log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
409 return -1;
410 }
411 }
412 else
413 {
414 double error = Ulp_Error_Double( outData[ i ],
415 expected );
416 if( fabs(error) > ulpLimit )
417 {
418 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
419 (int)i, (int)vecSize, expected,
420 outData[ i ],
421 inDataA[i*vecSize],
422 inDataB[i*vecSize],
423 error );
424
425 char vecA[1000], vecB[1000];
426 vector2string_double( vecA, inDataA + i * vecSize, vecSize );
427 vector2string_double( vecB, inDataB + i * vecSize, vecSize );
428 log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
429 return -1;
430 }
431 }
432 }
433 }
434 return 0;
435 }
436
verifyDot_double(double * srcA,double * srcB,size_t vecSize)437 double verifyDot_double( double *srcA, double *srcB, size_t vecSize )
438 {
439 double total = 0.f;
440
441 for( unsigned int i = 0; i < vecSize; i++ )
442 total += (double)srcA[ i ] * (double)srcB[ i ];
443
444 return total;
445 }
446
test_geom_dot_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)447 int test_geom_dot_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
448 {
449 size_t sizes[] = { 1, 2, 3, 4, 0 };
450 unsigned int size;
451 int retVal = 0;
452
453
454 for( size = 0; sizes[ size ] != 0 ; size++ )
455 {
456 if( test_twoToFloat_kernel_double( queue, context, "dot", sizes[ size ], verifyDot_double, -1.0f /*magic value*/, d ) != 0 )
457 {
458 log_error( " dot double vector size %d FAILED\n", (int)sizes[ size ] );
459 retVal = -1;
460 }
461 }
462 return retVal;
463 }
464
465
test_geom_fast_distance_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)466 int test_geom_fast_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
467 {
468 size_t sizes[] = { 1, 2, 3, 4, 0 };
469 unsigned int size;
470 int retVal = 0;
471
472 abort(); //there is no double precision fast_distance
473
474 for( size = 0; sizes[ size ] != 0 ; size++ )
475 {
476 double maxUlps = 8192.0f + // error in sqrt
477 0.5f * // effect on e of taking sqrt( x + e )
478 ( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
479 0.5f * (double) (sizes[size]-1)); // cumulative error for additions
480
481 if( test_twoToFloat_kernel_double( queue, context, "fast_distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
482 {
483 log_error( " fast_distance double vector size %d FAILED\n", (int)sizes[ size ] );
484 retVal = -1;
485 }
486 else
487 {
488 log_info( " fast_distance double vector size %d passed\n", (int)sizes[ size ] );
489 }
490 }
491 return retVal;
492 }
493
494
verifyDistance_double(double * srcA,double * srcB,size_t vecSize)495 double verifyDistance_double( double *srcA, double *srcB, size_t vecSize )
496 {
497 unsigned int i;
498 double diff[4];
499
500 for( i = 0; i < vecSize; i++ )
501 diff[i] = srcA[i] - srcB[i];
502
503 return verifyLength_double( diff, vecSize );
504 }
505
test_geom_distance_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)506 int test_geom_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
507 {
508 size_t sizes[] = { 1, 2, 3, 4, 0 };
509 unsigned int size;
510 int retVal = 0;
511
512 for( size = 0; sizes[ size ] != 0 ; size++ )
513 {
514 double maxUlps = 3.0f + // error in sqrt
515 0.5f * // effect on e of taking sqrt( x + e )
516 ( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
517 0.5f * (double) (sizes[size]-1)); // cumulative error for additions
518
519 maxUlps *= 2.0; // our reference code may be in error too
520
521 if( test_twoToFloat_kernel_double( queue, context, "distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
522 {
523 log_error( " distance double vector size %d FAILED\n", (int)sizes[ size ] );
524 retVal = -1;
525 }
526 else
527 {
528 log_info( " distance double vector size %d passed\n", (int)sizes[ size ] );
529 }
530 }
531 return retVal;
532 }
533
534 typedef double (*oneToFloatVerifyFn_double)( double *srcA, size_t vecSize );
535
test_oneToFloat_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToFloatVerifyFn_double verifyFn,double ulpLimit,MTdata d)536 int test_oneToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
537 size_t vecSize, oneToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
538 {
539 clProgramWrapper program;
540 clKernelWrapper kernel;
541 clMemWrapper streams[2];
542 BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
543 BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE));
544 int error;
545 size_t i, threads[1], localThreads[1];
546 char kernelSource[10240];
547 char *programPtr;
548 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
549 cl_double *inDataA = A;
550 cl_double *outData = B;
551
552 /* Create the source */
553 sprintf( kernelSource, vecSize == 3 ? oneToFloatKernelPattern_doubleV3 : oneToFloatKernelPattern_double, sizeNames[vecSize-1], fnName );
554
555 /* Create kernels */
556 programPtr = kernelSource;
557 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
558 return -1;
559
560 /* Generate some streams */
561 for( i = 0; i < TEST_SIZE * vecSize; i++ )
562 inDataA[ i ] = any_double(d);
563
564 fillWithTrickyNumbers_double( inDataA, NULL, vecSize );
565
566 streams[0] =
567 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
568 sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
569 if( streams[0] == NULL )
570 {
571 log_error("ERROR: Creating input array A failed!\n");
572 return -1;
573 }
574 streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
575 sizeof(cl_double) * TEST_SIZE, NULL, NULL);
576 if( streams[1] == NULL )
577 {
578 log_error("ERROR: Creating output array failed!\n");
579 return -1;
580 }
581
582 /* Assign streams and execute */
583 error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
584 test_error( error, "Unable to set indexed kernel arguments" );
585 error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
586 test_error( error, "Unable to set indexed kernel arguments" );
587
588 /* Run the kernel */
589 threads[0] = TEST_SIZE;
590
591 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
592 test_error( error, "Unable to get work group size to use" );
593
594 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
595 test_error( error, "Unable to execute test kernel" );
596
597 /* Now get the results */
598 error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
599 test_error( error, "Unable to read output array!" );
600
601 /* And verify! */
602 for( i = 0; i < TEST_SIZE; i++ )
603 {
604 double expected = verifyFn( inDataA + i * vecSize, vecSize );
605 if( (double) expected != outData[ i ] )
606 {
607 double ulps = Ulp_Error_Double( outData[i], expected );
608 if( fabs( ulps ) <= ulpLimit )
609 continue;
610
611 // We have to special case NAN
612 if( isnan( outData[ i ] ) && isnan( expected ) )
613 continue;
614
615 if(! (fabs(ulps) < ulpLimit) )
616 {
617 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
618 (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps );
619 char vecA[1000];
620 vector2string_double( vecA, inDataA + i * vecSize, vecSize );
621 log_error( "\tvector: %s", vecA );
622 return -1;
623 }
624 }
625 }
626
627 return 0;
628 }
629
verifyLength_double(double * srcA,size_t vecSize)630 double verifyLength_double( double *srcA, size_t vecSize )
631 {
632 double total = 0;
633 unsigned int i;
634
635 // We calculate the distance as a double, to try and make up for the fact that
636 // the GPU has better precision distance since it's a single op
637 for( i = 0; i < vecSize; i++ )
638 total += srcA[i] * srcA[i];
639
640 // Deal with spurious overflow
641 if( total == INFINITY )
642 {
643 total = 0.0;
644 for( i = 0; i < vecSize; i++ )
645 {
646 double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p-600, 0x1LL, -600);
647 total += f * f;
648 }
649
650 return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p600, 0x1LL, 600);
651 }
652
653 // Deal with spurious underflow
654 if( total < 4 /*max vector length*/ * DBL_MIN / DBL_EPSILON )
655 {
656 total = 0.0;
657 for( i = 0; i < vecSize; i++ )
658 {
659 double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700);
660 total += f * f;
661 }
662
663 return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p-700, 0x1LL, -700);
664 }
665
666 return sqrt( total );
667 }
668
test_geom_length_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)669 int test_geom_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
670 {
671 size_t sizes[] = { 1, 2, 3, 4, 0 };
672 unsigned int size;
673 int retVal = 0;
674
675 for( size = 0; sizes[ size ] != 0 ; size++ )
676 {
677 double maxUlps = 3.0f + // error in sqrt
678 0.5f * // effect on e of taking sqrt( x + e )
679 ( 0.5f * (double) sizes[size] + // cumulative error for multiplications
680 0.5f * (double) (sizes[size]-1)); // cumulative error for additions
681
682 maxUlps *= 2.0; // our reference code may be in error too
683 if( test_oneToFloat_kernel_double( queue, context, "length", sizes[ size ], verifyLength_double, maxUlps, d ) != 0 )
684 {
685 log_error( " length double vector size %d FAILED\n", (int)sizes[ size ] );
686 retVal = -1;
687 }
688 else
689 {
690 log_info( " length double vector size %d passed\n", (int)sizes[ size ] );
691 }
692 }
693 return retVal;
694 }
695
696
verifyFastLength_double(double * srcA,size_t vecSize)697 double verifyFastLength_double( double *srcA, size_t vecSize )
698 {
699 double total = 0;
700 unsigned int i;
701
702 // We calculate the distance as a double, to try and make up for the fact that
703 // the GPU has better precision distance since it's a single op
704 for( i = 0; i < vecSize; i++ )
705 {
706 total += (double)srcA[i] * (double)srcA[i];
707 }
708
709 return sqrt( total );
710 }
711
test_geom_fast_length_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)712 int test_geom_fast_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
713 {
714 size_t sizes[] = { 1, 2, 3, 4, 0 };
715 unsigned int size;
716 int retVal = 0;
717
718 abort(); //there is no double precision fast_length
719
720 for( size = 0; sizes[ size ] != 0 ; size++ )
721 {
722 double maxUlps = 8192.0f + // error in half_sqrt
723 0.5f * // effect on e of taking sqrt( x + e )
724 ( 0.5f * (double) sizes[size] + // cumulative error for multiplications
725 0.5f * (double) (sizes[size]-1)); // cumulative error for additions
726
727 if( test_oneToFloat_kernel_double( queue, context, "fast_length", sizes[ size ], verifyFastLength_double, maxUlps, d ) != 0 )
728 {
729 log_error( " fast_length double vector size %d FAILED\n", (int)sizes[ size ] );
730 retVal = -1;
731 }
732 else
733 {
734 log_info( " fast_length double vector size %d passed\n", (int)sizes[ size ] );
735 }
736 }
737 return retVal;
738 }
739
740
741 typedef void (*oneToOneVerifyFn_double)( double *srcA, double *dstA, size_t vecSize );
742
test_oneToOne_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToOneVerifyFn_double verifyFn,double ulpLimit,MTdata d)743 int test_oneToOne_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
744 size_t vecSize, oneToOneVerifyFn_double verifyFn, double ulpLimit, MTdata d )
745 {
746 clProgramWrapper program;
747 clKernelWrapper kernel;
748 clMemWrapper streams[2];
749 BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
750 BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
751 int error;
752 size_t i, j, threads[1], localThreads[1];
753 char kernelSource[10240];
754 char *programPtr;
755 char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
756 cl_double *inDataA = A;
757 cl_double *outData = B;
758
759 /* Create the source */
760 sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPattern_doubleV3 : oneToOneKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
761
762 /* Create kernels */
763 programPtr = kernelSource;
764 if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
765 return -1;
766
767 /* initialize data */
768 memset( inDataA, 0, vecSize * sizeof( cl_double ) );
769 for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
770 inDataA[ i ] = any_double(d);
771
772
773 streams[0] =
774 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
775 sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
776 if( streams[0] == NULL )
777 {
778 log_error("ERROR: Creating input array A failed!\n");
779 return -1;
780 }
781 streams[1] =
782 clCreateBuffer(context, CL_MEM_READ_WRITE,
783 sizeof(cl_double) * vecSize * TEST_SIZE, NULL, NULL);
784 if( streams[1] == NULL )
785 {
786 log_error("ERROR: Creating output array failed!\n");
787 return -1;
788 }
789
790 /* Assign streams and execute */
791 error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
792 test_error( error, "Unable to set indexed kernel arguments" );
793 error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
794 test_error( error, "Unable to set indexed kernel arguments" );
795
796 /* Run the kernel */
797 threads[0] = TEST_SIZE;
798
799 error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
800 test_error( error, "Unable to get work group size to use" );
801
802 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
803 test_error( error, "Unable to execute test kernel" );
804
805 /* Now get the results */
806 error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL );
807 test_error( error, "Unable to read output array!" );
808
809 /* And verify! */
810 for( i = 0; i < TEST_SIZE; i++ )
811 {
812 double expected[4];
813 verifyFn( inDataA + i * vecSize, expected, vecSize );
814 for( j = 0; j < vecSize; j++ )
815 {
816 // We have to special case NAN
817 if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
818 continue;
819
820 if( expected[j] != outData[ i *vecSize+j ] )
821 {
822 double error =
823 Ulp_Error_Double( outData[i*vecSize + j ], expected[ j ] );
824 if( fabs(error) > ulpLimit )
825 {
826 log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
827 (int)i, (int)j, (int)vecSize,
828 expected[j], expected[j],
829 outData[i*vecSize +j],
830 outData[i*vecSize +j], error );
831 log_error( " Source: " );
832 for( size_t q = 0; q < vecSize; q++ )
833 log_error( "%g ", inDataA[ i * vecSize + q ] );
834 log_error( "\n : " );
835 for( size_t q = 0; q < vecSize; q++ )
836 log_error( "%a ", inDataA[ i * vecSize + q ] );
837 log_error( "\n" );
838 log_error( " Result: " );
839 for( size_t q = 0; q < vecSize; q++ )
840 log_error( "%g ", outData[i * vecSize + q ] );
841 log_error( "\n : " );
842 for( size_t q = 0; q < vecSize; q++ )
843 log_error( "%a ", outData[i * vecSize + q ] );
844 log_error( "\n" );
845 log_error( " Expected: " );
846 for( size_t q = 0; q < vecSize; q++ )
847 log_error( "%g ", expected[ q ] );
848 log_error( "\n : " );
849 for( size_t q = 0; q < vecSize; q++ )
850 log_error( "%a ", expected[ q ] );
851 log_error( "\n" );
852 return -1;
853 }
854 }
855 }
856 }
857
858 return 0;
859 }
860
verifyNormalize_double(double * srcA,double * dst,size_t vecSize)861 void verifyNormalize_double( double *srcA, double *dst, size_t vecSize )
862 {
863 double total = 0, value;
864 unsigned int i;
865
866 // We calculate everything as a double, to try and make up for the fact that
867 // the GPU has better precision distance since it's a single op
868 for( i = 0; i < vecSize; i++ )
869 total += (double)srcA[i] * (double)srcA[i];
870
871 if( total < vecSize * DBL_MIN / DBL_EPSILON )
872 { //we may have incurred denormalization loss -- rescale
873 total = 0;
874 for( i = 0; i < vecSize; i++ )
875 {
876 dst[i] = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700); //exact
877 total += dst[i] * dst[i];
878 }
879
880 //If still zero
881 if( total == 0.0 )
882 {
883 // Special edge case: copy vector over without change
884 for( i = 0; i < vecSize; i++ )
885 dst[i] = srcA[i];
886 return;
887 }
888
889 srcA = dst;
890 }
891 else if( total == INFINITY )
892 { //we may have incurred spurious overflow
893 double scale = MAKE_HEX_DOUBLE(0x1.0p-512, 0x1LL, -512) / vecSize;
894 total = 0;
895 for( i = 0; i < vecSize; i++ )
896 {
897 dst[i] = srcA[i] * scale; //exact
898 total += dst[i] * dst[i];
899 }
900
901 // If there are infinities here, handle those
902 if( total == INFINITY )
903 {
904 total = 0;
905 for( i = 0; i < vecSize; i++ )
906 {
907 if( isinf(dst[i]) )
908 {
909 dst[i] = copysign( 1.0, srcA[i] );
910 total += 1.0;
911 }
912 else
913 dst[i] = copysign( 0.0, srcA[i] );
914 }
915 }
916
917 srcA = dst;
918 }
919
920 value = sqrt( total );
921
922 for( i = 0; i < vecSize; i++ )
923 dst[i] = srcA[i] / value;
924 }
925
test_geom_normalize_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)926 int test_geom_normalize_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
927 {
928 size_t sizes[] = { 1, 2, 3, 4, 0 };
929 unsigned int size;
930 int retVal = 0;
931
932 for( size = 0; sizes[ size ] != 0 ; size++ )
933 {
934 double maxUlps = 2.5f + // error in rsqrt + error in multiply
935 0.5f * // effect on e of taking sqrt( x + e )
936 ( 0.5f * (double) sizes[size] + // cumulative error for multiplications
937 0.5f * (double) (sizes[size]-1)); // cumulative error for additions
938
939 maxUlps *= 2.0; //our reference code is not infinitely precise and may have error of its own
940 if( test_oneToOne_kernel_double( queue, context, "normalize", sizes[ size ], verifyNormalize_double, maxUlps, d ) != 0 )
941 {
942 log_error( " normalize double vector size %d FAILED\n", (int)sizes[ size ] );
943 retVal = -1;
944 }
945 else
946 {
947 log_info( " normalize double vector size %d passed\n", (int)sizes[ size ] );
948 }
949 }
950 return retVal;
951 }
952
953
954
955
956
957