• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/typeWrappers.h"
18 #include "harness/conversions.h"
19 #include "harness/errorHelpers.h"
20 
21 const char *crossKernelSource_double =
22 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
23 "__kernel void sample_test(__global double4 *sourceA, __global double4 *sourceB, __global double4 *destValues)\n"
24 "{\n"
25 "    int  tid = get_global_id(0);\n"
26 "    destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n"
27 "\n"
28 "}\n";
29 
30 const char *crossKernelSource_doubleV3 =
31 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
32 "__kernel void sample_test(__global double *sourceA, __global double *sourceB, __global double *destValues)\n"
33 "{\n"
34 "    int  tid = get_global_id(0);\n"
35 "    vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues);\n"
36 "\n"
37 "}\n";
38 
39 const char *twoToFloatKernelPattern_double =
40 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
41 "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
42 "{\n"
43 "    int  tid = get_global_id(0);\n"
44 "    destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n"
45 "\n"
46 "}\n";
47 
48 const char *twoToFloatKernelPattern_doubleV3 =
49 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
50 "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n"
51 "{\n"
52 "    int  tid = get_global_id(0);\n"
53 "    destValues[tid] = %s( vload3( tid, (__global double*) sourceA), vload3( tid, (__global double*) sourceB ) );\n"
54 "\n"
55 "}\n";
56 
57 const char *oneToFloatKernelPattern_double =
58 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
59 "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
60 "{\n"
61 "    int  tid = get_global_id(0);\n"
62 "    destValues[tid] = %s( sourceA[tid] );\n"
63 "\n"
64 "}\n";
65 
66 const char *oneToFloatKernelPattern_doubleV3 =
67 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
68 "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n"
69 "{\n"
70 "    int  tid = get_global_id(0);\n"
71 "    destValues[tid] = %s( vload3( tid, (__global double*) sourceA) );\n"
72 "\n"
73 "}\n";
74 
75 const char *oneToOneKernelPattern_double =
76 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
77 "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
78 "{\n"
79 "    int  tid = get_global_id(0);\n"
80 "    destValues[tid] = %s( sourceA[tid] );\n"
81 "\n"
82 "}\n";
83 
84 const char *oneToOneKernelPattern_doubleV3 =
85 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
86 "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n"
87 "{\n"
88 "    int  tid = get_global_id(0);\n"
89 "    vstore3( %s( vload3( tid, (__global double*) sourceA) ), tid, (__global double*) destValues );\n"
90 "\n"
91 "}\n";
92 
93 #define TEST_SIZE (1 << 20)
94 
95 double verifyLength_double( double *srcA, size_t vecSize );
96 double verifyDistance_double( double *srcA, double *srcB, size_t vecSize );
97 
98 
99 
vector2string_double(char * string,double * vector,size_t elements)100 void vector2string_double( char *string, double *vector, size_t elements )
101 {
102     *string++ = '{';
103     *string++ = ' ';
104     string += sprintf( string, "%a", vector[0] );
105     size_t i;
106     for( i = 1; i < elements; i++ )
107         string += sprintf( string, ", %a", vector[i] );
108     *string++ = ' ';
109     *string++ = '}';
110     *string = '\0';
111 }
112 
fillWithTrickyNumbers_double(double * aVectors,double * bVectors,size_t vecSize)113 void fillWithTrickyNumbers_double( double *aVectors, double *bVectors, size_t vecSize )
114 {
115     static const cl_double trickyValues[] = { -FLT_EPSILON, FLT_EPSILON,
116         MAKE_HEX_DOUBLE(0x1.0p511, 0x1L, 511), MAKE_HEX_DOUBLE(0x1.8p511, 0x18L, 507), MAKE_HEX_DOUBLE(0x1.0p512, 0x1L, 512), MAKE_HEX_DOUBLE(-0x1.0p511, -0x1L, 511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p512, -0x1L, 512),
117         MAKE_HEX_DOUBLE(0x1.0p-511, 0x1L, -511), MAKE_HEX_DOUBLE(0x1.8p-511, 0x18L, -515), MAKE_HEX_DOUBLE(0x1.0p-512, 0x1L, -512), MAKE_HEX_DOUBLE(-0x1.0p-511, -0x1L, -511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p-512, -0x1L, -512),
118         DBL_MAX / 2., -DBL_MAX / 2., INFINITY,  -INFINITY, 0., -0. };
119     static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] );
120     static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount };
121     size_t i, j, k;
122 
123     for( j = 0; j < vecSize; j++ )
124         for( k = 0; k < vecSize; k++ )
125             for( i = 0; i < trickyCount; i++ )
126                 aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i];
127 
128     if( bVectors )
129     {
130         size_t copySize = vecSize * vecSize * trickyCount;
131         memset( bVectors, 0, sizeof(double) * copySize );
132         memset( aVectors + copySize, 0, sizeof(double) * copySize );
133         memcpy( bVectors + copySize, aVectors, sizeof(double) * copySize );
134     }
135 }
136 
137 
cross_product_double(const double * vecA,const double * vecB,double * outVector,double * errorTolerances,double ulpTolerance)138 void cross_product_double( const double *vecA, const double *vecB, double *outVector, double *errorTolerances, double ulpTolerance )
139 {
140     outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] );
141     outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] );
142     outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] );
143     outVector[ 3 ] = 0.0f;
144 
145     errorTolerances[ 0 ] = fmax( fabs( vecA[ 1 ] ), fmax( fabs( vecB[ 2 ] ), fmax( fabs( vecA[ 2 ] ), fabs( vecB[ 1 ] ) ) ) );
146     errorTolerances[ 1 ] = fmax( fabs( vecA[ 2 ] ), fmax( fabs( vecB[ 0 ] ), fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 2 ] ) ) ) );
147     errorTolerances[ 2 ] = fmax( fabs( vecA[ 0 ] ), fmax( fabs( vecB[ 1 ] ), fmax( fabs( vecA[ 1 ] ), fabs( vecB[ 0 ] ) ) ) );
148 
149     errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON );    // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result
150     errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON );
151     errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON );
152 }
153 
test_geom_cross_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)154 int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
155 {
156     cl_int error;
157     cl_ulong maxAllocSize, maxGlobalMemSize;
158 
159     error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL );
160     error |= clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( maxGlobalMemSize ), &maxGlobalMemSize, NULL );
161     test_error( error, "Unable to get device config" );
162 
163     log_info("Device supports:\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: %gMB\nCL_DEVICE_GLOBAL_MEM_SIZE: %gMB\n",
164              maxGlobalMemSize/(1024.0*1024.0), maxAllocSize/(1024.0*1024.0));
165 
166     if (maxGlobalMemSize > (cl_ulong)SIZE_MAX) {
167       maxGlobalMemSize = (cl_ulong)SIZE_MAX;
168     }
169 
170     unsigned int size;
171     unsigned int bufSize;
172     unsigned int adjustment;
173     int vecsize;
174 
175     adjustment = 32*1024*1024; /* Try to allocate a bit less than the limits */
176     for(vecsize = 3; vecsize <= 4; ++vecsize)
177     {
178         /* Make sure we adhere to the maximum individual allocation size and global memory size limits. */
179         size = TEST_SIZE;
180         bufSize = sizeof(cl_double) * TEST_SIZE * vecsize;
181 
182         while ((bufSize > (maxAllocSize - adjustment)) || (3*bufSize > (maxGlobalMemSize - adjustment))) {
183             size /= 2;
184             bufSize = sizeof(cl_double) * size * vecsize;
185         }
186 
187         /* Perform the test */
188         clProgramWrapper program;
189         clKernelWrapper kernel;
190         clMemWrapper streams[3];
191         cl_double testVector[4];
192         int error, i;
193         size_t threads[1], localThreads[1];
194         BufferOwningPtr<cl_double> A(malloc(bufSize));
195         BufferOwningPtr<cl_double> B(malloc(bufSize));
196         BufferOwningPtr<cl_double> C(malloc(bufSize));
197         cl_double *inDataA = A;
198         cl_double *inDataB = B;
199         cl_double *outData = C;
200 
201         /* Create kernels */
202         if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSource_doubleV3 : &crossKernelSource_double, "sample_test" ) )
203             return -1;
204 
205         /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
206         for( i = 0; i < size * vecsize; i++ )
207         {
208             inDataA[ i ] = get_random_double( -512.f, 512.f, d );
209             inDataB[ i ] = get_random_double( -512.f, 512.f, d );
210         }
211         fillWithTrickyNumbers_double( inDataA, inDataB, vecsize );
212 
213         streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize,
214                                     inDataA, NULL);
215         if( streams[0] == NULL )
216         {
217             log_error("ERROR: Creating input array A failed!\n");
218             return -1;
219         }
220         streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize,
221                                     inDataB, NULL);
222         if( streams[1] == NULL )
223         {
224             log_error("ERROR: Creating input array B failed!\n");
225             return -1;
226         }
227         streams[2] =
228             clCreateBuffer(context, CL_MEM_READ_WRITE, bufSize, NULL, NULL);
229         if( streams[2] == NULL )
230         {
231             log_error("ERROR: Creating output array failed!\n");
232             return -1;
233         }
234 
235         /* Assign streams and execute */
236         for( i = 0; i < 3; i++ )
237         {
238             error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
239             test_error( error, "Unable to set indexed kernel arguments" );
240         }
241 
242         /* Run the kernel */
243         threads[0] = size;
244 
245         error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
246         test_error( error, "Unable to get work group size to use" );
247 
248         error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
249         test_error( error, "Unable to execute test kernel" );
250 
251         /* Now get the results */
252         error = clEnqueueReadBuffer( queue, streams[2], true, 0, bufSize, outData, 0, NULL, NULL );
253         test_error( error, "Unable to read output array!" );
254 
255         /* And verify! */
256         for( i = 0; i < size; i++ )
257         {
258             double errorTolerances[ 4 ];
259             // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
260             cross_product_double( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f );
261 
262             double errs[] = {   fabs( testVector[ 0 ] - outData[ i * vecsize + 0 ] ),
263                 fabs( testVector[ 1 ] - outData[ i * vecsize + 1 ] ),
264                 fabs( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) };
265 
266             if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
267             {
268                 log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
269                           i, testVector[0], testVector[1], testVector[2], testVector[3],
270                           outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
271                 log_error( "    Input: (%a %a %a) and (%a %a %a)\n",
272                           inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
273                           inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
274                 log_error( "    Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n",
275                           errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] );
276                 log_error("     ulp %g\n", Ulp_Error_Double( outData[ i * vecsize + 1 ], testVector[ 1 ] ) );
277                 return -1;
278             }
279         }
280     }
281     return 0;
282 }
283 
getMaxValue_double(double vecA[],double vecB[],size_t vecSize)284 double getMaxValue_double( double vecA[], double vecB[], size_t vecSize )
285 {
286     double a = fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 0 ] ) );
287     for( size_t i = 1; i < vecSize; i++ )
288         a = fmax( fabs( vecA[ i ] ), fmax( fabs( vecB[ i ] ), a ) );
289     return a;
290 }
291 
292 typedef double (*twoToFloatVerifyFn_double)( double *srcA, double *srcB, size_t vecSize );
293 
test_twoToFloat_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,twoToFloatVerifyFn_double verifyFn,double ulpLimit,MTdata d)294 int test_twoToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
295                                   size_t vecSize, twoToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
296 {
297     clProgramWrapper program;
298     clKernelWrapper kernel;
299     clMemWrapper streams[3];
300     int error;
301     size_t i, threads[1], localThreads[1];
302     char kernelSource[10240];
303     char *programPtr;
304     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
305     BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
306     BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
307     BufferOwningPtr<cl_double> C(malloc(sizeof(cl_double) * TEST_SIZE));
308 
309     cl_double *inDataA = A;
310     cl_double *inDataB = B;
311     cl_double *outData = C;
312 
313     /* Create the source */
314     sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPattern_doubleV3 : twoToFloatKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
315 
316     /* Create kernels */
317     programPtr = kernelSource;
318     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
319         return -1;
320 
321     /* Generate some streams */
322     for( i = 0; i < TEST_SIZE * vecSize; i++ )
323     {
324         inDataA[ i ] = any_double(d);
325         inDataB[ i ] = any_double(d);
326     }
327     fillWithTrickyNumbers_double( inDataA, inDataB, vecSize );
328 
329 
330     streams[0] =
331         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
332                        sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
333     if( streams[0] == NULL )
334     {
335         log_error("ERROR: Creating input array A failed!\n");
336         return -1;
337     }
338     streams[1] =
339         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
340                        sizeof(cl_double) * vecSize * TEST_SIZE, inDataB, NULL);
341     if( streams[1] == NULL )
342     {
343         log_error("ERROR: Creating input array B failed!\n");
344         return -1;
345     }
346     streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
347                                 sizeof(cl_double) * TEST_SIZE, NULL, NULL);
348     if( streams[2] == NULL )
349     {
350         log_error("ERROR: Creating output array failed!\n");
351         return -1;
352     }
353 
354     /* Assign streams and execute */
355     for( i = 0; i < 3; i++ )
356     {
357         error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]);
358         test_error( error, "Unable to set indexed kernel arguments" );
359     }
360 
361     /* Run the kernel */
362     threads[0] = TEST_SIZE;
363 
364     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
365     test_error( error, "Unable to get work group size to use" );
366 
367     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
368     test_error( error, "Unable to execute test kernel" );
369 
370     /* Now get the results */
371     error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
372     test_error( error, "Unable to read output array!" );
373 
374     /* And verify! */
375     for( i = 0; i < TEST_SIZE; i++ )
376     {
377         double expected = verifyFn( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
378         if( (double) expected != outData[ i ] )
379         {
380             if( isnan(expected) && isnan( outData[i] ) )
381                 continue;
382 
383             if( ulpLimit < 0 )
384             {
385                 // Limit below zero means we need to test via a computed error (like cross product does)
386                 double maxValue =
387                 getMaxValue_double( inDataA + i * vecSize, inDataB + i * vecSize, vecSize );
388 
389                 // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors)
390                 double errorTolerance = maxValue * maxValue * ( 2.f * (double)vecSize - 1.f ) * FLT_EPSILON;
391 
392                 // Limit below zero means test via epsilon instead
393                 double error = fabs( (double)expected - (double)outData[ i ] );
394                 if( error > errorTolerance )
395                 {
396 
397                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n",
398                               (int)i, (int)vecSize, expected,
399                               outData[ i ],
400                               inDataA[i*vecSize],
401                               inDataB[i*vecSize],
402                               (double)error,
403                               (double)errorTolerance );
404 
405                     char vecA[1000], vecB[1000];
406                     vector2string_double( vecA, inDataA + i * vecSize, vecSize );
407                     vector2string_double( vecB, inDataB + i * vecSize, vecSize );
408                     log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
409                     return -1;
410                 }
411             }
412             else
413             {
414                 double error = Ulp_Error_Double( outData[ i ],
415                                                 expected );
416                 if( fabs(error) > ulpLimit )
417                 {
418                     log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n",
419                               (int)i, (int)vecSize, expected,
420                               outData[ i ],
421                               inDataA[i*vecSize],
422                               inDataB[i*vecSize],
423                               error );
424 
425                     char vecA[1000], vecB[1000];
426                     vector2string_double( vecA, inDataA + i * vecSize, vecSize );
427                     vector2string_double( vecB, inDataB + i * vecSize, vecSize );
428                     log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB );
429                     return -1;
430                 }
431             }
432         }
433     }
434     return 0;
435 }
436 
verifyDot_double(double * srcA,double * srcB,size_t vecSize)437 double verifyDot_double( double *srcA, double *srcB, size_t vecSize )
438 {
439     double total = 0.f;
440 
441     for( unsigned int i = 0; i < vecSize; i++ )
442         total += (double)srcA[ i ] * (double)srcB[ i ];
443 
444     return total;
445 }
446 
test_geom_dot_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)447 int test_geom_dot_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
448 {
449     size_t sizes[] = { 1, 2, 3, 4, 0 };
450     unsigned int size;
451     int retVal = 0;
452 
453 
454     for( size = 0; sizes[ size ] != 0 ; size++ )
455     {
456         if( test_twoToFloat_kernel_double( queue, context, "dot", sizes[ size ], verifyDot_double, -1.0f /*magic value*/, d ) != 0 )
457         {
458             log_error( "   dot double vector size %d FAILED\n", (int)sizes[ size ] );
459             retVal = -1;
460         }
461     }
462     return retVal;
463 }
464 
465 
test_geom_fast_distance_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)466 int test_geom_fast_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
467 {
468     size_t sizes[] = { 1, 2, 3, 4, 0 };
469     unsigned int size;
470     int retVal = 0;
471 
472     abort();    //there is no double precision fast_distance
473 
474     for( size = 0; sizes[ size ] != 0 ; size++ )
475     {
476         double maxUlps = 8192.0f +                           // error in sqrt
477         0.5f *                              // effect on e of taking sqrt( x + e )
478         ( 1.5f * (double) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
479          0.5f * (double) (sizes[size]-1));    // cumulative error for additions
480 
481         if( test_twoToFloat_kernel_double( queue, context, "fast_distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
482         {
483             log_error( "   fast_distance double vector size %d FAILED\n", (int)sizes[ size ] );
484             retVal = -1;
485         }
486         else
487         {
488             log_info( "   fast_distance double vector size %d passed\n", (int)sizes[ size ] );
489         }
490     }
491     return retVal;
492 }
493 
494 
verifyDistance_double(double * srcA,double * srcB,size_t vecSize)495 double verifyDistance_double( double *srcA, double *srcB, size_t vecSize )
496 {
497     unsigned int i;
498     double diff[4];
499 
500     for( i = 0; i < vecSize; i++ )
501         diff[i] = srcA[i] - srcB[i];
502 
503     return verifyLength_double( diff, vecSize );
504 }
505 
test_geom_distance_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)506 int test_geom_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
507 {
508     size_t sizes[] = { 1, 2, 3, 4, 0 };
509     unsigned int size;
510     int retVal = 0;
511 
512     for( size = 0; sizes[ size ] != 0 ; size++ )
513     {
514         double maxUlps = 3.0f +                              // error in sqrt
515         0.5f *                              // effect on e of taking sqrt( x + e )
516         ( 1.5f * (double) sizes[size] +      // cumulative error for multiplications  (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication
517          0.5f * (double) (sizes[size]-1));    // cumulative error for additions
518 
519         maxUlps *= 2.0;         // our reference code may be in error too
520 
521         if( test_twoToFloat_kernel_double( queue, context, "distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 )
522         {
523             log_error( "   distance double vector size %d FAILED\n", (int)sizes[ size ] );
524             retVal = -1;
525         }
526         else
527         {
528             log_info( "   distance double vector size %d passed\n", (int)sizes[ size ] );
529         }
530     }
531     return retVal;
532 }
533 
534 typedef double (*oneToFloatVerifyFn_double)( double *srcA, size_t vecSize );
535 
test_oneToFloat_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToFloatVerifyFn_double verifyFn,double ulpLimit,MTdata d)536 int test_oneToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
537                                   size_t vecSize, oneToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d )
538 {
539     clProgramWrapper program;
540     clKernelWrapper kernel;
541     clMemWrapper streams[2];
542     BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
543     BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE));
544     int error;
545     size_t i, threads[1], localThreads[1];
546     char kernelSource[10240];
547     char *programPtr;
548     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
549     cl_double *inDataA = A;
550     cl_double *outData = B;
551 
552     /* Create the source */
553     sprintf( kernelSource, vecSize == 3 ? oneToFloatKernelPattern_doubleV3 : oneToFloatKernelPattern_double, sizeNames[vecSize-1], fnName );
554 
555     /* Create kernels */
556     programPtr = kernelSource;
557     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
558         return -1;
559 
560     /* Generate some streams */
561     for( i = 0; i < TEST_SIZE * vecSize; i++ )
562         inDataA[ i ] = any_double(d);
563 
564     fillWithTrickyNumbers_double( inDataA, NULL, vecSize );
565 
566     streams[0] =
567         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
568                        sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
569     if( streams[0] == NULL )
570     {
571         log_error("ERROR: Creating input array A failed!\n");
572         return -1;
573     }
574     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
575                                 sizeof(cl_double) * TEST_SIZE, NULL, NULL);
576     if( streams[1] == NULL )
577     {
578         log_error("ERROR: Creating output array failed!\n");
579         return -1;
580     }
581 
582     /* Assign streams and execute */
583     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] );
584     test_error( error, "Unable to set indexed kernel arguments" );
585     error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] );
586     test_error( error, "Unable to set indexed kernel arguments" );
587 
588     /* Run the kernel */
589     threads[0] = TEST_SIZE;
590 
591     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
592     test_error( error, "Unable to get work group size to use" );
593 
594     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
595     test_error( error, "Unable to execute test kernel" );
596 
597     /* Now get the results */
598     error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL );
599     test_error( error, "Unable to read output array!" );
600 
601     /* And verify! */
602     for( i = 0; i < TEST_SIZE; i++ )
603     {
604         double expected = verifyFn( inDataA + i * vecSize, vecSize );
605         if( (double) expected != outData[ i ] )
606         {
607             double ulps = Ulp_Error_Double( outData[i], expected );
608             if( fabs( ulps ) <= ulpLimit )
609                 continue;
610 
611             // We have to special case NAN
612             if( isnan( outData[ i ] ) && isnan( expected ) )
613                 continue;
614 
615             if(! (fabs(ulps) < ulpLimit) )
616             {
617                 log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n",
618                           (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps );
619                 char vecA[1000];
620                 vector2string_double( vecA, inDataA + i * vecSize, vecSize );
621                 log_error( "\tvector: %s", vecA );
622                 return -1;
623             }
624         }
625     }
626 
627     return 0;
628 }
629 
verifyLength_double(double * srcA,size_t vecSize)630 double verifyLength_double( double *srcA, size_t vecSize )
631 {
632     double total = 0;
633     unsigned int i;
634 
635     // We calculate the distance as a double, to try and make up for the fact that
636     // the GPU has better precision distance since it's a single op
637     for( i = 0; i < vecSize; i++ )
638         total += srcA[i] * srcA[i];
639 
640     // Deal with spurious overflow
641     if( total == INFINITY )
642     {
643         total = 0.0;
644         for( i = 0; i < vecSize; i++ )
645         {
646             double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p-600, 0x1LL, -600);
647             total += f * f;
648         }
649 
650         return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p600, 0x1LL, 600);
651     }
652 
653     // Deal with spurious underflow
654     if( total < 4 /*max vector length*/ * DBL_MIN / DBL_EPSILON )
655     {
656         total = 0.0;
657         for( i = 0; i < vecSize; i++ )
658         {
659             double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700);
660             total += f * f;
661         }
662 
663         return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p-700, 0x1LL, -700);
664     }
665 
666     return sqrt( total );
667 }
668 
test_geom_length_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)669 int test_geom_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
670 {
671     size_t sizes[] = { 1, 2, 3, 4, 0 };
672     unsigned int size;
673     int retVal = 0;
674 
675     for( size = 0; sizes[ size ] != 0 ; size++ )
676     {
677         double maxUlps = 3.0f +                              // error in sqrt
678         0.5f *                              // effect on e of taking sqrt( x + e )
679         ( 0.5f * (double) sizes[size] +      // cumulative error for multiplications
680          0.5f * (double) (sizes[size]-1));    // cumulative error for additions
681 
682         maxUlps *= 2.0;         // our reference code may be in error too
683         if( test_oneToFloat_kernel_double( queue, context, "length", sizes[ size ], verifyLength_double, maxUlps, d ) != 0 )
684         {
685             log_error( "   length double vector size %d FAILED\n", (int)sizes[ size ] );
686             retVal = -1;
687         }
688         else
689         {
690             log_info( "   length double vector size %d passed\n", (int)sizes[ size ] );
691         }
692     }
693     return retVal;
694 }
695 
696 
verifyFastLength_double(double * srcA,size_t vecSize)697 double verifyFastLength_double( double *srcA, size_t vecSize )
698 {
699     double total = 0;
700     unsigned int i;
701 
702     // We calculate the distance as a double, to try and make up for the fact that
703     // the GPU has better precision distance since it's a single op
704     for( i = 0; i < vecSize; i++ )
705     {
706         total += (double)srcA[i] * (double)srcA[i];
707     }
708 
709     return sqrt( total );
710 }
711 
test_geom_fast_length_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)712 int test_geom_fast_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
713 {
714     size_t sizes[] = { 1, 2, 3, 4, 0 };
715     unsigned int size;
716     int retVal = 0;
717 
718     abort();    //there is no double precision fast_length
719 
720     for( size = 0; sizes[ size ] != 0 ; size++ )
721     {
722         double maxUlps = 8192.0f +                           // error in half_sqrt
723         0.5f *                              // effect on e of taking sqrt( x + e )
724         ( 0.5f * (double) sizes[size] +      // cumulative error for multiplications
725          0.5f * (double) (sizes[size]-1));    // cumulative error for additions
726 
727         if( test_oneToFloat_kernel_double( queue, context, "fast_length", sizes[ size ], verifyFastLength_double, maxUlps, d ) != 0 )
728         {
729             log_error( "   fast_length double vector size %d FAILED\n", (int)sizes[ size ] );
730             retVal = -1;
731         }
732         else
733         {
734             log_info( "   fast_length double vector size %d passed\n", (int)sizes[ size ] );
735         }
736     }
737     return retVal;
738 }
739 
740 
741 typedef void (*oneToOneVerifyFn_double)( double *srcA, double *dstA, size_t vecSize );
742 
test_oneToOne_kernel_double(cl_command_queue queue,cl_context context,const char * fnName,size_t vecSize,oneToOneVerifyFn_double verifyFn,double ulpLimit,MTdata d)743 int test_oneToOne_kernel_double(cl_command_queue queue, cl_context context, const char *fnName,
744                                 size_t vecSize, oneToOneVerifyFn_double verifyFn, double ulpLimit, MTdata d )
745 {
746     clProgramWrapper program;
747     clKernelWrapper kernel;
748     clMemWrapper streams[2];
749     BufferOwningPtr<cl_double> A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
750     BufferOwningPtr<cl_double> B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize));
751     int error;
752     size_t i, j, threads[1], localThreads[1];
753     char kernelSource[10240];
754     char *programPtr;
755     char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
756     cl_double *inDataA = A;
757     cl_double *outData = B;
758 
759     /* Create the source */
760     sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPattern_doubleV3 : oneToOneKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName );
761 
762     /* Create kernels */
763     programPtr = kernelSource;
764     if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) )
765         return -1;
766 
767     /* initialize data */
768     memset( inDataA, 0, vecSize * sizeof( cl_double ) );
769     for( i = vecSize; i < TEST_SIZE * vecSize; i++ )
770         inDataA[ i ] = any_double(d);
771 
772 
773     streams[0] =
774         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
775                        sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL);
776     if( streams[0] == NULL )
777     {
778         log_error("ERROR: Creating input array A failed!\n");
779         return -1;
780     }
781     streams[1] =
782         clCreateBuffer(context, CL_MEM_READ_WRITE,
783                        sizeof(cl_double) * vecSize * TEST_SIZE, NULL, NULL);
784     if( streams[1] == NULL )
785     {
786         log_error("ERROR: Creating output array failed!\n");
787         return -1;
788     }
789 
790     /* Assign streams and execute */
791     error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] );
792     test_error( error, "Unable to set indexed kernel arguments" );
793     error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] );
794     test_error( error, "Unable to set indexed kernel arguments" );
795 
796     /* Run the kernel */
797     threads[0] = TEST_SIZE;
798 
799     error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] );
800     test_error( error, "Unable to get work group size to use" );
801 
802     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
803     test_error( error, "Unable to execute test kernel" );
804 
805     /* Now get the results */
806     error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL );
807     test_error( error, "Unable to read output array!" );
808 
809     /* And verify! */
810     for( i = 0; i < TEST_SIZE; i++ )
811     {
812         double expected[4];
813         verifyFn( inDataA + i * vecSize, expected, vecSize );
814         for( j = 0; j < vecSize; j++ )
815         {
816             // We have to special case NAN
817             if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) )
818                 continue;
819 
820             if( expected[j] != outData[ i *vecSize+j ] )
821             {
822                 double error =
823                 Ulp_Error_Double( outData[i*vecSize + j ], expected[ j ] );
824                 if( fabs(error) > ulpLimit )
825                 {
826                     log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n",
827                               (int)i, (int)j, (int)vecSize,
828                               expected[j], expected[j],
829                               outData[i*vecSize +j],
830                               outData[i*vecSize +j], error );
831                     log_error( "       Source: " );
832                     for( size_t q = 0; q < vecSize; q++ )
833                         log_error( "%g ", inDataA[ i * vecSize + q ] );
834                     log_error( "\n             : " );
835                     for( size_t q = 0; q < vecSize; q++ )
836                         log_error( "%a ", inDataA[ i * vecSize + q ] );
837                     log_error( "\n" );
838                     log_error( "       Result: " );
839                     for( size_t q = 0; q < vecSize; q++ )
840                         log_error( "%g ", outData[i * vecSize + q ] );
841                     log_error( "\n             : " );
842                     for( size_t q = 0; q < vecSize; q++ )
843                         log_error( "%a ", outData[i * vecSize + q ] );
844                     log_error( "\n" );
845                     log_error( "       Expected: " );
846                     for( size_t q = 0; q < vecSize; q++ )
847                         log_error( "%g ", expected[ q ] );
848                     log_error( "\n             : " );
849                     for( size_t q = 0; q < vecSize; q++ )
850                         log_error( "%a ", expected[ q ] );
851                     log_error( "\n" );
852                     return -1;
853                 }
854             }
855         }
856     }
857 
858     return 0;
859 }
860 
verifyNormalize_double(double * srcA,double * dst,size_t vecSize)861 void verifyNormalize_double( double *srcA, double *dst, size_t vecSize )
862 {
863     double total = 0, value;
864     unsigned int i;
865 
866     // We calculate everything as a double, to try and make up for the fact that
867     // the GPU has better precision distance since it's a single op
868     for( i = 0; i < vecSize; i++ )
869         total += (double)srcA[i] * (double)srcA[i];
870 
871     if( total < vecSize * DBL_MIN / DBL_EPSILON )
872     { //we may have incurred denormalization loss -- rescale
873         total = 0;
874         for( i = 0; i < vecSize; i++ )
875         {
876             dst[i] = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700);  //exact
877             total += dst[i] * dst[i];
878         }
879 
880         //If still zero
881         if( total == 0.0 )
882         {
883             // Special edge case: copy vector over without change
884             for( i = 0; i < vecSize; i++ )
885                 dst[i] = srcA[i];
886             return;
887         }
888 
889         srcA = dst;
890     }
891     else if( total == INFINITY )
892     { //we may have incurred spurious overflow
893         double scale = MAKE_HEX_DOUBLE(0x1.0p-512, 0x1LL, -512) / vecSize;
894         total = 0;
895         for( i = 0; i < vecSize; i++ )
896         {
897             dst[i] = srcA[i] * scale;  //exact
898             total += dst[i] * dst[i];
899         }
900 
901         // If there are infinities here, handle those
902         if( total == INFINITY )
903         {
904             total = 0;
905             for( i = 0; i < vecSize; i++ )
906                 {
907                 if( isinf(dst[i]) )
908                 {
909                     dst[i] = copysign( 1.0, srcA[i] );
910                     total += 1.0;
911                 }
912                 else
913                     dst[i] = copysign( 0.0, srcA[i] );
914         }
915         }
916 
917         srcA = dst;
918     }
919 
920     value = sqrt( total );
921 
922     for( i = 0; i < vecSize; i++ )
923         dst[i] = srcA[i] / value;
924 }
925 
test_geom_normalize_double(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,MTdata d)926 int test_geom_normalize_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d)
927 {
928     size_t sizes[] = { 1, 2, 3, 4, 0 };
929     unsigned int size;
930     int retVal = 0;
931 
932     for( size = 0; sizes[ size ] != 0 ; size++ )
933     {
934         double maxUlps = 2.5f +                              // error in rsqrt + error in multiply
935         0.5f *                               // effect on e of taking sqrt( x + e )
936         ( 0.5f * (double) sizes[size] +      // cumulative error for multiplications
937          0.5f * (double) (sizes[size]-1));    // cumulative error for additions
938 
939         maxUlps *= 2.0; //our reference code is not infinitely precise and may have error of its own
940         if( test_oneToOne_kernel_double( queue, context, "normalize", sizes[ size ], verifyNormalize_double, maxUlps, d ) != 0 )
941         {
942             log_error( "   normalize double vector size %d FAILED\n", (int)sizes[ size ] );
943             retVal = -1;
944         }
945         else
946         {
947             log_info( "   normalize double vector size %d passed\n", (int)sizes[ size ] );
948         }
949     }
950     return retVal;
951 }
952 
953 
954 
955 
956 
957