• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <string.h>
19 #include <stdio.h>
20 
21 #if !defined(_WIN32)
22 #include <libgen.h>
23 #include <sys/param.h>
24 #endif
25 
26 #include "mingw_compat.h"
27 #if defined (__MINGW32__)
28 #include <sys/param.h>
29 #endif
30 
31 #include <time.h>
32 #include "errorHelpers.h"
33 #include "harness/compat.h"
34 #include "harness/mt19937.h"
35 #include "harness/kernelHelpers.h"
36 #include "harness/rounding_mode.h"
37 #include "harness/fpcontrol.h"
38 #include "harness/testHarness.h"
39 #include "harness/parseParameters.h"
40 #if defined( __APPLE__ )
41 #include <sys/sysctl.h>
42 #endif
43 #if defined( __linux__ )
44 #include <unistd.h>
45 #include <sys/syscall.h>
46 #include <linux/sysctl.h>
47 #endif
48 
49 #if defined (_WIN32)
50 #include <string.h>
51 #endif
52 
53 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
54 #include <emmintrin.h>
55 #endif
56 
57 #if defined(__PPC__)
58 // Global varaiable used to hold the FPU control register state. The FPSCR register can not
59 // be used because not all Power implementations retain or observed the NI (non-IEEE
60 // mode) bit.
61 __thread fpu_control_t fpu_control = 0;
62 #endif
63 
64 #ifndef MAXPATHLEN
65 #define MAXPATHLEN  2048
66 #endif
67 
68 char                appName[ MAXPATHLEN ] = "";
69 cl_context          gContext = NULL;
70 cl_command_queue    gQueue = NULL;
71 cl_program          gProgram[5] = { NULL, NULL, NULL, NULL, NULL };
72 cl_program          gProgram_double[5] = { NULL, NULL, NULL, NULL, NULL };
73 int                 gForceFTZ = 0;
74 int                 gSeed = 0;
75 int                 gSeedSpecified = 0;
76 int                 gHasDouble = 0;
77 MTdata              gMTdata = NULL;
78 int                 gSkipNanInf = 0;
79 int                     gIgnoreZeroSign = 0;
80 
81 cl_mem              bufA = NULL;
82 cl_mem              bufB = NULL;
83 cl_mem              bufC = NULL;
84 cl_mem              bufD = NULL;
85 cl_mem              bufE = NULL;
86 cl_mem              bufC_double = NULL;
87 cl_mem              bufD_double = NULL;
88 float               *buf1, *buf2, *buf3, *buf4, *buf5, *buf6;
89 float               *correct[8];
90 int                     *skipTest[8];
91 
92 double              *buf3_double, *buf4_double, *buf5_double, *buf6_double;
93 double              *correct_double[8];
94 
95 static const char   **gArgList;
96 static size_t       gArgCount;
97 
98 #define BUFFER_SIZE         (1024*1024)
99 
100 
101 static int ParseArgs( int argc, const char **argv );
102 static void PrintUsage( void );
103 test_status InitCL( cl_device_id device );
104 static void ReleaseCL( void );
105 static int RunTest( int testNumber );
106 static int RunTest_Double( int testNumber );
107 
108 #if defined(__ANDROID__)
109 #define nanf( X ) strtof( "NAN", ( char ** ) NULL )
110 #define nan( X )  strtod( "NAN", ( char ** ) NULL )
111 #endif
112 
113 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
114 // defeat x87 on MSVC
sse_add(float x,float y)115 float sse_add(float x, float y)
116 {
117     volatile float a = x;
118     volatile float b = y;
119 
120     // defeat x87
121     __m128 va = _mm_set_ss( (float) a );
122     __m128 vb = _mm_set_ss( (float) b );
123     va = _mm_add_ss( va, vb );
124     _mm_store_ss( (float*) &a, va );
125     return a;
126 }
127 
sse_add_sd(double x,double y)128 double sse_add_sd(double x, double y)
129 {
130     volatile double a = x;
131     volatile double b = y;
132 
133     // defeat x87
134     __m128d va = _mm_set_sd( (double) a );
135     __m128d vb = _mm_set_sd( (double) b );
136     va = _mm_add_sd( va, vb );
137     _mm_store_sd( (double*) &a, va );
138     return a;
139 }
140 
sse_sub(float x,float y)141 float sse_sub(float x, float y)
142 {
143     volatile float a = x;
144     volatile float b = y;
145 
146     // defeat x87
147     __m128 va = _mm_set_ss( (float) a );
148     __m128 vb = _mm_set_ss( (float) b );
149     va = _mm_sub_ss( va, vb );
150     _mm_store_ss( (float*) &a, va );
151     return a;
152 }
153 
sse_sub_sd(double x,double y)154 double sse_sub_sd(double x, double y)
155 {
156     volatile double a = x;
157     volatile double b = y;
158 
159     // defeat x87
160     __m128d va = _mm_set_sd( (double) a );
161     __m128d vb = _mm_set_sd( (double) b );
162     va = _mm_sub_sd( va, vb );
163     _mm_store_sd( (double*) &a, va );
164     return a;
165 }
166 
sse_mul(float x,float y)167 float sse_mul(float x, float y)
168 {
169     volatile float a = x;
170     volatile float b = y;
171 
172     // defeat x87
173     __m128 va = _mm_set_ss( (float) a );
174     __m128 vb = _mm_set_ss( (float) b );
175     va = _mm_mul_ss( va, vb );
176     _mm_store_ss( (float*) &a, va );
177     return a;
178 }
179 
sse_mul_sd(double x,double y)180 double sse_mul_sd(double x, double y)
181 {
182     volatile double a = x;
183     volatile double b = y;
184 
185     // defeat x87
186     __m128d va = _mm_set_sd( (double) a );
187     __m128d vb = _mm_set_sd( (double) b );
188     va = _mm_mul_sd( va, vb );
189     _mm_store_sd( (double*) &a, va );
190     return a;
191 }
192 #endif
193 
194 #ifdef __PPC__
ppc_mul(float a,float b)195 float ppc_mul(float a, float b)
196 {
197     float p;
198 
199     if (gForceFTZ) {
200         // Flush input a to zero if it is sub-normal
201         if (fabsf(a) < FLT_MIN) {
202             a = copysignf(0.0, a);
203         }
204         // Flush input b to zero if it is sub-normal
205         if (fabsf(b) < FLT_MIN) {
206             b = copysignf(0.0, b);
207         }
208         // Perform multiply
209         p = a * b;
210         // Flush the product if it is a sub-normal
211         if (fabs((double)a * (double)b) < FLT_MIN) {
212             p = copysignf(0.0, p);
213         }
214     } else {
215         p = a * b;
216     }
217     return p;
218 }
219 #endif
220 
test_contractions_float_0(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)221 int test_contractions_float_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
222 {
223     return RunTest(0);
224 }
225 
test_contractions_float_1(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)226 int test_contractions_float_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
227 {
228     return RunTest(1);
229 }
230 
test_contractions_float_2(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)231 int test_contractions_float_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
232 {
233     return RunTest(2);
234 }
235 
test_contractions_float_3(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)236 int test_contractions_float_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
237 {
238     return RunTest(3);
239 }
240 
test_contractions_float_4(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)241 int test_contractions_float_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
242 {
243     return RunTest(4);
244 }
245 
test_contractions_float_5(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)246 int test_contractions_float_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
247 {
248     return RunTest(5);
249 }
250 
test_contractions_float_6(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)251 int test_contractions_float_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
252 {
253     return RunTest(6);
254 }
255 
test_contractions_float_7(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)256 int test_contractions_float_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
257 {
258     return RunTest(7);
259 }
260 
test_contractions_double_0(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)261 int test_contractions_double_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
262 {
263     return RunTest_Double(0);
264 }
265 
test_contractions_double_1(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)266 int test_contractions_double_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
267 {
268     return RunTest_Double(1);
269 }
270 
test_contractions_double_2(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)271 int test_contractions_double_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
272 {
273     return RunTest_Double(2);
274 }
275 
test_contractions_double_3(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)276 int test_contractions_double_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
277 {
278     return RunTest_Double(3);
279 }
280 
test_contractions_double_4(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)281 int test_contractions_double_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
282 {
283     return RunTest_Double(4);
284 }
285 
test_contractions_double_5(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_contractions_double_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
287 {
288     return RunTest_Double(5);
289 }
290 
test_contractions_double_6(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)291 int test_contractions_double_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
292 {
293     return RunTest_Double(6);
294 }
295 
test_contractions_double_7(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)296 int test_contractions_double_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
297 {
298     return RunTest_Double(7);
299 }
300 
301 test_definition test_list[] = {
302     ADD_TEST( contractions_float_0 ),
303     ADD_TEST( contractions_float_1 ),
304     ADD_TEST( contractions_float_2 ),
305     ADD_TEST( contractions_float_3 ),
306     ADD_TEST( contractions_float_4 ),
307     ADD_TEST( contractions_float_5 ),
308     ADD_TEST( contractions_float_6 ),
309     ADD_TEST( contractions_float_7 ),
310     ADD_TEST( contractions_double_0 ),
311     ADD_TEST( contractions_double_1 ),
312     ADD_TEST( contractions_double_2 ),
313     ADD_TEST( contractions_double_3 ),
314     ADD_TEST( contractions_double_4 ),
315     ADD_TEST( contractions_double_5 ),
316     ADD_TEST( contractions_double_6 ),
317     ADD_TEST( contractions_double_7 ),
318 };
319 
320 const int test_num = ARRAY_SIZE( test_list );
321 
main(int argc,const char ** argv)322 int main( int argc, const char **argv )
323 {
324     argc = parseCustomParam(argc, argv);
325     if (argc == -1)
326     {
327         return -1;
328     }
329 
330     int error = ParseArgs( argc, argv );
331 
332     if( !error )
333     {
334         error = runTestHarnessWithCheck( gArgCount, gArgList, test_num, test_list, true, 0, InitCL );
335     }
336 
337     if( gQueue )
338     {
339         int flush_error = clFinish( gQueue );
340         if( flush_error )
341             log_error( "clFinish failed: %d\n", flush_error );
342     }
343 
344     ReleaseCL();
345     free( gArgList );
346 
347     return error;
348 }
349 
350 
351 
ParseArgs(int argc,const char ** argv)352 static int ParseArgs( int argc, const char **argv )
353 {
354     gArgList = (const char **)calloc( argc, sizeof( char*) );
355 
356     if( NULL == gArgList )
357     {
358         vlog_error( "Failed to allocate memory for argList\n" );
359         return 1;
360     }
361 
362     gArgList[0] = argv[0];
363     gArgCount = 1;
364 
365     int length_of_seed = 0;
366 
367     { // Extract the app name
368         strncpy( appName, argv[0], MAXPATHLEN );
369 
370 #if (defined( __APPLE__ ) || defined(__linux__) || defined(__MINGW32__))
371         char baseName[MAXPATHLEN];
372         char *base = NULL;
373         strncpy( baseName, argv[0], MAXPATHLEN );
374         base = basename( baseName );
375         if( NULL != base )
376         {
377             strncpy( appName, base, sizeof( appName )  );
378             appName[ sizeof( appName ) -1 ] = '\0';
379         }
380 #elif defined (_WIN32)
381         char fname[_MAX_FNAME + _MAX_EXT + 1];
382         char ext[_MAX_EXT];
383 
384         errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
385                                    fname, _MAX_FNAME, ext, _MAX_EXT );
386         if (err == 0) { // no error
387             strcat (fname, ext); //just cat them, size of frame can keep both
388             strncpy (appName, fname, sizeof(appName));
389             appName[ sizeof( appName ) -1 ] = '\0';
390         }
391 #endif
392     }
393 
394     for( int i = 1; i < argc; i++ )
395     {
396         const char *arg = argv[i];
397         if( NULL == arg )
398             break;
399 
400         if( arg[0] == '-' )
401         {
402             while( arg[1] != '\0' )
403             {
404                 arg++;
405                 switch( *arg )
406                 {
407                     case 'h':
408                         PrintUsage();
409                         return -1;
410 
411                     case 's':
412                         arg++;
413                         gSeed = atoi( arg );
414                         while (arg[length_of_seed] >='0' && arg[length_of_seed]<='9')
415                             length_of_seed++;
416                         gSeedSpecified = 1;
417                         arg+=length_of_seed-1;
418                         break;
419 
420                     case 'z':
421                         gForceFTZ ^= 1;
422                         break;
423 
424                     default:
425                         vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
426                         PrintUsage();
427                         return -1;
428                 }
429             }
430         }
431         else
432         {
433             gArgList[gArgCount] = arg;
434             gArgCount++;
435         }
436     }
437     vlog( "\n\nTest binary built %s %s\n", __DATE__, __TIME__ );
438 
439     PrintArch();
440 
441     return 0;
442 }
443 
PrintUsage(void)444 static void PrintUsage( void )
445 {
446     vlog( "%s [-z]: <optional: test names>\n", appName );
447     vlog( "\tOptions:\n" );
448     vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" );
449     vlog( "\t\t-sNUMBER set random seed.\n");
450     vlog( "\n" );
451     vlog( "\tTest names:\n" );
452     for( int i = 0; i < test_num; i++ )
453     {
454         vlog( "\t\t%s\n", test_list[i].name );
455     }
456 }
457 
458 const char *sizeNames[] = { "float", "float2", "float4", "float8", "float16" };
459 const char *sizeNames_double[] = { "double", "double2", "double4", "double8", "double16" };
460 
InitCL(cl_device_id device)461 test_status InitCL( cl_device_id device )
462 {
463     int error;
464     uint32_t i, j;
465     int *bufSkip = NULL;
466     int isRTZ = 0;
467     RoundingMode oldRoundMode = kDefaultRoundingMode;
468 
469     cl_device_fp_config floatCapabilities = 0;
470     if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL)))
471         floatCapabilities = 0;
472     if(0 == (CL_FP_DENORM & floatCapabilities) )
473         gForceFTZ ^= 1;
474 
475     // check for cl_khr_fp64
476     gHasDouble = is_extension_available(device, "cl_khr_fp64" );
477 
478     if(0 == (CL_FP_INF_NAN & floatCapabilities) )
479         gSkipNanInf = 1;
480 
481     // Embedded devices that flush to zero are allowed to have an undefined sign.
482     if (gIsEmbedded && gForceFTZ)
483         gIgnoreZeroSign = 1;
484 
485     gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
486     if( NULL == gContext || error )
487     {
488         vlog_error( "clCreateDeviceGroup failed. %d\n", error );
489         return TEST_FAIL;
490     }
491 
492     gQueue = clCreateCommandQueue( gContext, device, 0, &error );
493     if( NULL == gQueue || error )
494     {
495         vlog_error( "clCreateContext failed. %d\n", error );
496         return TEST_FAIL;
497     }
498 
499     // setup input buffers
500     bufA = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
501     bufB = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
502     bufC = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
503     bufD = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
504     bufE = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
505 
506     if( bufA == NULL    ||
507        bufB == NULL    ||
508        bufC == NULL    ||
509        bufD == NULL    ||
510        bufE == NULL    )
511     {
512         vlog_error( "clCreateArray failed for input\n" );
513         return TEST_FAIL;
514     }
515 
516     if( gHasDouble )
517     {
518         bufC_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
519         bufD_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
520         if( bufC_double == NULL    ||
521            bufD_double == NULL    )
522         {
523             vlog_error( "clCreateArray failed for input DP\n" );
524             return TEST_FAIL;
525         }
526     }
527 
528     const char *kernels[] = {
529         "", "#pragma OPENCL FP_CONTRACT OFF\n"
530         "__kernel void kernel1( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
531         "{\n"
532         "   int i = get_global_id(0);\n"
533         "   out[i] = a[i] * b[i] + c[i];\n"
534         "}\n"
535         "\n"
536         "__kernel void kernel2( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
537         "{\n"
538         "   int i = get_global_id(0);\n"
539         "   out[i] = a[i] * b[i] - c[i];\n"
540         "}\n"
541         "\n"
542         "__kernel void kernel3( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
543         "{\n"
544         "   int i = get_global_id(0);\n"
545         "   out[i] = c[i] + a[i] * b[i];\n"
546         "}\n"
547         "\n"
548         "__kernel void kernel4( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
549         "{\n"
550         "   int i = get_global_id(0);\n"
551         "   out[i] = c[i] - a[i] * b[i];\n"
552         "}\n"
553         "\n"
554         "__kernel void kernel5( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
555         "{\n"
556         "   int i = get_global_id(0);\n"
557         "   out[i] = -(a[i] * b[i] + c[i]);\n"
558         "}\n"
559         "\n"
560         "__kernel void kernel6( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
561         "{\n"
562         "   int i = get_global_id(0);\n"
563         "   out[i] = -(a[i] * b[i] - c[i]);\n"
564         "}\n"
565         "\n"
566         "__kernel void kernel7( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
567         "{\n"
568         "   int i = get_global_id(0);\n"
569         "   out[i] = -(c[i] + a[i] * b[i]);\n"
570         "}\n"
571         "\n"
572         "__kernel void kernel8( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
573         "{\n"
574         "   int i = get_global_id(0);\n"
575         "   out[i] = -(c[i] - a[i] * b[i]);\n"
576         "}\n"
577         "\n" };
578 
579     for (i = 0; i < sizeof(sizeNames) / sizeof(sizeNames[0]); i++)
580     {
581         size_t strCount = sizeof(kernels) / sizeof(kernels[0]);
582         kernels[0] = "";
583 
584         for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames[i];
585         error = create_single_kernel_helper(gContext, &gProgram[i], nullptr,
586                                             strCount, kernels, nullptr);
587         if (CL_SUCCESS != error || nullptr == gProgram[i])
588         {
589             log_error("Error: Unable to create test program! (%s) (in %s:%d)\n",
590                       IGetErrorString(error), __FILE__, __LINE__);
591             return TEST_FAIL;
592         }
593     }
594 
595     if (gHasDouble)
596     {
597         kernels[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
598         for (i = 0; i < sizeof(sizeNames_double) / sizeof(sizeNames_double[0]);
599              i++)
600         {
601             size_t strCount = sizeof(kernels) / sizeof(kernels[0]);
602 
603             for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames_double[i];
604             error = create_single_kernel_helper(gContext, &gProgram_double[i],
605                                                 nullptr, strCount, kernels,
606                                                 nullptr);
607             if (CL_SUCCESS != error || nullptr == gProgram_double[i])
608             {
609                 log_error(
610                     "Error: Unable to create test program! (%s) (in %s:%d)\n",
611                     IGetErrorString(error), __FILE__, __LINE__);
612                 return TEST_FAIL;
613             }
614         }
615     }
616 
617     if( 0 == gSeedSpecified )
618     {
619         time_t currentTime = time( NULL );
620         struct tm *t = localtime(&currentTime);
621         gSeed = t->tm_sec + 60 * ( t->tm_min + 60 * (t->tm_hour + 24 * (t->tm_yday + 365 * t->tm_year)));
622         gSeed = (uint32_t) (((uint64_t) gSeed * (uint64_t) gSeed ) >> 16);
623     }
624     gMTdata = init_genrand( gSeed );
625 
626 
627     // Init bufA and bufB
628     {
629         buf1 = (float *)malloc( BUFFER_SIZE );
630         buf2 = (float *)malloc( BUFFER_SIZE );
631         buf3 = (float *)malloc( BUFFER_SIZE );
632         buf4 = (float *)malloc( BUFFER_SIZE );
633         buf5 = (float *)malloc( BUFFER_SIZE );
634         buf6 = (float *)malloc( BUFFER_SIZE );
635 
636         bufSkip = (int *)malloc( BUFFER_SIZE );
637 
638         if( NULL == buf1 || NULL == buf2 || NULL == buf3 || NULL == buf4 || NULL == buf5 || NULL == buf6 || NULL == bufSkip)
639         {
640             vlog_error( "Out of memory initializing buffers\n" );
641             return TEST_FAIL;
642         }
643         for( i = 0; i < sizeof( correct ) / sizeof( correct[0] ); i++ )
644         {
645             correct[i] = (float *)malloc( BUFFER_SIZE );
646             skipTest[i] = (int *)malloc( BUFFER_SIZE );
647             if(( NULL == correct[i] ) || ( NULL == skipTest[i]))
648             {
649                 vlog_error( "Out of memory initializing buffers 2\n" );
650                 return TEST_FAIL;
651             }
652         }
653 
654         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
655             ((uint32_t*) buf1)[i] = genrand_int32( gMTdata );
656 
657         if( (error = clEnqueueWriteBuffer(gQueue, bufA, CL_FALSE, 0, BUFFER_SIZE, buf1, 0, NULL, NULL) ))
658         {
659             vlog_error( "Failure %d at clEnqueueWriteBuffer1\n", error );
660             return TEST_FAIL;
661         }
662 
663         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
664             ((uint32_t*) buf2)[i] = genrand_int32( gMTdata );
665 
666         if( (error = clEnqueueWriteBuffer(gQueue, bufB, CL_FALSE, 0, BUFFER_SIZE, buf2, 0, NULL, NULL) ))
667         {
668             vlog_error( "Failure %d at clEnqueueWriteBuffer2\n", error );
669             return TEST_FAIL;
670         }
671 
672         void *ftzInfo = NULL;
673         if( gForceFTZ )
674             ftzInfo = FlushToZero();
675         if ((CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device)) && gIsEmbedded) {
676             oldRoundMode = set_round(kRoundTowardZero, kfloat);
677             isRTZ = 1;
678         }
679         float *f = (float*) buf1;
680         float *f2 = (float*) buf2;
681         float *f3 = (float*) buf3;
682         float *f4 = (float*) buf4;
683         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
684         {
685             float q = f[i];
686             float q2 = f2[i];
687 
688             feclearexcept(FE_OVERFLOW);
689 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
690             // VS2005 might use x87 for straight multiplies, and we can't
691             // turn that off
692             f3[i] = sse_mul(q, q2);
693             f4[i] = sse_mul(-q, q2);
694 #elif defined(__PPC__)
695             // None of the current generation PPC processors support HW
696             // FTZ, emulate it in sw.
697             f3[i] = ppc_mul(q, q2);
698             f4[i] = ppc_mul(-q, q2);
699 #else
700             f3[i] = q * q2;
701             f4[i] = -q * q2;
702 #endif
703             // Skip test if the device doesn't support infinities and NaN AND the result overflows
704             // or either input is an infinity of NaN
705             bufSkip[i] = (gSkipNanInf && ((FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW))) ||
706                                           (fabsf(q)  == FLT_MAX) || (q  != q)  ||
707                                           (fabsf(q2) == FLT_MAX) || (q2 != q2)));
708         }
709 
710         if( gForceFTZ )
711             UnFlushToZero(ftzInfo);
712 
713     if (isRTZ)
714       (void)set_round(oldRoundMode, kfloat);
715 
716 
717         if( (error = clEnqueueWriteBuffer(gQueue, bufC, CL_FALSE, 0, BUFFER_SIZE, buf3, 0, NULL, NULL) ))
718         {
719             vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
720             return TEST_FAIL;
721         }
722         if( (error = clEnqueueWriteBuffer(gQueue, bufD, CL_FALSE, 0, BUFFER_SIZE, buf4, 0, NULL, NULL) ))
723         {
724             vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
725             return TEST_FAIL;
726         }
727 
728         // Fill the buffers with NaN
729         float *f5 = (float*) buf5;
730         float nan_val = nanf("");
731         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
732             f5[i] = nan_val;
733 
734         // calculate reference results
735         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
736         {
737             for ( j=0; j<8; j++)
738             {
739                 feclearexcept(FE_OVERFLOW);
740                 switch (j)
741                 {
742 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
743                         // VS2005 might use x87 for straight add/sub, and we can't
744                         // turn that off
745                     case 0:
746                         correct[0][i] = sse_add(buf3[i],buf4[i]); break;
747                     case 1:
748                         correct[1][i] = sse_sub(buf3[i],buf3[i]); break;
749                     case 2:
750                         correct[2][i] = sse_add(buf4[i],buf3[i]); break;
751                     case 3:
752                         correct[3][i] = sse_sub(buf3[i],buf3[i]); break;
753                     case 4:
754                         correct[4][i] = -sse_add(buf3[i],buf4[i]); break;
755                     case 5:
756                         correct[5][i] = -sse_sub(buf3[i],buf3[i]); break;
757                     case 6:
758                         correct[6][i] = -sse_add(buf4[i],buf3[i]); break;
759                     case 7:
760                         correct[7][i] = -sse_sub(buf3[i],buf3[i]); break;
761 #else
762                     case 0:
763                         correct[0][i] = buf3[i] + buf4[i]; break;
764                     case 1:
765                         correct[1][i] = buf3[i] - buf3[i]; break;
766                     case 2:
767                         correct[2][i] = buf4[i] + buf3[i]; break;
768                     case 3:
769                         correct[3][i] = buf3[i] - buf3[i]; break;
770                     case 4:
771                         correct[4][i] = -(buf3[i] + buf4[i]); break;
772                     case 5:
773                         correct[5][i] = -(buf3[i] - buf3[i]); break;
774                     case 6:
775                         correct[6][i] = -(buf4[i] + buf3[i]); break;
776                     case 7:
777                         correct[7][i] = -(buf3[i] - buf3[i]); break;
778 #endif
779                 }
780                 // Further skip test inputs if the device doesn support infinities AND NaNs
781                 // resulting sum overflows
782                 skipTest[j][i] = (bufSkip[i] ||
783                                   (gSkipNanInf && (FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)))));
784 
785 #if defined(__PPC__)
786                 // Since the current Power processors don't emulate flush to zero in HW,
787                 // it must be emulated in SW instead.
788                 if (gForceFTZ)
789                 {
790                     if ((fabsf(correct[j][i]) < FLT_MIN) && (correct[j][i] != 0.0f))
791                         correct[j][i] = copysignf(0.0f, correct[j][i]);
792                 }
793 #endif
794             }
795         }
796         if( gHasDouble )
797         {
798             // Spec requires correct non-flushed results
799             // for doubles. We disable FTZ if this is default on
800             // the platform (like ARM) for reference result computation
801             // It is no-op if platform default is not FTZ (e.g. x86)
802             FPU_mode_type oldMode;
803             DisableFTZ( &oldMode );
804 
805             buf3_double = (double *)malloc( BUFFER_SIZE );
806             buf4_double = (double *)malloc( BUFFER_SIZE );
807             buf5_double = (double *)malloc( BUFFER_SIZE );
808             buf6_double = (double *)malloc( BUFFER_SIZE );
809             if( NULL == buf3_double || NULL == buf4_double || NULL == buf5_double || NULL == buf6_double )
810             {
811                 vlog_error( "Out of memory initializing DP buffers\n" );
812                 return TEST_FAIL;
813             }
814             for( i = 0; i < sizeof( correct_double ) / sizeof( correct_double[0] ); i++ )
815             {
816                 correct_double[i] = (double *)malloc( BUFFER_SIZE );
817                 if( NULL == correct_double[i] )
818                 {
819                     vlog_error( "Out of memory initializing DP buffers 2\n" );
820                     return TEST_FAIL;
821                 }
822             }
823 
824 
825             double *f  = (double*) buf1;
826             double *f2 = (double*) buf2;
827             double *f3 = (double*) buf3_double;
828             double *f4 = (double*) buf4_double;
829             for( i = 0; i < BUFFER_SIZE / sizeof(double); i++ )
830             {
831                 double q = f[i];
832                 double q2 = f2[i];
833 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
834                 // VS2005 might use x87 for straight multiplies, and we can't
835                 // turn that off
836                 f3[i] = sse_mul_sd(q, q2);
837                 f4[i] = sse_mul_sd(-q, q2);
838 #else
839                 f3[i] = q * q2;
840                 f4[i] = -q * q2;
841 #endif
842             }
843 
844             if( (error = clEnqueueWriteBuffer(gQueue, bufC_double, CL_FALSE, 0, BUFFER_SIZE, buf3_double, 0, NULL, NULL) ))
845             {
846                 vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
847                 return TEST_FAIL;
848             }
849             if( (error = clEnqueueWriteBuffer(gQueue, bufD_double, CL_FALSE, 0, BUFFER_SIZE, buf4_double, 0, NULL, NULL) ))
850             {
851                 vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
852                 return TEST_FAIL;
853             }
854 
855             // Fill the buffers with NaN
856             double *f5 = (double*) buf5_double;
857             double nan_val = nanf("");
858             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
859                 f5[i] = nan_val;
860 
861             // calculate reference results
862             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
863             {
864 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
865                 // VS2005 might use x87 for straight add/sub, and we can't
866                 // turn that off
867                 correct_double[0][i] = sse_add_sd(buf3_double[i],buf4_double[i]);
868                 correct_double[1][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
869                 correct_double[2][i] = sse_add_sd(buf4_double[i],buf3_double[i]);
870                 correct_double[3][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
871                 correct_double[4][i] = -sse_add_sd(buf3_double[i],buf4_double[i]);
872                 correct_double[5][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
873                 correct_double[6][i] = -sse_add_sd(buf4_double[i],buf3_double[i]);
874                 correct_double[7][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
875 #else
876                 correct_double[0][i] = buf3_double[i] + buf4_double[i];
877                 correct_double[1][i] = buf3_double[i] - buf3_double[i];
878                 correct_double[2][i] = buf4_double[i] + buf3_double[i];
879                 correct_double[3][i] = buf3_double[i] - buf3_double[i];
880                 correct_double[4][i] = -(buf3_double[i] + buf4_double[i]);
881                 correct_double[5][i] = -(buf3_double[i] - buf3_double[i]);
882                 correct_double[6][i] = -(buf4_double[i] + buf3_double[i]);
883                 correct_double[7][i] = -(buf3_double[i] - buf3_double[i]);
884 #endif
885             }
886 
887             // Restore previous FP state since we modified it for
888             // reference result computation (see DisableFTZ call above)
889             RestoreFPState(&oldMode);
890         }
891     }
892 
893     char c[1000];
894     static const char *no_yes[] = { "NO", "YES" };
895     vlog( "\nCompute Device info:\n" );
896     clGetDeviceInfo( device,  CL_DEVICE_NAME, sizeof(c), (void *)&c, NULL);
897     vlog( "\tDevice Name: %s\n", c );
898     clGetDeviceInfo( device,  CL_DEVICE_VENDOR, sizeof(c), (void *)&c, NULL);
899     vlog( "\tVendor: %s\n", c );
900     clGetDeviceInfo( device,  CL_DEVICE_VERSION, sizeof(c), (void *)&c, NULL);
901     vlog( "\tDevice Version: %s\n", c );
902     clGetDeviceInfo( device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
903     vlog( "\tCL C Version: %s\n", c );
904     clGetDeviceInfo( device,  CL_DRIVER_VERSION, sizeof(c), (void *)&c, NULL);
905     vlog( "\tDriver Version: %s\n", c );
906     vlog( "\tSubnormal values supported? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
907     vlog( "\tTesting with FTZ mode ON? %s\n", no_yes[0 != gForceFTZ] );
908     vlog( "\tTesting Doubles? %s\n", no_yes[0 != gHasDouble] );
909     vlog( "\tRandom Number seed: 0x%8.8x\n", gSeed );
910     vlog( "\n\n" );
911 
912     return TEST_PASS;
913 }
914 
ReleaseCL(void)915 static void ReleaseCL( void )
916 {
917     clReleaseMemObject(bufA);
918     clReleaseMemObject(bufB);
919     clReleaseMemObject(bufC);
920     clReleaseMemObject(bufD);
921     clReleaseMemObject(bufE);
922     clReleaseProgram(gProgram[0]);
923     clReleaseProgram(gProgram[1]);
924     clReleaseProgram(gProgram[2]);
925     clReleaseProgram(gProgram[3]);
926     clReleaseProgram(gProgram[4]);
927     if( gHasDouble )
928     {
929         clReleaseMemObject(bufC_double);
930         clReleaseMemObject(bufD_double);
931         clReleaseProgram(gProgram_double[0]);
932         clReleaseProgram(gProgram_double[1]);
933         clReleaseProgram(gProgram_double[2]);
934         clReleaseProgram(gProgram_double[3]);
935         clReleaseProgram(gProgram_double[4]);
936     }
937     clReleaseCommandQueue(gQueue);
938     clReleaseContext(gContext);
939 }
940 
941 
RunTest(int testNumber)942 static int RunTest( int testNumber )
943 {
944     size_t i;
945     int error = 0;
946     cl_mem args[4];
947     float *c;
948     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
949         "kernel5", "kernel6", "kernel7", "kernel8" };
950     switch( testNumber )
951     {
952         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;      // a * b + c
953         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
954         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
955         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
956         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
957         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
958         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
959         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
960         default:
961             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
962             return -1;
963     }
964 
965 
966     int vectorSize;
967     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
968     {
969         cl_kernel k = clCreateKernel( gProgram[ vectorSize ], kernelName[ testNumber ], &error );
970         if( NULL == k || error )
971         {
972             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
973             return -2;
974         }
975 
976         // set the kernel args
977         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
978             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
979             {
980                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
981                 return error;
982             }
983 
984         // write NaNs to the result array
985         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf5, 0, NULL, NULL) ))
986         {
987             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
988             return error;
989         }
990 
991         // execute the kernel
992         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_float ) * (1<<vectorSize)), 0, 0 };
993         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
994         {
995             vlog_error( "Got Error # %d trying to execture kernel\n", error );
996             return error;
997         }
998 
999         // read the data back
1000         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6, 0, NULL, NULL ) ))
1001         {
1002             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
1003             return error;
1004         }
1005 
1006         // verify results
1007         float *test = (float*) buf6;
1008         float *a = (float*) buf1;
1009         float *b = (float*) buf2;
1010         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
1011         {
1012             if( isnan(test[i]) && isnan(correct[testNumber][i] ) )
1013                 continue;
1014 
1015             if( skipTest[testNumber][i] )
1016                 continue;
1017 
1018             // sign of zero must be correct
1019             if(( ((uint32_t*) test)[i] != ((uint32_t*) correct[testNumber])[i] ) &&
1020                !(gIgnoreZeroSign && (test[i] == 0.0f) && (correct[testNumber][i] == 0.0f)) )
1021             {
1022                 switch( testNumber )
1023                 {
1024                         // Zeros for these should be positive
1025                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1026                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1027                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1028                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1029                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1030                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1031                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1032                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1033 
1034                         // Zeros for these should be negative
1035                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1036                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1037                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1038                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1039                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1040                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1041                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1042                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1043                     default:
1044                         vlog_error( "error: Unknown test number!\n" );
1045                         clReleaseKernel(k);
1046                         return -2;
1047                 }
1048             }
1049         }
1050 
1051         clReleaseKernel(k);
1052     }
1053 
1054     return error;
1055 }
1056 
RunTest_Double(int testNumber)1057 static int RunTest_Double( int testNumber )
1058 {
1059     if( !gHasDouble )
1060     {
1061         vlog("Double is not supported, test not run.\n");
1062         return 0;
1063     }
1064 
1065     size_t i;
1066     int error = 0;
1067     cl_mem args[4];
1068     double *c;
1069     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
1070         "kernel5", "kernel6", "kernel7", "kernel8" };
1071 
1072     switch( testNumber )
1073     {
1074         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;      // a * b + c
1075         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1076         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1077         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1078         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1079         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1080         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1081         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1082         default:
1083             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
1084             return -1;
1085     }
1086 
1087     int vectorSize;
1088     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
1089     {
1090         cl_kernel k = clCreateKernel( gProgram_double[ vectorSize ], kernelName[ testNumber ], &error );
1091         if( NULL == k || error )
1092         {
1093             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
1094             return -2;
1095         }
1096 
1097         // set the kernel args
1098         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
1099             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
1100             {
1101                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
1102                 return error;
1103             }
1104 
1105         // write NaNs to the result array
1106         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_FALSE, 0, BUFFER_SIZE, buf5_double, 0, NULL, NULL) ))
1107         {
1108             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
1109             return error;
1110         }
1111 
1112         // execute the kernel
1113         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_double ) * (1<<vectorSize)), 0, 0 };
1114         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
1115         {
1116             vlog_error( "Got Error # %d trying to execture kernel\n", error );
1117             return error;
1118         }
1119 
1120         // read the data back
1121         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6_double, 0, NULL, NULL ) ))
1122         {
1123             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
1124             return error;
1125         }
1126 
1127         // verify results
1128         double *test = (double*) buf6_double;
1129         double *a = (double*) buf1;
1130         double *b = (double*) buf2;
1131         for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
1132         {
1133             if( isnan(test[i]) && isnan(correct_double[testNumber][i] ) )
1134                 continue;
1135 
1136             // sign of zero must be correct
1137             if( ((uint64_t*) test)[i] != ((uint64_t*) correct_double[testNumber])[i] )
1138             {
1139                 switch( testNumber )
1140                 {
1141                         // Zeros for these should be positive
1142                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1143                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1144                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1145                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1146                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1147                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1148                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1149                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1150 
1151                         // Zeros for these should be negative
1152                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1153                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1154                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1155                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1156                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1157                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1158                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1159                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1160                     default:
1161                         vlog_error( "error: Unknown test number!\n" );
1162                         return -2;
1163                 }
1164             }
1165         }
1166 
1167         clReleaseKernel(k);
1168     }
1169 
1170     return error;
1171 }
1172