• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/rounding_mode.h"
18 #include "harness/ThreadPool.h"
19 #include "harness/testHarness.h"
20 #include "harness/kernelHelpers.h"
21 #include "harness/parseParameters.h"
22 #if defined(__APPLE__)
23 #include <sys/sysctl.h>
24 #endif
25 
26 #if defined( __linux__ )
27 #include <unistd.h>
28 #include <sys/syscall.h>
29 #include <linux/sysctl.h>
30 #endif
31 #if defined(__linux__)
32 #include <sys/param.h>
33 #include <libgen.h>
34 #endif
35 
36 #include "mingw_compat.h"
37 #if defined(__MINGW32__)
38 #include <sys/param.h>
39 #endif
40 
41 #include <stdarg.h>
42 #include <stdio.h>
43 #include <string.h>
44 #if !defined(_WIN32)
45 #include <libgen.h>
46 #include <sys/mman.h>
47 #endif
48 #include <time.h>
49 
50 #include "Sleep.h"
51 #include "basic_test_conversions.h"
52 
53 #if (defined(_WIN32) && defined (_MSC_VER))
54 // need for _controlfp_s and rouinding modes in RoundingMode
55 #include "harness/testHarness.h"
56 #endif
57 
58 #pragma mark -
59 #pragma mark globals
60 
61 #define BUFFER_SIZE     (1024*1024)
62 #define kPageSize       4096
63 #define EMBEDDED_REDUCTION_FACTOR 16
64 #define PERF_LOOP_COUNT 100
65 
66 #define      kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
67 
68 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
69 #include "fplib.h"
70     extern bool            qcom_sat;
71     extern roundingMode    qcom_rm;
72 #endif
73 
74 const char **   argList = NULL;
75 int             argCount = 0;
76 cl_context      gContext = NULL;
77 cl_command_queue      gQueue = NULL;
78 char            appName[64] = "ctest";
79 int             gStartTestNumber = -1;
80 int             gEndTestNumber = 0;
81 #if defined( __APPLE__ )
82 int             gTimeResults = 1;
83 #else
84 int             gTimeResults = 0;
85 #endif
86 int             gReportAverageTimes = 0;
87 void            *gIn = NULL;
88 void            *gRef = NULL;
89 void        *gAllowZ = NULL;
90 void            *gOut[ kCallStyleCount ] = { NULL };
91 cl_mem          gInBuffer;
92 cl_mem          gOutBuffers[ kCallStyleCount ];
93 size_t          gComputeDevices = 0;
94 uint32_t        gDeviceFrequency = 0;
95 int             gWimpyMode = 0;
96 int             gWimpyReductionFactor = 128;
97 int             gSkipTesting = 0;
98 int             gForceFTZ = 0;
99 int             gMultithread = 1;
100 int             gIsRTZ = 0;
101 uint32_t        gSimdSize = 1;
102 int             gHasDouble = 0;
103 int             gTestDouble = 1;
104 const char *    sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
105 const int       vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
106 int             gMinVectorSize = 0;
107 int             gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] );
108 static MTdata   gMTdata;
109 
110 #pragma mark -
111 #pragma mark Declarations
112 
113 static int ParseArgs( int argc, const char **argv );
114 static void PrintUsage( void );
115 test_status InitCL( cl_device_id device );
116 static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round );
117 static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d );
118 static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel );
119 static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount );
120 
121 void *FlushToZero( void );
122 void UnFlushToZero( void *);
123 
124 static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
125 static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
126 
127 
128 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
129 // (that's x87 default state).  This causes problems with the tests that
130 // convert long and ulong to float and double or otherwise deal with values
131 // that need more precision than 53-bit. So, set the x87 to 64-bit precision.
Force64BitFPUPrecision(void)132 static inline void Force64BitFPUPrecision(void)
133 {
134 #if __MINGW32__
135     // The usual method is to use _controlfp as follows:
136     //     #include <float.h>
137     //     _controlfp(_PC_64, _MCW_PC);
138     //
139     // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
140     // divergent code just use inline assembly which works for both.
141     unsigned short int orig_cw = 0;
142     unsigned short int new_cw = 0;
143     __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
144     new_cw = orig_cw | 0x0300;   // set precision to 64-bit
145     __asm__ __volatile__ ("fldcw  %0"::"m" (new_cw));
146 #else
147     /* Implement for other platforms if needed */
148 #endif
149 }
150 
test_conversions(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)151 int test_conversions( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
152 {
153     int error, i, testNumber = -1;
154     int startMinVectorSize = gMinVectorSize;
155     Type inType, outType;
156     RoundingMode round;
157     SaturationMode sat;
158 
159     if( argCount )
160     {
161         for( i = 0; i < argCount; i++ )
162         {
163             if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) )
164             {
165                 vlog_error( "\n\t\t**** ERROR:  Unable to parse function name %s.  Skipping....  *****\n\n", argList[i] );
166                 continue;
167             }
168 
169             // skip double if we don't have it
170             if( !gTestDouble && (inType == kdouble || outType == kdouble ) )
171             {
172                 if( gHasDouble )
173                 {
174                     vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
175                     vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
176                 }
177 
178                 continue;
179             }
180 
181             // skip longs on embedded
182             if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
183             {
184                 continue;
185             }
186 
187             // Skip the implicit converts if the rounding mode is not default or test is saturated
188             if( 0 == startMinVectorSize )
189             {
190                 if( sat || round != kDefaultRoundingMode )
191                     gMinVectorSize = 1;
192                 else
193                     gMinVectorSize = 0;
194             }
195 
196             if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
197             {
198                 vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
199             }
200         }
201     }
202     else
203     {
204         for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) )
205         {
206             for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) )
207             {
208                 // skip longs on embedded
209                 if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
210                 {
211                     continue;
212                 }
213 
214                 for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) )
215                 {
216                     //skip illegal saturated conversions to float type
217                     if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) )
218                     {
219                         continue;
220                     }
221 
222                     for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) )
223                     {
224                         if( ++testNumber < gStartTestNumber )
225                         {
226                             //     vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
227                             continue;
228                         }
229                         else
230                         {
231                             if( gEndTestNumber > 0 && testNumber >= gEndTestNumber  )
232                             {
233                                 goto exit;
234                             }
235                         }
236 
237                         vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
238 
239                         // skip double if we don't have it
240                         if( ! gTestDouble && (inType == kdouble || outType == kdouble ) )
241                         {
242                             if( gHasDouble )
243                             {
244                                 vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
245                                 vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
246                             }
247                             continue;
248                         }
249 
250                         // Skip the implicit converts if the rounding mode is not default or test is saturated
251                         if( 0 == startMinVectorSize )
252                         {
253                             if( sat || round != kDefaultRoundingMode )
254                                 gMinVectorSize = 1;
255                             else
256                                 gMinVectorSize = 0;
257                         }
258 
259                         if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
260                         {
261                             vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
262                         }
263                     }
264                 }
265             }
266         }
267     }
268 
269 exit:
270     return gFailCount;
271 }
272 
273 test_definition test_list[] = {
274     ADD_TEST( conversions ),
275 };
276 
277 const int test_num = ARRAY_SIZE( test_list );
278 
279 #pragma mark -
280 
main(int argc,const char ** argv)281 int main (int argc, const char **argv )
282 {
283     int error;
284     cl_uint seed = (cl_uint) time( NULL );
285 
286     argc = parseCustomParam(argc, argv);
287     if (argc == -1)
288     {
289         return 1;
290     }
291 
292     if( (error = ParseArgs( argc, argv )) )
293         return error;
294 
295     //Turn off sleep so our tests run to completion
296     PreventSleep();
297     atexit( ResumeSleep );
298 
299     if(!gMultithread)
300         SetThreadCount(1);
301 
302 #if defined(_MSC_VER) && defined(_M_IX86)
303     // VS2005 (and probably others, since long double got deprecated) sets
304     // the x87 to 53-bit precision. This causes problems with the tests
305     // that convert long and ulong to float and double, since they deal
306     // with values that need more precision than that. So, set the x87
307     // to 64-bit precision.
308     unsigned int ignored;
309     _controlfp_s(&ignored, _PC_64, _MCW_PC);
310 #endif
311 
312     vlog( "===========================================================\n" );
313     vlog( "Random seed: %u\n", seed );
314     gMTdata = init_genrand( seed );
315 
316     const char* arg[] = {argv[0]};
317     int ret = runTestHarnessWithCheck( 1, arg, test_num, test_list, true, 0, InitCL );
318 
319     free_mtdata( gMTdata );
320     if (gQueue)
321     {
322         error = clFinish(gQueue);
323         if (error) vlog_error("clFinish failed: %d\n", error);
324     }
325 
326     clReleaseMemObject(gInBuffer);
327 
328     for( int i = 0; i < kCallStyleCount; i++ ) {
329         clReleaseMemObject(gOutBuffers[i]);
330     }
331     clReleaseCommandQueue(gQueue);
332     clReleaseContext(gContext);
333 
334     return ret;
335 }
336 
337 #pragma mark -
338 #pragma mark setup
339 
ParseArgs(int argc,const char ** argv)340 static int ParseArgs( int argc, const char **argv )
341 {
342     int i;
343     argList = (const char **)calloc( argc - 1, sizeof( char*) );
344     argCount = 0;
345 
346     if( NULL == argList && argc > 1 )
347         return -1;
348 
349 #if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__))
350     { // Extract the app name
351         char baseName[ MAXPATHLEN ];
352         strncpy( baseName, argv[0], MAXPATHLEN );
353         char *base = basename( baseName );
354         if( NULL != base )
355         {
356             strncpy( appName, base, sizeof( appName )  );
357             appName[ sizeof( appName ) -1 ] = '\0';
358         }
359     }
360 #elif defined (_WIN32)
361     {
362         char fname[_MAX_FNAME + _MAX_EXT + 1];
363         char ext[_MAX_EXT];
364 
365         errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
366                                    fname, _MAX_FNAME, ext, _MAX_EXT );
367         if (err == 0) { // no error
368             strcat (fname, ext); //just cat them, size of frame can keep both
369             strncpy (appName, fname, sizeof(appName));
370             appName[ sizeof( appName ) -1 ] = '\0';
371         }
372     }
373 #endif
374 
375     vlog( "\n%s", appName );
376     for( i = 1; i < argc; i++ )
377     {
378         const char *arg = argv[i];
379         if( NULL == arg )
380             break;
381 
382         vlog( "\t%s", arg );
383         if( arg[0] == '-' )
384         {
385             arg++;
386             while( *arg != '\0' )
387             {
388                 switch( *arg )
389                 {
390                     case 'd':
391                         gTestDouble ^= 1;
392                         break;
393                     case 'l':
394                         gSkipTesting ^= 1;
395                         break;
396                     case 'm':
397                         gMultithread ^= 1;
398                         break;
399                     case 'w':
400                         gWimpyMode ^= 1;
401                         break;
402                     case '[':
403                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
404                         break;
405                     case 'z':
406                         gForceFTZ ^= 1;
407                         break;
408                     case 't':
409                         gTimeResults ^= 1;
410                         break;
411                     case 'a':
412                         gReportAverageTimes ^= 1;
413                         break;
414                     case '1':
415                         if( arg[1] == '6' )
416                         {
417                             gMinVectorSize = 6;
418                             gMaxVectorSize = 7;
419                             arg++;
420                         }
421                         else
422                         {
423                             gMinVectorSize = 0;
424                             gMaxVectorSize = 2;
425                         }
426                         break;
427 
428                     case '2':
429                         gMinVectorSize = 2;
430                         gMaxVectorSize = 3;
431                         break;
432 
433                     case '3':
434                         gMinVectorSize = 3;
435                         gMaxVectorSize = 4;
436                         break;
437 
438                     case '4':
439                         gMinVectorSize = 4;
440                         gMaxVectorSize = 5;
441                         break;
442 
443                     case '8':
444                         gMinVectorSize = 5;
445                         gMaxVectorSize = 6;
446                         break;
447 
448                     default:
449                         vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
450                         PrintUsage();
451                         return -1;
452                 }
453                 arg++;
454             }
455         }
456         else
457         {
458             char *t = NULL;
459             long number = strtol( arg, &t, 0 );
460             if( t != arg )
461             {
462                 if( gStartTestNumber != -1 )
463                     gEndTestNumber = gStartTestNumber + (int) number;
464                 else
465                     gStartTestNumber = (int) number;
466             }
467             else
468             {
469                 argList[ argCount ] = arg;
470                 argCount++;
471             }
472         }
473     }
474 
475     // Check for the wimpy mode environment variable
476     if (getenv("CL_WIMPY_MODE")) {
477       vlog( "\n" );
478       vlog( "*** Detected CL_WIMPY_MODE env                          ***\n" );
479       gWimpyMode = 1;
480     }
481 
482     vlog( "\n" );
483 
484     vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );
485 
486     PrintArch();
487 
488     if( gWimpyMode )
489     {
490         vlog( "\n" );
491         vlog( "*** WARNING: Testing in Wimpy mode!                     ***\n" );
492         vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
493         vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
494         vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
495     }
496 
497     return 0;
498 }
499 
PrintUsage(void)500 static void PrintUsage( void )
501 {
502     int i;
503     vlog( "%s [-wz#]: <optional: test names>\n", appName );
504     vlog( "\ttest names:\n" );
505     vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" );
506     vlog( "\t\t\tPossible format types are:\n\t\t\t\t" );
507     for( i = 0; i < kTypeCount; i++ )
508         vlog( "%s, ", gTypeNames[i] );
509     vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" );
510     vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " );
511     for( i = 1; i < kRoundingModeCount; i++ )
512         vlog( "%s, ", gRoundingModeNames[i] );
513     vlog( "\n\t\t\tExamples:\n" );
514     vlog( "\t\t\t\tulong_short   converts short to ulong\n" );
515     vlog( "\t\t\t\tchar_sat_rte_float   converts float to char with saturated clipping in round to nearest rounding mode\n\n" );
516     vlog( "\toptions:\n" );
517     vlog( "\t\t-d\tToggle testing of double precision.  On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" );
518     vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
519     vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
520     vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
521     vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
522     vlog( "\t\t-z\tToggle flush to zero mode  (Default: per device)\n" );
523     vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
524     vlog( "\n" );
525     vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" );
526 }
527 
528 
GetTestCase(const char * name,Type * outType,Type * inType,SaturationMode * sat,RoundingMode * round)529 static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round )
530 {
531     int i;
532 
533     //Find the return type
534     for( i = 0; i < kTypeCount; i++ )
535         if( name == strstr( name, gTypeNames[i] ) )
536         {
537             *outType = (Type)i;
538             name += strlen( gTypeNames[i] );
539 
540             break;
541         }
542 
543     if( i == kTypeCount )
544         return -1;
545 
546     // Check to see if _sat appears next
547     *sat = (SaturationMode)0;
548     for( i = 1; i < kSaturationModeCount; i++ )
549         if( name == strstr( name, gSaturationNames[i] ) )
550         {
551             *sat = (SaturationMode)i;
552             name += strlen( gSaturationNames[i] );
553             break;
554         }
555 
556     *round = (RoundingMode)0;
557     for( i = 1; i < kRoundingModeCount; i++ )
558         if( name == strstr( name, gRoundingModeNames[i] ) )
559         {
560             *round = (RoundingMode)i;
561             name += strlen( gRoundingModeNames[i] );
562             break;
563         }
564 
565     if( *name != '_' )
566         return -2;
567     name++;
568 
569     for( i = 0; i < kTypeCount; i++ )
570         if( name == strstr( name, gTypeNames[i] ) )
571         {
572             *inType = (Type)i;
573             name += strlen( gTypeNames[i] );
574 
575             break;
576         }
577 
578     if( i == kTypeCount )
579         return -3;
580 
581     if( *name != '\0' )
582         return -4;
583 
584     return 0;
585 }
586 
587 #pragma mark -
588 #pragma mark OpenCL
589 
InitCL(cl_device_id device)590 test_status InitCL( cl_device_id device )
591 {
592     int error, i;
593     size_t configSize = sizeof( gComputeDevices );
594 
595     if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
596         gComputeDevices = 1;
597 
598     configSize = sizeof( gDeviceFrequency );
599     if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
600         gDeviceFrequency = 0;
601 
602     cl_device_fp_config floatCapabilities = 0;
603     if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities,  NULL)))
604         floatCapabilities = 0;
605     if(0 == (CL_FP_DENORM & floatCapabilities) )
606         gForceFTZ ^= 1;
607 
608     if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) )
609     {
610         char profileStr[128] = "";
611         // Verify that we are an embedded profile device
612         if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) )
613         {
614             vlog_error( "FAILURE: Could not get device profile: error %d\n", error );
615             return TEST_FAIL;
616         }
617 
618         if( strcmp( profileStr, "EMBEDDED_PROFILE" ) )
619         {
620             vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" );
621             return TEST_FAIL;
622         }
623 
624         if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) )
625         {
626             vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
627             return TEST_FAIL;
628         }
629 
630         gIsRTZ = 1;
631     }
632 
633     else if(is_extension_available(device, "cl_khr_fp64"))
634     {
635         gHasDouble = 1;
636     }
637     gTestDouble &= gHasDouble;
638 
639     //detect whether profile of the device is embedded
640     char profile[1024] = "";
641     if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){}
642     else if( strstr(profile, "EMBEDDED_PROFILE" ) )
643     {
644         gIsEmbedded = 1;
645         if( !is_extension_available(device, "cles_khr_int64" ) )
646             gHasLong = 0;
647     }
648 
649 
650     gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
651     if( NULL == gContext || error )
652     {
653         vlog_error( "clCreateContext failed. (%d)\n", error );
654         return TEST_FAIL;
655     }
656 
657     gQueue = clCreateCommandQueue(gContext, device, 0, &error);
658     if( NULL == gQueue || error )
659     {
660         vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
661         return TEST_FAIL;
662     }
663 
664     //Allocate buffers
665     //FIXME: use clProtectedArray for guarded allocations?
666     gIn   = malloc( BUFFER_SIZE + 2 * kPageSize );
667     gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize );
668     gRef  = malloc( BUFFER_SIZE + 2 * kPageSize );
669     for( i = 0; i < kCallStyleCount; i++ )
670     {
671         gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize );
672         if( NULL == gOut[i] )
673             return TEST_FAIL;
674     }
675 
676     // setup input buffers
677     gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error);
678     if( gInBuffer == NULL || error)
679     {
680         vlog_error( "clCreateBuffer failed for input (%d)\n", error );
681         return TEST_FAIL;
682     }
683 
684     // setup output buffers
685     for( i = 0; i < kCallStyleCount; i++ )
686     {
687         gOutBuffers[i] = clCreateBuffer(  gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error );
688         if( gOutBuffers[i] == NULL || error )
689         {
690             vlog_error( "clCreateArray failed for output (%d)\n", error );
691             return TEST_FAIL;
692         }
693     }
694 
695 
696     gMTdata = init_genrand( gRandomSeed );
697 
698 
699     char c[1024];
700     static const char *no_yes[] = { "NO", "YES" };
701     vlog( "\nCompute Device info:\n" );
702     clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
703     vlog( "\tDevice Name: %s\n", c );
704     clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
705     vlog( "\tVendor: %s\n", c );
706     clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
707     vlog( "\tDevice Version: %s\n", c );
708     clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
709     vlog( "\tCL C Version: %s\n", c );
710     clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
711     vlog( "\tDriver Version: %s\n", c );
712     vlog( "\tProcessing with %ld devices\n", gComputeDevices );
713     vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
714     vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
715     vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] );
716     vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] );
717     vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] );
718     if( gHasDouble )
719         vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] );
720     vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] );
721     vlog( "\tTesting vector sizes: " );
722     for( i = gMinVectorSize; i < gMaxVectorSize; i++ )
723         vlog("\t%d", vectorSizes[i]);
724     vlog( "\n" );
725     return TEST_PASS;
726 }
727 
RunKernel(cl_kernel kernel,void * inBuf,void * outBuf,size_t blockCount)728 static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount )
729 {
730     // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices.
731     int error;
732 
733     error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf);
734     error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
735 
736     if( error )
737     {
738         vlog_error( "FAILED -- could not set kernel args (%d)\n", error );
739         return error;
740     }
741 
742     if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL)))
743     {
744         vlog_error( "FAILED -- could not execute kernel (%d)\n", error );
745         return error;
746     }
747 
748     return 0;
749 }
750 
751 #if ! defined( __APPLE__ )
752 void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
753 #endif
754 
755 #if defined( __APPLE__ )
756 #include <mach/mach_time.h>
757 #endif
758 
759 uint64_t GetTime( void );
GetTime(void)760 uint64_t GetTime( void )
761 {
762 #if defined( __APPLE__ )
763     return mach_absolute_time();
764 #elif defined(_MSC_VER)
765     return  ReadTime();
766 #else
767     //mach_absolute_time is a high precision timer with precision < 1 microsecond.
768 #warning need accurate clock here.  Times are invalid.
769     return 0;
770 #endif
771 }
772 
773 
774 #if defined (_MSC_VER)
775 /* function is defined in "compat.h" */
776 #else
777 double SubtractTime( uint64_t endTime, uint64_t startTime );
SubtractTime(uint64_t endTime,uint64_t startTime)778 double SubtractTime( uint64_t endTime, uint64_t startTime )
779 {
780     uint64_t diff = endTime - startTime;
781     static double conversion = 0.0;
782 
783     if( 0.0 == conversion )
784     {
785 #if defined( __APPLE__ )
786         mach_timebase_info_data_t info = {0,0};
787         kern_return_t   err = mach_timebase_info( &info );
788         if( 0 == err )
789             conversion = 1e-9 * (double) info.numer / (double) info.denom;
790 #else
791         // This function consumes output from GetTime() above, and converts the time to secionds.
792 #warning need accurate ticks to seconds conversion factor here. Times are invalid.
793 #endif
794     }
795 
796     // strictly speaking we should also be subtracting out timer latency here
797     return conversion * (double) diff;
798 }
799 #endif
800 
801 typedef struct CalcReferenceValuesInfo
802 {
803     struct WriteInputBufferInfo *parent;        // pointer back to the parent WriteInputBufferInfo struct
804     cl_kernel                   kernel;         // the kernel for this vector size
805     cl_program                  program;        // the program for this vector size
806     cl_uint                     vectorSize;     // the vector size for this callback chain
807     void                        *p;             // the pointer to mapped result data for this vector size
808     cl_int                      result;
809 }CalcReferenceValuesInfo;
810 
811 typedef struct WriteInputBufferInfo
812 {
813     volatile cl_event           calcReferenceValues;   // user event which signals when main thread is done calculating reference values
814     volatile cl_event           doneBarrier;     // user event which signals when worker threads are done
815     cl_uint                     count;           // the number of elements in the array
816     Type                        outType;         // the data type of the conversion result
817     Type                        inType;          // the data type of the conversion input
818     volatile int                barrierCount;
819     CalcReferenceValuesInfo     calcInfo[kCallStyleCount];
820 }WriteInputBufferInfo;
821 
822 cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
RoundUpToNextPowerOfTwo(cl_uint x)823 cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
824 {
825     if( 0 == (x & (x-1)))
826         return x;
827 
828     while( x & (x-1) )
829        x &= x-1;
830 
831     return x + x;
832 }
833 
834 void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * );
835 
836 typedef struct DataInitInfo
837 {
838     cl_ulong        start;
839     cl_uint         size;
840     Type            outType;
841     Type            inType;
842     SaturationMode  sat;
843     RoundingMode    round;
844     MTdata          *d;
845 }DataInitInfo;
846 
847 cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p );
InitData(cl_uint job_id,cl_uint thread_id,void * p)848 cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p )
849 {
850     DataInitInfo *info = (DataInitInfo*) p;
851 
852     gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round,
853                                    info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] );
854     return CL_SUCCESS;
855 }
856 
setAllowZ(uint8_t * allow,uint32_t * x,cl_uint count)857 static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
858 {
859     cl_uint i;
860     for (i = 0; i < count; ++i)
861     allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
862 }
863 
864 cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p );
PrepareReference(cl_uint job_id,cl_uint thread_id,void * p)865 cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
866 {
867     DataInitInfo *info = (DataInitInfo*) p;
868     cl_uint count = info->size;
869     Type inType = info->inType;
870     Type outType = info->outType;
871     RoundingMode round = info->round;
872     size_t j;
873 
874     Force64BitFPUPrecision();
875 
876     void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType];
877     void *a = (cl_uchar*) gAllowZ + job_id * count;
878     void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType];
879 
880     if (outType != inType)
881     {
882         //create the reference while we wait
883         Convert f = gConversions[ outType ][ inType ];
884         if( info->sat )
885             f = gSaturatedConversions[ outType ][ inType ];
886 
887 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
888         /* ARM VFP doesn't have hardware instruction for converting from 64-bit
889          * integer to float types, hence GCC ARM uses the floating-point
890          * emulation code despite which -mfloat-abi setting it is. But the
891          * emulation code in libgcc.a has only one rounding mode (round to
892          * nearest even in this case) and ignores the user rounding mode setting
893          * in hardware. As a result setting rounding modes in hardware won't
894          * give correct rounding results for type covert from 64-bit integer to
895          * float using GCC for ARM compiler so for testing different rounding
896          * modes, we need to use alternative reference function. ARM64 does have
897          * an instruction, however we cannot guarantee the compiler will use it.
898          * On all ARM architechures use emulation to calculate reference.*/
899         switch (round)
900         {
901             /* conversions to floating-point type use the current rounding mode.
902              * The only default floating-point rounding mode supported is round to nearest even
903              * i.e the current rounding mode will be _rte for floating-point types. */
904             case kDefaultRoundingMode:
905                     qcom_rm = qcomRTE;
906                     break;
907             case kRoundToNearestEven:
908                     qcom_rm = qcomRTE;
909                     break;
910             case kRoundUp:
911                     qcom_rm = qcomRTP;
912                     break;
913             case kRoundDown:
914                     qcom_rm = qcomRTN;
915                     break;
916             case kRoundTowardZero:
917                     qcom_rm = qcomRTZ;
918                     break;
919             default:
920                     vlog_error("ERROR: undefined rounding mode %d\n", round);
921                     break;
922         }
923         qcom_sat =  info->sat;
924 #endif
925 
926         RoundingMode oldRound = set_round( round, outType );
927         f( d, s, count );
928         set_round( oldRound, outType );
929 
930     // Decide if we allow a zero result in addition to the correctly rounded one
931         memset(a, 0, count);
932     if (gForceFTZ) {
933         if (inType == kfloat)
934         setAllowZ((uint8_t*)a, (uint32_t*)s, count);
935         if (outType == kfloat)
936         setAllowZ((uint8_t*)a, (uint32_t*)d, count);
937     }
938     }
939     else
940     {
941         // Copy the input to the reference
942         memcpy(d, s, info->size * gTypeSizes[inType]);
943     }
944 
945     //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
946     if( info->outType != kfloat && info->outType != kdouble )
947     {
948         if( inType == kfloat )
949         {
950             float *inp = (float*) s;
951             for( j = 0; j < count; j++ )
952             {
953                 if( isnan( inp[j] ) )
954                     memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
955             }
956         }
957         if( inType == kdouble )
958         {
959             double *inp = (double*) s;
960             for( j = 0; j < count; j++ )
961             {
962                 if( isnan( inp[j] ) )
963                     memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
964             }
965         }
966     }
967     else if( inType == kfloat || inType == kdouble )
968     {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
969         if( inType == kfloat && outType == kdouble )
970         {
971             float *inp = (float*) s;
972             for( j = 0; j < count; j++ )
973             {
974                 if( isnan( inp[j] ) )
975                     ((double*) d)[j] = NAN;
976             }
977         }
978         if( inType == kdouble && outType == kfloat )
979         {
980             double *inp = (double*) s;
981             for( j = 0; j < count; j++ )
982             {
983                 if( isnan( inp[j] ) )
984                     ((float*) d)[j] = NAN;
985             }
986         }
987     }
988 
989     return CL_SUCCESS;
990 }
991 
DoTest(cl_device_id device,Type outType,Type inType,SaturationMode sat,RoundingMode round,MTdata d)992 static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d )
993 {
994 #ifdef __APPLE__
995     cl_ulong wall_start = mach_absolute_time();
996 #endif
997 
998     DataInitInfo  init_info = { 0, 0, outType, inType, sat, round, NULL };
999     WriteInputBufferInfo writeInputBufferInfo;
1000     int vectorSize;
1001     int error = 0;
1002     cl_uint threads = GetThreadCount();
1003     uint64_t i;
1004 
1005     gTestCount++;
1006     size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
1007     size_t step = blockCount;
1008     uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
1009     cl_event writeInputBuffer = NULL;
1010 
1011     memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) );
1012     init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) );
1013     if( NULL == init_info.d )
1014     {
1015         vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
1016         return -1;
1017     }
1018     for( i = 0; i < threads; i++ )
1019     {
1020         init_info.d[i] = init_genrand( genrand_int32( d ) );
1021         if( NULL == init_info.d[i] )
1022         {
1023             vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
1024             return -1;
1025         }
1026     }
1027 
1028     writeInputBufferInfo.outType = outType;
1029     writeInputBufferInfo.inType = inType;
1030 
1031     for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1032     {
1033         writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize,
1034                                                                         &writeInputBufferInfo.calcInfo[vectorSize].kernel );
1035         if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program )
1036         {
1037             gFailCount++;
1038             return -1;
1039         }
1040         if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel )
1041         {
1042             gFailCount++;
1043             vlog_error( "\t\tFAILED -- Failed to create kernel.\n" );
1044             return -2;
1045         }
1046 
1047         writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo;
1048         writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
1049         writeInputBufferInfo.calcInfo[vectorSize].result = -1;
1050     }
1051 
1052     if( gSkipTesting )
1053         goto exit;
1054 
1055     // Patch up rounding mode if default is RTZ
1056     // We leave the part above in default rounding mode so that the right kernel is compiled.
1057     if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) )
1058         init_info.round = round = kRoundTowardZero;
1059 
1060     // Figure out how many elements are in a work block
1061 
1062     // we handle 64-bit types a bit differently.
1063     if( 8*gTypeSizes[ inType ] > 32 )
1064         lastCase = 0x100000000ULL;
1065 
1066     if ( !gWimpyMode && gIsEmbedded )
1067       step = blockCount * EMBEDDED_REDUCTION_FACTOR;
1068 
1069     if ( gWimpyMode )
1070         step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
1071     vlog( "Testing... " );
1072     fflush(stdout);
1073     for( i = 0; i < (uint64_t)lastCase; i += step )
1074     {
1075 
1076         if( 0 == ( i & ((lastCase >> 3) -1))) {
1077             vlog(".");
1078             fflush(stdout);
1079         }
1080 
1081         cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
1082         writeInputBufferInfo.count = count;
1083 
1084         // Crate a user event to represent the status of the reference value computation completion
1085         writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error);
1086         if( error || NULL == writeInputBufferInfo.calcReferenceValues )
1087         {
1088             vlog_error( "ERROR: Unable to create user event. (%d)\n", error );
1089             gFailCount++;
1090             goto exit;
1091         }
1092 
1093         // retain for consumption by MapOutputBufferComplete
1094         for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1095         {
1096             if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) ))
1097             {
1098                 vlog_error( "ERROR: Unable to retain user event. (%d)\n", error );
1099                 gFailCount++;
1100                 goto exit;
1101             }
1102         }
1103 
1104         // Crate a user event to represent when the callbacks are done verifying correctness
1105         writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error);
1106         if( error || NULL == writeInputBufferInfo.calcReferenceValues )
1107         {
1108             vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error );
1109             gFailCount++;
1110             goto exit;
1111         }
1112 
1113         // retain for use by the callback that calls this
1114         if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) ))
1115         {
1116             vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error );
1117             gFailCount++;
1118             goto exit;
1119         }
1120 
1121         //      Call this in a multithreaded manner
1122         //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d );
1123         cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
1124         init_info.start = i;
1125         init_info.size = count / chunks;
1126         if( init_info.size < 16384 )
1127         {
1128             chunks = RoundUpToNextPowerOfTwo(threads);
1129             init_info.size = count / chunks;
1130             if( init_info.size < 16384 )
1131             {
1132                 init_info.size = count;
1133                 chunks = 1;
1134             }
1135         }
1136         ThreadPool_Do(InitData, chunks, &init_info);
1137 
1138         // Copy the results to the device
1139         writeInputBuffer = NULL;
1140         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer )))
1141         {
1142             vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error );
1143             gFailCount++;
1144             goto exit;
1145         }
1146 
1147         // Setup completion callback for the write, which will enqueue the rest of the work
1148         // This is somewhat gratuitous.  Because this is an in order queue, we didn't really need to
1149         // do this work in a callback. We could have done it from the main thread.  Here we are
1150         // verifying that the implementation can enqueue work from a callback, while at the same time
1151         // also checking to make sure that the conversions work.
1152         //
1153         // Because the verification code is also moved to a callback, it is hoped that implementations will
1154         // achieve a test performance improvement because they can verify the results in parallel.  If the
1155         // implementation serializes callbacks however, that won't happen.   Consider it some motivation
1156         // to do the right thing! :-)
1157         if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) )
1158         {
1159             vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error );
1160             gFailCount++;
1161             goto exit;
1162         }
1163 
1164         // The event can't be destroyed until the callback is called, so we can release it now.
1165         if( (error = clReleaseEvent(writeInputBuffer) ))
1166         {
1167             vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error );
1168             gFailCount++;
1169             goto exit;
1170         }
1171 
1172         // Make sure the work is actually running, so we don't deadlock
1173         if( (error = clFlush( gQueue ) ) )
1174         {
1175             vlog_error( "clFlush failed with error %d\n", error );
1176             gFailCount++;
1177             goto exit;
1178         }
1179 
1180         ThreadPool_Do(PrepareReference, chunks, &init_info);
1181 
1182         // signal we are done calculating the reference results
1183         if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) )
1184         {
1185             vlog_error( "Error:  Failed to set user event status to CL_COMPLETE:  %d\n", error );
1186             gFailCount++;
1187             goto exit;
1188         }
1189 
1190         // Wait for the event callbacks to finish verifying correctness.
1191         if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) ))
1192         {
1193             vlog_error( "Error:  Failed to wait for barrier:  %d\n", error );
1194             gFailCount++;
1195             goto exit;
1196         }
1197 
1198         if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) ))
1199         {
1200             vlog_error( "Error:  Failed to release calcReferenceValues:  %d\n", error );
1201             gFailCount++;
1202             goto exit;
1203         }
1204 
1205         if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) ))
1206         {
1207             vlog_error( "Error:  Failed to release done barrier:  %d\n", error );
1208             gFailCount++;
1209             goto exit;
1210         }
1211 
1212 
1213         for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1214         {
1215             if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result ))
1216             {
1217                 switch( inType )
1218                 {
1219                     case kuchar:
1220                     case kchar:
1221                         vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] );
1222                         break;
1223                     case kushort:
1224                     case kshort:
1225                         vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] );
1226                         break;
1227                     case kuint:
1228                     case kint:
1229                         vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] );
1230                         break;
1231                     case kfloat:
1232                         vlog( "Input value: %a ", ((float*)gIn)[error - 1] );
1233                         break;
1234                         break;
1235                     case kulong:
1236                     case klong:
1237                         vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] );
1238                         break;
1239                     case kdouble:
1240                         vlog( "Input value: %a ", ((double*)gIn)[error - 1]);
1241                         break;
1242                     default:
1243                         vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ );
1244                         abort();
1245                         break;
1246                 }
1247 
1248                 // tell the user which conversion it was.
1249                 if( 0 == vectorSize )
1250                     vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] );
1251                 else
1252                     vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ],
1253                                                             gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] );
1254 
1255                 gFailCount++;
1256                 goto exit;
1257             }
1258         }
1259     }
1260 
1261     log_info( "done.\n" );
1262 
1263     if( gTimeResults )
1264     {
1265         //Kick off tests for the various vector lengths
1266         for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1267         {
1268             size_t workItemCount = blockCount / vectorSizes[vectorSize];
1269             if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 )
1270                 workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
1271 
1272             double sum = 0.0;
1273             double bestTime = INFINITY;
1274             cl_uint k;
1275             for( k = 0; k < PERF_LOOP_COUNT; k++ )
1276             {
1277                 uint64_t startTime = GetTime();
1278                 if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
1279                 {
1280                     gFailCount++;
1281                     goto exit;
1282                 }
1283 
1284                 // Make sure OpenCL is done
1285                 if( (error = clFinish(gQueue) ) )
1286                 {
1287                     vlog_error( "Error %d at clFinish\n", error );
1288                     goto exit;
1289                 }
1290 
1291                 uint64_t endTime = GetTime();
1292                 double time = SubtractTime( endTime, startTime );
1293                 sum += time;
1294                 if( time < bestTime )
1295                     bestTime = time;
1296 
1297             }
1298 
1299             if( gReportAverageTimes )
1300                 bestTime = sum / PERF_LOOP_COUNT;
1301             double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]);
1302             if( 0 == vectorSize )
1303                 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] );
1304             else
1305                 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] );
1306         }
1307     }
1308 
1309     if( gWimpyMode )
1310         vlog( "\tWimp pass" );
1311     else
1312         vlog( "\tpassed" );
1313 
1314 #ifdef __APPLE__
1315     // record the run time
1316     vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) );
1317 #endif
1318     vlog( "\n\n" );
1319     fflush( stdout );
1320 
1321 
1322 exit:
1323     //clean up
1324     for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1325     {
1326         clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program );
1327         clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel );
1328     }
1329 
1330     if( init_info.d )
1331     {
1332         for( i = 0; i < threads; i++ )
1333             free_mtdata(init_info.d[i]);
1334         free(init_info.d);
1335     }
1336 
1337     return error;
1338 }
1339 
1340 void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data );
1341 
1342 // Note: not called reentrantly
WriteInputBufferComplete(cl_event e,cl_int status,void * data)1343 void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data )
1344 {
1345     WriteInputBufferInfo *info = (WriteInputBufferInfo*) data;
1346     cl_uint count = info->count;
1347     int vectorSize;
1348 
1349     if( CL_SUCCESS != status )
1350     {
1351         vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1352         gFailCount++;
1353         return;
1354     }
1355 
1356     info->barrierCount = gMaxVectorSize - gMinVectorSize;
1357 
1358     // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to
1359     // finish calculating the reference results.
1360     for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1361     {
1362         size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]);
1363         cl_event mapComplete = NULL;
1364 
1365         if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
1366         {
1367             gFailCount++;
1368             return;
1369         }
1370 
1371         info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
1372                                                           0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status);
1373         {
1374             if( status )
1375             {
1376                 vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1377                 gFailCount++;
1378                 return;
1379             }
1380         }
1381 
1382         if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize)))
1383         {
1384             vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1385             gFailCount++;
1386             return;
1387         }
1388 
1389         if( (status = clReleaseEvent(mapComplete)))
1390         {
1391             vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status );
1392             gFailCount++;
1393             return;
1394         }
1395     }
1396 
1397     // Make sure the work starts moving -- otherwise we may deadlock
1398     if( (status = clFlush(gQueue)))
1399     {
1400         vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1401         gFailCount++;
1402         return;
1403     }
1404 
1405     // e was already released by the main thread. It should be destroyed automatically soon after we exit.
1406 }
1407 
1408 void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data );
1409 
1410 // Note: May be called reentrantly
MapResultValuesComplete(cl_event e,cl_int status,void * data)1411 void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data )
1412 {
1413     CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
1414     cl_event calcReferenceValues = info->parent->calcReferenceValues;
1415 
1416     if( CL_SUCCESS != status )
1417     {
1418         vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status );
1419         gFailCount++;       // not thread safe -- being lazy here
1420         clReleaseEvent(calcReferenceValues);
1421         return;
1422     }
1423 
1424     // we know that the map is done, wait for the main thread to finish calculating the reference values
1425     if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data )))
1426     {
1427         vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status );
1428         gFailCount++;       // not thread safe -- being lazy here
1429     }
1430 
1431     // this thread no longer needs its reference to info->calcReferenceValues, so release it
1432     if( (status = clReleaseEvent(calcReferenceValues) ))
1433     {
1434         vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status );
1435         gFailCount++;       // not thread safe -- being lazy here
1436     }
1437 
1438     // no need to flush since we didn't enqueue anything
1439 
1440     // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit.
1441 }
1442 
1443 
CalcReferenceValuesComplete(cl_event e,cl_int status,void * data)1444 void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data )
1445 {
1446     CalcReferenceValuesInfo     *info = (CalcReferenceValuesInfo*) data;
1447     cl_uint                     vectorSize = info->vectorSize;
1448     cl_uint                     count = info->parent->count;
1449     Type                        outType = info->parent->outType;        // the data type of the conversion result
1450     Type                        inType = info->parent->inType;          // the data type of the conversion input
1451     size_t                      j;
1452     cl_int                      error;
1453     cl_event                    doneBarrier = info->parent->doneBarrier;
1454 
1455     // report spurious error condition
1456     if( CL_SUCCESS != status )
1457     {
1458         vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status );
1459         gFailCount++;       // lazy about thread safety here
1460         return;
1461     }
1462 
1463     // Now we know that both results have been mapped back from the device, and the
1464     // main thread is done calculating the reference results. It is now time to check
1465     // the results.
1466 
1467     // verify results
1468     void *mapped = info->p;
1469 
1470     //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
1471     if( outType != kfloat && outType != kdouble )
1472     {
1473         if( inType == kfloat )
1474         {
1475             float *inp = (float*) gIn;
1476             for( j = 0; j < count; j++ )
1477             {
1478                 if( isnan( inp[j] ) )
1479                     memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
1480             }
1481         }
1482         if( inType == kdouble )
1483         {
1484             double *inp = (double*) gIn;
1485             for( j = 0; j < count; j++ )
1486             {
1487                 if( isnan( inp[j] ) )
1488                     memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
1489             }
1490         }
1491     }
1492     else if( inType == kfloat || inType == kdouble )
1493     {  // outtype and intype is float or double.  NaN conversions for float <-> double can be any NaN
1494         if( inType == kfloat && outType == kdouble )
1495         {
1496             float *inp = (float*) gIn;
1497             double *outp = (double*) mapped;
1498             for( j = 0; j < count; j++ )
1499             {
1500                 if( isnan( inp[j] ) && isnan(outp[j]) )
1501                     outp[j] = NAN;
1502             }
1503         }
1504         if( inType == kdouble && outType == kfloat )
1505         {
1506             double *inp = (double*) gIn;
1507             float *outp = (float*) mapped;
1508             for( j = 0; j < count; j++ )
1509             {
1510                 if( isnan( inp[j] ) && isnan(outp[j]) )
1511                     outp[j] = NAN;
1512             }
1513         }
1514     }
1515 
1516     if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) )
1517         info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] );
1518     else
1519         info->result = 0;
1520 
1521     // Fill the output buffer with junk and release it
1522     {
1523         cl_uint pattern =  0xffffdead;
1524         memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
1525         if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL)))
1526         {
1527             vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete  (%d)\n", error );
1528             gFailCount++;
1529         }
1530     }
1531 
1532     if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) )
1533     {
1534         if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) ))
1535         {
1536             vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status );
1537             gFailCount++;
1538             return;
1539         }
1540 
1541         if( (status = clReleaseEvent( doneBarrier ) ) )
1542         {
1543             vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status );
1544             gFailCount++;
1545             return;
1546         }
1547     }
1548 
1549 
1550     // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after
1551     // all the calls to CalcReferenceValuesComplete exit.
1552 }
1553 
MakeProgram(Type outType,Type inType,SaturationMode sat,RoundingMode round,int vectorSize,cl_kernel * outKernel)1554 static cl_program   MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel )
1555 {
1556     cl_program program;
1557     char testName[256];
1558     int error = 0;
1559     const char **strings;
1560     size_t stringCount = 0;
1561 
1562     // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
1563     if (0 == vectorSize)
1564     {
1565         char inName[32];
1566         char outName[32];
1567         const char *programSource[] =
1568         {
1569             "", // optional pragma
1570             "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1571             "{\n"
1572             "   size_t i = get_global_id(0);\n"
1573             "   dest[i] =  src[i];\n"
1574             "}\n"
1575         };
1576         stringCount = sizeof(programSource) / sizeof(programSource[0]);
1577         strings = programSource;
1578 
1579         if (outType == kdouble || inType == kdouble)
1580             programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1581 
1582         //create the type name
1583         strncpy(inName, gTypeNames[inType], sizeof(inName));
1584         strncpy(outName, gTypeNames[outType], sizeof(outName));
1585         sprintf(testName, "test_implicit_%s_%s", outName, inName);
1586         vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]);
1587         fflush(stdout);
1588     }
1589     else
1590     {
1591         int vectorSizetmp = vectorSizes[vectorSize];
1592 
1593         char convertString[128];
1594         char inName[32];
1595         char outName[32];
1596         const char *programSource[] =
1597         {
1598             "", // optional pragma
1599             "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1600             "{\n"
1601             "   size_t i = get_global_id(0);\n"
1602             "   dest[i] = ", convertString, "( src[i] );\n"
1603             "}\n"
1604         };
1605         const char *programSourceV3[] =
1606         {
1607             "", // optional pragma
1608             "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1609             "{\n"
1610             "   size_t i = get_global_id(0);\n"
1611             "   if( i + 1 < get_global_size(0))\n"
1612             "       vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
1613             "   else\n"
1614             "   {\n"
1615             "       ", inName, "3 in;\n"
1616             "       ", outName, "3 out;\n"
1617             "       if( 0 == (i & 1) )\n"
1618             "           in.y = src[3*i+1];\n"
1619             "       in.x = src[3*i];\n"
1620             "       out = ", convertString, "( in ); \n"
1621             "       dest[3*i] = out.x;\n"
1622             "       if( 0 == (i & 1) )\n"
1623             "           dest[3*i+1] = out.y;\n"
1624             "   }\n"
1625             "}\n"
1626         };
1627         stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) :
1628             sizeof(programSource) / sizeof(programSource[0]);
1629         strings = 3 == vectorSizetmp ? programSourceV3 : programSource;
1630 
1631         if (outType == kdouble || inType == kdouble) {
1632             programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1633             programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1634         }
1635 
1636         //create the type name
1637         switch (vectorSizetmp)
1638         {
1639         case 1:
1640             strncpy(inName, gTypeNames[inType], sizeof(inName));
1641             strncpy(outName, gTypeNames[outType], sizeof(outName));
1642             snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1643             snprintf(testName, 256, "test_%s_%s", convertString, inName);
1644             vlog("Building %s( %s ) test\n", convertString, inName);
1645             break;
1646         case 3:
1647             strncpy(inName, gTypeNames[inType], sizeof(inName));
1648             strncpy(outName, gTypeNames[outType], sizeof(outName));
1649             snprintf(convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1650             snprintf(testName, 256, "test_%s_%s3", convertString, inName);
1651             vlog("Building %s( %s3 ) test\n", convertString, inName);
1652             break;
1653         default:
1654             snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], vectorSizetmp);
1655             snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], vectorSizetmp);
1656             snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1657             snprintf(testName, 256, "test_%s_%s", convertString, inName);
1658             vlog("Building %s( %s ) test\n", convertString, inName);
1659             break;
1660         }
1661 
1662         fflush(stdout);
1663     }
1664     *outKernel = NULL;
1665 
1666     const char *flags = NULL;
1667     if( gForceFTZ )
1668         flags = "-cl-denorms-are-zero";
1669 
1670     // build it
1671     error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags);
1672     if (error)
1673     {
1674         char    buffer[2048] = "";
1675 
1676         vlog_error("Failed to build kernel/program.\n", error);
1677         clReleaseProgram(program);
1678         return NULL;
1679     }
1680 
1681     return program;
1682 }
1683