1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/rounding_mode.h"
18 #include "harness/ThreadPool.h"
19 #include "harness/testHarness.h"
20 #include "harness/kernelHelpers.h"
21 #include "harness/parseParameters.h"
22 #if defined(__APPLE__)
23 #include <sys/sysctl.h>
24 #endif
25
26 #if defined( __linux__ )
27 #include <unistd.h>
28 #include <sys/syscall.h>
29 #include <linux/sysctl.h>
30 #endif
31 #if defined(__linux__)
32 #include <sys/param.h>
33 #include <libgen.h>
34 #endif
35
36 #include "mingw_compat.h"
37 #if defined(__MINGW32__)
38 #include <sys/param.h>
39 #endif
40
41 #include <stdarg.h>
42 #include <stdio.h>
43 #include <string.h>
44 #if !defined(_WIN32)
45 #include <libgen.h>
46 #include <sys/mman.h>
47 #endif
48 #include <time.h>
49
50 #include "Sleep.h"
51 #include "basic_test_conversions.h"
52
53 #if (defined(_WIN32) && defined (_MSC_VER))
54 // need for _controlfp_s and rouinding modes in RoundingMode
55 #include "harness/testHarness.h"
56 #endif
57
58 #pragma mark -
59 #pragma mark globals
60
61 #define BUFFER_SIZE (1024*1024)
62 #define kPageSize 4096
63 #define EMBEDDED_REDUCTION_FACTOR 16
64 #define PERF_LOOP_COUNT 100
65
66 #define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
67
68 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
69 #include "fplib.h"
70 extern bool qcom_sat;
71 extern roundingMode qcom_rm;
72 #endif
73
74 const char ** argList = NULL;
75 int argCount = 0;
76 cl_context gContext = NULL;
77 cl_command_queue gQueue = NULL;
78 char appName[64] = "ctest";
79 int gStartTestNumber = -1;
80 int gEndTestNumber = 0;
81 #if defined( __APPLE__ )
82 int gTimeResults = 1;
83 #else
84 int gTimeResults = 0;
85 #endif
86 int gReportAverageTimes = 0;
87 void *gIn = NULL;
88 void *gRef = NULL;
89 void *gAllowZ = NULL;
90 void *gOut[ kCallStyleCount ] = { NULL };
91 cl_mem gInBuffer;
92 cl_mem gOutBuffers[ kCallStyleCount ];
93 size_t gComputeDevices = 0;
94 uint32_t gDeviceFrequency = 0;
95 int gWimpyMode = 0;
96 int gWimpyReductionFactor = 128;
97 int gSkipTesting = 0;
98 int gForceFTZ = 0;
99 int gMultithread = 1;
100 int gIsRTZ = 0;
101 uint32_t gSimdSize = 1;
102 int gHasDouble = 0;
103 int gTestDouble = 1;
104 const char * sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
105 const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
106 int gMinVectorSize = 0;
107 int gMaxVectorSize = sizeof(vectorSizes) / sizeof( vectorSizes[0] );
108 static MTdata gMTdata;
109
110 #pragma mark -
111 #pragma mark Declarations
112
113 static int ParseArgs( int argc, const char **argv );
114 static void PrintUsage( void );
115 test_status InitCL( cl_device_id device );
116 static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round );
117 static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d );
118 static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel );
119 static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount );
120
121 void *FlushToZero( void );
122 void UnFlushToZero( void *);
123
124 static cl_program CreateImplicitConvertProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
125 static cl_program CreateStandardProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, char testName[256], cl_int *error );
126
127
128 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
129 // (that's x87 default state). This causes problems with the tests that
130 // convert long and ulong to float and double or otherwise deal with values
131 // that need more precision than 53-bit. So, set the x87 to 64-bit precision.
Force64BitFPUPrecision(void)132 static inline void Force64BitFPUPrecision(void)
133 {
134 #if __MINGW32__
135 // The usual method is to use _controlfp as follows:
136 // #include <float.h>
137 // _controlfp(_PC_64, _MCW_PC);
138 //
139 // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
140 // divergent code just use inline assembly which works for both.
141 unsigned short int orig_cw = 0;
142 unsigned short int new_cw = 0;
143 __asm__ __volatile__ ("fstcw %0":"=m" (orig_cw));
144 new_cw = orig_cw | 0x0300; // set precision to 64-bit
145 __asm__ __volatile__ ("fldcw %0"::"m" (new_cw));
146 #else
147 /* Implement for other platforms if needed */
148 #endif
149 }
150
test_conversions(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)151 int test_conversions( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
152 {
153 int error, i, testNumber = -1;
154 int startMinVectorSize = gMinVectorSize;
155 Type inType, outType;
156 RoundingMode round;
157 SaturationMode sat;
158
159 if( argCount )
160 {
161 for( i = 0; i < argCount; i++ )
162 {
163 if( GetTestCase( argList[i], &outType, &inType, &sat, &round ) )
164 {
165 vlog_error( "\n\t\t**** ERROR: Unable to parse function name %s. Skipping.... *****\n\n", argList[i] );
166 continue;
167 }
168
169 // skip double if we don't have it
170 if( !gTestDouble && (inType == kdouble || outType == kdouble ) )
171 {
172 if( gHasDouble )
173 {
174 vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
175 vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
176 }
177
178 continue;
179 }
180
181 // skip longs on embedded
182 if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
183 {
184 continue;
185 }
186
187 // Skip the implicit converts if the rounding mode is not default or test is saturated
188 if( 0 == startMinVectorSize )
189 {
190 if( sat || round != kDefaultRoundingMode )
191 gMinVectorSize = 1;
192 else
193 gMinVectorSize = 0;
194 }
195
196 if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
197 {
198 vlog_error( "\t *** convert_%sn%s%s( %sn ) FAILED ** \n", gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
199 }
200 }
201 }
202 else
203 {
204 for( outType = (Type)0; outType < kTypeCount; outType = (Type)(outType+1) )
205 {
206 for( inType = (Type)0; inType < kTypeCount; inType = (Type)(inType+1) )
207 {
208 // skip longs on embedded
209 if( !gHasLong && (inType == klong || outType == klong || inType == kulong || outType == kulong) )
210 {
211 continue;
212 }
213
214 for( sat = (SaturationMode)0; sat < kSaturationModeCount; sat = (SaturationMode)(sat+1) )
215 {
216 //skip illegal saturated conversions to float type
217 if( kSaturated == sat && ( outType == kfloat || outType == kdouble ) )
218 {
219 continue;
220 }
221
222 for( round = (RoundingMode)0; round < kRoundingModeCount; round = (RoundingMode)(round+1) )
223 {
224 if( ++testNumber < gStartTestNumber )
225 {
226 // vlog( "%d) skipping convert_%sn%s%s( %sn )\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
227 continue;
228 }
229 else
230 {
231 if( gEndTestNumber > 0 && testNumber >= gEndTestNumber )
232 {
233 goto exit;
234 }
235 }
236
237 vlog( "%d) Testing convert_%sn%s%s( %sn ):\n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
238
239 // skip double if we don't have it
240 if( ! gTestDouble && (inType == kdouble || outType == kdouble ) )
241 {
242 if( gHasDouble )
243 {
244 vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[ outType ], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType] );
245 vlog( "\t\tcl_khr_fp64 enabled, but double testing turned off.\n" );
246 }
247 continue;
248 }
249
250 // Skip the implicit converts if the rounding mode is not default or test is saturated
251 if( 0 == startMinVectorSize )
252 {
253 if( sat || round != kDefaultRoundingMode )
254 gMinVectorSize = 1;
255 else
256 gMinVectorSize = 0;
257 }
258
259 if( ( error = DoTest( device, outType, inType, sat, round, gMTdata ) ) )
260 {
261 vlog_error( "\t *** %d) convert_%sn%s%s( %sn ) FAILED ** \n", testNumber, gTypeNames[outType], gSaturationNames[sat], gRoundingModeNames[round], gTypeNames[inType] );
262 }
263 }
264 }
265 }
266 }
267 }
268
269 exit:
270 return gFailCount;
271 }
272
273 test_definition test_list[] = {
274 ADD_TEST( conversions ),
275 };
276
277 const int test_num = ARRAY_SIZE( test_list );
278
279 #pragma mark -
280
main(int argc,const char ** argv)281 int main (int argc, const char **argv )
282 {
283 int error;
284 cl_uint seed = (cl_uint) time( NULL );
285
286 argc = parseCustomParam(argc, argv);
287 if (argc == -1)
288 {
289 return 1;
290 }
291
292 if( (error = ParseArgs( argc, argv )) )
293 return error;
294
295 //Turn off sleep so our tests run to completion
296 PreventSleep();
297 atexit( ResumeSleep );
298
299 if(!gMultithread)
300 SetThreadCount(1);
301
302 #if defined(_MSC_VER) && defined(_M_IX86)
303 // VS2005 (and probably others, since long double got deprecated) sets
304 // the x87 to 53-bit precision. This causes problems with the tests
305 // that convert long and ulong to float and double, since they deal
306 // with values that need more precision than that. So, set the x87
307 // to 64-bit precision.
308 unsigned int ignored;
309 _controlfp_s(&ignored, _PC_64, _MCW_PC);
310 #endif
311
312 vlog( "===========================================================\n" );
313 vlog( "Random seed: %u\n", seed );
314 gMTdata = init_genrand( seed );
315
316 const char* arg[] = {argv[0]};
317 int ret = runTestHarnessWithCheck( 1, arg, test_num, test_list, true, 0, InitCL );
318
319 free_mtdata( gMTdata );
320 if (gQueue)
321 {
322 error = clFinish(gQueue);
323 if (error) vlog_error("clFinish failed: %d\n", error);
324 }
325
326 clReleaseMemObject(gInBuffer);
327
328 for( int i = 0; i < kCallStyleCount; i++ ) {
329 clReleaseMemObject(gOutBuffers[i]);
330 }
331 clReleaseCommandQueue(gQueue);
332 clReleaseContext(gContext);
333
334 return ret;
335 }
336
337 #pragma mark -
338 #pragma mark setup
339
ParseArgs(int argc,const char ** argv)340 static int ParseArgs( int argc, const char **argv )
341 {
342 int i;
343 argList = (const char **)calloc( argc - 1, sizeof( char*) );
344 argCount = 0;
345
346 if( NULL == argList && argc > 1 )
347 return -1;
348
349 #if (defined( __APPLE__ ) || defined(__linux__) || defined (__MINGW32__))
350 { // Extract the app name
351 char baseName[ MAXPATHLEN ];
352 strncpy( baseName, argv[0], MAXPATHLEN );
353 char *base = basename( baseName );
354 if( NULL != base )
355 {
356 strncpy( appName, base, sizeof( appName ) );
357 appName[ sizeof( appName ) -1 ] = '\0';
358 }
359 }
360 #elif defined (_WIN32)
361 {
362 char fname[_MAX_FNAME + _MAX_EXT + 1];
363 char ext[_MAX_EXT];
364
365 errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
366 fname, _MAX_FNAME, ext, _MAX_EXT );
367 if (err == 0) { // no error
368 strcat (fname, ext); //just cat them, size of frame can keep both
369 strncpy (appName, fname, sizeof(appName));
370 appName[ sizeof( appName ) -1 ] = '\0';
371 }
372 }
373 #endif
374
375 vlog( "\n%s", appName );
376 for( i = 1; i < argc; i++ )
377 {
378 const char *arg = argv[i];
379 if( NULL == arg )
380 break;
381
382 vlog( "\t%s", arg );
383 if( arg[0] == '-' )
384 {
385 arg++;
386 while( *arg != '\0' )
387 {
388 switch( *arg )
389 {
390 case 'd':
391 gTestDouble ^= 1;
392 break;
393 case 'l':
394 gSkipTesting ^= 1;
395 break;
396 case 'm':
397 gMultithread ^= 1;
398 break;
399 case 'w':
400 gWimpyMode ^= 1;
401 break;
402 case '[':
403 parseWimpyReductionFactor(arg, gWimpyReductionFactor);
404 break;
405 case 'z':
406 gForceFTZ ^= 1;
407 break;
408 case 't':
409 gTimeResults ^= 1;
410 break;
411 case 'a':
412 gReportAverageTimes ^= 1;
413 break;
414 case '1':
415 if( arg[1] == '6' )
416 {
417 gMinVectorSize = 6;
418 gMaxVectorSize = 7;
419 arg++;
420 }
421 else
422 {
423 gMinVectorSize = 0;
424 gMaxVectorSize = 2;
425 }
426 break;
427
428 case '2':
429 gMinVectorSize = 2;
430 gMaxVectorSize = 3;
431 break;
432
433 case '3':
434 gMinVectorSize = 3;
435 gMaxVectorSize = 4;
436 break;
437
438 case '4':
439 gMinVectorSize = 4;
440 gMaxVectorSize = 5;
441 break;
442
443 case '8':
444 gMinVectorSize = 5;
445 gMaxVectorSize = 6;
446 break;
447
448 default:
449 vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
450 PrintUsage();
451 return -1;
452 }
453 arg++;
454 }
455 }
456 else
457 {
458 char *t = NULL;
459 long number = strtol( arg, &t, 0 );
460 if( t != arg )
461 {
462 if( gStartTestNumber != -1 )
463 gEndTestNumber = gStartTestNumber + (int) number;
464 else
465 gStartTestNumber = (int) number;
466 }
467 else
468 {
469 argList[ argCount ] = arg;
470 argCount++;
471 }
472 }
473 }
474
475 // Check for the wimpy mode environment variable
476 if (getenv("CL_WIMPY_MODE")) {
477 vlog( "\n" );
478 vlog( "*** Detected CL_WIMPY_MODE env ***\n" );
479 gWimpyMode = 1;
480 }
481
482 vlog( "\n" );
483
484 vlog( "Test binary built %s %s\n", __DATE__, __TIME__ );
485
486 PrintArch();
487
488 if( gWimpyMode )
489 {
490 vlog( "\n" );
491 vlog( "*** WARNING: Testing in Wimpy mode! ***\n" );
492 vlog( "*** Wimpy mode is not sufficient to verify correctness. ***\n" );
493 vlog( "*** It gives warm fuzzy feelings and then nevers calls. ***\n\n" );
494 vlog("*** Wimpy Reduction Factor: %-27u ***\n\n", gWimpyReductionFactor);
495 }
496
497 return 0;
498 }
499
PrintUsage(void)500 static void PrintUsage( void )
501 {
502 int i;
503 vlog( "%s [-wz#]: <optional: test names>\n", appName );
504 vlog( "\ttest names:\n" );
505 vlog( "\t\tdestFormat<_sat><_round>_sourceFormat\n" );
506 vlog( "\t\t\tPossible format types are:\n\t\t\t\t" );
507 for( i = 0; i < kTypeCount; i++ )
508 vlog( "%s, ", gTypeNames[i] );
509 vlog( "\n\n\t\t\tPossible saturation values are: (empty) and _sat\n" );
510 vlog( "\t\t\tPossible rounding values are:\n\t\t\t\t(empty), " );
511 for( i = 1; i < kRoundingModeCount; i++ )
512 vlog( "%s, ", gRoundingModeNames[i] );
513 vlog( "\n\t\t\tExamples:\n" );
514 vlog( "\t\t\t\tulong_short converts short to ulong\n" );
515 vlog( "\t\t\t\tchar_sat_rte_float converts float to char with saturated clipping in round to nearest rounding mode\n\n" );
516 vlog( "\toptions:\n" );
517 vlog( "\t\t-d\tToggle testing of double precision. On by default if cl_khr_fp64 is enabled, ignored otherwise.\n" );
518 vlog( "\t\t-l\tToggle link check mode. When on, testing is skipped, and we just check to see that the kernels build. (Off by default.)\n" );
519 vlog( "\t\t-m\tToggle Multithreading. (On by default.)\n" );
520 vlog( "\t\t-w\tToggle wimpy mode. When wimpy mode is on, we run a very small subset of the tests for each fn. NOT A VALID TEST! (Off by default.)\n" );
521 vlog(" \t\t-[2^n]\tSet wimpy reduction factor, recommended range of n is 1-12, default factor(%u)\n", gWimpyReductionFactor);
522 vlog( "\t\t-z\tToggle flush to zero mode (Default: per device)\n" );
523 vlog( "\t\t-#\tTest just vector size given by #, where # is an element of the set {1,2,3,4,8,16}\n" );
524 vlog( "\n" );
525 vlog( "You may also pass the number of the test on which to start.\nA second number can be then passed to indicate how many tests to run\n\n" );
526 }
527
528
GetTestCase(const char * name,Type * outType,Type * inType,SaturationMode * sat,RoundingMode * round)529 static int GetTestCase( const char *name, Type *outType, Type *inType, SaturationMode *sat, RoundingMode *round )
530 {
531 int i;
532
533 //Find the return type
534 for( i = 0; i < kTypeCount; i++ )
535 if( name == strstr( name, gTypeNames[i] ) )
536 {
537 *outType = (Type)i;
538 name += strlen( gTypeNames[i] );
539
540 break;
541 }
542
543 if( i == kTypeCount )
544 return -1;
545
546 // Check to see if _sat appears next
547 *sat = (SaturationMode)0;
548 for( i = 1; i < kSaturationModeCount; i++ )
549 if( name == strstr( name, gSaturationNames[i] ) )
550 {
551 *sat = (SaturationMode)i;
552 name += strlen( gSaturationNames[i] );
553 break;
554 }
555
556 *round = (RoundingMode)0;
557 for( i = 1; i < kRoundingModeCount; i++ )
558 if( name == strstr( name, gRoundingModeNames[i] ) )
559 {
560 *round = (RoundingMode)i;
561 name += strlen( gRoundingModeNames[i] );
562 break;
563 }
564
565 if( *name != '_' )
566 return -2;
567 name++;
568
569 for( i = 0; i < kTypeCount; i++ )
570 if( name == strstr( name, gTypeNames[i] ) )
571 {
572 *inType = (Type)i;
573 name += strlen( gTypeNames[i] );
574
575 break;
576 }
577
578 if( i == kTypeCount )
579 return -3;
580
581 if( *name != '\0' )
582 return -4;
583
584 return 0;
585 }
586
587 #pragma mark -
588 #pragma mark OpenCL
589
InitCL(cl_device_id device)590 test_status InitCL( cl_device_id device )
591 {
592 int error, i;
593 size_t configSize = sizeof( gComputeDevices );
594
595 if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, configSize, &gComputeDevices, NULL )) )
596 gComputeDevices = 1;
597
598 configSize = sizeof( gDeviceFrequency );
599 if( (error = clGetDeviceInfo( device, CL_DEVICE_MAX_CLOCK_FREQUENCY, configSize, &gDeviceFrequency, NULL )) )
600 gDeviceFrequency = 0;
601
602 cl_device_fp_config floatCapabilities = 0;
603 if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL)))
604 floatCapabilities = 0;
605 if(0 == (CL_FP_DENORM & floatCapabilities) )
606 gForceFTZ ^= 1;
607
608 if( 0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST ) )
609 {
610 char profileStr[128] = "";
611 // Verify that we are an embedded profile device
612 if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof( profileStr ), profileStr, NULL ) ) )
613 {
614 vlog_error( "FAILURE: Could not get device profile: error %d\n", error );
615 return TEST_FAIL;
616 }
617
618 if( strcmp( profileStr, "EMBEDDED_PROFILE" ) )
619 {
620 vlog_error( "FAILURE: non-embedded profile device does not support CL_FP_ROUND_TO_NEAREST\n" );
621 return TEST_FAIL;
622 }
623
624 if( 0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO ) )
625 {
626 vlog_error( "FAILURE: embedded profile device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" );
627 return TEST_FAIL;
628 }
629
630 gIsRTZ = 1;
631 }
632
633 else if(is_extension_available(device, "cl_khr_fp64"))
634 {
635 gHasDouble = 1;
636 }
637 gTestDouble &= gHasDouble;
638
639 //detect whether profile of the device is embedded
640 char profile[1024] = "";
641 if( (error = clGetDeviceInfo( device, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ) ) ){}
642 else if( strstr(profile, "EMBEDDED_PROFILE" ) )
643 {
644 gIsEmbedded = 1;
645 if( !is_extension_available(device, "cles_khr_int64" ) )
646 gHasLong = 0;
647 }
648
649
650 gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
651 if( NULL == gContext || error )
652 {
653 vlog_error( "clCreateContext failed. (%d)\n", error );
654 return TEST_FAIL;
655 }
656
657 gQueue = clCreateCommandQueue(gContext, device, 0, &error);
658 if( NULL == gQueue || error )
659 {
660 vlog_error( "clCreateCommandQueue failed. (%d)\n", error );
661 return TEST_FAIL;
662 }
663
664 //Allocate buffers
665 //FIXME: use clProtectedArray for guarded allocations?
666 gIn = malloc( BUFFER_SIZE + 2 * kPageSize );
667 gAllowZ = malloc( BUFFER_SIZE + 2 * kPageSize );
668 gRef = malloc( BUFFER_SIZE + 2 * kPageSize );
669 for( i = 0; i < kCallStyleCount; i++ )
670 {
671 gOut[i] = malloc( BUFFER_SIZE + 2 * kPageSize );
672 if( NULL == gOut[i] )
673 return TEST_FAIL;
674 }
675
676 // setup input buffers
677 gInBuffer = clCreateBuffer(gContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error);
678 if( gInBuffer == NULL || error)
679 {
680 vlog_error( "clCreateBuffer failed for input (%d)\n", error );
681 return TEST_FAIL;
682 }
683
684 // setup output buffers
685 for( i = 0; i < kCallStyleCount; i++ )
686 {
687 gOutBuffers[i] = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE, NULL, &error );
688 if( gOutBuffers[i] == NULL || error )
689 {
690 vlog_error( "clCreateArray failed for output (%d)\n", error );
691 return TEST_FAIL;
692 }
693 }
694
695
696 gMTdata = init_genrand( gRandomSeed );
697
698
699 char c[1024];
700 static const char *no_yes[] = { "NO", "YES" };
701 vlog( "\nCompute Device info:\n" );
702 clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(c), c, NULL);
703 vlog( "\tDevice Name: %s\n", c );
704 clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(c), c, NULL);
705 vlog( "\tVendor: %s\n", c );
706 clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(c), c, NULL);
707 vlog( "\tDevice Version: %s\n", c );
708 clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
709 vlog( "\tCL C Version: %s\n", c );
710 clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(c), c, NULL);
711 vlog( "\tDriver Version: %s\n", c );
712 vlog( "\tProcessing with %ld devices\n", gComputeDevices );
713 vlog( "\tDevice Frequency: %d MHz\n", gDeviceFrequency );
714 vlog( "\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
715 vlog( "\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ] );
716 vlog( "\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ] );
717 vlog( "\tHas Double? %s\n", no_yes[0 != gHasDouble] );
718 if( gHasDouble )
719 vlog( "\tTest Double? %s\n", no_yes[0 != gTestDouble] );
720 vlog( "\tHas Long? %s\n", no_yes[0 != gHasLong] );
721 vlog( "\tTesting vector sizes: " );
722 for( i = gMinVectorSize; i < gMaxVectorSize; i++ )
723 vlog("\t%d", vectorSizes[i]);
724 vlog( "\n" );
725 return TEST_PASS;
726 }
727
RunKernel(cl_kernel kernel,void * inBuf,void * outBuf,size_t blockCount)728 static int RunKernel( cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount )
729 {
730 // The global dimensions are just the blockCount to execute since we haven't set up multiple queues for multiple devices.
731 int error;
732
733 error = clSetKernelArg(kernel, 0, sizeof( inBuf ), &inBuf);
734 error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
735
736 if( error )
737 {
738 vlog_error( "FAILED -- could not set kernel args (%d)\n", error );
739 return error;
740 }
741
742 if( (error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount, NULL, 0, NULL, NULL)))
743 {
744 vlog_error( "FAILED -- could not execute kernel (%d)\n", error );
745 return error;
746 }
747
748 return 0;
749 }
750
751 #if ! defined( __APPLE__ )
752 void memset_pattern4(void *dest, const void *src_pattern, size_t bytes );
753 #endif
754
755 #if defined( __APPLE__ )
756 #include <mach/mach_time.h>
757 #endif
758
759 uint64_t GetTime( void );
GetTime(void)760 uint64_t GetTime( void )
761 {
762 #if defined( __APPLE__ )
763 return mach_absolute_time();
764 #elif defined(_MSC_VER)
765 return ReadTime();
766 #else
767 //mach_absolute_time is a high precision timer with precision < 1 microsecond.
768 #warning need accurate clock here. Times are invalid.
769 return 0;
770 #endif
771 }
772
773
774 #if defined (_MSC_VER)
775 /* function is defined in "compat.h" */
776 #else
777 double SubtractTime( uint64_t endTime, uint64_t startTime );
SubtractTime(uint64_t endTime,uint64_t startTime)778 double SubtractTime( uint64_t endTime, uint64_t startTime )
779 {
780 uint64_t diff = endTime - startTime;
781 static double conversion = 0.0;
782
783 if( 0.0 == conversion )
784 {
785 #if defined( __APPLE__ )
786 mach_timebase_info_data_t info = {0,0};
787 kern_return_t err = mach_timebase_info( &info );
788 if( 0 == err )
789 conversion = 1e-9 * (double) info.numer / (double) info.denom;
790 #else
791 // This function consumes output from GetTime() above, and converts the time to secionds.
792 #warning need accurate ticks to seconds conversion factor here. Times are invalid.
793 #endif
794 }
795
796 // strictly speaking we should also be subtracting out timer latency here
797 return conversion * (double) diff;
798 }
799 #endif
800
801 typedef struct CalcReferenceValuesInfo
802 {
803 struct WriteInputBufferInfo *parent; // pointer back to the parent WriteInputBufferInfo struct
804 cl_kernel kernel; // the kernel for this vector size
805 cl_program program; // the program for this vector size
806 cl_uint vectorSize; // the vector size for this callback chain
807 void *p; // the pointer to mapped result data for this vector size
808 cl_int result;
809 }CalcReferenceValuesInfo;
810
811 typedef struct WriteInputBufferInfo
812 {
813 volatile cl_event calcReferenceValues; // user event which signals when main thread is done calculating reference values
814 volatile cl_event doneBarrier; // user event which signals when worker threads are done
815 cl_uint count; // the number of elements in the array
816 Type outType; // the data type of the conversion result
817 Type inType; // the data type of the conversion input
818 volatile int barrierCount;
819 CalcReferenceValuesInfo calcInfo[kCallStyleCount];
820 }WriteInputBufferInfo;
821
822 cl_uint RoundUpToNextPowerOfTwo( cl_uint x );
RoundUpToNextPowerOfTwo(cl_uint x)823 cl_uint RoundUpToNextPowerOfTwo( cl_uint x )
824 {
825 if( 0 == (x & (x-1)))
826 return x;
827
828 while( x & (x-1) )
829 x &= x-1;
830
831 return x + x;
832 }
833
834 void CL_CALLBACK WriteInputBufferComplete( cl_event, cl_int, void * );
835
836 typedef struct DataInitInfo
837 {
838 cl_ulong start;
839 cl_uint size;
840 Type outType;
841 Type inType;
842 SaturationMode sat;
843 RoundingMode round;
844 MTdata *d;
845 }DataInitInfo;
846
847 cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p );
InitData(cl_uint job_id,cl_uint thread_id,void * p)848 cl_int InitData( cl_uint job_id, cl_uint thread_id, void *p )
849 {
850 DataInitInfo *info = (DataInitInfo*) p;
851
852 gInitFunctions[ info->inType ]( (char*)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat, info->round,
853 info->outType, info->start + job_id * info->size, info->size, info->d[thread_id] );
854 return CL_SUCCESS;
855 }
856
setAllowZ(uint8_t * allow,uint32_t * x,cl_uint count)857 static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
858 {
859 cl_uint i;
860 for (i = 0; i < count; ++i)
861 allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
862 }
863
864 cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p );
PrepareReference(cl_uint job_id,cl_uint thread_id,void * p)865 cl_int PrepareReference( cl_uint job_id, cl_uint thread_id, void *p )
866 {
867 DataInitInfo *info = (DataInitInfo*) p;
868 cl_uint count = info->size;
869 Type inType = info->inType;
870 Type outType = info->outType;
871 RoundingMode round = info->round;
872 size_t j;
873
874 Force64BitFPUPrecision();
875
876 void *s = (cl_uchar*) gIn + job_id * count * gTypeSizes[info->inType];
877 void *a = (cl_uchar*) gAllowZ + job_id * count;
878 void *d = (cl_uchar*) gRef + job_id * count * gTypeSizes[info->outType];
879
880 if (outType != inType)
881 {
882 //create the reference while we wait
883 Convert f = gConversions[ outType ][ inType ];
884 if( info->sat )
885 f = gSaturatedConversions[ outType ][ inType ];
886
887 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
888 /* ARM VFP doesn't have hardware instruction for converting from 64-bit
889 * integer to float types, hence GCC ARM uses the floating-point
890 * emulation code despite which -mfloat-abi setting it is. But the
891 * emulation code in libgcc.a has only one rounding mode (round to
892 * nearest even in this case) and ignores the user rounding mode setting
893 * in hardware. As a result setting rounding modes in hardware won't
894 * give correct rounding results for type covert from 64-bit integer to
895 * float using GCC for ARM compiler so for testing different rounding
896 * modes, we need to use alternative reference function. ARM64 does have
897 * an instruction, however we cannot guarantee the compiler will use it.
898 * On all ARM architechures use emulation to calculate reference.*/
899 switch (round)
900 {
901 /* conversions to floating-point type use the current rounding mode.
902 * The only default floating-point rounding mode supported is round to nearest even
903 * i.e the current rounding mode will be _rte for floating-point types. */
904 case kDefaultRoundingMode:
905 qcom_rm = qcomRTE;
906 break;
907 case kRoundToNearestEven:
908 qcom_rm = qcomRTE;
909 break;
910 case kRoundUp:
911 qcom_rm = qcomRTP;
912 break;
913 case kRoundDown:
914 qcom_rm = qcomRTN;
915 break;
916 case kRoundTowardZero:
917 qcom_rm = qcomRTZ;
918 break;
919 default:
920 vlog_error("ERROR: undefined rounding mode %d\n", round);
921 break;
922 }
923 qcom_sat = info->sat;
924 #endif
925
926 RoundingMode oldRound = set_round( round, outType );
927 f( d, s, count );
928 set_round( oldRound, outType );
929
930 // Decide if we allow a zero result in addition to the correctly rounded one
931 memset(a, 0, count);
932 if (gForceFTZ) {
933 if (inType == kfloat)
934 setAllowZ((uint8_t*)a, (uint32_t*)s, count);
935 if (outType == kfloat)
936 setAllowZ((uint8_t*)a, (uint32_t*)d, count);
937 }
938 }
939 else
940 {
941 // Copy the input to the reference
942 memcpy(d, s, info->size * gTypeSizes[inType]);
943 }
944
945 //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
946 if( info->outType != kfloat && info->outType != kdouble )
947 {
948 if( inType == kfloat )
949 {
950 float *inp = (float*) s;
951 for( j = 0; j < count; j++ )
952 {
953 if( isnan( inp[j] ) )
954 memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
955 }
956 }
957 if( inType == kdouble )
958 {
959 double *inp = (double*) s;
960 for( j = 0; j < count; j++ )
961 {
962 if( isnan( inp[j] ) )
963 memset( (char*) d + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
964 }
965 }
966 }
967 else if( inType == kfloat || inType == kdouble )
968 { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN
969 if( inType == kfloat && outType == kdouble )
970 {
971 float *inp = (float*) s;
972 for( j = 0; j < count; j++ )
973 {
974 if( isnan( inp[j] ) )
975 ((double*) d)[j] = NAN;
976 }
977 }
978 if( inType == kdouble && outType == kfloat )
979 {
980 double *inp = (double*) s;
981 for( j = 0; j < count; j++ )
982 {
983 if( isnan( inp[j] ) )
984 ((float*) d)[j] = NAN;
985 }
986 }
987 }
988
989 return CL_SUCCESS;
990 }
991
DoTest(cl_device_id device,Type outType,Type inType,SaturationMode sat,RoundingMode round,MTdata d)992 static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMode sat, RoundingMode round, MTdata d )
993 {
994 #ifdef __APPLE__
995 cl_ulong wall_start = mach_absolute_time();
996 #endif
997
998 DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
999 WriteInputBufferInfo writeInputBufferInfo;
1000 int vectorSize;
1001 int error = 0;
1002 cl_uint threads = GetThreadCount();
1003 uint64_t i;
1004
1005 gTestCount++;
1006 size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] );
1007 size_t step = blockCount;
1008 uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]);
1009 cl_event writeInputBuffer = NULL;
1010
1011 memset( &writeInputBufferInfo, 0, sizeof( writeInputBufferInfo ) );
1012 init_info.d = (MTdata*)malloc( threads * sizeof( MTdata ) );
1013 if( NULL == init_info.d )
1014 {
1015 vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
1016 return -1;
1017 }
1018 for( i = 0; i < threads; i++ )
1019 {
1020 init_info.d[i] = init_genrand( genrand_int32( d ) );
1021 if( NULL == init_info.d[i] )
1022 {
1023 vlog_error( "ERROR: Unable to allocate storage for random number generator!\n" );
1024 return -1;
1025 }
1026 }
1027
1028 writeInputBufferInfo.outType = outType;
1029 writeInputBufferInfo.inType = inType;
1030
1031 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1032 {
1033 writeInputBufferInfo.calcInfo[vectorSize].program = MakeProgram( outType, inType, sat, round, vectorSize,
1034 &writeInputBufferInfo.calcInfo[vectorSize].kernel );
1035 if( NULL == writeInputBufferInfo.calcInfo[vectorSize].program )
1036 {
1037 gFailCount++;
1038 return -1;
1039 }
1040 if( NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel )
1041 {
1042 gFailCount++;
1043 vlog_error( "\t\tFAILED -- Failed to create kernel.\n" );
1044 return -2;
1045 }
1046
1047 writeInputBufferInfo.calcInfo[vectorSize].parent = &writeInputBufferInfo;
1048 writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
1049 writeInputBufferInfo.calcInfo[vectorSize].result = -1;
1050 }
1051
1052 if( gSkipTesting )
1053 goto exit;
1054
1055 // Patch up rounding mode if default is RTZ
1056 // We leave the part above in default rounding mode so that the right kernel is compiled.
1057 if( round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat) )
1058 init_info.round = round = kRoundTowardZero;
1059
1060 // Figure out how many elements are in a work block
1061
1062 // we handle 64-bit types a bit differently.
1063 if( 8*gTypeSizes[ inType ] > 32 )
1064 lastCase = 0x100000000ULL;
1065
1066 if ( !gWimpyMode && gIsEmbedded )
1067 step = blockCount * EMBEDDED_REDUCTION_FACTOR;
1068
1069 if ( gWimpyMode )
1070 step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
1071 vlog( "Testing... " );
1072 fflush(stdout);
1073 for( i = 0; i < (uint64_t)lastCase; i += step )
1074 {
1075
1076 if( 0 == ( i & ((lastCase >> 3) -1))) {
1077 vlog(".");
1078 fflush(stdout);
1079 }
1080
1081 cl_uint count = (uint32_t) MIN( blockCount, lastCase - i );
1082 writeInputBufferInfo.count = count;
1083
1084 // Crate a user event to represent the status of the reference value computation completion
1085 writeInputBufferInfo.calcReferenceValues = clCreateUserEvent( gContext, &error);
1086 if( error || NULL == writeInputBufferInfo.calcReferenceValues )
1087 {
1088 vlog_error( "ERROR: Unable to create user event. (%d)\n", error );
1089 gFailCount++;
1090 goto exit;
1091 }
1092
1093 // retain for consumption by MapOutputBufferComplete
1094 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1095 {
1096 if( (error = clRetainEvent(writeInputBufferInfo.calcReferenceValues) ))
1097 {
1098 vlog_error( "ERROR: Unable to retain user event. (%d)\n", error );
1099 gFailCount++;
1100 goto exit;
1101 }
1102 }
1103
1104 // Crate a user event to represent when the callbacks are done verifying correctness
1105 writeInputBufferInfo.doneBarrier = clCreateUserEvent( gContext, &error);
1106 if( error || NULL == writeInputBufferInfo.calcReferenceValues )
1107 {
1108 vlog_error( "ERROR: Unable to create user event for barrier. (%d)\n", error );
1109 gFailCount++;
1110 goto exit;
1111 }
1112
1113 // retain for use by the callback that calls this
1114 if( (error = clRetainEvent(writeInputBufferInfo.doneBarrier) ))
1115 {
1116 vlog_error( "ERROR: Unable to retain user event doneBarrier. (%d)\n", error );
1117 gFailCount++;
1118 goto exit;
1119 }
1120
1121 // Call this in a multithreaded manner
1122 // gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d );
1123 cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
1124 init_info.start = i;
1125 init_info.size = count / chunks;
1126 if( init_info.size < 16384 )
1127 {
1128 chunks = RoundUpToNextPowerOfTwo(threads);
1129 init_info.size = count / chunks;
1130 if( init_info.size < 16384 )
1131 {
1132 init_info.size = count;
1133 chunks = 1;
1134 }
1135 }
1136 ThreadPool_Do(InitData, chunks, &init_info);
1137
1138 // Copy the results to the device
1139 writeInputBuffer = NULL;
1140 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, count * gTypeSizes[inType], gIn, 0, NULL, &writeInputBuffer )))
1141 {
1142 vlog_error( "ERROR: clEnqueueWriteBuffer failed. (%d)\n", error );
1143 gFailCount++;
1144 goto exit;
1145 }
1146
1147 // Setup completion callback for the write, which will enqueue the rest of the work
1148 // This is somewhat gratuitous. Because this is an in order queue, we didn't really need to
1149 // do this work in a callback. We could have done it from the main thread. Here we are
1150 // verifying that the implementation can enqueue work from a callback, while at the same time
1151 // also checking to make sure that the conversions work.
1152 //
1153 // Because the verification code is also moved to a callback, it is hoped that implementations will
1154 // achieve a test performance improvement because they can verify the results in parallel. If the
1155 // implementation serializes callbacks however, that won't happen. Consider it some motivation
1156 // to do the right thing! :-)
1157 if( (error = clSetEventCallback( writeInputBuffer, CL_COMPLETE, WriteInputBufferComplete, &writeInputBufferInfo)) )
1158 {
1159 vlog_error( "ERROR: clSetEventCallback failed. (%d)\n", error );
1160 gFailCount++;
1161 goto exit;
1162 }
1163
1164 // The event can't be destroyed until the callback is called, so we can release it now.
1165 if( (error = clReleaseEvent(writeInputBuffer) ))
1166 {
1167 vlog_error( "ERROR: clReleaseEvent failed. (%d)\n", error );
1168 gFailCount++;
1169 goto exit;
1170 }
1171
1172 // Make sure the work is actually running, so we don't deadlock
1173 if( (error = clFlush( gQueue ) ) )
1174 {
1175 vlog_error( "clFlush failed with error %d\n", error );
1176 gFailCount++;
1177 goto exit;
1178 }
1179
1180 ThreadPool_Do(PrepareReference, chunks, &init_info);
1181
1182 // signal we are done calculating the reference results
1183 if( (error = clSetUserEventStatus( writeInputBufferInfo.calcReferenceValues, CL_COMPLETE ) ) )
1184 {
1185 vlog_error( "Error: Failed to set user event status to CL_COMPLETE: %d\n", error );
1186 gFailCount++;
1187 goto exit;
1188 }
1189
1190 // Wait for the event callbacks to finish verifying correctness.
1191 if( (error = clWaitForEvents( 1, (cl_event*) &writeInputBufferInfo.doneBarrier ) ))
1192 {
1193 vlog_error( "Error: Failed to wait for barrier: %d\n", error );
1194 gFailCount++;
1195 goto exit;
1196 }
1197
1198 if( (error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues ) ))
1199 {
1200 vlog_error( "Error: Failed to release calcReferenceValues: %d\n", error );
1201 gFailCount++;
1202 goto exit;
1203 }
1204
1205 if( (error = clReleaseEvent(writeInputBufferInfo.doneBarrier ) ))
1206 {
1207 vlog_error( "Error: Failed to release done barrier: %d\n", error );
1208 gFailCount++;
1209 goto exit;
1210 }
1211
1212
1213 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1214 {
1215 if( ( error = writeInputBufferInfo.calcInfo[ vectorSize ].result ))
1216 {
1217 switch( inType )
1218 {
1219 case kuchar:
1220 case kchar:
1221 vlog( "Input value: 0x%2.2x ", ((unsigned char*)gIn)[error - 1] );
1222 break;
1223 case kushort:
1224 case kshort:
1225 vlog( "Input value: 0x%4.4x ", ((unsigned short*)gIn)[error - 1] );
1226 break;
1227 case kuint:
1228 case kint:
1229 vlog( "Input value: 0x%8.8x ", ((unsigned int*)gIn)[error - 1] );
1230 break;
1231 case kfloat:
1232 vlog( "Input value: %a ", ((float*)gIn)[error - 1] );
1233 break;
1234 break;
1235 case kulong:
1236 case klong:
1237 vlog( "Input value: 0x%16.16llx ", ((unsigned long long*)gIn)[error - 1] );
1238 break;
1239 case kdouble:
1240 vlog( "Input value: %a ", ((double*)gIn)[error - 1]);
1241 break;
1242 default:
1243 vlog_error( "Internal error at %s: %d\n", __FILE__, __LINE__ );
1244 abort();
1245 break;
1246 }
1247
1248 // tell the user which conversion it was.
1249 if( 0 == vectorSize )
1250 vlog( " (implicit scalar conversion from %s to %s)\n", gTypeNames[ inType ], gTypeNames[ outType ] );
1251 else
1252 vlog( " (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType], sizeNames[vectorSize], gSaturationNames[ sat ],
1253 gRoundingModeNames[ round ], gTypeNames[inType], sizeNames[vectorSize] );
1254
1255 gFailCount++;
1256 goto exit;
1257 }
1258 }
1259 }
1260
1261 log_info( "done.\n" );
1262
1263 if( gTimeResults )
1264 {
1265 //Kick off tests for the various vector lengths
1266 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1267 {
1268 size_t workItemCount = blockCount / vectorSizes[vectorSize];
1269 if( vectorSizes[vectorSize] * gTypeSizes[outType] < 4 )
1270 workItemCount /= 4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
1271
1272 double sum = 0.0;
1273 double bestTime = INFINITY;
1274 cl_uint k;
1275 for( k = 0; k < PERF_LOOP_COUNT; k++ )
1276 {
1277 uint64_t startTime = GetTime();
1278 if( (error = RunKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
1279 {
1280 gFailCount++;
1281 goto exit;
1282 }
1283
1284 // Make sure OpenCL is done
1285 if( (error = clFinish(gQueue) ) )
1286 {
1287 vlog_error( "Error %d at clFinish\n", error );
1288 goto exit;
1289 }
1290
1291 uint64_t endTime = GetTime();
1292 double time = SubtractTime( endTime, startTime );
1293 sum += time;
1294 if( time < bestTime )
1295 bestTime = time;
1296
1297 }
1298
1299 if( gReportAverageTimes )
1300 bestTime = sum / PERF_LOOP_COUNT;
1301 double clocksPerOp = bestTime * (double) gDeviceFrequency * gComputeDevices * gSimdSize * 1e6 / (workItemCount * vectorSizes[vectorSize]);
1302 if( 0 == vectorSize )
1303 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "implicit convert %s -> %s", gTypeNames[ inType ], gTypeNames[ outType ] );
1304 else
1305 vlog_perf( clocksPerOp, LOWER_IS_BETTER, "clocks / element", "convert_%s%s%s%s( %s%s )", gTypeNames[ outType ], sizeNames[vectorSize], gSaturationNames[ sat ], gRoundingModeNames[round], gTypeNames[inType], sizeNames[vectorSize] );
1306 }
1307 }
1308
1309 if( gWimpyMode )
1310 vlog( "\tWimp pass" );
1311 else
1312 vlog( "\tpassed" );
1313
1314 #ifdef __APPLE__
1315 // record the run time
1316 vlog( "\t(%f s)", 1e-9 * ( mach_absolute_time() - wall_start ) );
1317 #endif
1318 vlog( "\n\n" );
1319 fflush( stdout );
1320
1321
1322 exit:
1323 //clean up
1324 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1325 {
1326 clReleaseProgram( writeInputBufferInfo.calcInfo[vectorSize].program );
1327 clReleaseKernel( writeInputBufferInfo.calcInfo[vectorSize].kernel );
1328 }
1329
1330 if( init_info.d )
1331 {
1332 for( i = 0; i < threads; i++ )
1333 free_mtdata(init_info.d[i]);
1334 free(init_info.d);
1335 }
1336
1337 return error;
1338 }
1339
1340 void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data );
1341
1342 // Note: not called reentrantly
WriteInputBufferComplete(cl_event e,cl_int status,void * data)1343 void CL_CALLBACK WriteInputBufferComplete( cl_event e, cl_int status, void *data )
1344 {
1345 WriteInputBufferInfo *info = (WriteInputBufferInfo*) data;
1346 cl_uint count = info->count;
1347 int vectorSize;
1348
1349 if( CL_SUCCESS != status )
1350 {
1351 vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1352 gFailCount++;
1353 return;
1354 }
1355
1356 info->barrierCount = gMaxVectorSize - gMinVectorSize;
1357
1358 // now that we know that the write buffer is complete, enqueue callbacks to wait for the main thread to
1359 // finish calculating the reference results.
1360 for( vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
1361 {
1362 size_t workItemCount = (count + vectorSizes[vectorSize] - 1) / ( vectorSizes[vectorSize]);
1363 cl_event mapComplete = NULL;
1364
1365 if( (status = RunKernel( info->calcInfo[ vectorSize ].kernel, gInBuffer, gOutBuffers[ vectorSize ], workItemCount )) )
1366 {
1367 gFailCount++;
1368 return;
1369 }
1370
1371 info->calcInfo[vectorSize].p = clEnqueueMapBuffer( gQueue, gOutBuffers[ vectorSize ], CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
1372 0, count * gTypeSizes[ info->outType ], 0, NULL, &mapComplete, &status);
1373 {
1374 if( status )
1375 {
1376 vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1377 gFailCount++;
1378 return;
1379 }
1380 }
1381
1382 if( (status = clSetEventCallback( mapComplete, CL_COMPLETE, MapResultValuesComplete, info->calcInfo + vectorSize)))
1383 {
1384 vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1385 gFailCount++;
1386 return;
1387 }
1388
1389 if( (status = clReleaseEvent(mapComplete)))
1390 {
1391 vlog_error( "ERROR: clReleaseEvent calback failed in WriteInputBufferComplete for vector size %d with status: %d\n", vectorSize, status );
1392 gFailCount++;
1393 return;
1394 }
1395 }
1396
1397 // Make sure the work starts moving -- otherwise we may deadlock
1398 if( (status = clFlush(gQueue)))
1399 {
1400 vlog_error( "ERROR: WriteInputBufferComplete calback failed with status: %d\n", status );
1401 gFailCount++;
1402 return;
1403 }
1404
1405 // e was already released by the main thread. It should be destroyed automatically soon after we exit.
1406 }
1407
1408 void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data );
1409
1410 // Note: May be called reentrantly
MapResultValuesComplete(cl_event e,cl_int status,void * data)1411 void CL_CALLBACK MapResultValuesComplete( cl_event e, cl_int status, void *data )
1412 {
1413 CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
1414 cl_event calcReferenceValues = info->parent->calcReferenceValues;
1415
1416 if( CL_SUCCESS != status )
1417 {
1418 vlog_error( "ERROR: MapResultValuesComplete calback failed with status: %d\n", status );
1419 gFailCount++; // not thread safe -- being lazy here
1420 clReleaseEvent(calcReferenceValues);
1421 return;
1422 }
1423
1424 // we know that the map is done, wait for the main thread to finish calculating the reference values
1425 if( (status = clSetEventCallback( calcReferenceValues, CL_COMPLETE, CalcReferenceValuesComplete, data )))
1426 {
1427 vlog_error( "ERROR: clSetEventCallback failed in MapResultValuesComplete with status: %d\n", status );
1428 gFailCount++; // not thread safe -- being lazy here
1429 }
1430
1431 // this thread no longer needs its reference to info->calcReferenceValues, so release it
1432 if( (status = clReleaseEvent(calcReferenceValues) ))
1433 {
1434 vlog_error( "ERROR: clReleaseEvent(info->calcReferenceValues) failed with status: %d\n", status );
1435 gFailCount++; // not thread safe -- being lazy here
1436 }
1437
1438 // no need to flush since we didn't enqueue anything
1439
1440 // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after we exit.
1441 }
1442
1443
CalcReferenceValuesComplete(cl_event e,cl_int status,void * data)1444 void CL_CALLBACK CalcReferenceValuesComplete( cl_event e, cl_int status, void *data )
1445 {
1446 CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo*) data;
1447 cl_uint vectorSize = info->vectorSize;
1448 cl_uint count = info->parent->count;
1449 Type outType = info->parent->outType; // the data type of the conversion result
1450 Type inType = info->parent->inType; // the data type of the conversion input
1451 size_t j;
1452 cl_int error;
1453 cl_event doneBarrier = info->parent->doneBarrier;
1454
1455 // report spurious error condition
1456 if( CL_SUCCESS != status )
1457 {
1458 vlog_error( "ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n", status );
1459 gFailCount++; // lazy about thread safety here
1460 return;
1461 }
1462
1463 // Now we know that both results have been mapped back from the device, and the
1464 // main thread is done calculating the reference results. It is now time to check
1465 // the results.
1466
1467 // verify results
1468 void *mapped = info->p;
1469
1470 //Patch up NaNs conversions to integer to zero -- these can be converted to any integer
1471 if( outType != kfloat && outType != kdouble )
1472 {
1473 if( inType == kfloat )
1474 {
1475 float *inp = (float*) gIn;
1476 for( j = 0; j < count; j++ )
1477 {
1478 if( isnan( inp[j] ) )
1479 memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
1480 }
1481 }
1482 if( inType == kdouble )
1483 {
1484 double *inp = (double*) gIn;
1485 for( j = 0; j < count; j++ )
1486 {
1487 if( isnan( inp[j] ) )
1488 memset( (char*) mapped + j * gTypeSizes[ outType ], 0, gTypeSizes[ outType ] );
1489 }
1490 }
1491 }
1492 else if( inType == kfloat || inType == kdouble )
1493 { // outtype and intype is float or double. NaN conversions for float <-> double can be any NaN
1494 if( inType == kfloat && outType == kdouble )
1495 {
1496 float *inp = (float*) gIn;
1497 double *outp = (double*) mapped;
1498 for( j = 0; j < count; j++ )
1499 {
1500 if( isnan( inp[j] ) && isnan(outp[j]) )
1501 outp[j] = NAN;
1502 }
1503 }
1504 if( inType == kdouble && outType == kfloat )
1505 {
1506 double *inp = (double*) gIn;
1507 float *outp = (float*) mapped;
1508 for( j = 0; j < count; j++ )
1509 {
1510 if( isnan( inp[j] ) && isnan(outp[j]) )
1511 outp[j] = NAN;
1512 }
1513 }
1514 }
1515
1516 if( memcmp( mapped, gRef, count * gTypeSizes[ outType ] ) )
1517 info->result = gCheckResults[outType]( mapped, gRef, gAllowZ, count, vectorSizes[vectorSize] );
1518 else
1519 info->result = 0;
1520
1521 // Fill the output buffer with junk and release it
1522 {
1523 cl_uint pattern = 0xffffdead;
1524 memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
1525 if((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[ vectorSize ], mapped, 0, NULL, NULL)))
1526 {
1527 vlog_error( "ERROR: clEnqueueUnmapMemObject failed in CalcReferenceValuesComplete (%d)\n", error );
1528 gFailCount++;
1529 }
1530 }
1531
1532 if( 1 == ThreadPool_AtomicAdd( &info->parent->barrierCount, -1) )
1533 {
1534 if( (status = clSetUserEventStatus( doneBarrier, CL_COMPLETE) ))
1535 {
1536 vlog_error( "ERROR: clSetUserEventStatus failed in CalcReferenceValuesComplete (err: %d). We're probably going to deadlock.\n", status );
1537 gFailCount++;
1538 return;
1539 }
1540
1541 if( (status = clReleaseEvent( doneBarrier ) ) )
1542 {
1543 vlog_error( "ERROR: clReleaseEvent failed in CalcReferenceValuesComplete (err: %d).\n", status );
1544 gFailCount++;
1545 return;
1546 }
1547 }
1548
1549
1550 // e was already released by WriteInputBufferComplete. It should be destroyed automatically soon after
1551 // all the calls to CalcReferenceValuesComplete exit.
1552 }
1553
MakeProgram(Type outType,Type inType,SaturationMode sat,RoundingMode round,int vectorSize,cl_kernel * outKernel)1554 static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel )
1555 {
1556 cl_program program;
1557 char testName[256];
1558 int error = 0;
1559 const char **strings;
1560 size_t stringCount = 0;
1561
1562 // Create the program. This is a bit complicated because we are trying to avoid byte and short stores.
1563 if (0 == vectorSize)
1564 {
1565 char inName[32];
1566 char outName[32];
1567 const char *programSource[] =
1568 {
1569 "", // optional pragma
1570 "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1571 "{\n"
1572 " size_t i = get_global_id(0);\n"
1573 " dest[i] = src[i];\n"
1574 "}\n"
1575 };
1576 stringCount = sizeof(programSource) / sizeof(programSource[0]);
1577 strings = programSource;
1578
1579 if (outType == kdouble || inType == kdouble)
1580 programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1581
1582 //create the type name
1583 strncpy(inName, gTypeNames[inType], sizeof(inName));
1584 strncpy(outName, gTypeNames[outType], sizeof(outName));
1585 sprintf(testName, "test_implicit_%s_%s", outName, inName);
1586 vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]);
1587 fflush(stdout);
1588 }
1589 else
1590 {
1591 int vectorSizetmp = vectorSizes[vectorSize];
1592
1593 char convertString[128];
1594 char inName[32];
1595 char outName[32];
1596 const char *programSource[] =
1597 {
1598 "", // optional pragma
1599 "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1600 "{\n"
1601 " size_t i = get_global_id(0);\n"
1602 " dest[i] = ", convertString, "( src[i] );\n"
1603 "}\n"
1604 };
1605 const char *programSourceV3[] =
1606 {
1607 "", // optional pragma
1608 "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n"
1609 "{\n"
1610 " size_t i = get_global_id(0);\n"
1611 " if( i + 1 < get_global_size(0))\n"
1612 " vstore3( ", convertString, "( vload3( i, src)), i, dest );\n"
1613 " else\n"
1614 " {\n"
1615 " ", inName, "3 in;\n"
1616 " ", outName, "3 out;\n"
1617 " if( 0 == (i & 1) )\n"
1618 " in.y = src[3*i+1];\n"
1619 " in.x = src[3*i];\n"
1620 " out = ", convertString, "( in ); \n"
1621 " dest[3*i] = out.x;\n"
1622 " if( 0 == (i & 1) )\n"
1623 " dest[3*i+1] = out.y;\n"
1624 " }\n"
1625 "}\n"
1626 };
1627 stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) :
1628 sizeof(programSource) / sizeof(programSource[0]);
1629 strings = 3 == vectorSizetmp ? programSourceV3 : programSource;
1630
1631 if (outType == kdouble || inType == kdouble) {
1632 programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1633 programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
1634 }
1635
1636 //create the type name
1637 switch (vectorSizetmp)
1638 {
1639 case 1:
1640 strncpy(inName, gTypeNames[inType], sizeof(inName));
1641 strncpy(outName, gTypeNames[outType], sizeof(outName));
1642 snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1643 snprintf(testName, 256, "test_%s_%s", convertString, inName);
1644 vlog("Building %s( %s ) test\n", convertString, inName);
1645 break;
1646 case 3:
1647 strncpy(inName, gTypeNames[inType], sizeof(inName));
1648 strncpy(outName, gTypeNames[outType], sizeof(outName));
1649 snprintf(convertString, sizeof(convertString), "convert_%s3%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1650 snprintf(testName, 256, "test_%s_%s3", convertString, inName);
1651 vlog("Building %s( %s3 ) test\n", convertString, inName);
1652 break;
1653 default:
1654 snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType], vectorSizetmp);
1655 snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType], vectorSizetmp);
1656 snprintf(convertString, sizeof(convertString), "convert_%s%s%s", outName, gSaturationNames[sat], gRoundingModeNames[round]);
1657 snprintf(testName, 256, "test_%s_%s", convertString, inName);
1658 vlog("Building %s( %s ) test\n", convertString, inName);
1659 break;
1660 }
1661
1662 fflush(stdout);
1663 }
1664 *outKernel = NULL;
1665
1666 const char *flags = NULL;
1667 if( gForceFTZ )
1668 flags = "-cl-denorms-are-zero";
1669
1670 // build it
1671 error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags);
1672 if (error)
1673 {
1674 char buffer[2048] = "";
1675
1676 vlog_error("Failed to build kernel/program.\n", error);
1677 clReleaseProgram(program);
1678 return NULL;
1679 }
1680
1681 return program;
1682 }
1683