• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/conversions.h"
18 #ifndef _WIN32
19 #include <unistd.h>
20 #endif
21 
22 #define INT_TEST_VALUE 402258822
23 #define LONG_TEST_VALUE 515154531254381446LL
24 
25 
26 const char *atomic_global_pattern[] = {
27     "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
28     "{\n"
29     "    int  tid = get_global_id(0);\n"
30     "\n"
31     ,
32     "\n"
33     "}\n" };
34 
35 const char *atomic_local_pattern[] = {
36     "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
37     "{\n"
38     "    int  tid = get_global_id(0);\n"
39     "     int  dstItemIdx;\n"
40     "\n"
41     "    // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
42     "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
43     "        destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
44     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
45     "\n"
46     ,
47     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
48     "    // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
49     "     for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
50     "        finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
51     "}\n" };
52 
53 
54 #define TEST_COUNT 128 * 1024
55 
56 
57 struct TestFns
58 {
59     cl_int    mIntStartValue;
60     cl_long    mLongStartValue;
61 
62     size_t    (*NumResultsFn)( size_t threadSize, ExplicitType dataType );
63 
64     // Integer versions
65     cl_int    (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue );
66     void    (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d );
67     bool    (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue );
68 
69     // Long versions
70     cl_long    (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue );
71     void    (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d );
72     bool    (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue );
73 
74     // Float versions
75     cl_float    (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue );
76     void        (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d );
77     bool        (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue );
78 };
79 
check_atomic_support(cl_device_id device,bool extended,bool isLocal,ExplicitType dataType)80 bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType )
81 {
82     const char *extensionNames[8] = {
83         "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
84         "cl_khr_local_int32_base_atomics",  "cl_khr_local_int32_extended_atomics",
85         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics",
86         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics"       // this line intended to be the same as the last one
87     };
88 
89     size_t index = 0;
90     if( extended )
91         index += 1;
92     if( isLocal )
93         index += 2;
94 
95     Version version = get_device_cl_version(device);
96 
97     switch (dataType)
98     {
99         case kInt:
100         case kUInt:
101             if( version >= Version(1,1) )
102                 return 1;
103             break;
104         case kLong:
105         case kULong:
106             index += 4;
107             break;
108         case kFloat:  // this has to stay separate since the float atomics arent in the 1.0 extensions
109             return version >= Version(1,1);
110         default:
111             log_error( "ERROR:  Unsupported data type (%d) in check_atomic_support\n", dataType );
112             return 0;
113     }
114 
115     return is_extension_available( device, extensionNames[index] );
116 }
117 
test_atomic_function(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool isLocal,ExplicitType dataType,bool matchGroupSize)118 int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
119                          TestFns testFns,
120                          bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize )
121 {
122     clProgramWrapper program;
123     clKernelWrapper kernel;
124     int error;
125     size_t threads[1];
126     clMemWrapper streams[2];
127     void *refValues, *startRefValues;
128     size_t threadSize, groupSize;
129     const char *programLines[4];
130     char pragma[ 512 ];
131     char programHeader[ 512 ];
132     MTdata d;
133     size_t typeSize = get_explicit_type_size( dataType );
134 
135 
136     // Verify we can run first
137     bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt );
138     if( !check_atomic_support( deviceID, extended, isLocal, dataType ) )
139     {
140         // Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either)
141         if( dataType == kFloat )
142             log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" );
143         else if( !isUnsigned )
144             log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 );
145         // Since we don't support the operation, they implicitly pass
146         return 0;
147     }
148     else
149     {
150         if( dataType == kFloat )
151             log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" );
152         else
153             log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "",
154                      (int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" );
155     }
156 
157     //// Set up the kernel code
158 
159     // Create the pragma line for this kernel
160     bool isLong = ( dataType == kLong || dataType == kULong );
161     sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
162             isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32",
163             extended ? "extended" : "base" );
164 
165     // Now create the program header
166     const char *typeName = get_explicit_type_name( dataType );
167     if( isLocal )
168         sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName );
169     else
170         sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName );
171 
172     // Set up our entire program now
173     programLines[ 0 ] = pragma;
174     programLines[ 1 ] = programHeader;
175     programLines[ 2 ] = programCore;
176     programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ];
177 
178     if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) )
179     {
180         return -1;
181     }
182 
183     //// Set up to actually run
184     threadSize = num_elements;
185 
186     error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize );
187     test_error( error, "Unable to get thread group max size" );
188 
189     if( matchGroupSize )
190         // HACK because xchg and cmpxchg apparently are limited by hardware
191         threadSize = groupSize;
192 
193     if( isLocal )
194     {
195         size_t maxSizes[3] = {0, 0, 0};
196         error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), maxSizes, 0);
197         test_error( error, "Unable to obtain max work item sizes for the device" );
198 
199         size_t workSize;
200         error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
201         test_error( error, "Unable to obtain max work group size for device and kernel combo" );
202 
203         // "workSize" is limited to that of the first dimension as only a 1DRange is executed.
204         if( maxSizes[0] < workSize )
205         {
206             workSize = maxSizes[0];
207         }
208 
209         threadSize = groupSize = workSize;
210     }
211 
212 
213     log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize );
214 
215     refValues = (cl_int *)malloc( typeSize * threadSize );
216 
217     if( testFns.GenerateRefsIntFn != NULL )
218     {
219         // We have a ref generator provided
220         d = init_genrand( gRandomSeed );
221         startRefValues = malloc( typeSize * threadSize );
222         if( typeSize == 4 )
223             testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d );
224         else
225             testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d );
226         free_mtdata(d);
227         d = NULL;
228     }
229     else
230         startRefValues = NULL;
231 
232     // If we're given a num_results function, we need to determine how many result objects we need. If
233     // we don't have it, we assume it's just 1
234     size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1;
235 
236     char * destItems = new char[ typeSize * numDestItems ];
237     if( destItems == NULL )
238     {
239         log_error( "ERROR: Unable to allocate memory!\n" );
240         return -1;
241     }
242     void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue;
243     for( size_t i = 0; i < numDestItems; i++ )
244         memcpy( destItems + i * typeSize, startValue, typeSize );
245 
246     streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), typeSize * numDestItems, destItems, NULL);
247     if (!streams[0])
248     {
249         log_error("ERROR: Creating output array failed!\n");
250         return -1;
251     }
252     streams[1] = clCreateBuffer(context, (cl_mem_flags)(( startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE )), typeSize * threadSize, startRefValues, NULL);
253     if (!streams[1])
254     {
255         log_error("ERROR: Creating reference array failed!\n");
256         return -1;
257     }
258 
259     /* Set the arguments */
260     error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
261     test_error( error, "Unable to set indexed kernel arguments" );
262     error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
263     test_error( error, "Unable to set indexed kernel arguments" );
264 
265     if( isLocal )
266     {
267         error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL );
268         test_error( error, "Unable to set indexed local kernel argument" );
269 
270         cl_int numDestItemsInt = (cl_int)numDestItems;
271         error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt );
272         test_error( error, "Unable to set indexed kernel argument" );
273     }
274 
275     /* Run the kernel */
276     threads[0] = threadSize;
277     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL );
278     test_error( error, "Unable to execute test kernel" );
279 
280     error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
281     test_error( error, "Unable to read result value!" );
282 
283     error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL );
284     test_error( error, "Unable to read reference values!" );
285 
286     // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
287     // have one, it's because we're comparing ref values only
288     if( testFns.ExpectedValueIntFn != NULL )
289     {
290         for( size_t i = 0; i < numDestItems; i++ )
291         {
292             char expected[ 8 ];
293             cl_int intVal;
294             cl_long longVal;
295             if( typeSize == 4 )
296             {
297                 // Int version
298                 intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i );
299                 memcpy( expected, &intVal, sizeof( intVal ) );
300             }
301             else
302             {
303                 // Long version
304                 longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i );
305                 memcpy( expected, &longVal, sizeof( longVal ) );
306             }
307 
308             if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 )
309             {
310                 if( typeSize == 4 )
311                 {
312                     cl_int *outValue = (cl_int *)( destItems + i * typeSize );
313                     log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue );
314                     cl_int *startRefs = (cl_int *)startRefValues;
315                     cl_int *refs = (cl_int *)refValues;
316                     for( i = 0; i < threadSize; i++ )
317                     {
318                         if( startRefs != NULL )
319                             log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] );
320                         else
321                             log_info( " --- %ld --- %d\n", i, refs[i] );
322                     }
323                 }
324                 else
325                 {
326                     cl_long *outValue = (cl_long *)( destItems + i * typeSize );
327                     log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue );
328                     cl_long *startRefs = (cl_long *)startRefValues;
329                     cl_long *refs = (cl_long *)refValues;
330                     for( i = 0; i < threadSize; i++ )
331                     {
332                         if( startRefs != NULL )
333                             log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] );
334                         else
335                             log_info( " --- %ld --- %lld\n", i, refs[i] );
336                     }
337                 }
338                 return -1;
339             }
340         }
341     }
342 
343     if( testFns.VerifyRefsIntFn != NULL )
344     {
345         /* Use the verify function to also check the results */
346         if( dataType == kFloat )
347         {
348             cl_float *outValue = (cl_float *)destItems;
349             if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 )
350             {
351                 log_error( "ERROR: Reference values did not validate!\n" );
352                 return -1;
353             }
354         }
355         else if( typeSize == 4 )
356         {
357             cl_int *outValue = (cl_int *)destItems;
358             if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 )
359             {
360                 log_error( "ERROR: Reference values did not validate!\n" );
361                 return -1;
362             }
363         }
364         else
365         {
366             cl_long *outValue = (cl_long *)destItems;
367             if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 )
368             {
369                 log_error( "ERROR: Reference values did not validate!\n" );
370                 return -1;
371             }
372         }
373     }
374     else if( testFns.ExpectedValueIntFn == NULL )
375     {
376         log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" );
377         return -1;
378     }
379 
380 
381     /* Re-write the starting value */
382     for( size_t i = 0; i < numDestItems; i++ )
383         memcpy( destItems + i * typeSize, startValue, typeSize );
384     error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
385     test_error( error, "Unable to write starting values!" );
386 
387     /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
388     threads[0] = 1;
389     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL );
390     test_error( error, "Unable to execute test kernel" );
391 
392     error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL );
393     test_error( error, "Unable to read reference values!" );
394 
395     if( memcmp( refValues, destItems, typeSize ) != 0 )
396     {
397         if( typeSize == 4 )
398         {
399             cl_int *s = (cl_int *)destItems;
400             cl_int *r = (cl_int *)refValues;
401             log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
402                       " (should have been %d, returned %d)!\n", *s, *r );
403         }
404         else
405         {
406             cl_long *s = (cl_long *)destItems;
407             cl_long *r = (cl_long *)refValues;
408             log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
409                       " (should have been %lld, returned %lld)!\n", *s, *r );
410         }
411         return -1;
412     }
413 
414     delete [] destItems;
415     free( refValues );
416     if( startRefValues != NULL )
417         free( startRefValues );
418 
419     return 0;
420 }
421 
test_atomic_function_set(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool matchGroupSize,bool usingAtomicPrefix)422 int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
423                              TestFns testFns,
424                              bool extended, bool matchGroupSize, bool usingAtomicPrefix )
425 {
426     log_info("    Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_");
427 
428     int errors = 0;
429     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kInt, matchGroupSize );
430     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kUInt, matchGroupSize );
431     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kInt, matchGroupSize );
432     errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kUInt, matchGroupSize );
433 
434     // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix.
435     // The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix.
436     if (!usingAtomicPrefix) {
437       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kLong, matchGroupSize );
438       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kULong, matchGroupSize );
439       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kLong, matchGroupSize );
440       errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kULong, matchGroupSize );
441     }
442 
443     return errors;
444 }
445 
446 #pragma mark ---- add
447 
448 const char atom_add_core[] =
449 "    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
450 "    atom_add( &destMemory[0], tid + 3 );\n"
451 "   atom_add( &destMemory[0], tid + 3 );\n"
452 "   atom_add( &destMemory[0], tid + 3 );\n";
453 
454 const char atomic_add_core[] =
455 "    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
456 "    atomic_add( &destMemory[0], tid + 3 );\n"
457 "   atomic_add( &destMemory[0], tid + 3 );\n"
458 "   atomic_add( &destMemory[0], tid + 3 );\n";
459 
test_atomic_add_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)460 cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
461 {
462     cl_int total = 0;
463     for( size_t i = 0; i < size; i++ )
464         total += ( (cl_int)i + 3 ) * 4;
465     return total;
466 }
467 
test_atomic_add_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)468 cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
469 {
470     cl_long total = 0;
471     for( size_t i = 0; i < size; i++ )
472         total += ( ( i + 3 ) * 4 );
473     return total;
474 }
475 
test_atomic_add(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)476 int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
477 {
478     TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL };
479 
480     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
481         return -1;
482     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
483       return -1;
484     return 0;
485 }
486 
487 #pragma mark ---- sub
488 
489 const char atom_sub_core[] = "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
490 
491 const char atomic_sub_core[] = "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
492 
test_atomic_sub_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)493 cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
494 {
495     cl_int total = INT_TEST_VALUE;
496     for( size_t i = 0; i < size; i++ )
497         total -= (cl_int)i + 3;
498     return total;
499 }
500 
test_atomic_sub_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)501 cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
502 {
503     cl_long total = LONG_TEST_VALUE;
504     for( size_t i = 0; i < size; i++ )
505         total -= i + 3;
506     return total;
507 }
508 
test_atomic_sub(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)509 int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
510 {
511     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL };
512 
513     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
514         return -1;
515     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
516         return -1;
517     return 0;
518 }
519 
520 #pragma mark ---- xchg
521 
522 const char atom_xchg_core[] = "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
523 
524 const char atomic_xchg_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
525 const char atomic_xchg_float_core[] = "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
526 
test_atomic_xchg_verify_int(size_t size,cl_int * refValues,cl_int finalValue)527 bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue )
528 {
529     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
530     char *valids;
531     size_t i;
532     char originalValidCount = 0;
533 
534     valids = (char *)malloc( sizeof( char ) * size );
535     memset( valids, 0, sizeof( char ) * size );
536 
537     for( i = 0; i < size; i++ )
538     {
539         if( refValues[ i ] == INT_TEST_VALUE )
540         {
541             // Special initial value
542             originalValidCount++;
543             continue;
544         }
545         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
546         {
547             log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] );
548             return false;
549         }
550         valids[ refValues[ i ] ] ++;
551     }
552 
553     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
554      the final value outputted */
555     if( valids[ finalValue ] > 0 )
556     {
557         log_error( "ERROR: Final value %d was also in ref list!\n", finalValue );
558         return false;
559     }
560     else
561         valids[ finalValue ] = 1;    // So the following loop will be okay
562 
563     /* Now check that every entry has one and only one count */
564     if( originalValidCount != 1 )
565     {
566         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
567         return false;
568     }
569     for( i = 0; i < size; i++ )
570     {
571         if( valids[ i ] != 1 )
572         {
573             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
574             for( size_t j = 0; j < size; j++ )
575                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
576             return false;
577         }
578     }
579 
580     free( valids );
581     return true;
582 }
583 
test_atomic_xchg_verify_long(size_t size,cl_long * refValues,cl_long finalValue)584 bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue )
585 {
586     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
587     char *valids;
588     size_t i;
589     char originalValidCount = 0;
590 
591     valids = (char *)malloc( sizeof( char ) * size );
592     memset( valids, 0, sizeof( char ) * size );
593 
594     for( i = 0; i < size; i++ )
595     {
596         if( refValues[ i ] == LONG_TEST_VALUE )
597         {
598             // Special initial value
599             originalValidCount++;
600             continue;
601         }
602         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
603         {
604             log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] );
605             return false;
606         }
607         valids[ refValues[ i ] ] ++;
608     }
609 
610     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
611      the final value outputted */
612     if( valids[ finalValue ] > 0 )
613     {
614         log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue );
615         return false;
616     }
617     else
618         valids[ finalValue ] = 1;    // So the following loop will be okay
619 
620     /* Now check that every entry has one and only one count */
621     if( originalValidCount != 1 )
622     {
623         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
624         return false;
625     }
626     for( i = 0; i < size; i++ )
627     {
628         if( valids[ i ] != 1 )
629         {
630             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
631             for( size_t j = 0; j < size; j++ )
632                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
633             return false;
634         }
635     }
636 
637     free( valids );
638     return true;
639 }
640 
test_atomic_xchg_verify_float(size_t size,cl_float * refValues,cl_float finalValue)641 bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue )
642 {
643     /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
644     char *valids;
645     size_t i;
646     char originalValidCount = 0;
647 
648     valids = (char *)malloc( sizeof( char ) * size );
649     memset( valids, 0, sizeof( char ) * size );
650 
651     for( i = 0; i < size; i++ )
652     {
653         cl_int *intRefValue = (cl_int *)( &refValues[ i ] );
654         if( *intRefValue == INT_TEST_VALUE )
655         {
656             // Special initial value
657             originalValidCount++;
658             continue;
659         }
660         if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
661         {
662             log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] );
663             return false;
664         }
665         valids[ (int)refValues[ i ] ] ++;
666     }
667 
668     /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
669      the final value outputted */
670     if( valids[ (int)finalValue ] > 0 )
671     {
672         log_error( "ERROR: Final value %a was also in ref list!\n", finalValue );
673         return false;
674     }
675     else
676         valids[ (int)finalValue ] = 1;    // So the following loop will be okay
677 
678     /* Now check that every entry has one and only one count */
679     if( originalValidCount != 1 )
680     {
681         log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
682         return false;
683     }
684     for( i = 0; i < size; i++ )
685     {
686         if( valids[ i ] != 1 )
687         {
688             log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
689             for( size_t j = 0; j < size; j++ )
690                 log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
691             return false;
692         }
693     }
694 
695     free( valids );
696     return true;
697 }
698 
test_atomic_xchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)699 int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
700 {
701     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float };
702 
703     int errors = test_atomic_function_set( deviceID, context, queue, num_elements, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false  );
704     errors |= test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true  );
705 
706     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, false, kFloat, true );
707     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, true, kFloat, true );
708 
709     return errors;
710 }
711 
712 
713 #pragma mark ---- min
714 
715 const char atom_min_core[] = "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
716 
717 const char atomic_min_core[] = "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
718 
test_atomic_min_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)719 cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
720 {
721     cl_int total = 0x7fffffffL;
722     for( size_t i = 0; i < size; i++ )
723     {
724         if( startRefValues[ i ] < total )
725             total = startRefValues[ i ];
726     }
727     return total;
728 }
729 
test_atomic_min_gen_int(size_t size,cl_int * startRefValues,MTdata d)730 void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d )
731 {
732     for( size_t i = 0; i < size; i++ )
733         startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
734 }
735 
test_atomic_min_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)736 cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
737 {
738     cl_long total = 0x7fffffffffffffffLL;
739     for( size_t i = 0; i < size; i++ )
740     {
741         if( startRefValues[ i ] < total )
742             total = startRefValues[ i ];
743     }
744     return total;
745 }
746 
test_atomic_min_gen_long(size_t size,cl_long * startRefValues,MTdata d)747 void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d )
748 {
749     for( size_t i = 0; i < size; i++ )
750         startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
751 }
752 
test_atomic_min(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)753 int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
754 {
755     TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL };
756 
757     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
758         return -1;
759     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
760         return -1;
761     return 0;
762 }
763 
764 
765 #pragma mark ---- max
766 
767 const char atom_max_core[] = "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
768 
769 const char atomic_max_core[] = "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
770 
test_atomic_max_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)771 cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
772 {
773     cl_int total = 0;
774     for( size_t i = 0; i < size; i++ )
775     {
776         if( startRefValues[ i ] > total )
777             total = startRefValues[ i ];
778     }
779     return total;
780 }
781 
test_atomic_max_gen_int(size_t size,cl_int * startRefValues,MTdata d)782 void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d )
783 {
784     for( size_t i = 0; i < size; i++ )
785         startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
786 }
787 
test_atomic_max_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)788 cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
789 {
790     cl_long total = 0;
791     for( size_t i = 0; i < size; i++ )
792     {
793         if( startRefValues[ i ] > total )
794             total = startRefValues[ i ];
795     }
796     return total;
797 }
798 
test_atomic_max_gen_long(size_t size,cl_long * startRefValues,MTdata d)799 void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d )
800 {
801     for( size_t i = 0; i < size; i++ )
802         startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
803 }
804 
test_atomic_max(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)805 int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
806 {
807     TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL };
808 
809     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
810         return -1;
811     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
812       return -1;
813     return 0;
814 }
815 
816 
817 #pragma mark ---- inc
818 
819 const char atom_inc_core[] = "    oldValues[tid] = atom_inc( &destMemory[0] );\n";
820 
821 const char atomic_inc_core[] = "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";
822 
test_atomic_inc_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)823 cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
824 {
825     return INT_TEST_VALUE + (cl_int)size;
826 }
827 
test_atomic_inc_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)828 cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
829 {
830     return LONG_TEST_VALUE + size;
831 }
832 
test_atomic_inc(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)833 int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
834 {
835     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL };
836 
837     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
838         return -1;
839     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
840         return -1;
841     return 0;
842 }
843 
844 
845 #pragma mark ---- dec
846 
847 const char atom_dec_core[] = "    oldValues[tid] = atom_dec( &destMemory[0] );\n";
848 
849 const char atomic_dec_core[] = "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";
850 
test_atomic_dec_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)851 cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
852 {
853     return INT_TEST_VALUE - (cl_int)size;
854 }
855 
test_atomic_dec_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)856 cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
857 {
858     return LONG_TEST_VALUE - size;
859 }
860 
test_atomic_dec(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)861 int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
862 {
863     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL };
864 
865     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
866         return -1;
867     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
868         return -1;
869     return 0;
870 }
871 
872 
873 #pragma mark ---- cmpxchg
874 
875 /* We test cmpxchg by implementing (the long way) atom_add */
876 const char atom_cmpxchg_core[] =
877 "    int oldValue, origValue, newValue;\n"
878 "    do { \n"
879 "        origValue = destMemory[0];\n"
880 "        newValue = origValue + tid + 2;\n"
881 "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
882 "    } while( oldValue != origValue );\n"
883 "    oldValues[tid] = oldValue;\n"
884 ;
885 
886 const char atom_cmpxchg64_core[] =
887 "    long oldValue, origValue, newValue;\n"
888 "    do { \n"
889 "        origValue = destMemory[0];\n"
890 "        newValue = origValue + tid + 2;\n"
891 "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
892 "    } while( oldValue != origValue );\n"
893 "    oldValues[tid] = oldValue;\n"
894 ;
895 
896 const char atomic_cmpxchg_core[] =
897 "    int oldValue, origValue, newValue;\n"
898 "    do { \n"
899 "        origValue = destMemory[0];\n"
900 "        newValue = origValue + tid + 2;\n"
901 "        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
902 "    } while( oldValue != origValue );\n"
903 "    oldValues[tid] = oldValue;\n"
904 ;
905 
test_atomic_cmpxchg_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)906 cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
907 {
908     cl_int total = INT_TEST_VALUE;
909     for( size_t i = 0; i < size; i++ )
910         total += (cl_int)i + 2;
911     return total;
912 }
913 
test_atomic_cmpxchg_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)914 cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
915 {
916     cl_long total = LONG_TEST_VALUE;
917     for( size_t i = 0; i < size; i++ )
918         total += i + 2;
919     return total;
920 }
921 
test_atomic_cmpxchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)922 int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
923 {
924     TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL };
925 
926     int errors = 0;
927 
928     log_info("    Testing atom_ functions...\n");
929     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kInt, true );
930     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kUInt, true );
931     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kInt, true );
932     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kUInt, true );
933 
934     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kLong, true );
935     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kULong, true );
936     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kLong, true );
937     errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kULong, true );
938 
939     log_info("    Testing atomic_ functions...\n");
940     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kInt, true );
941     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kUInt, true );
942     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kInt, true );
943     errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kUInt, true );
944 
945     if( errors )
946         return -1;
947 
948     return 0;
949 }
950 
951 #pragma mark -------- Bitwise functions
952 
test_bitwise_num_results(size_t threadCount,ExplicitType dataType)953 size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType )
954 {
955     size_t numBits = get_explicit_type_size( dataType ) * 8;
956 
957     return ( threadCount + numBits - 1 ) / numBits;
958 }
959 
960 #pragma mark ---- and
961 
962 const char atom_and_core[] =
963 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
964 "    int  whichResult = tid / numBits;\n"
965 "    int  bitIndex = tid - ( whichResult * numBits );\n"
966 "\n"
967 "    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
968 ;
969 
970 const char atomic_and_core[] =
971 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
972 "    int  whichResult = tid / numBits;\n"
973 "    int  bitIndex = tid - ( whichResult * numBits );\n"
974 "\n"
975 "    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
976 ;
977 
978 
test_atomic_and_result_int(size_t size,cl_int * startRefValues,size_t whichResult)979 cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
980 {
981     size_t numThreads = ( (size_t)size + 31 ) / 32;
982     if( whichResult < numThreads - 1 )
983         return 0;
984 
985     // Last item doesn't get and'ed on every bit, so we have to mask away
986     size_t numBits = (size_t)size - whichResult * 32;
987     cl_int bits = (cl_int)0xffffffffL;
988     for( size_t i = 0; i < numBits; i++ )
989         bits &= ~( 1 << i );
990 
991     return bits;
992 }
993 
test_atomic_and_result_long(size_t size,cl_long * startRefValues,size_t whichResult)994 cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
995 {
996     size_t numThreads = ( (size_t)size + 63 ) / 64;
997     if( whichResult < numThreads - 1 )
998         return 0;
999 
1000     // Last item doesn't get and'ed on every bit, so we have to mask away
1001     size_t numBits = (size_t)size - whichResult * 64;
1002     cl_long bits = (cl_long)0xffffffffffffffffLL;
1003     for( size_t i = 0; i < numBits; i++ )
1004         bits &= ~( 1 << i );
1005 
1006     return bits;
1007 }
1008 
test_atomic_and(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1009 int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1010 {
1011     TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results,
1012         test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL };
1013 
1014     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1015         return -1;
1016     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1017         return -1;
1018     return 0;
1019 }
1020 
1021 
1022 #pragma mark ---- or
1023 
1024 const char atom_or_core[] =
1025 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1026 "    int  whichResult = tid / numBits;\n"
1027 "    int  bitIndex = tid - ( whichResult * numBits );\n"
1028 "\n"
1029 "    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
1030 ;
1031 
1032 const char atomic_or_core[] =
1033 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1034 "    int  whichResult = tid / numBits;\n"
1035 "    int  bitIndex = tid - ( whichResult * numBits );\n"
1036 "\n"
1037 "    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
1038 ;
1039 
test_atomic_or_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1040 cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
1041 {
1042     size_t numThreads = ( (size_t)size + 31 ) / 32;
1043     if( whichResult < numThreads - 1 )
1044         return 0xffffffff;
1045 
1046     // Last item doesn't get and'ed on every bit, so we have to mask away
1047     size_t numBits = (size_t)size - whichResult * 32;
1048     cl_int bits = 0;
1049     for( size_t i = 0; i < numBits; i++ )
1050         bits |= ( 1 << i );
1051 
1052     return bits;
1053 }
1054 
test_atomic_or_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1055 cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
1056 {
1057     size_t numThreads = ( (size_t)size + 63 ) / 64;
1058     if( whichResult < numThreads - 1 )
1059         return 0x0ffffffffffffffffLL;
1060 
1061     // Last item doesn't get and'ed on every bit, so we have to mask away
1062     size_t numBits = (size_t)size - whichResult * 64;
1063     cl_long bits = 0;
1064     for( size_t i = 0; i < numBits; i++ )
1065         bits |= ( 1LL << i );
1066 
1067     return bits;
1068 }
1069 
test_atomic_or(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1070 int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1071 {
1072     TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL };
1073 
1074     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1075         return -1;
1076     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1077         return -1;
1078     return 0;
1079 }
1080 
1081 
1082 #pragma mark ---- xor
1083 
1084 const char atom_xor_core[] =
1085 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1086 "    int  bitIndex = tid & ( numBits - 1 );\n"
1087 "\n"
1088 "    oldValues[tid] = atom_xor( &destMemory[0], 1 << bitIndex );\n"
1089 ;
1090 
1091 const char atomic_xor_core[] =
1092 "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1093 "    int  bitIndex = tid & ( numBits - 1 );\n"
1094 "\n"
1095 "    oldValues[tid] = atomic_xor( &destMemory[0], 1 << bitIndex );\n"
1096 ;
1097 
test_atomic_xor_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1098 cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
1099 {
1100     cl_int total = 0x2f08ab41;
1101     for( size_t i = 0; i < size; i++ )
1102         total ^= ( 1 << ( i & 31 ) );
1103     return total;
1104 }
1105 
test_atomic_xor_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1106 cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
1107 {
1108     cl_long total = 0x2f08ab418ba0541LL;
1109     for( size_t i = 0; i < size; i++ )
1110         total ^= ( 1LL << ( i & 63 ) );
1111     return total;
1112 }
1113 
test_atomic_xor(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1114 int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
1115 {
1116     TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL };
1117 
1118     if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false  ) != 0 )
1119         return -1;
1120     if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true  ) != 0 )
1121         return -1;
1122     return 0;
1123 }
1124 
1125 
1126 
1127 
1128