1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #ifndef _COMMON_H_
17 #define _COMMON_H_
18
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/ThreadPool.h"
22
23 #include "host_atomics.h"
24
25 #include <vector>
26 #include <sstream>
27
28 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
29 #define MAX_HOST_THREADS GetThreadCount()
30
31 #define EXECUTE_TEST(error, test)\
32 error |= test;\
33 if(error && !gContinueOnError)\
34 return error;
35
36 enum TExplicitAtomicType
37 {
38 TYPE_ATOMIC_INT,
39 TYPE_ATOMIC_UINT,
40 TYPE_ATOMIC_LONG,
41 TYPE_ATOMIC_ULONG,
42 TYPE_ATOMIC_FLOAT,
43 TYPE_ATOMIC_DOUBLE,
44 TYPE_ATOMIC_INTPTR_T,
45 TYPE_ATOMIC_UINTPTR_T,
46 TYPE_ATOMIC_SIZE_T,
47 TYPE_ATOMIC_PTRDIFF_T,
48 TYPE_ATOMIC_FLAG
49 };
50
51 enum TExplicitMemoryScopeType
52 {
53 MEMORY_SCOPE_EMPTY,
54 MEMORY_SCOPE_WORK_GROUP,
55 MEMORY_SCOPE_DEVICE,
56 MEMORY_SCOPE_ALL_DEVICES, // Alias for MEMORY_SCOPE_ALL_SVM_DEVICES
57 MEMORY_SCOPE_ALL_SVM_DEVICES
58 };
59
60 extern bool gHost; // temporary flag for testing native host threads (test verification)
61 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
62 extern bool gContinueOnError; // execute all cases even when errors detected
63 extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
64 extern bool gNoGenericAddressSpace; // disable cases with generic address space
65 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
66 extern bool gDebug; // print OpenCL kernel code
67 extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
68 extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
69 extern cl_device_atomic_capabilities gAtomicMemCap,
70 gAtomicFenceCap; // atomic memory and fence capabilities for this device
71
72 extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
73 extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
74
75 extern cl_int getSupportedMemoryOrdersAndScopes(
76 cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
77 std::vector<TExplicitMemoryScopeType> &memoryScopes);
78
79 class AtomicTypeInfo
80 {
81 public:
82 TExplicitAtomicType _type;
AtomicTypeInfo(TExplicitAtomicType type)83 AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
84 cl_uint Size(cl_device_id device);
85 const char* AtomicTypeName();
86 const char* RegularTypeName();
87 const char* AddSubOperandTypeName();
88 int IsSupported(cl_device_id device);
89 };
90
91 template<typename HostDataType>
92 class AtomicTypeExtendedInfo : public AtomicTypeInfo
93 {
94 public:
AtomicTypeExtendedInfo(TExplicitAtomicType type)95 AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
96 HostDataType MinValue();
97 HostDataType MaxValue();
SpecialValue(cl_uchar x)98 HostDataType SpecialValue(cl_uchar x)
99 {
100 HostDataType tmp;
101 cl_uchar *ptr = (cl_uchar*)&tmp;
102 for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
103 ptr[i] = x;
104 return tmp;
105 }
SpecialValue(cl_ushort x)106 HostDataType SpecialValue(cl_ushort x)
107 {
108 HostDataType tmp;
109 cl_ushort *ptr = (cl_ushort*)&tmp;
110 for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
111 ptr[i] = x;
112 return tmp;
113 }
114 };
115
116 class CTest {
117 public:
118 virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
119 };
120
121 template<typename HostAtomicType, typename HostDataType>
122 class CBasicTest : CTest
123 {
124 public:
125 typedef struct {
126 CBasicTest *test;
127 cl_uint tid;
128 cl_uint threadCount;
129 volatile HostAtomicType *destMemory;
130 HostDataType *oldValues;
131 } THostThreadContext;
HostThreadFunction(cl_uint job_id,cl_uint thread_id,void * userInfo)132 static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
133 {
134 THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
135 threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
136 return 0;
137 }
CBasicTest(TExplicitAtomicType dataType,bool useSVM)138 CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
139 _maxDeviceThreads(MAX_DEVICE_THREADS),
140 _dataType(dataType), _useSVM(useSVM), _startValue(255),
141 _localMemory(false), _declaredInProgram(false),
142 _usedInFunction(false), _genericAddrSpace(false),
143 _oldValueCheck(true), _localRefValues(false),
144 _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
145 {
146 }
~CBasicTest()147 virtual ~CBasicTest()
148 {
149 if(_passCount)
150 log_info(" %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
151 }
NumResults(cl_uint threadCount,cl_device_id deviceID)152 virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
153 {
154 return 1;
155 }
NumNonAtomicVariablesPerThread()156 virtual cl_uint NumNonAtomicVariablesPerThread()
157 {
158 return 1;
159 }
ExpectedValue(HostDataType & expected,cl_uint threadCount,HostDataType * startRefValues,cl_uint whichDestValue)160 virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
161 {
162 return false;
163 }
GenerateRefs(cl_uint threadCount,HostDataType * startRefValues,MTdata d)164 virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
165 {
166 return false;
167 }
VerifyRefs(bool & correct,cl_uint threadCount,HostDataType * refValues,HostAtomicType * finalValues)168 virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
169 {
170 return false;
171 }
172 virtual std::string PragmaHeader(cl_device_id deviceID);
173 virtual std::string ProgramHeader(cl_uint maxNumDestItems);
174 virtual std::string FunctionCode();
175 virtual std::string KernelCode(cl_uint maxNumDestItems);
176 virtual std::string ProgramCore() = 0;
SingleTestName()177 virtual std::string SingleTestName()
178 {
179 std::string testName = LocalMemory() ? "local" : "global";
180 testName += " ";
181 testName += DataType().AtomicTypeName();
182 if(DeclaredInProgram())
183 {
184 testName += " declared in program";
185 }
186 if(DeclaredInProgram() && UsedInFunction())
187 testName += ",";
188 if(UsedInFunction())
189 {
190 testName += " used in ";
191 if(GenericAddrSpace())
192 testName += "generic ";
193 testName += "function";
194 }
195 return testName;
196 }
197 virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
ExecuteForEachPointerType(cl_device_id deviceID,cl_context context,cl_command_queue queue)198 int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
199 {
200 int error = 0;
201 UsedInFunction(false);
202 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
203 UsedInFunction(true);
204 GenericAddrSpace(false);
205 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
206 GenericAddrSpace(true);
207 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
208 GenericAddrSpace(false);
209 return error;
210 }
ExecuteForEachDeclarationType(cl_device_id deviceID,cl_context context,cl_command_queue queue)211 int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
212 {
213 int error = 0;
214 DeclaredInProgram(false);
215 EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
216 if(!UseSVM())
217 {
218 DeclaredInProgram(true);
219 EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
220 }
221 return error;
222 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)223 virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
224 {
225 int error = 0;
226 if(_maxDeviceThreads > 0 && !UseSVM())
227 {
228 LocalMemory(true);
229 EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
230 }
231 if(_maxDeviceThreads+MaxHostThreads() > 0)
232 {
233 LocalMemory(false);
234 EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
235 }
236 return error;
237 }
Execute(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)238 virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
239 {
240 if(sizeof(HostAtomicType) != DataType().Size(deviceID))
241 {
242 log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
243 return -1;
244 }
245 if(sizeof(HostAtomicType) != sizeof(HostDataType))
246 {
247 log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
248 return -1;
249 }
250 // Verify we can run first
251 if(UseSVM() && !gUseHostPtr)
252 {
253 cl_device_svm_capabilities caps;
254 cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
255 test_error(error, "clGetDeviceInfo failed");
256 if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
257 {
258 log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
259 // implicit pass
260 return 0;
261 }
262 }
263 if(!DataType().IsSupported(deviceID))
264 {
265 log_info("\t%s not supported\n", DataType().AtomicTypeName());
266 // implicit pass or host test (debug feature)
267 if(UseSVM())
268 return 0;
269 _maxDeviceThreads = 0;
270 }
271 if(_maxDeviceThreads+MaxHostThreads() == 0)
272 return 0;
273 return ExecuteForEachParameterSet(deviceID, context, queue);
274 }
HostFunction(cl_uint tid,cl_uint threadCount,volatile HostAtomicType * destMemory,HostDataType * oldValues)275 virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
276 {
277 log_info("Empty thread function %u\n", (cl_uint)tid);
278 }
DataType()279 AtomicTypeExtendedInfo<HostDataType> DataType() const
280 {
281 return AtomicTypeExtendedInfo<HostDataType>(_dataType);
282 }
283 cl_uint _maxDeviceThreads;
MaxHostThreads()284 virtual cl_uint MaxHostThreads()
285 {
286 if(UseSVM() || gHost)
287 return MAX_HOST_THREADS;
288 else
289 return 0;
290 }
291
CheckCapabilities(TExplicitMemoryScopeType memoryScope,TExplicitMemoryOrderType memoryOrder)292 int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
293 TExplicitMemoryOrderType memoryOrder)
294 {
295 /*
296 Differentiation between atomic fence and other atomic operations
297 does not need to occur here.
298
299 The initialisation of this test checks that the minimum required
300 capabilities are supported by this device.
301
302 The following switches allow the test to skip if optional capabilites
303 are not supported by the device.
304 */
305 switch (memoryScope)
306 {
307 case MEMORY_SCOPE_EMPTY: {
308 break;
309 }
310 case MEMORY_SCOPE_WORK_GROUP: {
311 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
312 {
313 return TEST_SKIPPED_ITSELF;
314 }
315 break;
316 }
317 case MEMORY_SCOPE_DEVICE: {
318 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
319 {
320 return TEST_SKIPPED_ITSELF;
321 }
322 break;
323 }
324 case MEMORY_SCOPE_ALL_DEVICES: // fallthough
325 case MEMORY_SCOPE_ALL_SVM_DEVICES: {
326 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
327 {
328 return TEST_SKIPPED_ITSELF;
329 }
330 break;
331 }
332 default: {
333 log_info("Invalid memory scope\n");
334 break;
335 }
336 }
337
338 switch (memoryOrder)
339 {
340 case MEMORY_ORDER_EMPTY: {
341 break;
342 }
343 case MEMORY_ORDER_RELAXED: {
344 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
345 {
346 return TEST_SKIPPED_ITSELF;
347 }
348 break;
349 }
350 case MEMORY_ORDER_ACQUIRE:
351 case MEMORY_ORDER_RELEASE:
352 case MEMORY_ORDER_ACQ_REL: {
353 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
354 {
355 return TEST_SKIPPED_ITSELF;
356 }
357 break;
358 }
359 case MEMORY_ORDER_SEQ_CST: {
360 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
361 {
362 return TEST_SKIPPED_ITSELF;
363 }
364 break;
365 }
366 default: {
367 log_info("Invalid memory order\n");
368 break;
369 }
370 }
371
372 return 0;
373 }
SVMDataBufferAllSVMConsistent()374 virtual bool SVMDataBufferAllSVMConsistent() {return false;}
UseSVM()375 bool UseSVM() {return _useSVM;}
StartValue(HostDataType startValue)376 void StartValue(HostDataType startValue) {_startValue = startValue;}
StartValue()377 HostDataType StartValue() {return _startValue;}
LocalMemory(bool local)378 void LocalMemory(bool local) {_localMemory = local;}
LocalMemory()379 bool LocalMemory() {return _localMemory;}
DeclaredInProgram(bool declaredInProgram)380 void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
DeclaredInProgram()381 bool DeclaredInProgram() {return _declaredInProgram;}
UsedInFunction(bool local)382 void UsedInFunction(bool local) {_usedInFunction = local;}
UsedInFunction()383 bool UsedInFunction() {return _usedInFunction;}
GenericAddrSpace(bool genericAddrSpace)384 void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
GenericAddrSpace()385 bool GenericAddrSpace() {return _genericAddrSpace;}
OldValueCheck(bool check)386 void OldValueCheck(bool check) {_oldValueCheck = check;}
OldValueCheck()387 bool OldValueCheck() {return _oldValueCheck;}
LocalRefValues(bool localRefValues)388 void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
LocalRefValues()389 bool LocalRefValues() {return _localRefValues;}
MaxGroupSize(cl_uint maxGroupSize)390 void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
MaxGroupSize()391 cl_uint MaxGroupSize() {return _maxGroupSize;}
CurrentGroupSize(cl_uint currentGroupSize)392 void CurrentGroupSize(cl_uint currentGroupSize)
393 {
394 if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
395 _currentGroupSize = MaxGroupSize();
396 else
397 _currentGroupSize = currentGroupSize;
398 }
CurrentGroupSize()399 cl_uint CurrentGroupSize() {return _currentGroupSize;}
CurrentGroupNum(cl_uint threadCount)400 virtual cl_uint CurrentGroupNum(cl_uint threadCount)
401 {
402 if(threadCount == 0)
403 return 0;
404 if(LocalMemory())
405 return 1;
406 return threadCount/CurrentGroupSize();
407 }
Iterations()408 cl_int Iterations() {return _iterations;}
IterationsStr()409 std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
410 private:
411 const TExplicitAtomicType _dataType;
412 const bool _useSVM;
413 HostDataType _startValue;
414 bool _localMemory;
415 bool _declaredInProgram;
416 bool _usedInFunction;
417 bool _genericAddrSpace;
418 bool _oldValueCheck;
419 bool _localRefValues;
420 cl_uint _maxGroupSize;
421 cl_uint _currentGroupSize;
422 cl_uint _passCount;
423 const cl_int _iterations;
424 };
425
426 template<typename HostAtomicType, typename HostDataType>
427 class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
428 {
429 public:
430 using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
431 using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
432 using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
433 CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
434 {
435 }
ProgramHeader(cl_uint maxNumDestItems)436 virtual std::string ProgramHeader(cl_uint maxNumDestItems)
437 {
438 std::string header;
439 if(gOldAPI)
440 {
441 std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
442 header +=
443 "#define atomic_store_explicit(x,y,o"+s+") atomic_store(x,y)\n"
444 "#define atomic_load_explicit(x,o"+s+") atomic_load(x)\n"
445 "#define atomic_exchange_explicit(x,y,o"+s+") atomic_exchange(x,y)\n"
446 "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
447 "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_weak(x,y,z)\n"
448 "#define atomic_fetch_add_explicit(x,y,o"+s+") atomic_fetch_add(x,y)\n"
449 "#define atomic_fetch_sub_explicit(x,y,o"+s+") atomic_fetch_sub(x,y)\n"
450 "#define atomic_fetch_or_explicit(x,y,o"+s+") atomic_fetch_or(x,y)\n"
451 "#define atomic_fetch_xor_explicit(x,y,o"+s+") atomic_fetch_xor(x,y)\n"
452 "#define atomic_fetch_and_explicit(x,y,o"+s+") atomic_fetch_and(x,y)\n"
453 "#define atomic_fetch_min_explicit(x,y,o"+s+") atomic_fetch_min(x,y)\n"
454 "#define atomic_fetch_max_explicit(x,y,o"+s+") atomic_fetch_max(x,y)\n"
455 "#define atomic_flag_test_and_set_explicit(x,o"+s+") atomic_flag_test_and_set(x)\n"
456 "#define atomic_flag_clear_explicit(x,o"+s+") atomic_flag_clear(x)\n";
457 }
458 return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
459 }
SingleTestName()460 virtual std::string SingleTestName()
461 {
462 std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
463 if(MemoryOrder() != MEMORY_ORDER_EMPTY)
464 {
465 testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
466 }
467 if(MemoryScope() != MEMORY_SCOPE_EMPTY)
468 {
469 testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
470 }
471 return testName;
472 }
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)473 virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
474 {
475 if(LocalMemory() &&
476 MemoryScope() != MEMORY_SCOPE_EMPTY &&
477 MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
478 return 0;
479 if(MemoryScope() == MEMORY_SCOPE_DEVICE)
480 MaxGroupSize(16); // increase number of groups by forcing smaller group size
481 else
482 MaxGroupSize(0); // group size limited by device capabilities
483
484 if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
485 return 0; // skip test - not applicable
486
487 return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
488 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)489 virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
490 {
491 // repeat test for each reasonable memory order/scope combination
492 std::vector<TExplicitMemoryOrderType> memoryOrder;
493 std::vector<TExplicitMemoryScopeType> memoryScope;
494 int error = 0;
495
496 // For OpenCL-3.0 and later some orderings and scopes are optional, so here
497 // we query for the supported ones.
498 test_error_ret(
499 getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
500 "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
501
502 for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
503 {
504 for(unsigned si = 0; si < memoryScope.size(); si++)
505 {
506 if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
507 continue;
508 MemoryOrder(memoryOrder[oi]);
509 MemoryScope(memoryScope[si]);
510 EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
511 }
512 }
513 return error;
514 }
MemoryOrder(TExplicitMemoryOrderType memoryOrder)515 void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
MemoryOrder()516 TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
MemoryOrderStr()517 std::string MemoryOrderStr()
518 {
519 if(MemoryOrder() != MEMORY_ORDER_EMPTY)
520 return std::string(", ")+get_memory_order_type_name(MemoryOrder());
521 return "";
522 }
MemoryScope(TExplicitMemoryScopeType memoryScope)523 void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
MemoryScope()524 TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
MemoryScopeStr()525 std::string MemoryScopeStr()
526 {
527 if(MemoryScope() != MEMORY_SCOPE_EMPTY)
528 return std::string(", ")+get_memory_scope_type_name(MemoryScope());
529 return "";
530 }
MemoryOrderScopeStr()531 std::string MemoryOrderScopeStr()
532 {
533 return MemoryOrderStr()+MemoryScopeStr();
534 }
CurrentGroupNum(cl_uint threadCount)535 virtual cl_uint CurrentGroupNum(cl_uint threadCount)
536 {
537 if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
538 return 1;
539 return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
540 }
MaxHostThreads()541 virtual cl_uint MaxHostThreads()
542 {
543 // block host threads execution for memory scope different than
544 // memory_scope_all_svm_devices
545 if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
546 || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
547 {
548 return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
549 }
550 else
551 {
552 return 0;
553 }
554 }
555 private:
556 TExplicitMemoryOrderType _memoryOrder;
557 TExplicitMemoryScopeType _memoryScope;
558 };
559
560 template<typename HostAtomicType, typename HostDataType>
561 class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
562 {
563 public:
564 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
565 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
566 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
567 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
568 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
569 using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
570
571 CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
572 {
573 }
SingleTestName()574 virtual std::string SingleTestName()
575 {
576 std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
577 if(MemoryOrder() != MEMORY_ORDER_EMPTY)
578 testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
579 if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
580 testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
581 if(MemoryScope() != MEMORY_SCOPE_EMPTY)
582 testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
583 return testName;
584 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)585 virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
586 {
587 // repeat test for each reasonable memory order/scope combination
588 std::vector<TExplicitMemoryOrderType> memoryOrder;
589 std::vector<TExplicitMemoryScopeType> memoryScope;
590 int error = 0;
591
592 // For OpenCL-3.0 and later some orderings and scopes are optional, so here
593 // we query for the supported ones.
594 test_error_ret(
595 getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
596 "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
597
598 for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
599 {
600 for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
601 {
602 for(unsigned si = 0; si < memoryScope.size(); si++)
603 {
604 if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
605 && memoryOrder[oi] != memoryOrder[o2i])
606 continue; // both memory order arguments must be set (or none)
607 if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
608 && memoryScope[si] != MEMORY_SCOPE_EMPTY)
609 continue; // memory scope without memory order is not allowed
610 MemoryOrder(memoryOrder[oi]);
611 MemoryOrder2(memoryOrder[o2i]);
612 MemoryScope(memoryScope[si]);
613
614 if (CheckCapabilities(MemoryScope(), MemoryOrder())
615 == TEST_SKIPPED_ITSELF)
616 continue; // skip test - not applicable
617
618 if (CheckCapabilities(MemoryScope(), MemoryOrder2())
619 == TEST_SKIPPED_ITSELF)
620 continue; // skip test - not applicable
621
622 EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
623 }
624 }
625 }
626 return error;
627 }
MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)628 void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
MemoryOrder2()629 TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
MemoryOrderFailStr()630 std::string MemoryOrderFailStr()
631 {
632 if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
633 return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
634 return "";
635 }
MemoryOrderScope()636 std::string MemoryOrderScope()
637 {
638 return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
639 }
640 private:
641 TExplicitMemoryOrderType _memoryOrder2;
642 };
643
644 template<typename HostAtomicType, typename HostDataType>
PragmaHeader(cl_device_id deviceID)645 std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
646 {
647 std::string pragma;
648
649 if(gOldAPI)
650 {
651 pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
652 pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
653 pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
654 pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
655 }
656 // Create the pragma lines for this kernel
657 if(DataType().Size(deviceID) == 8)
658 {
659 pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
660 pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
661 }
662 if(_dataType == TYPE_ATOMIC_DOUBLE)
663 pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
664 return pragma;
665 }
666
667 template<typename HostAtomicType, typename HostDataType>
ProgramHeader(cl_uint maxNumDestItems)668 std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
669 {
670 // Create the program header
671 std::string header;
672 std::string aTypeName = DataType().AtomicTypeName();
673 std::string cTypeName = DataType().RegularTypeName();
674 std::string argListForKernel;
675 std::string argListForFunction;
676 std::string argListNoTypes;
677 std::string functionPrototype;
678 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
679
680 if(gOldAPI)
681 {
682 header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
683 "#define atomic_store(x,y) (*(x) = y)\n"
684 "#define atomic_load(x) (*(x))\n"
685 "#define ATOMIC_VAR_INIT(x) (x)\n"
686 "#define ATOMIC_FLAG_INIT 0\n"
687 "#define atomic_init(x,y) atomic_store(x,y)\n";
688 if(aTypeName == "atomic_float")
689 header += "#define atomic_exchange(x,y) atomic_xchg(x,y)\n";
690 else if(aTypeName == "atomic_double")
691 header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
692 "{\n"
693 " long tmp = *(long*)&y, res;\n"
694 " volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
695 " res = atom_xchg(tmpA,tmp);\n"
696 " return *(double*)&res;\n"
697 "}\n";
698 else
699 header += "#define atomic_exchange(x,y) atom_xchg(x,y)\n";
700 if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
701 header +=
702 "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
703 "{\n"
704 " "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
705 " if(old == *expected)\n"
706 " return true;\n"
707 " *expected = old;\n"
708 " return false;\n"
709 "}\n"
710 "#define atomic_compare_exchange_weak atomic_compare_exchange_strong\n";
711 header +=
712 "#define atomic_fetch_add(x,y) atom_add(x,y)\n"
713 "#define atomic_fetch_sub(x,y) atom_sub(x,y)\n"
714 "#define atomic_fetch_or(x,y) atom_or(x,y)\n"
715 "#define atomic_fetch_xor(x,y) atom_xor(x,y)\n"
716 "#define atomic_fetch_and(x,y) atom_and(x,y)\n"
717 "#define atomic_fetch_min(x,y) atom_min(x,y)\n"
718 "#define atomic_fetch_max(x,y) atom_max(x,y)\n"
719 "#define atomic_flag_test_and_set(x) atomic_exchange(x,1)\n"
720 "#define atomic_flag_clear(x) atomic_store(x,0)\n"
721 "\n";
722 }
723 if(!LocalMemory() && DeclaredInProgram())
724 {
725 // additional atomic variable for results copying (last thread will do this)
726 header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
727 // atomic variables declared in program scope - test data
728 std::stringstream ss;
729 ss << maxNumDestItems;
730 header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
731 ss.str("");
732 ss << _startValue;
733 for(cl_uint i = 0; i < maxNumDestItems; i++)
734 {
735 if(aTypeName == "atomic_flag")
736 header += " ATOMIC_FLAG_INIT";
737 else
738 header += " ATOMIC_VAR_INIT("+ss.str()+")";
739 if(i+1 < maxNumDestItems)
740 header += ",";
741 header += "\n";
742 }
743 header+=
744 "};\n"
745 "\n";
746 }
747 return header;
748 }
749
750 template<typename HostAtomicType, typename HostDataType>
FunctionCode()751 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
752 {
753 if(!UsedInFunction())
754 return "";
755 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
756 std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
757 if(!GenericAddrSpace())
758 code += addressSpace;
759 code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
760 " *oldValues";
761 if(LocalRefValues())
762 code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
763 code += ")\n"
764 "{\n";
765 code += ProgramCore();
766 code += "}\n"
767 "\n";
768 return code;
769 }
770
771 template<typename HostAtomicType, typename HostDataType>
KernelCode(cl_uint maxNumDestItems)772 std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
773 {
774 std::string aTypeName = DataType().AtomicTypeName();
775 std::string cTypeName = DataType().RegularTypeName();
776 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
777 std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
778
779 // prepare list of arguments for kernel
780 if(LocalMemory())
781 {
782 code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
783 " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
784 }
785 else
786 {
787 code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
788 ", __global "+cTypeName+" *oldValues";
789 }
790 if(LocalRefValues())
791 code += std::string(", __local ")+cTypeName+" *localValues";
792 code += ")\n"
793 "{\n";
794 if(LocalMemory() && DeclaredInProgram())
795 {
796 // local atomics declared in kernel scope
797 std::stringstream ss;
798 ss << maxNumDestItems;
799 code += std::string(" __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
800 }
801 code += " uint tid = get_global_id(0);\n"
802 "\n";
803 if(LocalMemory())
804 {
805 // memory_order_relaxed is sufficient for these initialization operations
806 // as the barrier below will act as a fence, providing an order to the
807 // operations. memory_scope_work_group is sufficient as local memory is
808 // only visible within the work-group.
809 code += R"(
810 // initialize atomics not reachable from host (first thread
811 // is doing this, other threads are waiting on barrier)
812 if(get_local_id(0) == 0)
813 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
814 {)";
815 if (aTypeName == "atomic_flag")
816 {
817 code += R"(
818 if(finalDest[dstItemIdx])
819 atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
820 memory_order_relaxed,
821 memory_scope_work_group);
822 else
823 atomic_flag_clear_explicit(destMemory+dstItemIdx,
824 memory_order_relaxed,
825 memory_scope_work_group);)";
826 }
827 else
828 {
829 code += R"(
830 atomic_store_explicit(destMemory+dstItemIdx,
831 finalDest[dstItemIdx],
832 memory_order_relaxed,
833 memory_scope_work_group);)";
834 }
835 code +=
836 " }\n"
837 " barrier(CLK_LOCAL_MEM_FENCE);\n"
838 "\n";
839 }
840 if (LocalRefValues())
841 {
842 code +=
843 " // Copy input reference values into local memory\n";
844 if (NumNonAtomicVariablesPerThread() == 1)
845 code += " localValues[get_local_id(0)] = oldValues[tid];\n";
846 else
847 {
848 std::stringstream ss;
849 ss << NumNonAtomicVariablesPerThread();
850 code +=
851 " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
852 " localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
853 }
854 code +=
855 " barrier(CLK_LOCAL_MEM_FENCE);\n"
856 "\n";
857 }
858 if (UsedInFunction())
859 code += std::string(" test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
860 (LocalRefValues() ? ", localValues" : "")+");\n";
861 else
862 code += ProgramCore();
863 code += "\n";
864 if (LocalRefValues())
865 {
866 code +=
867 " // Copy local reference values into output array\n"
868 " barrier(CLK_LOCAL_MEM_FENCE);\n";
869 if (NumNonAtomicVariablesPerThread() == 1)
870 code += " oldValues[tid] = localValues[get_local_id(0)];\n";
871 else
872 {
873 std::stringstream ss;
874 ss << NumNonAtomicVariablesPerThread();
875 code +=
876 " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
877 " oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
878 }
879 code += "\n";
880 }
881 if(LocalMemory() || DeclaredInProgram())
882 {
883 code += " // Copy final values to host reachable buffer\n";
884 if(LocalMemory())
885 code +=
886 " barrier(CLK_LOCAL_MEM_FENCE);\n"
887 " if(get_local_id(0) == 0) // first thread in workgroup\n";
888 else
889 // global atomics declared in program scope
890 code += R"(
891 if(atomic_fetch_add_explicit(&finishedThreads, 1u,
892 memory_order_relaxed,
893 memory_scope_work_group)
894 == get_global_size(0)-1) // last finished thread
895 )";
896 code +=
897 " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
898 if(aTypeName == "atomic_flag")
899 {
900 code += R"(
901 finalDest[dstItemIdx] =
902 atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
903 memory_order_relaxed,
904 memory_scope_work_group);)";
905 }
906 else
907 {
908 code += R"(
909 finalDest[dstItemIdx] =
910 atomic_load_explicit(destMemory+dstItemIdx,
911 memory_order_relaxed,
912 memory_scope_work_group);)";
913 }
914 }
915 code += "}\n"
916 "\n";
917 return code;
918 }
919
920 template <typename HostAtomicType, typename HostDataType>
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)921 int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
922 {
923 int error;
924 clProgramWrapper program;
925 clKernelWrapper kernel;
926 size_t threadNum[1];
927 clMemWrapper streams[2];
928 std::vector<HostAtomicType> destItems;
929 HostAtomicType *svmAtomicBuffer = 0;
930 std::vector<HostDataType> refValues, startRefValues;
931 HostDataType *svmDataBuffer = 0;
932 cl_uint deviceThreadCount, hostThreadCount, threadCount;
933 size_t groupSize = 0;
934 std::string programSource;
935 const char *programLine;
936 MTdata d;
937 size_t typeSize = DataType().Size(deviceID);
938
939 deviceThreadCount = _maxDeviceThreads;
940 hostThreadCount = MaxHostThreads();
941 threadCount = deviceThreadCount+hostThreadCount;
942
943 //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
944 log_info("\t%s...\n", SingleTestName().c_str());
945
946 if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
947 {
948 log_info("\t\tTest disabled\n");
949 return 0;
950 }
951 if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
952 {
953 log_info("\t\tTest disabled\n");
954 return 0;
955 }
956
957 // set up work sizes based on device capabilities and test configuration
958 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
959 test_error(error, "Unable to obtain max work group size for device");
960 CurrentGroupSize((cl_uint)groupSize);
961 if(CurrentGroupSize() > deviceThreadCount)
962 CurrentGroupSize(deviceThreadCount);
963 if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
964 deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
965 threadCount = deviceThreadCount+hostThreadCount;
966
967 // If we're given a num_results function, we need to determine how many result objects we need.
968 // This is the first assessment for current maximum number of threads (exact thread count is not known here)
969 // - needed for program source code generation (arrays of atomics declared in program)
970 cl_uint numDestItems = NumResults(threadCount, deviceID);
971
972 if(deviceThreadCount > 0)
973 {
974 // This loop iteratively reduces the workgroup size by 2 and then
975 // re-generates the kernel with the reduced
976 // workgroup size until we find a size which is admissible for the kernel
977 // being run or reduce the wg size
978 // to the trivial case of 1 (which was separately verified to be accurate
979 // for the kernel being run)
980
981 while ((CurrentGroupSize() > 1))
982 {
983 // Re-generate the kernel code with the current group size
984 if (kernel) clReleaseKernel(kernel);
985 if (program) clReleaseProgram(program);
986 programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
987 + FunctionCode() + KernelCode(numDestItems);
988 programLine = programSource.c_str();
989 if (create_single_kernel_helper_with_build_options(
990 context, &program, &kernel, 1, &programLine,
991 "test_atomic_kernel", gOldAPI ? "" : nullptr))
992 {
993 return -1;
994 }
995 // Get work group size for the new kernel
996 error = clGetKernelWorkGroupInfo(kernel, deviceID,
997 CL_KERNEL_WORK_GROUP_SIZE,
998 sizeof(groupSize), &groupSize, NULL);
999 test_error(error,
1000 "Unable to obtain max work group size for device and "
1001 "kernel combo");
1002
1003 if (LocalMemory())
1004 {
1005 cl_ulong usedLocalMemory;
1006 cl_ulong totalLocalMemory;
1007 cl_uint maxWorkGroupSize;
1008
1009 error = clGetKernelWorkGroupInfo(
1010 kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
1011 sizeof(usedLocalMemory), &usedLocalMemory, NULL);
1012 test_error(error, "clGetKernelWorkGroupInfo failed");
1013
1014 error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
1015 sizeof(totalLocalMemory),
1016 &totalLocalMemory, NULL);
1017 test_error(error, "clGetDeviceInfo failed");
1018
1019 // We know that each work-group is going to use typeSize *
1020 // deviceThreadCount bytes of local memory
1021 // so pick the maximum value for deviceThreadCount that uses all
1022 // the local memory.
1023 maxWorkGroupSize =
1024 ((totalLocalMemory - usedLocalMemory) / typeSize);
1025
1026 if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
1027 }
1028 if (CurrentGroupSize() <= groupSize)
1029 break;
1030 else
1031 CurrentGroupSize(CurrentGroupSize() / 2);
1032 }
1033 if(CurrentGroupSize() > deviceThreadCount)
1034 CurrentGroupSize(deviceThreadCount);
1035 if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1036 deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
1037 threadCount = deviceThreadCount+hostThreadCount;
1038 }
1039 if (gDebug)
1040 {
1041 log_info("Program source:\n");
1042 log_info("%s\n", programLine);
1043 }
1044 if(deviceThreadCount > 0)
1045 log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
1046 if(hostThreadCount > 0)
1047 log_info("\t\t(host threads %u)\n", hostThreadCount);
1048
1049 refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
1050
1051 // Generate ref data if we have a ref generator provided
1052 d = init_genrand(gRandomSeed);
1053 startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
1054 if(GenerateRefs(threadCount, &startRefValues[0], d))
1055 {
1056 //copy ref values for host threads
1057 memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
1058 }
1059 else
1060 {
1061 startRefValues.resize(0);
1062 }
1063 free_mtdata(d);
1064 d = NULL;
1065
1066 // If we're given a num_results function, we need to determine how many result objects we need. If
1067 // we don't have it, we assume it's just 1
1068 // This is final value (exact thread count is known in this place)
1069 numDestItems = NumResults(threadCount, deviceID);
1070
1071 destItems.resize(numDestItems);
1072 for(cl_uint i = 0; i < numDestItems; i++)
1073 destItems[i] = _startValue;
1074
1075 // Create main buffer with atomic variables (array size dependent on particular test)
1076 if(UseSVM())
1077 {
1078 if(gUseHostPtr)
1079 svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
1080 else
1081 svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
1082 if(!svmAtomicBuffer)
1083 {
1084 log_error("ERROR: clSVMAlloc failed!\n");
1085 return -1;
1086 }
1087 memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
1088 streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1089 typeSize * numDestItems, svmAtomicBuffer, NULL);
1090 }
1091 else
1092 {
1093 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
1094 typeSize * numDestItems, &destItems[0], NULL);
1095 }
1096 if (!streams[0])
1097 {
1098 log_error("ERROR: Creating output array failed!\n");
1099 return -1;
1100 }
1101 // Create buffer for per-thread input/output data
1102 if(UseSVM())
1103 {
1104 if(gUseHostPtr)
1105 svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
1106 else
1107 svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
1108 if(!svmDataBuffer)
1109 {
1110 log_error("ERROR: clSVMAlloc failed!\n");
1111 return -1;
1112 }
1113 if(startRefValues.size())
1114 memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
1115 streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1116 typeSize * threadCount
1117 * NumNonAtomicVariablesPerThread(),
1118 svmDataBuffer, NULL);
1119 }
1120 else
1121 {
1122 streams[1] = clCreateBuffer(
1123 context,
1124 ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
1125 typeSize * threadCount * NumNonAtomicVariablesPerThread(),
1126 startRefValues.size() ? &startRefValues[0] : 0, NULL);
1127 }
1128 if (!streams[1])
1129 {
1130 log_error("ERROR: Creating reference array failed!\n");
1131 return -1;
1132 }
1133 if(deviceThreadCount > 0)
1134 {
1135 cl_uint argInd = 0;
1136 /* Set the arguments */
1137 error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
1138 test_error(error, "Unable to set kernel argument");
1139 error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
1140 test_error(error, "Unable to set indexed kernel argument");
1141 error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
1142 test_error(error, "Unable to set indexed kernel arguments");
1143 error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
1144 test_error(error, "Unable to set indexed kernel arguments");
1145 if(LocalMemory())
1146 {
1147 error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
1148 test_error(error, "Unable to set indexed local kernel argument");
1149 }
1150 if(LocalRefValues())
1151 {
1152 error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
1153 test_error(error, "Unable to set indexed kernel argument");
1154 }
1155 }
1156 /* Configure host threads */
1157 std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
1158 for(unsigned int t = 0; t < hostThreadCount; t++)
1159 {
1160 hostThreadContexts[t].test = this;
1161 hostThreadContexts[t].tid = deviceThreadCount+t;
1162 hostThreadContexts[t].threadCount = threadCount;
1163 hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
1164 hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
1165 }
1166
1167 if(deviceThreadCount > 0)
1168 {
1169 /* Run the kernel */
1170 threadNum[0] = deviceThreadCount;
1171 groupSize = CurrentGroupSize();
1172 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
1173 test_error(error, "Unable to execute test kernel");
1174 /* start device threads */
1175 error = clFlush(queue);
1176 test_error(error, "clFlush failed");
1177 }
1178
1179 /* Start host threads and wait for finish */
1180 if(hostThreadCount > 0)
1181 ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
1182
1183 if(UseSVM())
1184 {
1185 error = clFinish(queue);
1186 test_error(error, "clFinish failed");
1187 memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
1188 memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
1189 }
1190 else
1191 {
1192 if(deviceThreadCount > 0)
1193 {
1194 error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
1195 test_error(error, "Unable to read result value!");
1196 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
1197 test_error(error, "Unable to read reference values!");
1198 }
1199 }
1200 bool dataVerified = false;
1201 // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
1202 // have one, it's because we're comparing ref values only
1203 for(cl_uint i = 0; i < numDestItems; i++)
1204 {
1205 HostDataType expected;
1206
1207 if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
1208 break; // no expected value function provided
1209
1210 if(expected != destItems[i])
1211 {
1212 std::stringstream logLine;
1213 logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
1214 log_error("%s", logLine.str().c_str());
1215 for(i = 0; i < threadCount; i++)
1216 {
1217 logLine.str("");
1218 logLine << " --- " << i << " - ";
1219 if(startRefValues.size())
1220 logLine << startRefValues[i] << " -> " << refValues[i];
1221 else
1222 logLine << refValues[i];
1223 logLine << " --- ";
1224 if(i < numDestItems)
1225 logLine << destItems[i];
1226 logLine << "\n";
1227 log_info("%s", logLine.str().c_str());
1228 }
1229 if(!gDebug)
1230 {
1231 log_info("Program source:\n");
1232 log_info("%s\n", programLine);
1233 }
1234 return -1;
1235 }
1236 dataVerified = true;
1237 }
1238
1239 bool dataCorrect = false;
1240 /* Use the verify function (if provided) to also check the results */
1241 if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
1242 {
1243 if(!dataCorrect)
1244 {
1245 log_error("ERROR: Reference values did not validate!\n");
1246 std::stringstream logLine;
1247 for(cl_uint i = 0; i < threadCount; i++)
1248 for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
1249 {
1250 logLine.str("");
1251 logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
1252 if(j == 0 && i < numDestItems)
1253 logLine << destItems[i];
1254 logLine << "\n";
1255 log_info("%s", logLine.str().c_str());
1256 }
1257 if(!gDebug)
1258 {
1259 log_info("Program source:\n");
1260 log_info("%s\n", programLine);
1261 }
1262 return -1;
1263 }
1264 }
1265 else if(!dataVerified)
1266 {
1267 log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
1268 return -1;
1269 }
1270
1271 if(OldValueCheck() &&
1272 !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
1273 // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
1274 {
1275 /* Re-write the starting value */
1276 for(size_t i = 0; i < numDestItems; i++)
1277 destItems[i] = _startValue;
1278 refValues[0] = 0;
1279 if(deviceThreadCount > 0)
1280 {
1281 error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
1282 test_error(error, "Unable to write starting values!");
1283
1284 /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
1285 threadNum[0] = 1;
1286 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
1287 test_error(error, "Unable to execute test kernel");
1288
1289 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
1290 test_error(error, "Unable to read reference values!");
1291 }
1292 else
1293 {
1294 /* Start host thread */
1295 HostFunction(0, 1, &destItems[0], &refValues[0]);
1296 }
1297
1298 if(refValues[0] != _startValue)//destItems[0])
1299 {
1300 std::stringstream logLine;
1301 logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
1302 " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
1303 log_error("%s", logLine.str().c_str());
1304 if(!gDebug)
1305 {
1306 log_info("Program source:\n");
1307 log_info("%s\n", programLine);
1308 }
1309 return -1;
1310 }
1311 }
1312 if(UseSVM())
1313 {
1314 // the buffer object must first be released before the SVM buffer is freed
1315 error = clReleaseMemObject(streams[0]);
1316 streams[0] = 0;
1317 test_error(error, "clReleaseMemObject failed");
1318 if(gUseHostPtr)
1319 free(svmAtomicBuffer);
1320 else
1321 clSVMFree(context, svmAtomicBuffer);
1322 error = clReleaseMemObject(streams[1]);
1323 streams[1] = 0;
1324 test_error(error, "clReleaseMemObject failed");
1325 if(gUseHostPtr)
1326 free(svmDataBuffer);
1327 else
1328 clSVMFree(context, svmDataBuffer);
1329 }
1330 _passCount++;
1331 return 0;
1332 }
1333
1334 #endif //_COMMON_H_
1335