1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "TestNonUniformWorkGroup.h"
17 #include <vector>
18 #include <sstream>
19 #define NL "\n"
20
21 size_t TestNonUniformWorkGroup::_maxLocalWorkgroupSize = 0;
22 bool TestNonUniformWorkGroup::_strictMode = false;
23
24 // Main Kernel source code
25 static const char *KERNEL_FUNCTION =
26 NL "#define MAX_DIMS 3"
27 NL "typedef struct _DataContainerAttrib"
28 NL "{"
29 NL " unsigned long get_global_size[MAX_DIMS];"
30 NL " unsigned long get_global_offset[MAX_DIMS];"
31 NL " unsigned long get_local_size[MAX_DIMS];"
32 NL " unsigned long get_enqueued_local_size[MAX_DIMS];"
33 NL " unsigned long get_global_id[MAX_DIMS];"
34 NL " unsigned long get_local_id[MAX_DIMS];"
35 NL " unsigned long get_group_id[MAX_DIMS];"
36 NL " unsigned long get_num_groups[MAX_DIMS];"
37 NL " unsigned long get_work_dim;"
38 NL " unsigned short test_local_barrier_result_bool;"
39 NL " unsigned short test_global_barrier_result_bool;"
40 NL " unsigned short test_local_atomic_result_value;"
41 NL "}DataContainerAttrib;"
42
43 NL "enum Error{"
44 NL " ERR_GLOBAL_SIZE=0,"
45 NL " ERR_GLOBAL_WORK_OFFSET,"
46 NL " ERR_LOCAL_SIZE,"
47 NL " ERR_GLOBAL_ID,"
48 NL " ERR_LOCAL_ID,"
49 NL " ERR_ENQUEUED_LOCAL_SIZE,"
50 NL " ERR_NUM_GROUPS,"
51 NL " ERR_GROUP_ID,"
52 NL " ERR_WORK_DIM,"
53 NL " ERR_GLOBAL_BARRIER,"
54 NL " ERR_LOCAL_BARRIER,"
55 NL " ERR_GLOBAL_ATOMIC,"
56 NL " ERR_LOCAL_ATOMIC,"
57 NL " ERR_STRICT_MODE,"
58 NL " ERR_BUILD_STATUS,"
59 NL " ERR_UNKNOWN,"
60 NL " ERR_DIFFERENT,"
61 NL " _LAST_ELEM"
62 NL "};"
63
64 NL "uint getGlobalIndex (uint gid2, uint gid1, uint gid0) {"
65 NL " return gid2*get_global_size(0)*get_global_size(1) + gid1*get_global_size(0) + gid0;"
66 NL "}"
67
68 NL "int getRegionIndex () {"
69 NL " uint gid0 = get_global_id(0) - get_global_offset(0);"
70 NL " uint gid1 = get_global_id(1) - get_global_offset(1);"
71 NL " uint gid2 = get_global_id(2) - get_global_offset(2);"
72 NL " if (gid0 == 0 && gid1 == 0 && gid2 == 0) {"
73 NL " return 0;"
74 NL " } else if (gid0 == get_global_size(0) - 1 && gid1 == 0 && gid2 == 0) {"
75 NL " return 1;"
76 NL " } else if (gid0 == 0 && gid1 == get_global_size(1) - 1 && gid2 == 0) {"
77 NL " return 2;"
78 NL " } else if (gid0 == get_global_size(0) - 1 && gid1 == get_global_size(1) - 1 && gid2 == 0) {"
79 NL " return 3;"
80 NL " } else if (gid0 == 0 && gid1 == 0 && gid2 == get_global_size(2) - 1) {"
81 NL " return 4;"
82 NL " } else if (gid0 == get_global_size(0) - 1 && gid1 == 0 && gid2 == get_global_size(2) - 1) {"
83 NL " return 5;"
84 NL " } else if (gid0 == 0 && gid1 == get_global_size(1) - 1 && gid2 == get_global_size(2) - 1) {"
85 NL " return 6;"
86 NL " } else if (gid0 == get_global_size(0) - 1 && gid1 == get_global_size(1) - 1 && gid2 == get_global_size(2) - 1) {"
87 NL " return 7;"
88 NL " }"
89 NL " return -1;"
90 NL "}"
91
92 NL "void getLocalSize(__global DataContainerAttrib *results) {"
93 NL " for (unsigned short i = 0; i < MAX_DIMS; i++) {"
94 NL " results->get_local_size[i] = get_local_size(i);"
95 NL " }"
96 NL "}"
97
98 NL "#ifdef TESTBASIC"
99 // values set by this function will be checked on the host side
100 NL "void testBasicHost(__global DataContainerAttrib *results) {"
101 NL " for (unsigned short i = 0; i < MAX_DIMS; i++) {"
102 NL " results->get_global_size[i] = get_global_size(i);"
103 NL " results->get_global_offset[i] = get_global_offset(i);"
104 NL " results->get_enqueued_local_size[i] = get_enqueued_local_size(i);"
105 NL " results->get_global_id[i] = get_global_id(i);"
106 NL " results->get_local_id[i] = get_local_id(i);"
107 NL " results->get_group_id[i] = get_group_id(i);"
108 NL " results->get_num_groups[i] = get_num_groups(i);"
109 NL " }"
110 NL " results->get_work_dim = get_work_dim();"
111 NL "}"
112 // values set by this function are checked on the kernel side
113 NL "void testBasicKernel(__global unsigned int *errorCounterBuffer, __local DataContainerAttrib *resultsForThread0) {"
114 NL " uint lid0 = get_local_id(0);"
115 NL " uint lid1 = get_local_id(1);"
116 NL " uint lid2 = get_local_id(2);"
117 NL " if (lid0 == 0 && lid1 == 0 && lid2 == 0) {"
118 NL " for (unsigned short i = 0; i < MAX_DIMS; i++) {"
119 NL " resultsForThread0->get_global_size[i] = get_global_size(i);"
120 NL " resultsForThread0->get_global_offset[i] = get_global_offset(i);"
121 NL " resultsForThread0->get_enqueued_local_size[i] = get_enqueued_local_size(i);"
122 NL " resultsForThread0->get_group_id[i] = get_group_id(i);"
123 NL " resultsForThread0->get_num_groups[i] = get_num_groups(i);"
124 NL " }"
125 NL " resultsForThread0->get_work_dim = get_work_dim();"
126 NL " }"
127 NL " barrier(CLK_LOCAL_MEM_FENCE);"
128 // verifies built in functions on the kernel side
129 NL " if (lid0 != 0 || lid1 != 0 || lid2 != 0) {"
130 NL " for (unsigned short i = 0; i < MAX_DIMS; i++) {"
131 NL " if (resultsForThread0->get_global_size[i] != get_global_size(i)) {"
132 NL " atomic_inc(&errorCounterBuffer[ERR_GLOBAL_SIZE]);"
133 NL " }"
134 NL " if (resultsForThread0->get_global_offset[i] != get_global_offset(i)) {"
135 NL " atomic_inc(&errorCounterBuffer[ERR_GLOBAL_WORK_OFFSET]);"
136 NL " }"
137 NL " if (resultsForThread0->get_enqueued_local_size[i] != get_enqueued_local_size(i)) {"
138 NL " atomic_inc(&errorCounterBuffer[ERR_ENQUEUED_LOCAL_SIZE]);"
139 NL " }"
140 NL " if (resultsForThread0->get_group_id[i] != get_group_id(i)) {"
141 NL " atomic_inc(&errorCounterBuffer[ERR_GROUP_ID]);"
142 NL " }"
143 NL " if (resultsForThread0->get_num_groups[i] != get_num_groups(i)) {"
144 NL " atomic_inc(&errorCounterBuffer[ERR_NUM_GROUPS]);"
145 NL " }"
146 NL " }"
147 NL " if (resultsForThread0->get_work_dim != get_work_dim()) {"
148 NL " atomic_inc(&errorCounterBuffer[ERR_WORK_DIM]);"
149 NL " }"
150 NL " }"
151 NL "}"
152 NL "#endif"
153
154 NL "#ifdef TESTBARRIERS"
155 NL "void testBarriers(__global unsigned int *errorCounterBuffer, __local unsigned int *testLocalBuffer, __global unsigned int *testGlobalBuffer) {"
156 NL " uint gid0 = get_global_id(0);"
157 NL " uint gid1 = get_global_id(1);"
158 NL " uint gid2 = get_global_id(2);"
159 NL " uint lid0 = get_local_id(0);"
160 NL " uint lid1 = get_local_id(1);"
161 NL " uint lid2 = get_local_id(2);"
162 NL
163 NL " uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
164 NL " uint localIndex = lid2*get_local_size(0)*get_local_size(1) + lid1*get_local_size(0) + lid0;"
165 NL " testLocalBuffer[localIndex] = 0;"
166 NL " testGlobalBuffer[globalIndex] = 0;"
167 NL " uint maxLocalIndex = get_local_size(0)*get_local_size(1)*get_local_size(2)-1;"
168 NL " uint nextLocalIndex = (localIndex>=maxLocalIndex)?0:(localIndex+1);"
169 NL " uint next_lid0 = (lid0+1>=get_local_size(0))?0:lid0+1;"
170 NL " uint next_lid1 = (lid1+1>=get_local_size(1))?0:lid1+1;"
171 NL " uint next_lid2 = (lid2+1>=get_local_size(2))?0:lid2+1;"
172 NL " uint nextGlobalIndexInLocalWorkGroup = getGlobalIndex (get_group_id(2)*get_enqueued_local_size(2)+next_lid2, get_group_id(1)*get_enqueued_local_size(1)+next_lid1, get_group_id(0)*get_enqueued_local_size(0)+next_lid0);"
173 // testing local barriers
174 NL " testLocalBuffer[localIndex] = localIndex;"
175 NL " barrier(CLK_LOCAL_MEM_FENCE);"
176 NL " uint temp = testLocalBuffer[nextLocalIndex];"
177 NL " if (temp != nextLocalIndex) {"
178 NL " atomic_inc(&errorCounterBuffer[ERR_LOCAL_BARRIER]);"
179 NL " }"
180 // testing global barriers
181 NL " testGlobalBuffer[globalIndex] = globalIndex;"
182 NL " barrier(CLK_GLOBAL_MEM_FENCE);"
183 NL " uint temp2 = testGlobalBuffer[nextGlobalIndexInLocalWorkGroup];"
184 NL " if (temp2 != nextGlobalIndexInLocalWorkGroup) {"
185 NL " atomic_inc(&errorCounterBuffer[ERR_GLOBAL_BARRIER]);"
186 NL " }"
187 NL "}"
188 NL "#endif"
189
190 NL "#ifdef TESTATOMICS"
191 NL "void testAtomics(__global unsigned int *globalAtomicTestVariable, __local unsigned int *localAtomicTestVariable) {"
192 NL " uint gid0 = get_global_id(0);"
193 NL " uint gid1 = get_global_id(1);"
194 NL " uint gid2 = get_global_id(2);"
195 NL
196 NL " uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
197 // testing atomic function on local memory
198 NL " atomic_inc(localAtomicTestVariable);"
199 NL " barrier(CLK_LOCAL_MEM_FENCE);"
200 // testing atomic function on global memory
201 NL " atomic_inc(globalAtomicTestVariable);"
202 NL "}"
203 NL "#endif"
204
205 NL "#ifdef RWGSX"
206 NL "#ifdef RWGSY"
207 NL "#ifdef RWGSZ"
208 NL "__attribute__((reqd_work_group_size(RWGSX, RWGSY, RWGSZ)))"
209 NL "#endif"
210 NL "#endif"
211 NL "#endif"
212 NL "__kernel void testKernel(__global DataContainerAttrib *results, __local unsigned int *testLocalBuffer,"
213 NL " __global unsigned int *testGlobalBuffer, __global unsigned int *globalAtomicTestVariable, __global unsigned int *errorCounterBuffer) {"
214 NL " uint gid0 = get_global_id(0);"
215 NL " uint gid1 = get_global_id(1);"
216 NL " uint gid2 = get_global_id(2);"
217 NL
218 NL " uint globalIndex = getGlobalIndex(gid2-get_global_offset(2), gid1-get_global_offset(1), gid0-get_global_offset(0));"
219 NL " int regionIndex = getRegionIndex();"
220 NL " if (regionIndex >= 0) {"
221 NL " getLocalSize(&results[regionIndex]);"
222 NL " }"
223 NL "#ifdef TESTBASIC"
224 NL " if (regionIndex >= 0) {"
225 NL " testBasicHost(&results[regionIndex]);"
226 NL " }"
227 NL " __local DataContainerAttrib resultsForThread0;"
228 NL " testBasicKernel(errorCounterBuffer, &resultsForThread0);"
229 NL "#endif"
230 NL "#ifdef TESTBARRIERS"
231 NL " testBarriers(errorCounterBuffer, testLocalBuffer, testGlobalBuffer);"
232 NL "#endif"
233 NL "#ifdef TESTATOMICS"
234 NL " __local unsigned int localAtomicTestVariable;"
235 NL " localAtomicTestVariable = 0;"
236 NL " barrier(CLK_LOCAL_MEM_FENCE);"
237 NL " testAtomics(globalAtomicTestVariable, &localAtomicTestVariable);"
238 NL " barrier(CLK_LOCAL_MEM_FENCE);"
239 NL " if (localAtomicTestVariable != get_local_size(0) * get_local_size(1) * get_local_size(2)) {"
240 NL " atomic_inc(&errorCounterBuffer[ERR_LOCAL_ATOMIC]);"
241 NL " }"
242 NL "#endif"
243 NL "}"
244 NL ;
245
TestNonUniformWorkGroup(const cl_device_id & device,const cl_context & context,const cl_command_queue & queue,const cl_uint dims,size_t * globalSize,const size_t * localSize,const size_t * buffersSize,const size_t * globalWorkOffset,const size_t * reqdWorkGroupSize)246 TestNonUniformWorkGroup::TestNonUniformWorkGroup(
247 const cl_device_id &device, const cl_context &context,
248 const cl_command_queue &queue, const cl_uint dims, size_t *globalSize,
249 const size_t *localSize, const size_t *buffersSize,
250 const size_t *globalWorkOffset, const size_t *reqdWorkGroupSize)
251 : _device(device), _context(context), _queue(queue), _dims(dims)
252 {
253
254 if (globalSize == NULL || dims < 1 || dims > 3)
255 {
256 // throw std::invalid_argument("globalSize is NULL value.");
257 // This is method of informing that parameters are wrong.
258 // It would be checked by prepareDevice() function.
259 // This is used because of lack of exception support.
260 _globalSize[0] = 0;
261 return;
262 }
263
264 // For OpenCL-3.0 support for non-uniform workgroups is optional, it's still
265 // useful to run these tests since we can verify the behavior of the
266 // get_enqueued_local_size() builtin for uniform workgroups, so we round up
267 // the global size to insure uniform workgroups on those 3.0 devices.
268 // We only need to do this when localSize is non-null, otherwise the driver
269 // will select a value for localSize which will be uniform on devices that
270 // don't support non-uniform work-groups.
271 if (nullptr != localSize && get_device_cl_version(device) >= Version(3, 0))
272 {
273 // Query for the non-uniform work-group support.
274 cl_bool are_non_uniform_sub_groups_supported{ CL_FALSE };
275 auto error =
276 clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT,
277 sizeof(are_non_uniform_sub_groups_supported),
278 &are_non_uniform_sub_groups_supported, nullptr);
279 if (error)
280 {
281 print_error(error,
282 "clGetDeviceInfo failed for "
283 "CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT");
284 // This signals an error to the caller (see above).
285 _globalSize[0] = 0;
286 return;
287 }
288
289 // If non-uniform work-groups are not supported round up the global
290 // sizes so workgroups are uniform and we have at least one.
291 if (CL_FALSE == are_non_uniform_sub_groups_supported)
292 {
293 log_info(
294 "WARNING: Non-uniform work-groups are not supported on this "
295 "device.\n Running test with uniform work-groups.\n");
296 for (unsigned dim = 0; dim < dims; ++dim)
297 {
298 auto global_size_before = globalSize[dim];
299 auto global_size_rounded = global_size_before
300 + (localSize[dim] - global_size_before % localSize[dim]);
301 globalSize[dim] = global_size_rounded;
302 log_info("Rounding globalSize[%d] = %d -> %d\n", dim,
303 global_size_before, global_size_rounded);
304 }
305 }
306 }
307
308 cl_uint i;
309 _globalWorkOffset_IsNull = true;
310 _localSize_IsNull = true;
311
312 setGlobalWorkgroupSize(globalSize);
313 setLocalWorkgroupSize(globalSize, localSize);
314 for (i = _dims; i < MAX_DIMS; i++)
315 {
316 _globalSize[i] = 1;
317 }
318
319 for (i = 0; i < MAX_DIMS; i++)
320 {
321 _globalWorkOffset[i] = 0;
322 }
323
324 if (globalWorkOffset)
325 {
326 _globalWorkOffset_IsNull = false;
327 for (i = 0; i < _dims; i++)
328 {
329 _globalWorkOffset[i] = globalWorkOffset[i];
330 }
331 }
332
333 for (i = 0; i < MAX_DIMS; i++)
334 {
335 _enqueuedLocalSize[i] = 1;
336 }
337
338 if (localSize)
339 {
340 _localSize_IsNull = false;
341 for (i = 0; i < _dims; i++)
342 {
343 _enqueuedLocalSize[i] = _localSize[i];
344 }
345 }
346
347 if (reqdWorkGroupSize)
348 {
349 for (i = 0; i < _dims; i++)
350 {
351 _reqdWorkGroupSize[i] = reqdWorkGroupSize[i];
352 }
353 for (i = _dims; i < MAX_DIMS; i++)
354 {
355 _reqdWorkGroupSize[i] = 1;
356 }
357 }
358 else
359 {
360 _reqdWorkGroupSize[0] = 0;
361 _reqdWorkGroupSize[1] = 0;
362 _reqdWorkGroupSize[2] = 0;
363 }
364
365 _testRange = Range::ALL;
366
367 _numOfGlobalWorkItems = _globalSize[0] * _globalSize[1] * _globalSize[2];
368
369 DataContainerAttrib temp = { { 0, 0, 0 } };
370
371 // array with results from each region
372 _resultsRegionArray.resize(NUMBER_OF_REGIONS, temp);
373 _referenceRegionArray.resize(NUMBER_OF_REGIONS, temp);
374 }
375
~TestNonUniformWorkGroup()376 TestNonUniformWorkGroup::~TestNonUniformWorkGroup () {
377 if (_err.checkError()) {
378 _err.showStats();
379 }
380 }
381
setLocalWorkgroupSize(const size_t * globalSize,const size_t * localSize)382 void TestNonUniformWorkGroup::setLocalWorkgroupSize (const size_t *globalSize, const size_t *localSize)
383 {
384 cl_uint i;
385 // Enforce localSize should not exceed globalSize
386 if (localSize) {
387 for (i = 0; i < _dims; i++) {
388 if ((globalSize[i] < localSize[i])) {
389 _localSize[i] = globalSize[i];
390 }else{
391 _localSize[i] = localSize[i];
392 }
393 }
394 }
395 }
396
setGlobalWorkgroupSize(const size_t * globalSize)397 void TestNonUniformWorkGroup::setGlobalWorkgroupSize (const size_t *globalSize)
398 {
399 cl_uint i;
400 for (i = 0; i < _dims; i++) {
401 _globalSize[i] = globalSize[i];
402 }
403 }
404
verifyData(DataContainerAttrib * reference,DataContainerAttrib * results,short regionNumber)405 void TestNonUniformWorkGroup::verifyData (DataContainerAttrib * reference, DataContainerAttrib * results, short regionNumber) {
406
407 std::ostringstream tmp;
408 std::string errorLocation;
409
410 if (_testRange & Range::BASIC) {
411 for (unsigned short i = 0; i < MAX_DIMS; i++) {
412 tmp.str("");
413 tmp.clear();
414 tmp << "region number: " << regionNumber << " for dim: " << i;
415 errorLocation = tmp.str();
416
417 if (results->get_global_size[i] != reference->get_global_size[i]) {
418 _err.show(Error::ERR_GLOBAL_SIZE, errorLocation, results->get_global_size[i], reference->get_global_size[i]);
419 }
420
421 if (results->get_global_offset[i] != reference->get_global_offset[i]) {
422 _err.show(Error::ERR_GLOBAL_WORK_OFFSET, errorLocation, results->get_global_offset[i], reference->get_global_offset[i]);
423 }
424
425 if (results->get_local_size[i] != reference->get_local_size[i] || results->get_local_size[i] > _maxWorkItemSizes[i]) {
426 _err.show(Error::ERR_LOCAL_SIZE, errorLocation, results->get_local_size[i], reference->get_local_size[i]);
427 }
428
429 if (results->get_enqueued_local_size[i] != reference->get_enqueued_local_size[i] || results->get_enqueued_local_size[i] > _maxWorkItemSizes[i]) {
430 _err.show(Error::ERR_ENQUEUED_LOCAL_SIZE, errorLocation, results->get_enqueued_local_size[i], reference->get_enqueued_local_size[i]);
431 }
432
433 if (results->get_num_groups[i] != reference->get_num_groups[i]) {
434 _err.show(Error::ERR_NUM_GROUPS, errorLocation, results->get_num_groups[i], reference->get_num_groups[i]);
435 }
436 }
437 }
438
439 tmp.str("");
440 tmp.clear();
441 tmp << "region number: " << regionNumber;
442 errorLocation = tmp.str();
443 if (_testRange & Range::BASIC) {
444 if (results->get_work_dim != reference->get_work_dim) {
445 _err.show(Error::ERR_WORK_DIM, errorLocation, results->get_work_dim, reference->get_work_dim);
446 }
447 }
448 }
449
calculateExpectedValues()450 void TestNonUniformWorkGroup::calculateExpectedValues () {
451 size_t nonRemainderGlobalSize[MAX_DIMS];
452 size_t numberOfPossibleRegions[MAX_DIMS];
453
454 nonRemainderGlobalSize[0] = _globalSize[0] - (_globalSize[0] % _enqueuedLocalSize[0]);
455 nonRemainderGlobalSize[1] = _globalSize[1] - (_globalSize[1] % _enqueuedLocalSize[1]);
456 nonRemainderGlobalSize[2] = _globalSize[2] - (_globalSize[2] % _enqueuedLocalSize[2]);
457
458 numberOfPossibleRegions[0] = (_globalSize[0]>1)?2:1;
459 numberOfPossibleRegions[1] = (_globalSize[1]>1)?2:1;
460 numberOfPossibleRegions[2] = (_globalSize[2]>1)?2:1;
461
462 for (cl_ushort i = 0; i < NUMBER_OF_REGIONS; ++i) {
463
464 if (i & 0x01 && numberOfPossibleRegions[0] == 1) {
465 continue;
466 }
467
468 if (i & 0x02 && numberOfPossibleRegions[1] == 1) {
469 continue;
470 }
471
472 if (i & 0x04 && numberOfPossibleRegions[2] == 1) {
473 continue;
474 }
475
476 for (cl_ushort dim = 0; dim < MAX_DIMS; ++dim) {
477 _referenceRegionArray[i].get_global_size[dim] = static_cast<unsigned long>(_globalSize[dim]);
478 _referenceRegionArray[i].get_global_offset[dim] = static_cast<unsigned long>(_globalWorkOffset[dim]);
479 _referenceRegionArray[i].get_enqueued_local_size[dim] = static_cast<unsigned long>(_enqueuedLocalSize[dim]);
480 _referenceRegionArray[i].get_local_size[dim] = static_cast<unsigned long>(_enqueuedLocalSize[dim]);
481 _referenceRegionArray[i].get_num_groups[dim] = static_cast<unsigned long>(ceil(static_cast<float>(_globalSize[dim]) / _enqueuedLocalSize[dim]));
482 }
483 _referenceRegionArray[i].get_work_dim = _dims;
484
485 if (i & 0x01) {
486 _referenceRegionArray[i].get_local_size[0] = static_cast<unsigned long>((_globalSize[0] - 1) % _enqueuedLocalSize[0] + 1);
487 }
488
489 if (i & 0x02) {
490 _referenceRegionArray[i].get_local_size[1] = static_cast<unsigned long>((_globalSize[1] - 1) % _enqueuedLocalSize[1] + 1);
491 }
492
493 if (i & 0x04) {
494 _referenceRegionArray[i].get_local_size[2] = static_cast<unsigned long>((_globalSize[2] - 1) % _enqueuedLocalSize[2] + 1);
495 }
496 }
497 }
498
getMaxLocalWorkgroupSize(const cl_device_id & device)499 size_t TestNonUniformWorkGroup::getMaxLocalWorkgroupSize (const cl_device_id &device) {
500 int err;
501
502 if (TestNonUniformWorkGroup::_maxLocalWorkgroupSize == 0) {
503 err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
504 sizeof(TestNonUniformWorkGroup::_maxLocalWorkgroupSize), &TestNonUniformWorkGroup::_maxLocalWorkgroupSize, NULL);
505 }
506
507 return TestNonUniformWorkGroup::_maxLocalWorkgroupSize;
508 }
509
enableStrictMode(bool state)510 void TestNonUniformWorkGroup::enableStrictMode(bool state) {
511 TestNonUniformWorkGroup::_strictMode = state;
512 }
513
prepareDevice()514 int TestNonUniformWorkGroup::prepareDevice () {
515 int err;
516 cl_uint device_max_dimensions;
517 cl_uint i;
518
519 if (_globalSize[0] == 0)
520 {
521 log_error("Some arguments passed into constructor were wrong.\n");
522 return -1;
523 }
524
525 err = clGetDeviceInfo(_device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
526 sizeof(device_max_dimensions), &device_max_dimensions, NULL);
527 test_error(err, "clGetDeviceInfo failed");
528
529 err = clGetDeviceInfo(_device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
530 sizeof(_maxWorkItemSizes), _maxWorkItemSizes, NULL);
531
532 test_error(err, "clGetDeviceInfo failed");
533
534 // Trim the local size to the limitations of what the device supports in each dimension.
535 for (i = 0; i < _dims; i++) {
536 if(_enqueuedLocalSize[i] > _maxWorkItemSizes[i]) {
537 _enqueuedLocalSize[i] = _maxWorkItemSizes[i];
538 }
539 }
540
541 if(_localSize_IsNull == false)
542 calculateExpectedValues();
543
544 std::string buildOptions{};
545 if(_reqdWorkGroupSize[0] != 0 && _reqdWorkGroupSize[1] != 0 && _reqdWorkGroupSize[2] != 0) {
546 std::ostringstream tmp(" ");
547 tmp << " -D RWGSX=" << _reqdWorkGroupSize[0]
548 << " -D RWGSY=" << _reqdWorkGroupSize[1]
549 << " -D RWGSZ=" << _reqdWorkGroupSize[2] << " ";
550 buildOptions += tmp.str();
551 }
552
553 if (_testRange & Range::BASIC)
554 buildOptions += " -D TESTBASIC";
555 if (_testRange & Range::ATOMICS)
556 buildOptions += " -D TESTATOMICS";
557 if (_testRange & Range::BARRIERS)
558 buildOptions += " -D TESTBARRIERS";
559
560 err = create_single_kernel_helper_with_build_options (_context, &_program, &_testKernel, 1,
561 &KERNEL_FUNCTION, "testKernel", buildOptions.c_str());
562 if (err)
563 {
564 log_error("Error %d in line: %d of file %s\n", err, __LINE__, __FILE__);
565 return -1;
566 }
567
568 return 0;
569 }
570
verifyResults()571 int TestNonUniformWorkGroup::verifyResults () {
572 if (_localSize_IsNull) {
573 // for global work groups where local work group size is not defined (set to NULL in clEnqueueNDRangeKernel)
574 // we need to check what optimal size was chosen by device
575 // we assumed that local size value for work item 0 is right for the rest work items
576 _enqueuedLocalSize[0] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[0]);
577 _enqueuedLocalSize[1] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[1]);
578 _enqueuedLocalSize[2] = static_cast<size_t>(_resultsRegionArray[0].get_local_size[2]);
579 calculateExpectedValues();
580
581 // strict mode verification
582 if(_strictMode) {
583 size_t localWorkGroupSize = _enqueuedLocalSize[0]*_enqueuedLocalSize[1]*_enqueuedLocalSize[2];
584 if (localWorkGroupSize != TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device))
585 _err.show(Error::ERR_STRICT_MODE, "",localWorkGroupSize, TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device));
586 }
587
588 log_info ("Local work group size calculated by driver: %s\n", showArray(_enqueuedLocalSize, _dims).c_str());
589 }
590
591 for (cl_ushort i = 0; i < NUMBER_OF_REGIONS; ++i) {
592 verifyData(&_referenceRegionArray[i], &_resultsRegionArray[i], i);
593 }
594
595 if (_testRange & Range::ATOMICS) {
596 if (_globalAtomicTestValue != _numOfGlobalWorkItems) {
597 _err.show(Error::ERR_GLOBAL_ATOMIC);
598 }
599 }
600
601 if (_err.checkError())
602 return -1;
603
604 return 0;
605 }
606
showArray(const size_t * arr,cl_uint dims)607 std::string showArray (const size_t *arr, cl_uint dims) {
608 std::ostringstream tmpStringStream ("");
609
610 tmpStringStream << "{";
611 for (cl_uint i=0; i < dims; i++) {
612 tmpStringStream << arr[i];
613 if (i+1 < dims)
614 tmpStringStream << ", ";
615 }
616 tmpStringStream << "}";
617
618 return tmpStringStream.str();
619 }
620
showTestInfo()621 void TestNonUniformWorkGroup::showTestInfo () {
622 std::string tmpString;
623 log_info ("T E S T P A R A M E T E R S :\n");
624 log_info ("\tNumber of dimensions:\t%d\n", _dims);
625
626 tmpString = showArray(_globalSize, _dims);
627
628 log_info("\tGlobal work group size:\t%s\n", tmpString.c_str());
629
630 if (!_localSize_IsNull) {
631 tmpString = showArray(_enqueuedLocalSize, _dims);
632 } else {
633 tmpString = "NULL";
634 }
635 log_info("\tLocal work group size:\t%s\n", tmpString.c_str());
636
637 if (!_globalWorkOffset_IsNull) {
638 tmpString = showArray(_globalWorkOffset, _dims);
639 } else {
640 tmpString = "NULL";
641 }
642 log_info("\tGlobal work group offset:\t%s\n", tmpString.c_str());
643
644 if (_reqdWorkGroupSize[0] != 0 && _reqdWorkGroupSize[1] != 0 && _reqdWorkGroupSize[2] != 0) {
645 tmpString = showArray(_reqdWorkGroupSize, _dims);
646 } else {
647 tmpString = "attribute disabled";
648 }
649 log_info ("\treqd_work_group_size attribute:\t%s\n", tmpString.c_str());
650
651 tmpString = "";
652 if(_testRange & Range::BASIC)
653 tmpString += "basic";
654 if(_testRange & Range::ATOMICS) {
655 if(tmpString != "") tmpString += ", ";
656 tmpString += "atomics";
657 }
658 if(_testRange & Range::BARRIERS) {
659 if(tmpString != "") tmpString += ", ";
660 tmpString += "barriers";
661 }
662 log_info ("\tTest range:\t%s\n", tmpString.c_str());
663 if(_strictMode) {
664 log_info ("\tStrict mode:\tON\n");
665 if (!_localSize_IsNull) {
666 log_info ("\tATTENTION: strict mode applies only NULL local work group size\n");
667 } else {
668 log_info ("\t\tExpected value of local work group size is %ld.\n",
669 TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device));
670 }
671
672 }
673 }
674
adjustLocalArraySize(size_t localArraySize)675 size_t TestNonUniformWorkGroup::adjustLocalArraySize (size_t localArraySize) {
676 // In case if localArraySize is too big, sometimes we can not run kernel because of lack
677 // of resources due to kernel itself requires some local memory to run
678 int err;
679
680 cl_ulong kernelLocalMemSize = 0;
681 err = clGetKernelWorkGroupInfo(_testKernel, _device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalMemSize), &kernelLocalMemSize, NULL);
682 test_error(err, "clGetKernelWorkGroupInfo failed");
683
684 cl_ulong deviceLocalMemSize = 0;
685 err = clGetDeviceInfo(_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(deviceLocalMemSize), &deviceLocalMemSize, NULL);
686 test_error(err, "clGetDeviceInfo failed");
687
688 if (kernelLocalMemSize + localArraySize > deviceLocalMemSize) {
689 size_t adjustedLocalArraySize = deviceLocalMemSize - kernelLocalMemSize;
690 log_info("localArraySize was adjusted from %lu to %lu\n", localArraySize, adjustedLocalArraySize);
691 localArraySize = adjustedLocalArraySize;
692 }
693
694 return localArraySize;
695 }
696
adjustGlobalBufferSize(size_t globalBufferSize)697 size_t TestNonUniformWorkGroup::adjustGlobalBufferSize(size_t globalBufferSize) {
698 // In case if global buffer size is too big, sometimes we can not run kernel because of lack
699 // of resources due to kernel itself requires some global memory to run
700 int err;
701
702 cl_ulong deviceMaxAllocObjSize = 0;
703 err = clGetDeviceInfo(_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(deviceMaxAllocObjSize), &deviceMaxAllocObjSize, NULL);
704 test_error(err, "clGetDeviceInfo failed");
705
706 size_t adjustedGlobalBufferSize = globalBufferSize;
707 if (deviceMaxAllocObjSize < globalBufferSize) {
708 adjustedGlobalBufferSize = deviceMaxAllocObjSize;
709 log_info("globalBufferSize was adjusted from %lu to %lu\n", globalBufferSize, adjustedGlobalBufferSize);
710 }
711
712 return adjustedGlobalBufferSize;
713 }
714
runKernel()715 int TestNonUniformWorkGroup::runKernel () {
716 int err;
717
718 // TEST INFO
719 showTestInfo();
720
721 size_t localArraySize = (_localSize_IsNull)?TestNonUniformWorkGroup::getMaxLocalWorkgroupSize(_device):(_enqueuedLocalSize[0]*_enqueuedLocalSize[1]*_enqueuedLocalSize[2]);
722 clMemWrapper resultsRegionArray = clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _resultsRegionArray.size() * sizeof(DataContainerAttrib), &_resultsRegionArray.front(), &err);
723 test_error(err, "clCreateBuffer failed");
724
725 size_t *localSizePtr = (_localSize_IsNull)?NULL:_enqueuedLocalSize;
726 size_t *globalWorkOffsetPtr = (_globalWorkOffset_IsNull)?NULL:_globalWorkOffset;
727
728 err = clSetKernelArg(_testKernel, 0, sizeof(resultsRegionArray), &resultsRegionArray);
729 test_error(err, "clSetKernelArg failed");
730
731 //creating local buffer
732 localArraySize = adjustLocalArraySize(localArraySize*sizeof(unsigned int));
733 err = clSetKernelArg(_testKernel, 1, localArraySize, NULL);
734 test_error(err, "clSetKernelArg failed");
735
736 size_t globalBufferSize = adjustGlobalBufferSize(_numOfGlobalWorkItems*sizeof(cl_uint));
737 clMemWrapper testGlobalArray = clCreateBuffer(_context, CL_MEM_READ_WRITE, globalBufferSize, NULL, &err);
738 test_error(err, "clCreateBuffer failed");
739
740 err = clSetKernelArg(_testKernel, 2, sizeof(testGlobalArray), &testGlobalArray);
741 test_error(err, "clSetKernelArg failed");
742
743 _globalAtomicTestValue = 0;
744 clMemWrapper globalAtomicTestVariable = clCreateBuffer(_context, (CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR), sizeof(_globalAtomicTestValue), &_globalAtomicTestValue, &err);
745 test_error(err, "clCreateBuffer failed");
746
747 err = clSetKernelArg(_testKernel, 3, sizeof(globalAtomicTestVariable), &globalAtomicTestVariable);
748 test_error(err, "clSetKernelArg failed");
749
750 clMemWrapper errorArray = clCreateBuffer(_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _err.errorArrayCounterSize(), _err.errorArrayCounter(), &err);
751 test_error(err, "clCreateBuffer failed");
752
753 err = clSetKernelArg(_testKernel, 4, sizeof(errorArray), &errorArray);
754 test_error(err, "clSetKernelArg failed");
755
756 err = clEnqueueNDRangeKernel(_queue, _testKernel, _dims, globalWorkOffsetPtr, _globalSize,
757 localSizePtr, 0, NULL, NULL);
758 test_error(err, "clEnqueueNDRangeKernel failed");
759
760
761 err = clFinish(_queue);
762 test_error(err, "clFinish failed");
763
764 err = clEnqueueReadBuffer(_queue, globalAtomicTestVariable, CL_TRUE, 0, sizeof(unsigned int), &_globalAtomicTestValue, 0, NULL, NULL);
765 test_error(err, "clEnqueueReadBuffer failed");
766
767 if (_err.checkError()) {
768 return -1;
769 }
770
771 // synchronization of main buffer
772 err = clEnqueueReadBuffer(_queue, resultsRegionArray, CL_TRUE, 0, _resultsRegionArray.size() * sizeof(DataContainerAttrib), &_resultsRegionArray.front(), 0, NULL, NULL);
773 test_error(err, "clEnqueueReadBuffer failed");
774
775 err = clEnqueueReadBuffer(_queue, errorArray, CL_TRUE, 0, _err.errorArrayCounterSize(), _err.errorArrayCounter(), 0, NULL, NULL);
776 test_error(err, "clEnqueueReadBuffer failed");
777 // Synchronization of errors occurred in kernel into general error stats
778 _err.synchronizeStatsMap();
779
780 return 0;
781 }
782
runTestNonUniformWorkGroup(const cl_uint dims,size_t * globalSize,const size_t * localSize,int range)783 void SubTestExecutor::runTestNonUniformWorkGroup(const cl_uint dims,
784 size_t *globalSize,
785 const size_t *localSize,
786 int range)
787 {
788 runTestNonUniformWorkGroup(dims, globalSize, localSize, NULL, NULL, range);
789 }
790
runTestNonUniformWorkGroup(const cl_uint dims,size_t * globalSize,const size_t * localSize,const size_t * globalWorkOffset,const size_t * reqdWorkGroupSize,int range)791 void SubTestExecutor::runTestNonUniformWorkGroup(
792 const cl_uint dims, size_t *globalSize, const size_t *localSize,
793 const size_t *globalWorkOffset, const size_t *reqdWorkGroupSize, int range)
794 {
795
796
797 int err;
798 ++_overallCounter;
799 TestNonUniformWorkGroup test(_device, _context, _queue, dims, globalSize,
800 localSize, NULL, globalWorkOffset,
801 reqdWorkGroupSize);
802
803 test.setTestRange(range);
804 err = test.prepareDevice();
805 if (err)
806 {
807 log_error("Error: prepare device\n");
808 ++_failCounter;
809 return;
810 }
811
812 err = test.runKernel();
813 if (err)
814 {
815 log_error("Error: run kernel\n");
816 ++_failCounter;
817 return;
818 }
819
820 err = test.verifyResults();
821 if (err)
822 {
823 log_error("Error: verify results\n");
824 ++_failCounter;
825 return;
826 }
827 }
828
calculateWorkGroupSize(size_t & maxWgSize,int testRange)829 int SubTestExecutor::calculateWorkGroupSize(size_t &maxWgSize, int testRange) {
830 int err;
831
832 clProgramWrapper program;
833 clKernelWrapper testKernel;
834 std::string buildOptions{};
835
836 if (testRange & Range::BASIC)
837 buildOptions += " -D TESTBASIC";
838 if (testRange & Range::ATOMICS)
839 buildOptions += " -D TESTATOMICS";
840 if (testRange & Range::BARRIERS)
841 buildOptions += " -D TESTBARRIERS";
842
843 err = create_single_kernel_helper_with_build_options (_context, &program, &testKernel, 1,
844 &KERNEL_FUNCTION, "testKernel", buildOptions.c_str());
845 if (err)
846 {
847 log_error("Error %d in line: %d of file %s\n", err, __LINE__, __FILE__);
848 return err;
849 }
850
851 err = clGetKernelWorkGroupInfo (testKernel, _device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(maxWgSize), &maxWgSize, NULL);
852 test_error(err, "clGetKernelWorkGroupInfo failed");
853
854 TestNonUniformWorkGroup::setMaxLocalWorkgroupSize(maxWgSize);
855
856 return 0;
857 }
858
status()859 int SubTestExecutor::status() {
860
861 if (_failCounter>0) {
862 log_error ("%d subtest(s) (of %d) failed\n", _failCounter, _overallCounter);
863 return -1;
864 } else {
865 log_info ("All %d subtest(s) passed\n", _overallCounter);
866 return 0;
867 }
868 }
869
870