1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22
23 #include "procs.h"
24
25 #define ITERATIONS 4
26 #define DEBUG 0
27
28 // If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test will limit the maximum total
29 // global dimensions tested to this value.
30 #define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1<<24)
31 int limit_size = 0;
32
33 static int
get_maximums(cl_kernel kernel,cl_context context,size_t * max_workgroup_size_result,cl_ulong * max_allcoation_result,cl_ulong * max_physical_result)34 get_maximums(cl_kernel kernel, cl_context context,
35 size_t *max_workgroup_size_result,
36 cl_ulong *max_allcoation_result,
37 cl_ulong *max_physical_result) {
38 int err = 0;
39 cl_uint i;
40 cl_device_id *devices;
41
42 // Get all the devices in the device group
43 size_t num_devices_returned;
44 err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &num_devices_returned);
45 if(err != CL_SUCCESS)
46 {
47 log_error("clGetContextInfo() failed (%d).\n", err);
48 return -10;
49 }
50 devices = (cl_device_id *)malloc(num_devices_returned);
51 err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned, devices, NULL);
52 if(err != CL_SUCCESS)
53 {
54 log_error("clGetContextInfo() failed (%d).\n", err);
55 return -10;
56 }
57 num_devices_returned /= sizeof(cl_device_id);
58 if (num_devices_returned > 1) log_info("%d devices in device group.\n", (int)num_devices_returned);
59 if (num_devices_returned < 1) {
60 log_error("0 devices found for this kernel.\n");
61 return -1;
62 }
63
64 // Iterate over them and find the maximum local workgroup size
65 size_t max_workgroup_size = 0;
66 size_t current_workgroup_size = 0;
67 cl_ulong max_allocation = 0;
68 cl_ulong current_allocation = 0;
69 cl_ulong max_physical = 0;
70 cl_ulong current_physical = 0;
71
72 for (i=0; i<num_devices_returned; i++) {
73 // Max workgroup size for this kernel on this device
74 err = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(current_workgroup_size), ¤t_workgroup_size, NULL);
75 if(err != CL_SUCCESS)
76 {
77 log_error("clGetKernelWorkGroupInfo() failed (%d) for device %d.\n", err, i);
78 return -10;
79 }
80 if (max_workgroup_size == 0)
81 max_workgroup_size = current_workgroup_size;
82 else if (current_workgroup_size < max_workgroup_size)
83 max_workgroup_size = current_workgroup_size;
84
85 // Get the maximum allocation size
86 err = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(current_allocation), ¤t_allocation, NULL);
87 if(err != CL_SUCCESS)
88 {
89 log_error("clGetDeviceConfigInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed (%d) for device %d.\n", err, i);
90 return -10;
91 }
92 if (max_allocation == 0)
93 max_allocation = current_allocation;
94 else if (current_allocation < max_allocation)
95 max_allocation = current_allocation;
96
97 // Get the maximum physical size
98 err = clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(current_physical), ¤t_physical, NULL);
99 if(err != CL_SUCCESS)
100 {
101 log_error("clGetDeviceConfigInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed (%d) for device %d.\n", err, i);
102 return -10;
103 }
104 if (max_physical == 0)
105 max_physical = current_physical;
106 else if (current_physical < max_allocation)
107 max_physical = current_physical;
108 }
109 free(devices);
110
111 log_info("Device maximums: max local workgroup size:%d, max allocation size: %g MB, max physical memory %gMB\n",
112 (int)max_workgroup_size, (double)(max_allocation/1024.0/1024.0), (double)(max_physical/1024.0/1024.0));
113 *max_workgroup_size_result = max_workgroup_size;
114 *max_allcoation_result = max_allocation;
115 *max_physical_result = max_physical;
116 return 0;
117 }
118
119 static const char *thread_dimension_kernel_code_atomic_long =
120 "\n"
121 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
122 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
123 "__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
124 " uint final_x_size, uint final_y_size, uint final_z_size,\n"
125 " ulong start_address, ulong end_address)\n"
126 "{\n"
127 " uint error = 0;\n"
128 " if (get_global_id(0) >= final_x_size)\n"
129 " error = 64;\n"
130 " if (get_global_id(1) >= final_y_size)\n"
131 " error = 128;\n"
132 " if (get_global_id(2) >= final_z_size)\n"
133 " error = 256;\n"
134 "\n"
135 " unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
136 " (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
137 " if ((t_address >= start_address) && (t_address < end_address))\n"
138 " atom_add(&dst[t_address-start_address], 1u);\n"
139 " if (error)\n"
140 " atom_or(&dst[t_address-start_address], error);\n"
141 "\n"
142 "}\n";
143
144 static const char *thread_dimension_kernel_code_not_atomic_long =
145 "\n"
146 "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
147 " uint final_x_size, uint final_y_size, uint final_z_size,\n"
148 " ulong start_address, ulong end_address)\n"
149 "{\n"
150 " uint error = 0;\n"
151 " if (get_global_id(0) >= final_x_size)\n"
152 " error = 64;\n"
153 " if (get_global_id(1) >= final_y_size)\n"
154 " error = 128;\n"
155 " if (get_global_id(2) >= final_z_size)\n"
156 " error = 256;\n"
157 "\n"
158 " unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
159 " (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
160 " if ((t_address >= start_address) && (t_address < end_address))\n"
161 " dst[t_address-start_address]++;\n"
162 " if (error)\n"
163 " dst[t_address-start_address]|=error;\n"
164 "\n"
165 "}\n";
166
167 static const char *thread_dimension_kernel_code_atomic_not_long =
168 "\n"
169 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
170 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
171 "__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
172 " uint final_x_size, uint final_y_size, uint final_z_size,\n"
173 " uint start_address, uint end_address)\n"
174 "{\n"
175 " uint error = 0;\n"
176 " if (get_global_id(0) >= final_x_size)\n"
177 " error = 64;\n"
178 " if (get_global_id(1) >= final_y_size)\n"
179 " error = 128;\n"
180 " if (get_global_id(2) >= final_z_size)\n"
181 " error = 256;\n"
182 "\n"
183 " unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n"
184 " (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
185 " if ((t_address >= start_address) && (t_address < end_address))\n"
186 " atom_add(&dst[t_address-start_address], 1u);\n"
187 " if (error)\n"
188 " atom_or(&dst[t_address-start_address], error);\n"
189 "\n"
190 "}\n";
191
192 static const char *thread_dimension_kernel_code_not_atomic_not_long =
193 "\n"
194 "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
195 " uint final_x_size, uint final_y_size, uint final_z_size,\n"
196 " uint start_address, uint end_address)\n"
197 "{\n"
198 " uint error = 0;\n"
199 " if (get_global_id(0) >= final_x_size)\n"
200 " error = 64;\n"
201 " if (get_global_id(1) >= final_y_size)\n"
202 " error = 128;\n"
203 " if (get_global_id(2) >= final_z_size)\n"
204 " error = 256;\n"
205 "\n"
206 " unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n"
207 " (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
208 " if ((t_address >= start_address) && (t_address < end_address))\n"
209 " dst[t_address-start_address]++;\n"
210 " if (error)\n"
211 " dst[t_address-start_address]|=error;\n"
212 "\n"
213 "}\n";
214
215 char dim_str[128];
216 char *
print_dimensions(size_t x,size_t y,size_t z,cl_uint dim)217 print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) {
218 // Not thread safe...
219 if (dim == 1) {
220 snprintf(dim_str, 128, "[%d]", (int)x);
221 } else if (dim == 2) {
222 snprintf(dim_str, 128, "[%d x %d]", (int)x, (int)y);
223 } else if (dim == 3) {
224 snprintf(dim_str, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
225 } else {
226 snprintf(dim_str, 128, "INVALID DIM: %d", dim);
227 }
228 return dim_str;
229 }
230
231 char dim_str2[128];
232 char *
print_dimensions2(size_t x,size_t y,size_t z,cl_uint dim)233 print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) {
234 // Not thread safe...
235 if (dim == 1) {
236 snprintf(dim_str2, 128, "[%d]", (int)x);
237 } else if (dim == 2) {
238 snprintf(dim_str2, 128, "[%d x %d]", (int)x, (int)y);
239 } else if (dim == 3) {
240 snprintf(dim_str2, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
241 } else {
242 snprintf(dim_str2, 128, "INVALID DIM: %d", dim);
243 }
244 return dim_str2;
245 }
246
247
248 /*
249 This tests thread dimensions by executing a kernel across a range of dimensions.
250 Each kernel instance does an atomic write into a specific location in a buffer to
251 ensure that the correct dimensions are run. To handle large dimensions, the kernel
252 masks its execution region internally. This allows a small (128MB) buffer to be used
253 for very large executions by running the kernel multiple times.
254 */
run_test(cl_context context,cl_command_queue queue,cl_kernel kernel,cl_mem array,cl_uint memory_size,cl_uint dimensions,cl_uint final_x_size,cl_uint final_y_size,cl_uint final_z_size,cl_uint local_x_size,cl_uint local_y_size,cl_uint local_z_size,int explict_local)255 int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_mem array, cl_uint memory_size, cl_uint dimensions,
256 cl_uint final_x_size, cl_uint final_y_size, cl_uint final_z_size,
257 cl_uint local_x_size, cl_uint local_y_size, cl_uint local_z_size,
258 int explict_local)
259 {
260 cl_uint errors = 0;
261 size_t global_size[3], local_size[3];
262 global_size[0] = final_x_size; local_size[0] = local_x_size;
263 global_size[1] = final_y_size; local_size[1] = local_y_size;
264 global_size[2] = final_z_size; local_size[2] = local_z_size;
265
266 cl_ulong start_valid_memory_address = 0;
267 cl_ulong end_valid_memory_address = memory_size;
268 cl_ulong last_memory_address = (cl_ulong)final_x_size*(cl_ulong)final_y_size*(cl_ulong)final_z_size*sizeof(cl_uint);
269 if (end_valid_memory_address > last_memory_address)
270 end_valid_memory_address = last_memory_address;
271
272 int number_of_iterations_required = (int)ceil((double)last_memory_address/(double)memory_size);
273 log_info("\t\tTest requires %gMB (%d test iterations using an allocation of %gMB).\n",
274 (double)last_memory_address/(1024.0*1024.0), number_of_iterations_required, (double)memory_size/(1024.0*1024.0));
275 //log_info("Last memory address: %llu, memory_size: %llu\n", last_memory_address, memory_size);
276
277 while (end_valid_memory_address <= last_memory_address)
278 {
279 int err;
280 const int fill_pattern = 0x0;
281 err = clEnqueueFillBuffer(queue,
282 array,
283 (void*)&fill_pattern,
284 sizeof(fill_pattern),
285 0,
286 memory_size,
287 0,
288 NULL,
289 NULL);
290 if (err != CL_SUCCESS) {
291 print_error( err, "Failed to set fill buffer.");
292 return -3;
293 }
294
295 cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint);
296 cl_ulong end_valid_index = end_valid_memory_address/sizeof(cl_uint);
297
298 cl_uint start_valid_index_int = (cl_uint) start_valid_index;
299 cl_uint end_valid_index_int = (cl_uint) end_valid_index;
300
301 // Set the arguments
302 err = clSetKernelArg(kernel, 0, sizeof(array), &array);
303 err |= clSetKernelArg(kernel, 1, sizeof(final_x_size), &final_x_size);
304 err |= clSetKernelArg(kernel, 2, sizeof(final_y_size), &final_y_size);
305 err |= clSetKernelArg(kernel, 3, sizeof(final_z_size), &final_z_size);
306 if (gHasLong)
307 {
308 err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index), &start_valid_index);
309 err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index), &end_valid_index);
310 }
311 else
312 {
313 err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int), &start_valid_index_int);
314 err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int), &end_valid_index_int);
315 }
316
317 if (err != CL_SUCCESS) {
318 print_error( err, "Failed to set arguments.");
319 return -3;
320 }
321
322
323 // Execute the kernel
324 if (explict_local == 0) {
325 err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, NULL, 0, NULL, NULL);
326 if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, NULL local, %d dim, start address %llu, end address %llu.\n",
327 print_dimensions(global_size[0], global_size[1], global_size[2], dimensions),
328 dimensions, start_valid_memory_address, end_valid_memory_address);
329 } else {
330 err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, local_size, 0, NULL, NULL);
331 if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, local %s, %d dim, start address %llu, end address %llu.\n",
332 print_dimensions(global_size[0], global_size[1], global_size[2], dimensions), print_dimensions2(local_size[0], local_size[1], local_size[2], dimensions),
333 dimensions, start_valid_memory_address, end_valid_memory_address);
334 }
335 if (err == CL_OUT_OF_RESOURCES) {
336 log_info("WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the global dimensions are too large. Skipping this size.\n");
337 return 0;
338 }
339 if (err != CL_SUCCESS) {
340 print_error( err, "Failed to execute kernel\n");
341 return -3;
342 }
343
344 void* mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0, memory_size, 0, NULL, NULL, &err );
345 if (err != CL_SUCCESS) {
346 print_error( err, "Failed to map results\n");
347 return -4;
348 }
349 cl_uint* data = (cl_uint*)mapped;
350
351 // Verify the data
352 cl_uint i;
353 cl_uint last_address = (cl_uint)(end_valid_memory_address - start_valid_memory_address)/(cl_uint)sizeof(cl_uint);
354 for (i=0; i<last_address; i++) {
355 if (i < last_address) {
356 if (data[i] != 1) {
357 errors++;
358 // log_info("%d expected 1 got %d\n", i, data[i]);
359 }
360 } else {
361 if (data[i] != 0) {
362 errors++;
363 log_info("%d expected 0 got %d\n", i, data[i]);
364 }
365 }
366 }
367
368 err = clEnqueueUnmapMemObject(queue, array, mapped, 0, NULL, NULL );
369 if (err != CL_SUCCESS) {
370 print_error( err, "Failed to unmap results\n");
371 return -4;
372 }
373
374 err = clFlush(queue);
375 if (err != CL_SUCCESS) {
376 print_error( err, "Failed to flush\n");
377 return -4;
378 }
379
380 // Increment the addresses
381 if (end_valid_memory_address == last_memory_address)
382 break;
383 start_valid_memory_address += memory_size;
384 end_valid_memory_address += memory_size;
385 if (end_valid_memory_address > last_memory_address)
386 end_valid_memory_address = last_memory_address;
387 }
388
389 if (errors)
390 log_error("%d errors.\n", errors);
391 return errors;
392 }
393
394
395
396
397 static cl_uint max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
398
set_min(cl_uint * x,cl_uint * y,cl_uint * z)399 static void set_min(cl_uint *x, cl_uint *y, cl_uint *z) {
400 if (*x < min_x_size)
401 *x = min_x_size;
402 if (*y < min_y_size)
403 *y = min_y_size;
404 if (*z < min_z_size)
405 *z = min_z_size;
406 if (*x > max_x_size)
407 *x = max_x_size;
408 if (*y > max_y_size)
409 *y = max_y_size;
410 if (*z > max_z_size)
411 *z = max_z_size;
412 }
413
414
415 int
test_thread_dimensions(cl_device_id device,cl_context context,cl_command_queue queue,cl_uint dimensions,cl_uint min_dim,cl_uint max_dim,cl_uint quick_test,cl_uint size_increase_per_iteration,int explicit_local)416 test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue queue, cl_uint dimensions, cl_uint min_dim, cl_uint max_dim, cl_uint quick_test, cl_uint size_increase_per_iteration, int explicit_local) {
417 cl_mem array;
418 cl_program program;
419 cl_kernel kernel;
420 int err;
421 cl_uint memory_size, max_memory_size;
422 size_t max_local_workgroup_size[3];
423 cl_uint device_max_dimensions;
424 int use_atomics = 1;
425 MTdata d;
426
427 if (getenv("CL_WIMPY_MODE") && !quick_test) {
428 log_info("CL_WIMPY_MODE enabled, skipping test\n");
429 return 0;
430 }
431
432 // Unconditionally test larger sizes for CL 1.1
433 log_info("Testing large global dimensions.\n");
434 limit_size = 0;
435
436 /* Check if atomics are supported. */
437 if (!is_extension_available(device, "cl_khr_global_int32_base_atomics")) {
438 log_info("WARNING: Base atomics not supported (cl_khr_global_int32_base_atomics). Test will not be guaranteed to catch overlaping thread dimensions.\n");
439 use_atomics = 0;
440 }
441
442 if (quick_test)
443 log_info("WARNING: Running quick test. This will only test the base dimensions (power of two) and base-1 with all local threads fixed in one dim.\n");
444
445 // Verify that we can test this many dimensions
446 err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(device_max_dimensions), &device_max_dimensions, NULL);
447 test_error(err, "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed");
448
449 if (dimensions > device_max_dimensions) {
450 log_info("Can not test %d dimensions when device only supports %d.\n", dimensions, device_max_dimensions);
451 return 0;
452 }
453
454 log_info("Setting random seed to 0.\n");
455
456 if (gHasLong) {
457 if (use_atomics) {
458 err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_long, "test_thread_dimension_atomic" );
459 } else {
460 err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_long, "test_thread_dimension_not_atomic" );
461 }
462 } else {
463 if (use_atomics) {
464 err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_not_long, "test_thread_dimension_atomic" );
465 } else {
466 err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_not_long, "test_thread_dimension_not_atomic" );
467 }
468 }
469 test_error( err, "Unable to create testing kernel" );
470
471 err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
472 test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
473
474 // Get the maximum sizes supported by this device
475 size_t max_workgroup_size = 0;
476 cl_ulong max_allocation = 0;
477 cl_ulong max_physical = 0;
478 int found_size = 0;
479
480 err = get_maximums(kernel, context,
481 &max_workgroup_size, &max_allocation, &max_physical);
482
483 // Make sure we don't try to allocate more than half the physical memory present.
484 if (max_allocation > (max_physical/2)) {
485 log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n",
486 (max_physical/2/(1024.0*1024.0)), (max_physical/(1024.0*1024.0)));
487 max_allocation = max_physical/2;
488 }
489
490 // Limit the maximum we'll allocate for this test to 512 to be reasonable.
491 if (max_allocation > 1024*1024*512) {
492 log_info("Limiting max allocation to 512MB from device maximum allocation of %gMB.\n", (max_allocation/1024.0/1024.0));
493 max_allocation = 1024*1024*512;
494 }
495
496 max_memory_size = (cl_uint)(max_allocation);
497 if (max_memory_size > 512*1024*1024)
498 max_memory_size = 512*1024*1024;
499 memory_size = max_memory_size;
500
501 log_info("Memory allocation size to use is %gMB, max workgroup size is %d.\n", max_memory_size/(1024.0*1024.0), (int)max_workgroup_size);
502
503 while (!found_size && memory_size >= max_memory_size/8) {
504 array =
505 clCreateBuffer(context, CL_MEM_READ_WRITE, memory_size, NULL, &err);
506 if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE || err == CL_OUT_OF_HOST_MEMORY) {
507 memory_size -= max_memory_size/16;
508 continue;
509 }
510 if (err) {
511 print_error( err, "clCreateBuffer failed");
512 return -1;
513 }
514 found_size = 1;
515 }
516
517 if (!found_size) {
518 log_error("Failed to find a working size greater than 1/8th of the reported allocation size.\n");
519 return -1;
520 }
521
522 if (memory_size < max_memory_size) {
523 log_info("Note: failed to allocate %gMB, using %gMB instead.\n", max_memory_size/(1024.0*1024.0), memory_size/(1024.0*1024.0));
524 }
525
526 int errors = 0;
527 // Each dimension's size is multiplied by this amount on each iteration.
528 // uint size_increase_per_iteration = 4;
529 // 1 test at the specified size
530 // 2 tests with each dimensions +/- 1
531 // 2 tests with all dimensions +/- 1
532 // 2 random tests
533 cl_uint tests_per_size = 1 + 2*dimensions + 2 + 2;
534
535 // 1 test with 1 as the local threads in each dimensions
536 // 1 test with all the local threads in each dimension
537 // 2 random tests
538 cl_uint local_tests_per_size = 1 + dimensions + 2;
539 if (explicit_local == 0)
540 local_tests_per_size = 1;
541
542 max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
543
544 if (dimensions > 3) {
545 log_error("Invalid dimensions: %d\n", dimensions);
546 return -1;
547 }
548 max_x_size = max_dim;
549 min_x_size = min_dim;
550 if (dimensions > 1) {
551 max_y_size = max_dim;
552 min_y_size = min_dim;
553 }
554 if (dimensions > 2) {
555 max_z_size = max_dim;
556 min_z_size = min_dim;
557 }
558
559 log_info("Testing with dimensions up to %s.\n", print_dimensions(max_x_size, max_y_size, max_z_size, dimensions));
560 cl_uint x_size, y_size, z_size;
561
562 d = init_genrand( gRandomSeed );
563 z_size = min_z_size;
564 while (z_size <= max_z_size) {
565 y_size = min_y_size;
566 while (y_size <= max_y_size) {
567 x_size = min_x_size;
568 while (x_size <= max_x_size) {
569
570 log_info("Base test size %s:\n", print_dimensions(x_size, y_size, z_size, dimensions));
571
572 cl_uint sub_test;
573 cl_uint final_x_size, final_y_size, final_z_size;
574 for (sub_test = 0; sub_test < tests_per_size; sub_test++) {
575 final_x_size = x_size;
576 final_y_size = y_size;
577 final_z_size = z_size;
578
579 if (sub_test == 0) {
580 if (DEBUG) log_info("\tTesting with base dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
581 } else if (quick_test) {
582 // If we are in quick mode just do 1 run with x-1, y-1, and z-1.
583 if (sub_test > 1)
584 break;
585 final_x_size--;
586 final_y_size--;
587 final_z_size--;
588 set_min(&final_x_size, &final_y_size, &final_z_size);
589 if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
590 } else if (sub_test <= dimensions*2) {
591 int dim_to_change = (sub_test-1)%dimensions;
592 //log_info ("dim_to_change: %d (sub_test:%d) dimensions %d\n", dim_to_change,sub_test, dimensions);
593 int up_down = (sub_test > dimensions) ? 0 : 1;
594
595 if (dim_to_change == 0) {
596 final_x_size += (up_down) ? -1 : +1;
597 } else if (dim_to_change == 1) {
598 final_y_size += (up_down) ? -1 : +1;
599 } else if (dim_to_change == 2) {
600 final_z_size += (up_down) ? -1 : +1;
601 } else {
602 log_error("Invalid dim_to_change: %d\n", dim_to_change);
603 return -1;
604 }
605 set_min(&final_x_size, &final_y_size, &final_z_size);
606 if (DEBUG) log_info("\tTesting with one base dimension +/- 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
607 } else if (sub_test == (dimensions*2+1)) {
608 if (dimensions == 1)
609 continue;
610 final_x_size--;
611 final_y_size--;
612 final_z_size--;
613 set_min(&final_x_size, &final_y_size, &final_z_size);
614 if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
615 } else if (sub_test == (dimensions*2+2)) {
616 if (dimensions == 1)
617 continue;
618 final_x_size++;
619 final_y_size++;
620 final_z_size++;
621 set_min(&final_x_size, &final_y_size, &final_z_size);
622 if (DEBUG) log_info("\tTesting with all base dimensions + 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
623 } else {
624 final_x_size = (int)get_random_float(0, (x_size/size_increase_per_iteration), d)+x_size/size_increase_per_iteration;
625 final_y_size = (int)get_random_float(0, (y_size/size_increase_per_iteration), d)+y_size/size_increase_per_iteration;
626 final_z_size = (int)get_random_float(0, (z_size/size_increase_per_iteration), d)+z_size/size_increase_per_iteration;
627 set_min(&final_x_size, &final_y_size, &final_z_size);
628 if (DEBUG) log_info("\tTesting with random dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
629 }
630
631 if (limit_size && final_x_size*final_y_size*final_z_size >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST) {
632 log_info("Skipping size %s as it exceeds max test threads of %d.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), MAX_TOTAL_GLOBAL_THREADS_FOR_TEST);
633 continue;
634 }
635
636 cl_uint local_test;
637 cl_uint local_x_size, local_y_size, local_z_size;
638 cl_uint previous_local_x_size=0, previous_local_y_size=0, previous_local_z_size=0;
639 for (local_test = 0; local_test < local_tests_per_size; local_test++) {
640
641 local_x_size = 1;
642 local_y_size = 1;
643 local_z_size = 1;
644
645 if (local_test == 0) {
646 } else if (local_test <= dimensions) {
647 int dim_to_change = (local_test-1)%dimensions;
648 if (dim_to_change == 0) {
649 local_x_size = (cl_uint)max_workgroup_size;
650 } else if (dim_to_change == 1) {
651 local_y_size = (cl_uint)max_workgroup_size;
652 } else if (dim_to_change == 2) {
653 local_z_size = (cl_uint)max_workgroup_size;
654 } else {
655 log_error("Invalid dim_to_change: %d\n", dim_to_change);
656 free_mtdata(d);
657 return -1;
658 }
659 } else {
660 local_x_size = (int)get_random_float(1, (int)max_workgroup_size, d);
661 while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
662 local_x_size--;
663 int remainder = (int)floor((double)max_workgroup_size/local_x_size);
664 // Evenly prefer dimensions 2 and 1 first
665 if (local_test % 2) {
666 if (dimensions > 1) {
667 local_y_size = (int)get_random_float(1, (int)remainder, d);
668 while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
669 local_y_size--;
670 remainder = (int)floor((double)remainder/local_y_size);
671 }
672 if (dimensions > 2) {
673 local_z_size = (int)get_random_float(1, (int)remainder, d);
674 while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
675 local_z_size--;
676 }
677 } else {
678 if (dimensions > 2) {
679 local_z_size = (int)get_random_float(1, (int)remainder, d);
680 while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
681 local_z_size--;
682 remainder = (int)floor((double)remainder/local_z_size);
683 }
684 if (dimensions > 1) {
685 local_y_size = (int)get_random_float(1, (int)remainder, d);
686 while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
687 local_y_size--;
688 }
689 }
690 }
691
692 // Put all the threads in one dimension to speed up the test in quick mode.
693 if (quick_test) {
694 local_y_size = 1;
695 local_z_size = 1;
696 local_x_size = 1;
697 if (final_z_size > final_y_size && final_z_size > final_x_size)
698 local_z_size = (cl_uint)max_workgroup_size;
699 else if (final_y_size > final_x_size)
700 local_y_size = (cl_uint)max_workgroup_size;
701 else
702 local_x_size = (cl_uint)max_workgroup_size;
703 }
704
705 if (local_x_size > max_local_workgroup_size[0])
706 local_x_size = (int)max_local_workgroup_size[0];
707 if (dimensions > 1 && local_y_size > max_local_workgroup_size[1])
708 local_y_size = (int)max_local_workgroup_size[1];
709 if (dimensions > 2 && local_z_size > max_local_workgroup_size[2])
710 local_z_size = (int)max_local_workgroup_size[2];
711
712 // Cleanup the local dimensions
713 while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
714 local_x_size--;
715 while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
716 local_y_size--;
717 while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
718 local_z_size--;
719 if ((previous_local_x_size == local_x_size) && (previous_local_y_size == local_y_size) && (previous_local_z_size == local_z_size))
720 continue;
721
722 if (explicit_local == 0) {
723 local_x_size = 0;
724 local_y_size = 0;
725 local_z_size = 0;
726 }
727
728 if (DEBUG) log_info("\t\tTesting local size %s.\n", print_dimensions(local_x_size, local_y_size, local_z_size, dimensions));
729
730 if (explicit_local == 0) {
731 log_info("\tTesting global %s local [NULL]...\n",
732 print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
733 } else {
734 log_info("\tTesting global %s local %s...\n",
735 print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
736 print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
737 }
738
739 // Avoid running with very small local sizes on very large global sizes
740 cl_uint total_local_size = local_x_size * local_y_size * local_z_size;
741 long total_global_size = final_x_size * final_y_size * final_z_size;
742 if (total_local_size < max_workgroup_size) {
743 if (total_global_size > 16384*16384) {
744 if (total_local_size < 64) {
745 log_info("Skipping test as local_size is small and it will take a long time.\n");
746 continue;
747 }
748 }
749 }
750
751 err = run_test(context, queue, kernel, array, memory_size, dimensions,
752 final_x_size, final_y_size, final_z_size,
753 local_x_size, local_y_size, local_z_size, explicit_local);
754
755 // If we failed to execute, then return so we don't crash.
756 if (err < 0) {
757 clReleaseMemObject(array);
758 clReleaseKernel(kernel);
759 clReleaseProgram(program);
760 free_mtdata(d);
761 return -1;
762 }
763
764 // Otherwise, if we had errors add them up.
765 if (err) {
766 log_error("Test global %s local %s failed.\n",
767 print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
768 print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
769 errors++;
770 clReleaseMemObject(array);
771 clReleaseKernel(kernel);
772 clReleaseProgram(program);
773 free_mtdata(d);
774 return -1;
775 }
776
777
778 previous_local_x_size = local_x_size;
779 previous_local_y_size = local_y_size;
780 previous_local_z_size = local_z_size;
781
782 // Only test one config in quick mode.
783 if (quick_test)
784 break;
785 } // local_test size
786 } // sub_test
787 // Increment the x_size
788 if (x_size == max_x_size)
789 break;
790 x_size *= size_increase_per_iteration;
791 if (x_size > max_x_size)
792 x_size = max_x_size;
793 } // x_size
794 // Increment the y_size
795 if (y_size == max_y_size)
796 break;
797 y_size *= size_increase_per_iteration;
798 if (y_size > max_y_size)
799 y_size = max_y_size;
800 } // y_size
801 // Increment the z_size
802 if (z_size == max_z_size)
803 break;
804 z_size *= size_increase_per_iteration;
805 if (z_size > max_z_size)
806 z_size = max_z_size;
807 } // z_size
808
809
810 free_mtdata(d);
811 clReleaseMemObject(array);
812 clReleaseKernel(kernel);
813 clReleaseProgram(program);
814 if (errors)
815 log_error("%d total errors.\n", errors);
816 return errors;
817
818 }
819
820 #define QUICK 1
821 #define FULL 0
822
test_quick_1d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)823 int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
824 {
825 return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, QUICK, 4, 1);
826 }
827
test_quick_2d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)828 int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
829 {
830 return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 1);
831 }
832
test_quick_3d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)833 int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
834 {
835 return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 1);
836 }
837
838
test_quick_1d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)839 int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
840 {
841 return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, QUICK, 4, 0);
842 }
843
test_quick_2d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)844 int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
845 {
846 return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 0);
847 }
848
test_quick_3d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)849 int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
850 {
851 return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 0);
852 }
853
854
test_full_1d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)855 int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
856 {
857 return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, FULL, 4, 1);
858 }
859
test_full_2d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)860 int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
861 {
862 return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 1);
863 }
864
test_full_3d_explicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)865 int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
866 {
867 return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 1);
868 }
869
870
test_full_1d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)871 int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
872 {
873 return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, FULL, 4, 0);
874 }
875
test_full_2d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)876 int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
877 {
878 return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 0);
879 }
880
test_full_3d_implicit_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)881 int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
882 {
883 return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 0);
884 }
885
886