1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27
28 static const char *async_global_to_local_kernel2D =
29 "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
30 "%s\n" // optional pragma string
31 "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
32 "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
33 "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
34 "{\n"
35 " int i, j;\n"
36 // Zero the local storage first
37 " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
38 " for(j=0; j<numElementsPerLine; j++)\n"
39 " localBuffer[ (get_local_id( 0 "
40 ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
41 "(%s)(%s)0;\n"
42 // Do this to verify all kernels are done zeroing the local buffer before we
43 // try the copy
44 " barrier( CLK_LOCAL_MEM_FENCE );\n"
45 " event_t event;\n"
46 " event = async_work_group_copy_2D2D( (__local %s*)localBuffer, "
47 "(__global const "
48 "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
49 "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, "
50 "srcStride, dstStride, 0 );\n"
51 // Wait for the copy to complete, then verify by manually copying to the
52 // dest
53 " wait_group_events( 1, &event );\n"
54 " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
55 " for(j=0; j<numElementsPerLine; j++)\n"
56 " dst[ (get_global_id( 0 "
57 ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
58 "localBuffer[ (get_local_id( 0 "
59 ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n"
60 "}\n";
61
62 static const char *async_local_to_global_kernel2D =
63 "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
64 "%s\n" // optional pragma string
65 "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
66 "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
67 "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
68 "{\n"
69 " int i, j;\n"
70 // Zero the local storage first
71 " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
72 " for(j=0; j<numElementsPerLine; j++)\n"
73 " localBuffer[ (get_local_id( 0 "
74 ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = "
75 "(%s)(%s)0;\n"
76 // Do this to verify all kernels are done zeroing the local buffer before we
77 // try the copy
78 " barrier( CLK_LOCAL_MEM_FENCE );\n"
79 " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
80 " for(j=0; j<numElementsPerLine; j++)\n"
81 " localBuffer[ (get_local_id( 0 "
82 ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ "
83 "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + "
84 "srcStride)+j ];\n"
85 // Do this to verify all kernels are done copying to the local buffer before
86 // we try the copy
87 " barrier( CLK_LOCAL_MEM_FENCE );\n"
88 " event_t event;\n"
89 " event = async_work_group_copy_2D2D((__global "
90 "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
91 "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, "
92 "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n"
93 " wait_group_events( 1, &event );\n"
94 "}\n";
95
test_copy2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,ExplicitType vecType,int vecSize,int srcStride,int dstStride,bool localIsDst)96 int test_copy2D(cl_device_id deviceID, cl_context context,
97 cl_command_queue queue, const char *kernelCode,
98 ExplicitType vecType, int vecSize, int srcStride, int dstStride,
99 bool localIsDst)
100 {
101 int error;
102 clProgramWrapper program;
103 clKernelWrapper kernel;
104 clMemWrapper streams[2];
105 size_t threads[1], localThreads[1];
106 void *inBuffer, *outBuffer, *outBufferCopy;
107 MTdata d;
108 char vecNameString[64];
109 vecNameString[0] = 0;
110 if (vecSize == 1)
111 sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
112 else
113 sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
114 vecSize);
115
116 size_t elementSize = get_explicit_type_size(vecType) * vecSize;
117 log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
118 srcStride, dstStride);
119
120 cl_long max_local_mem_size;
121 error =
122 clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
123 sizeof(max_local_mem_size), &max_local_mem_size, NULL);
124 test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
125
126 cl_long max_global_mem_size;
127 error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
128 sizeof(max_global_mem_size), &max_global_mem_size,
129 NULL);
130 test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
131
132 cl_long max_alloc_size;
133 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
134 sizeof(max_alloc_size), &max_alloc_size, NULL);
135 test_error(error,
136 "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
137
138 if (max_alloc_size > max_global_mem_size / 2)
139 max_alloc_size = max_global_mem_size / 2;
140
141 unsigned int num_of_compute_devices;
142 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
143 sizeof(num_of_compute_devices),
144 &num_of_compute_devices, NULL);
145 test_error(error,
146 "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
147
148 char programSource[4096];
149 programSource[0] = 0;
150 char *programPtr;
151
152 sprintf(programSource, kernelCode,
153 vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
154 : "",
155 vecNameString, vecNameString, vecNameString, vecNameString,
156 get_explicit_type_name(vecType), vecNameString, vecNameString);
157 // log_info("program: %s\n", programSource);
158 programPtr = programSource;
159
160 error = create_single_kernel_helper(context, &program, &kernel, 1,
161 (const char **)&programPtr, "test_fn");
162 test_error(error, "Unable to create testing kernel");
163
164 size_t max_workgroup_size;
165 error = clGetKernelWorkGroupInfo(
166 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
167 &max_workgroup_size, NULL);
168 test_error(
169 error,
170 "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
171
172 size_t max_local_workgroup_size[3];
173 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
174 sizeof(max_local_workgroup_size),
175 max_local_workgroup_size, NULL);
176 test_error(error,
177 "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
178
179 // Pick the minimum of the device and the kernel
180 if (max_workgroup_size > max_local_workgroup_size[0])
181 max_workgroup_size = max_local_workgroup_size[0];
182
183 size_t numElementsPerLine = 10;
184 size_t lineCopiesPerWorkItem = 13;
185 elementSize =
186 get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
187 size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize
188 * (numElementsPerLine + (localIsDst ? dstStride : srcStride));
189 size_t maxLocalWorkgroupSize =
190 (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
191
192 // Calculation can return 0 on embedded devices due to 1KB local mem limit
193 if (maxLocalWorkgroupSize == 0)
194 {
195 maxLocalWorkgroupSize = 1;
196 }
197
198 size_t localWorkgroupSize = maxLocalWorkgroupSize;
199 if (maxLocalWorkgroupSize > max_workgroup_size)
200 localWorkgroupSize = max_workgroup_size;
201
202 size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride)
203 / (numElementsPerLine + srcStride);
204 size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
205 / (numElementsPerLine + dstStride);
206 size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
207 size_t maxLocalWorkgroups =
208 maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
209
210 size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
211 - (localIsDst ? dstStride : srcStride);
212 size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
213 size_t totalLines =
214 numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
215 size_t inBufferSize = elementSize
216 * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride);
217 size_t outBufferSize = elementSize
218 * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride);
219 size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
220
221 inBuffer = (void *)malloc(inBufferSize);
222 outBuffer = (void *)malloc(outBufferSize);
223 outBufferCopy = (void *)malloc(outBufferSize);
224
225 cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt,
226 lineCopiesPerWorkgroup;
227 lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem;
228 numElementsPerLineInt = (int)numElementsPerLine;
229 lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize);
230
231 log_info(
232 "Global: %d, local %d, local buffer %db, global in buffer %db, "
233 "global out buffer %db, each work group will copy %d lines and each "
234 "work item item will copy %d lines.\n",
235 (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
236 (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
237 lineCopiesPerWorkItemInt);
238
239 threads[0] = globalWorkgroupSize;
240 localThreads[0] = localWorkgroupSize;
241
242 d = init_genrand(gRandomSeed);
243 generate_random_data(
244 vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
245 generate_random_data(
246 vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
247 free_mtdata(d);
248 d = NULL;
249 memcpy(outBufferCopy, outBuffer, outBufferSize);
250
251 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
252 inBuffer, &error);
253 test_error(error, "Unable to create input buffer");
254 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
255 outBuffer, &error);
256 test_error(error, "Unable to create output buffer");
257
258 error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
259 test_error(error, "Unable to set kernel argument");
260 error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
261 test_error(error, "Unable to set kernel argument");
262 error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
263 test_error(error, "Unable to set kernel argument");
264 error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
265 &numElementsPerLineInt);
266 test_error(error, "Unable to set kernel argument");
267 error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
268 &lineCopiesPerWorkgroup);
269 test_error(error, "Unable to set kernel argument");
270 error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
271 &lineCopiesPerWorkItemInt);
272 test_error(error, "Unable to set kernel argument");
273 error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
274 test_error(error, "Unable to set kernel argument");
275 error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
276 test_error(error, "Unable to set kernel argument");
277
278 // Enqueue
279 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
280 localThreads, 0, NULL, NULL);
281 test_error(error, "Unable to queue kernel");
282
283 // Read
284 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
285 outBuffer, 0, NULL, NULL);
286 test_error(error, "Unable to read results");
287
288 // Verify
289 int failuresPrinted = 0;
290 // Verify
291 size_t typeSize = get_explicit_type_size(vecType) * vecSize;
292 for (int i = 0;
293 i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
294 i += elementSize)
295 {
296 for (int j = 0; j < (int)numElementsPerLine * elementSize;
297 j += elementSize)
298 {
299 int inIdx = i * (numElementsPerLine + srcStride) + j;
300 int outIdx = i * (numElementsPerLine + dstStride) + j;
301 if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
302 typeSize)
303 != 0)
304 {
305 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
306 unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
307 char values[4096];
308 values[0] = 0;
309
310 if (failuresPrinted == 0)
311 {
312 // Print first failure message
313 log_error("ERROR: Results of copy did not validate!\n");
314 }
315 sprintf(values + strlen(values), "%d -> [", inIdx);
316 for (int k = 0; k < (int)elementSize; k++)
317 sprintf(values + strlen(values), "%2x ", inchar[k]);
318 sprintf(values + strlen(values), "] != [");
319 for (int k = 0; k < (int)elementSize; k++)
320 sprintf(values + strlen(values), "%2x ", outchar[k]);
321 sprintf(values + strlen(values), "]");
322 log_error("%s\n", values);
323 failuresPrinted++;
324 }
325
326 if (failuresPrinted > 5)
327 {
328 log_error("Not printing further failures...\n");
329 return -1;
330 }
331 }
332 if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
333 * elementSize)
334 {
335 int outIdx = i * (numElementsPerLine + dstStride)
336 + numElementsPerLine * elementSize;
337 if (memcmp(((char *)outBuffer) + outIdx,
338 ((char *)outBufferCopy) + outIdx,
339 dstStride * elementSize)
340 != 0)
341 {
342 if (failuresPrinted == 0)
343 {
344 // Print first failure message
345 log_error("ERROR: Results of copy did not validate!\n");
346 }
347 log_error(
348 "2D copy corrupted data in output buffer in the stride "
349 "offset of line %d\n",
350 i);
351 failuresPrinted++;
352 }
353 if (failuresPrinted > 5)
354 {
355 log_error("Not printing further failures...\n");
356 return -1;
357 }
358 }
359 }
360
361 free(inBuffer);
362 free(outBuffer);
363 free(outBufferCopy);
364
365 return failuresPrinted ? -1 : 0;
366 }
367
test_copy2D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)368 int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
369 cl_command_queue queue, const char *kernelCode,
370 bool localIsDst)
371 {
372 ExplicitType vecType[] = {
373 kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong,
374 kULong, kFloat, kDouble, kNumExplicitTypes
375 };
376 unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
377 unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
378 unsigned int size, typeIndex, srcStride, dstStride;
379
380 int errors = 0;
381
382 if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
383 {
384 log_info(
385 "Device does not support extended async copies. Skipping test.\n");
386 return 0;
387 }
388
389 for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
390 {
391 if (vecType[typeIndex] == kDouble
392 && !is_extension_available(deviceID, "cl_khr_fp64"))
393 continue;
394
395 if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
396 && !gHasLong)
397 continue;
398
399 for (size = 0; vecSizes[size] != 0; size++)
400 {
401 if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
402 <= 2) // small type
403 {
404 for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes)
405 / sizeof(smallTypesStrideSizes[0]);
406 srcStride++)
407 {
408 for (dstStride = 0;
409 dstStride < sizeof(smallTypesStrideSizes)
410 / sizeof(smallTypesStrideSizes[0]);
411 dstStride++)
412 {
413 if (test_copy2D(deviceID, context, queue, kernelCode,
414 vecType[typeIndex], vecSizes[size],
415 smallTypesStrideSizes[srcStride],
416 smallTypesStrideSizes[dstStride],
417 localIsDst))
418 {
419 errors++;
420 }
421 }
422 }
423 }
424 // not a small type, check only zero stride
425 else if (test_copy2D(deviceID, context, queue, kernelCode,
426 vecType[typeIndex], vecSizes[size], 0, 0,
427 localIsDst))
428 {
429 errors++;
430 }
431 }
432 }
433 if (errors) return -1;
434 return 0;
435 }
436
test_async_copy_global_to_local2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)437 int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
438 cl_command_queue queue, int num_elements)
439 {
440 return test_copy2D_all_types(deviceID, context, queue,
441 async_global_to_local_kernel2D, true);
442 }
443
test_async_copy_local_to_global2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)444 int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
445 cl_command_queue queue, int num_elements)
446 {
447 return test_copy2D_all_types(deviceID, context, queue,
448 async_local_to_global_kernel2D, false);
449 }
450