• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17 
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27 
28 static const char *async_global_to_local_kernel2D =
29     "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
30     "%s\n" // optional pragma string
31     "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
32     "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
33     "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
34     "{\n"
35     " int i, j;\n"
36     // Zero the local storage first
37     " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
38     "   for(j=0; j<numElementsPerLine; j++)\n"
39     "     localBuffer[ (get_local_id( 0 "
40     ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
41     "(%s)(%s)0;\n"
42     // Do this to verify all kernels are done zeroing the local buffer before we
43     // try the copy
44     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
45     "    event_t event;\n"
46     "    event = async_work_group_copy_2D2D( (__local %s*)localBuffer, "
47     "(__global const "
48     "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
49     "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, "
50     "srcStride, dstStride, 0 );\n"
51     // Wait for the copy to complete, then verify by manually copying to the
52     // dest
53     "     wait_group_events( 1, &event );\n"
54     " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
55     "   for(j=0; j<numElementsPerLine; j++)\n"
56     "     dst[ (get_global_id( 0 "
57     ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = "
58     "localBuffer[ (get_local_id( 0 "
59     ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n"
60     "}\n";
61 
62 static const char *async_local_to_global_kernel2D =
63     "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n"
64     "%s\n" // optional pragma string
65     "__kernel void test_fn( const __global %s *src, __global %s *dst, __local "
66     "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int "
67     "lineCopiesPerWorkItem, int srcStride, int dstStride )\n"
68     "{\n"
69     " int i, j;\n"
70     // Zero the local storage first
71     " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
72     "   for(j=0; j<numElementsPerLine; j++)\n"
73     "     localBuffer[ (get_local_id( 0 "
74     ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = "
75     "(%s)(%s)0;\n"
76     // Do this to verify all kernels are done zeroing the local buffer before we
77     // try the copy
78     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
79     " for(i=0; i<lineCopiesPerWorkItem; i++)\n"
80     "   for(j=0; j<numElementsPerLine; j++)\n"
81     "     localBuffer[ (get_local_id( 0 "
82     ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ "
83     "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + "
84     "srcStride)+j ];\n"
85     // Do this to verify all kernels are done copying to the local buffer before
86     // we try the copy
87     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
88     "    event_t event;\n"
89     "    event = async_work_group_copy_2D2D((__global "
90     "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + "
91     "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, "
92     "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n"
93     "    wait_group_events( 1, &event );\n"
94     "}\n";
95 
test_copy2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,ExplicitType vecType,int vecSize,int srcStride,int dstStride,bool localIsDst)96 int test_copy2D(cl_device_id deviceID, cl_context context,
97                 cl_command_queue queue, const char *kernelCode,
98                 ExplicitType vecType, int vecSize, int srcStride, int dstStride,
99                 bool localIsDst)
100 {
101     int error;
102     clProgramWrapper program;
103     clKernelWrapper kernel;
104     clMemWrapper streams[2];
105     size_t threads[1], localThreads[1];
106     void *inBuffer, *outBuffer, *outBufferCopy;
107     MTdata d;
108     char vecNameString[64];
109     vecNameString[0] = 0;
110     if (vecSize == 1)
111         sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
112     else
113         sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType),
114                 vecSize);
115 
116     size_t elementSize = get_explicit_type_size(vecType) * vecSize;
117     log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString,
118              srcStride, dstStride);
119 
120     cl_long max_local_mem_size;
121     error =
122         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
123                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);
124     test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
125 
126     cl_long max_global_mem_size;
127     error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
128                             sizeof(max_global_mem_size), &max_global_mem_size,
129                             NULL);
130     test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
131 
132     cl_long max_alloc_size;
133     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
134                             sizeof(max_alloc_size), &max_alloc_size, NULL);
135     test_error(error,
136                "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
137 
138     if (max_alloc_size > max_global_mem_size / 2)
139         max_alloc_size = max_global_mem_size / 2;
140 
141     unsigned int num_of_compute_devices;
142     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
143                             sizeof(num_of_compute_devices),
144                             &num_of_compute_devices, NULL);
145     test_error(error,
146                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
147 
148     char programSource[4096];
149     programSource[0] = 0;
150     char *programPtr;
151 
152     sprintf(programSource, kernelCode,
153             vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable"
154                                : "",
155             vecNameString, vecNameString, vecNameString, vecNameString,
156             get_explicit_type_name(vecType), vecNameString, vecNameString);
157     // log_info("program: %s\n", programSource);
158     programPtr = programSource;
159 
160     error = create_single_kernel_helper(context, &program, &kernel, 1,
161                                         (const char **)&programPtr, "test_fn");
162     test_error(error, "Unable to create testing kernel");
163 
164     size_t max_workgroup_size;
165     error = clGetKernelWorkGroupInfo(
166         kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
167         &max_workgroup_size, NULL);
168     test_error(
169         error,
170         "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
171 
172     size_t max_local_workgroup_size[3];
173     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
174                             sizeof(max_local_workgroup_size),
175                             max_local_workgroup_size, NULL);
176     test_error(error,
177                "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
178 
179     // Pick the minimum of the device and the kernel
180     if (max_workgroup_size > max_local_workgroup_size[0])
181         max_workgroup_size = max_local_workgroup_size[0];
182 
183     size_t numElementsPerLine = 10;
184     size_t lineCopiesPerWorkItem = 13;
185     elementSize =
186         get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize);
187     size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize
188         * (numElementsPerLine + (localIsDst ? dstStride : srcStride));
189     size_t maxLocalWorkgroupSize =
190         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
191 
192     // Calculation can return 0 on embedded devices due to 1KB local mem limit
193     if (maxLocalWorkgroupSize == 0)
194     {
195         maxLocalWorkgroupSize = 1;
196     }
197 
198     size_t localWorkgroupSize = maxLocalWorkgroupSize;
199     if (maxLocalWorkgroupSize > max_workgroup_size)
200         localWorkgroupSize = max_workgroup_size;
201 
202     size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride)
203         / (numElementsPerLine + srcStride);
204     size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride)
205         / (numElementsPerLine + dstStride);
206     size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut);
207     size_t maxLocalWorkgroups =
208         maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
209 
210     size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem
211         - (localIsDst ? dstStride : srcStride);
212     size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups);
213     size_t totalLines =
214         numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
215     size_t inBufferSize = elementSize
216         * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride);
217     size_t outBufferSize = elementSize
218         * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride);
219     size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize;
220 
221     inBuffer = (void *)malloc(inBufferSize);
222     outBuffer = (void *)malloc(outBufferSize);
223     outBufferCopy = (void *)malloc(outBufferSize);
224 
225     cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt,
226         lineCopiesPerWorkgroup;
227     lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem;
228     numElementsPerLineInt = (int)numElementsPerLine;
229     lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize);
230 
231     log_info(
232         "Global: %d, local %d, local buffer %db, global in buffer %db, "
233         "global out buffer %db, each work group will copy %d lines and each "
234         "work item item will copy %d lines.\n",
235         (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
236         (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
237         lineCopiesPerWorkItemInt);
238 
239     threads[0] = globalWorkgroupSize;
240     localThreads[0] = localWorkgroupSize;
241 
242     d = init_genrand(gRandomSeed);
243     generate_random_data(
244         vecType, inBufferSize / get_explicit_type_size(vecType), d, inBuffer);
245     generate_random_data(
246         vecType, outBufferSize / get_explicit_type_size(vecType), d, outBuffer);
247     free_mtdata(d);
248     d = NULL;
249     memcpy(outBufferCopy, outBuffer, outBufferSize);
250 
251     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
252                                 inBuffer, &error);
253     test_error(error, "Unable to create input buffer");
254     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
255                                 outBuffer, &error);
256     test_error(error, "Unable to create output buffer");
257 
258     error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
259     test_error(error, "Unable to set kernel argument");
260     error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
261     test_error(error, "Unable to set kernel argument");
262     error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
263     test_error(error, "Unable to set kernel argument");
264     error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
265                            &numElementsPerLineInt);
266     test_error(error, "Unable to set kernel argument");
267     error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
268                            &lineCopiesPerWorkgroup);
269     test_error(error, "Unable to set kernel argument");
270     error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
271                            &lineCopiesPerWorkItemInt);
272     test_error(error, "Unable to set kernel argument");
273     error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
274     test_error(error, "Unable to set kernel argument");
275     error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
276     test_error(error, "Unable to set kernel argument");
277 
278     // Enqueue
279     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
280                                    localThreads, 0, NULL, NULL);
281     test_error(error, "Unable to queue kernel");
282 
283     // Read
284     error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
285                                 outBuffer, 0, NULL, NULL);
286     test_error(error, "Unable to read results");
287 
288     // Verify
289     int failuresPrinted = 0;
290     // Verify
291     size_t typeSize = get_explicit_type_size(vecType) * vecSize;
292     for (int i = 0;
293          i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
294          i += elementSize)
295     {
296         for (int j = 0; j < (int)numElementsPerLine * elementSize;
297              j += elementSize)
298         {
299             int inIdx = i * (numElementsPerLine + srcStride) + j;
300             int outIdx = i * (numElementsPerLine + dstStride) + j;
301             if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
302                        typeSize)
303                 != 0)
304             {
305                 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
306                 unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
307                 char values[4096];
308                 values[0] = 0;
309 
310                 if (failuresPrinted == 0)
311                 {
312                     // Print first failure message
313                     log_error("ERROR: Results of copy did not validate!\n");
314                 }
315                 sprintf(values + strlen(values), "%d -> [", inIdx);
316                 for (int k = 0; k < (int)elementSize; k++)
317                     sprintf(values + strlen(values), "%2x ", inchar[k]);
318                 sprintf(values + strlen(values), "] != [");
319                 for (int k = 0; k < (int)elementSize; k++)
320                     sprintf(values + strlen(values), "%2x ", outchar[k]);
321                 sprintf(values + strlen(values), "]");
322                 log_error("%s\n", values);
323                 failuresPrinted++;
324             }
325 
326             if (failuresPrinted > 5)
327             {
328                 log_error("Not printing further failures...\n");
329                 return -1;
330             }
331         }
332         if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
333                 * elementSize)
334         {
335             int outIdx = i * (numElementsPerLine + dstStride)
336                 + numElementsPerLine * elementSize;
337             if (memcmp(((char *)outBuffer) + outIdx,
338                        ((char *)outBufferCopy) + outIdx,
339                        dstStride * elementSize)
340                 != 0)
341             {
342                 if (failuresPrinted == 0)
343                 {
344                     // Print first failure message
345                     log_error("ERROR: Results of copy did not validate!\n");
346                 }
347                 log_error(
348                     "2D copy corrupted data in output buffer in the stride "
349                     "offset of line %d\n",
350                     i);
351                 failuresPrinted++;
352             }
353             if (failuresPrinted > 5)
354             {
355                 log_error("Not printing further failures...\n");
356                 return -1;
357             }
358         }
359     }
360 
361     free(inBuffer);
362     free(outBuffer);
363     free(outBufferCopy);
364 
365     return failuresPrinted ? -1 : 0;
366 }
367 
test_copy2D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)368 int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
369                           cl_command_queue queue, const char *kernelCode,
370                           bool localIsDst)
371 {
372     ExplicitType vecType[] = {
373         kChar,  kUChar, kShort,  kUShort,          kInt, kUInt, kLong,
374         kULong, kFloat, kDouble, kNumExplicitTypes
375     };
376     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
377     unsigned int smallTypesStrideSizes[] = { 0, 10, 100 };
378     unsigned int size, typeIndex, srcStride, dstStride;
379 
380     int errors = 0;
381 
382     if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
383     {
384         log_info(
385             "Device does not support extended async copies. Skipping test.\n");
386         return 0;
387     }
388 
389     for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++)
390     {
391         if (vecType[typeIndex] == kDouble
392             && !is_extension_available(deviceID, "cl_khr_fp64"))
393             continue;
394 
395         if ((vecType[typeIndex] == kLong || vecType[typeIndex] == kULong)
396             && !gHasLong)
397             continue;
398 
399         for (size = 0; vecSizes[size] != 0; size++)
400         {
401             if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size]
402                 <= 2) // small type
403             {
404                 for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes)
405                          / sizeof(smallTypesStrideSizes[0]);
406                      srcStride++)
407                 {
408                     for (dstStride = 0;
409                          dstStride < sizeof(smallTypesStrideSizes)
410                              / sizeof(smallTypesStrideSizes[0]);
411                          dstStride++)
412                     {
413                         if (test_copy2D(deviceID, context, queue, kernelCode,
414                                         vecType[typeIndex], vecSizes[size],
415                                         smallTypesStrideSizes[srcStride],
416                                         smallTypesStrideSizes[dstStride],
417                                         localIsDst))
418                         {
419                             errors++;
420                         }
421                     }
422                 }
423             }
424             // not a small type, check only zero stride
425             else if (test_copy2D(deviceID, context, queue, kernelCode,
426                                  vecType[typeIndex], vecSizes[size], 0, 0,
427                                  localIsDst))
428             {
429                 errors++;
430             }
431         }
432     }
433     if (errors) return -1;
434     return 0;
435 }
436 
test_async_copy_global_to_local2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)437 int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
438                                       cl_command_queue queue, int num_elements)
439 {
440     return test_copy2D_all_types(deviceID, context, queue,
441                                  async_global_to_local_kernel2D, true);
442 }
443 
test_async_copy_local_to_global2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)444 int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
445                                       cl_command_queue queue, int num_elements)
446 {
447     return test_copy2D_all_types(deviceID, context, queue,
448                                  async_local_to_global_kernel2D, false);
449 }
450