• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 
24 
25 
26 #include "procs.h"
27 #include "harness/conversions.h"
28 
29 static const char *async_strided_global_to_local_kernel =
30 "%s\n" // optional pragma string
31 "%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
32 "{\n"
33 " int i;\n"
34 // Zero the local storage first
35 " for(i=0; i<copiesPerWorkItem; i++)\n"
36 "   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
37 // Do this to verify all kernels are done zeroing the local buffer before we try the copy
38 " barrier( CLK_LOCAL_MEM_FENCE );\n"
39 " event_t event;\n"
40 " event = async_work_group_strided_copy( (__local %s*)localBuffer, (__global const %s*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, 0 );\n"
41 // Wait for the copy to complete, then verify by manually copying to the dest
42 " wait_group_events( 1, &event );\n"
43 " for(i=0; i<copiesPerWorkItem; i++)\n"
44 "   dst[ get_global_id( 0 )*copiesPerWorkItem*stride+i*stride ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];\n"
45 "}\n" ;
46 
47 static const char *async_strided_local_to_global_kernel =
48 "%s\n" // optional pragma string
49 "%s__kernel void test_fn( const __global %s *src, __global %s *dst, __local %s *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem, int stride )\n"
50 "{\n"
51 " int i;\n"
52 // Zero the local storage first
53 " for(i=0; i<copiesPerWorkItem; i++)\n"
54 "   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (%s)(%s)0;\n"
55 // Do this to verify all kernels are done zeroing the local buffer before we try the copy
56 " barrier( CLK_LOCAL_MEM_FENCE );\n"
57 " for(i=0; i<copiesPerWorkItem; i++)\n"
58 "   localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = src[ get_global_id( 0 )*copiesPerWorkItem*stride+i*stride ];\n"
59 // Do this to verify all kernels are done copying to the local buffer before we try the copy
60 " barrier( CLK_LOCAL_MEM_FENCE );\n"
61 " event_t event;\n"
62 " event = async_work_group_strided_copy((__global %s*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const %s*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, 0 );\n"
63 " wait_group_events( 1, &event );\n"
64 "}\n" ;
65 
66 
test_strided_copy(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,ExplicitType vecType,int vecSize,int stride)67 int test_strided_copy(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, ExplicitType vecType, int vecSize, int stride)
68 {
69     int error;
70     clProgramWrapper program;
71     clKernelWrapper kernel;
72     clMemWrapper streams[ 2 ];
73     size_t threads[ 1 ], localThreads[ 1 ];
74     void *inBuffer, *outBuffer;
75     MTdata d;
76     char vecNameString[64]; vecNameString[0] = 0;
77 
78     if (vecSize == 1)
79         sprintf(vecNameString, "%s", get_explicit_type_name(vecType));
80     else
81         sprintf(vecNameString, "%s%d", get_explicit_type_name(vecType), vecSize);
82 
83 
84     log_info("Testing %s\n", vecNameString);
85 
86     cl_long max_local_mem_size;
87     error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(max_local_mem_size), &max_local_mem_size, NULL);
88     test_error( error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
89 
90     unsigned int num_of_compute_devices;
91     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_of_compute_devices), &num_of_compute_devices, NULL);
92     test_error( error, "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
93 
94     char programSource[4096]; programSource[0]=0;
95     char *programPtr;
96 
97     sprintf(programSource, kernelCode,
98         vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "",
99         "",
100         vecNameString, vecNameString, vecNameString, vecNameString, get_explicit_type_name(vecType), vecNameString, vecNameString);
101     //log_info("program: %s\n", programSource);
102     programPtr = programSource;
103 
104     error = create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "test_fn" );
105     test_error( error, "Unable to create testing kernel" );
106 
107     size_t max_workgroup_size;
108     error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL);
109     test_error (error, "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
110 
111     size_t max_local_workgroup_size[3];
112     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
113     test_error (error, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
114 
115   // Pick the minimum of the device and the kernel
116     if (max_workgroup_size > max_local_workgroup_size[0])
117         max_workgroup_size = max_local_workgroup_size[0];
118 
119     size_t elementSize = get_explicit_type_size(vecType)* ((vecSize == 3) ? 4 : vecSize);
120 
121     cl_ulong max_global_mem_size;
122     error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(max_global_mem_size), &max_global_mem_size, NULL);
123     test_error (error, "clGetDeviceInfo failed for CL_DEVICE_GLOBAL_MEM_SIZE");
124 
125     if (max_global_mem_size > (cl_ulong)SIZE_MAX) {
126       max_global_mem_size = (cl_ulong)SIZE_MAX;
127     }
128 
129     cl_bool unified_mem;
130     error = clGetDeviceInfo(deviceID, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(unified_mem), &unified_mem, NULL);
131     test_error (error, "clGetDeviceInfo failed for CL_DEVICE_HOST_UNIFIED_MEMORY");
132 
133     int number_of_global_mem_buffers = (unified_mem) ? 4 : 2;
134 
135     size_t numberOfCopiesPerWorkitem = 3;
136     size_t localStorageSpacePerWorkitem = numberOfCopiesPerWorkitem*elementSize;
137     size_t maxLocalWorkgroupSize = (((int)max_local_mem_size/2)/localStorageSpacePerWorkitem);
138 
139     size_t localWorkgroupSize = maxLocalWorkgroupSize;
140     if (maxLocalWorkgroupSize > max_workgroup_size)
141         localWorkgroupSize = max_workgroup_size;
142 
143     size_t localBufferSize = localWorkgroupSize*elementSize*numberOfCopiesPerWorkitem;
144     size_t numberOfLocalWorkgroups = 579;//1111;
145 
146     // Reduce the numberOfLocalWorkgroups so that no more than 1/2 of CL_DEVICE_GLOBAL_MEM_SIZE is consumed
147     // by the allocated buffer. This is done to avoid resource  errors resulting from address space fragmentation.
148     size_t numberOfLocalWorkgroupsLimit = max_global_mem_size / (2 * number_of_global_mem_buffers * localBufferSize * stride);
149     if (numberOfLocalWorkgroups > numberOfLocalWorkgroupsLimit) numberOfLocalWorkgroups = numberOfLocalWorkgroupsLimit;
150 
151     size_t globalBufferSize = numberOfLocalWorkgroups*localBufferSize*stride;
152     size_t globalWorkgroupSize = numberOfLocalWorkgroups*localWorkgroupSize;
153 
154     inBuffer = (void*)malloc(globalBufferSize);
155     outBuffer = (void*)malloc(globalBufferSize);
156     memset(outBuffer, 0, globalBufferSize);
157 
158     cl_int copiesPerWorkItemInt, copiesPerWorkgroup;
159     copiesPerWorkItemInt = (int)numberOfCopiesPerWorkitem;
160     copiesPerWorkgroup = (int)(numberOfCopiesPerWorkitem*localWorkgroupSize);
161 
162     log_info("Global: %d, local %d, local buffer %db, global buffer %db, copy stride %d, each work group will copy %d elements and each work item item will copy %d elements.\n",
163                 (int) globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize, (int)globalBufferSize, (int)stride, copiesPerWorkgroup, copiesPerWorkItemInt);
164 
165     threads[0] = globalWorkgroupSize;
166     localThreads[0] = localWorkgroupSize;
167 
168     d = init_genrand( gRandomSeed );
169     generate_random_data( vecType, globalBufferSize/get_explicit_type_size(vecType), d, inBuffer );
170     free_mtdata(d); d = NULL;
171 
172     streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, inBuffer, &error );
173     test_error( error, "Unable to create input buffer" );
174     streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, globalBufferSize, outBuffer, &error );
175     test_error( error, "Unable to create output buffer" );
176 
177     error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] );
178     test_error( error, "Unable to set kernel argument" );
179     error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] );
180     test_error( error, "Unable to set kernel argument" );
181     error = clSetKernelArg( kernel, 2, localBufferSize, NULL );
182     test_error( error, "Unable to set kernel argument" );
183     error = clSetKernelArg( kernel, 3, sizeof(copiesPerWorkgroup), &copiesPerWorkgroup );
184     test_error( error, "Unable to set kernel argument" );
185     error = clSetKernelArg( kernel, 4, sizeof(copiesPerWorkItemInt), &copiesPerWorkItemInt );
186     test_error( error, "Unable to set kernel argument" );
187     error = clSetKernelArg( kernel, 5, sizeof(stride), &stride );
188     test_error( error, "Unable to set kernel argument" );
189 
190     // Enqueue
191     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
192     test_error( error, "Unable to queue kernel" );
193 
194     // Read
195     error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, globalBufferSize, outBuffer, 0, NULL, NULL );
196     test_error( error, "Unable to read results" );
197 
198     // Verify
199     size_t typeSize = get_explicit_type_size(vecType)* vecSize;
200     for (int i=0; i<(int)globalBufferSize; i+=(int)elementSize*(int)stride)
201     {
202         if (memcmp( ((char *)inBuffer)+i, ((char *)outBuffer)+i, typeSize) != 0 )
203         {
204             unsigned char * inchar = (unsigned char*)inBuffer + i;
205             unsigned char * outchar = (unsigned char*)outBuffer + i;
206             char values[4096];
207             values[0] = 0;
208 
209             log_error( "ERROR: Results of copy did not validate!\n" );
210             sprintf(values + strlen( values), "%d -> [", i);
211             for (int j=0; j<(int)elementSize; j++)
212                 sprintf(values + strlen( values), "%2x ", inchar[j]);
213             sprintf(values + strlen(values), "] != [");
214             for (int j=0; j<(int)elementSize; j++)
215                 sprintf(values + strlen( values), "%2x ", outchar[j]);
216             sprintf(values + strlen(values), "]");
217             log_error("%s\n", values);
218             free(inBuffer);
219             free(outBuffer);
220             return -1;
221         }
222     }
223 
224     free(inBuffer);
225     free(outBuffer);
226 
227     return 0;
228 }
229 
test_strided_copy_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode)230 int test_strided_copy_all_types(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode)
231 {
232     ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
233     unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 };
234     unsigned int strideSizes[] = { 1, 3, 4, 5, 0 };
235     unsigned int size, typeIndex, stride;
236 
237     int errors = 0;
238 
239     for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ )
240     {
241         if( vecType[ typeIndex ] == kDouble && !is_extension_available( deviceID, "cl_khr_fp64" ) )
242             continue;
243 
244         if (( vecType[ typeIndex ] == kLong || vecType[ typeIndex ] == kULong ) && !gHasLong )
245             continue;
246 
247         for( size = 0; vecSizes[ size ] != 0; size++ )
248         {
249             for( stride = 0; strideSizes[ stride ] != 0; stride++)
250             {
251                 if (test_strided_copy( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], strideSizes[stride] ))
252                 {
253                     errors++;
254                 }
255             }
256         }
257     }
258     if (errors)
259         return -1;
260     return 0;
261 }
262 
263 
264 
265 
test_async_strided_copy_global_to_local(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)266 int test_async_strided_copy_global_to_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
267 {
268     return test_strided_copy_all_types( deviceID, context, queue, async_strided_global_to_local_kernel );
269 }
270 
test_async_strided_copy_local_to_global(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)271 int test_async_strided_copy_local_to_global(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
272 {
273     return test_strided_copy_all_types( deviceID, context, queue, async_strided_local_to_global_kernel );
274 }
275 
276