• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <algorithm>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <limits.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <vector>
24 
25 #include <CL/cl_half.h>
26 
27 #include "procs.h"
28 #include "harness/conversions.h"
29 #include "harness/errorHelpers.h"
30 #include "harness/stringHelpers.h"
31 #include "harness/typeWrappers.h"
32 
33 // Outputs debug information for stores
34 #define DEBUG 0
35 // Forces stores/loads to be done with offsets = tid
36 #define LINEAR_OFFSETS 0
37 #define NUM_LOADS    512
38 #define HFF(num) cl_half_from_float(num, halfRoundingMode)
39 #define HTF(num) cl_half_to_float(num)
40 
41 char pragma_str[128] = { 0 };
42 char mem_type[64] = { 0 };
43 char store_str[128] = { 0 };
44 char load_str[128] = { 0 };
45 
46 extern cl_half_rounding_mode halfRoundingMode;
47 
48 // clang-format off
49 static const char *store_pattern= "results[ tid ] = tmp;\n";
50 static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
51 static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
52 static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
53 static const char *kernel_pattern[] = {
54 pragma_str,
55 "#define STYPE %s\n"
56 "__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
57 "{\n"
58 "    int tid = get_global_id( 0 );\n"
59 "    %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
60 "    ", store_str,
61 "}\n"
62 };
63 
64 const char *pattern_local [] = {
65 pragma_str,
66 "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
67 "{\n"
68 "    int tid = get_global_id( 0 );\n"
69 "   int lid = get_local_id( 0 );\n"
70 "\n"
71 "    if( lid == 0 )\n"
72 "    {\n"
73 "        for( int i = 0; i < %d; i++ ) {\n"
74 "           ", load_str,
75 "        }\n"
76 "    }\n"
77 //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
78 //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
79 //  updated on all threads at that point
80 "   barrier( CLK_LOCAL_MEM_FENCE );\n"
81 "\n"
82 "    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
83 "    ", store_str,
84 "}\n" };
85 
86 const char *pattern_priv [] = {
87 pragma_str,
88 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
89 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
90 "#define PRIV_TYPE %s\n"
91 "#define PRIV_SIZE %d\n"
92 "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
93 "{\n"
94 "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
95 "    int tid = get_global_id( 0 );\n"
96 "\n"
97 "    for( int i = 0; i < PRIV_SIZE; i++ )\n"
98 "      sPrivateStorage[ i ] = src[ i ];\n"
99 //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
100 //  anybody else to sync up
101 "\n"
102 "    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
103 "    ", store_str,
104 "}\n"};
105 // clang-format on
106 
107 #pragma mark -------------------- vload harness --------------------------
108 
109 typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
110                                   size_t);
111 typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
112                        unsigned int, create_program_fn, size_t);
113 
test_vload(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_program_fn createFn,size_t bufferSize)114 int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
115                ExplicitType type, unsigned int vecSize,
116                create_program_fn createFn, size_t bufferSize)
117 {
118     clProgramWrapper program;
119     clKernelWrapper kernel;
120     clMemWrapper streams[ 4 ];
121     MTdataHolder d(gRandomSeed);
122     const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
123 
124     if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
125 
126     size_t threads[ 1 ], localThreads[ 1 ];
127     clProtectedArray inBuffer( bufferSize );
128     cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
129     size_t numElements, typeSize, i;
130     unsigned int outVectorSize;
131 
132     pragma_str[0] = '\0';
133     if (type == kDouble)
134         std::snprintf(pragma_str, sizeof(pragma_str),
135                       "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
136     else if (type == kHalf)
137         std::snprintf(pragma_str, sizeof(pragma_str),
138                       "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
139 
140     typeSize = get_explicit_type_size( type );
141     numElements = bufferSize / ( typeSize * vecSize );
142     bufferSize = numElements * typeSize * vecSize;    // To account for rounding
143 
144     if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
145 
146     // Create some random input data and random offsets to load from
147     generate_random_data( type, numElements * vecSize, d, (void *)inBuffer );
148     for( i = 0; i < numLoads; i++ )
149     {
150         offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d );
151         if( offsets[ i ] < numElements - 2 )
152             alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d );
153         else
154             alignmentOffsets[ i ] = 0;
155         if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i;
156     }
157     if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n");
158 
159     // 32-bit fixup
160     outVectorSize = vecSize;
161 
162     // Declare output buffers now
163     std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
164     std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
165 
166     // Create the program
167     std::string programSrc;
168     createFn( programSrc, numElements, type, vecSize, outVectorSize);
169 
170     // Create our kernel
171     const char *ptr = programSrc.c_str();
172     cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
173                                                &ptr, "test_fn");
174     test_error( error, "Unable to create testing kernel" );
175     if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
176 
177     // Get the number of args to differentiate the kernels with local storage. (They have 5)
178     cl_uint numArgs;
179     error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
180     test_error( error, "clGetKernelInfo failed");
181 
182     // Set up parameters
183     streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error );
184     test_error( error, "Unable to create kernel stream" );
185     streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error );
186     test_error( error, "Unable to create kernel stream" );
187     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
188     test_error( error, "Unable to create kernel stream" );
189     streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
190                                 numLoads * typeSize * outVectorSize,
191                                 (void *)outBuffer.data(), &error);
192     test_error( error, "Unable to create kernel stream" );
193 
194     // Set parameters and run
195     if (numArgs == 5) {
196         // We need to set the size of the local storage
197         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
198         test_error( error, "clSetKernelArg for buffer failed");
199         for( i = 0; i < 4; i++ )
200         {
201             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
202             test_error( error, "Unable to set kernel argument" );
203         }
204     } else {
205         // No local storage
206         for( i = 0; i < 4; i++ )
207         {
208             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
209             test_error( error, "Unable to set kernel argument" );
210         }
211     }
212 
213     threads[ 0 ] = numLoads;
214     error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
215     test_error( error, "Unable to get local thread size" );
216 
217     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
218     test_error( error, "Unable to exec kernel" );
219 
220     // Get the results
221     error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
222                                 numLoads * typeSize * outVectorSize
223                                     * sizeof(cl_char),
224                                 (void *)outBuffer.data(), 0, NULL, NULL);
225     test_error( error, "Unable to read results" );
226 
227     // Create the reference results
228     referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
229     for( i = 0; i < numLoads; i++ )
230     {
231         memcpy(&referenceBuffer[i * typeSize * vecSize],
232                ((char *)(void *)inBuffer)
233                    + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
234                typeSize * vecSize);
235     }
236 
237     // Validate the results now
238     char *expected = referenceBuffer.data();
239     char *actual = outBuffer.data();
240     char *in = (char *)(void *)inBuffer;
241 
242     if (DEBUG) {
243         log_info("Memory contents:\n");
244         char inString[1024];
245         char expectedString[1024], actualString[1024];
246         for (i=0; i<numElements; i++) {
247             if (i < numLoads) {
248                 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
249                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
250                          GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ),
251                          offsets[i], alignmentOffsets[i]);
252                 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*outVectorSize]), typeSize * vecSize) != 0)
253                     log_error(" << ERROR\n");
254                 else
255                     log_info("\n");
256             } else {
257                 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
258                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
259                          GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ));
260             }
261         }
262     }
263 
264     for( i = 0; i < numLoads; i++ )
265     {
266         if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
267         {
268             char expectedString[ 1024 ], actualString[ 1024 ];
269             log_error( "ERROR: Data sample %d for vload of %s%d did not validate (expected {%s}, got {%s}, loaded from offset %d)\n",
270                       (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
271                       GetDataVectorString( actual, typeSize, vecSize, actualString ), (int)offsets[ i ] );
272             return 1;
273         }
274         expected += typeSize * vecSize;
275         actual += typeSize * outVectorSize;
276     }
277     return 0;
278 }
279 
280 template <test_fn test_func_ptr>
test_vset(cl_device_id device,cl_context context,cl_command_queue queue,create_program_fn createFn,size_t bufferSize)281 int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
282               create_program_fn createFn, size_t bufferSize)
283 {
284     std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
285                                           kInt,   kUInt,  kLong,  kULong,
286                                           kFloat, kHalf,  kDouble };
287     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
288     const char *size_names[] = { "2", "3", "4", "8", "16"};
289     int error = 0;
290 
291     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
292 
293     bool hasDouble = is_extension_available(device, "cl_khr_fp64");
294     bool hasHalf = is_extension_available(device, "cl_khr_fp16");
295 
296     for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
297     {
298         if (vecType[typeIdx] == kDouble && !hasDouble)
299             continue;
300         else if (vecType[typeIdx] == kHalf && !hasHalf)
301             continue;
302         else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
303                  && !gHasLong)
304             continue;
305 
306         for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
307         {
308             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
309 
310             int error_this_type =
311                 test_func_ptr(device, context, queue, vecType[typeIdx],
312                               vecSizes[sizeIdx], createFn, bufferSize);
313             if (error_this_type) {
314                 error += error_this_type;
315                 log_error("Failure; skipping further sizes for this type.");
316                 break;
317             }
318         }
319     }
320     return error;
321 }
322 
323 #pragma mark -------------------- vload test cases --------------------------
324 
create_global_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)325 void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
326                              ExplicitType type, size_t inVectorSize,
327                              size_t outVectorSize)
328 {
329     std::snprintf(mem_type, sizeof(mem_type), "__global");
330     std::snprintf(store_str, sizeof(store_str), store_patternV3);
331     const char *typeName = get_explicit_type_name(type);
332     std::string outTypeName = typeName;
333     if (inVectorSize != 3)
334     {
335         outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
336         std::snprintf(store_str, sizeof(store_str), store_pattern);
337     }
338 
339     std::string kernel_src = concat_kernel(
340         kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
341     destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
342                              typeName, (int)inVectorSize, (int)inVectorSize);
343 }
344 
test_vload_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)345 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
346 {
347     return test_vset<test_vload>(device, context, queue,
348                                  create_global_load_code, 10240);
349 }
350 
create_local_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)351 void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
352                             ExplicitType type, size_t inVectorSize,
353                             size_t outVectorSize)
354 {
355     std::snprintf(store_str, sizeof(store_str), store_patternV3);
356     std::snprintf(load_str, sizeof(load_str), load_patternV3);
357     const char *typeName = get_explicit_type_name(type);
358     std::string outTypeName = typeName;
359     std::string inTypeName = typeName;
360     if (inVectorSize != 3)
361     {
362         outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
363         inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
364         std::snprintf(store_str, sizeof(store_str), store_pattern);
365         std::snprintf(load_str, sizeof(load_str), load_pattern);
366     }
367 
368     std::string kernel_src = concat_kernel(
369         pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
370     destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
371                              outTypeName.c_str(), (int)inBufferSize, typeName,
372                              (int)inVectorSize, (int)inVectorSize, typeName);
373 }
374 
test_vload_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)375 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
376 {
377     // Determine the max size of a local buffer that we can test against
378     cl_ulong localSize;
379     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
380     test_error( error, "Unable to get max size of local memory buffer" );
381     if (localSize > 10240) localSize = 10240;
382     if (localSize > 4096)
383         localSize -= 2048;
384     else
385         localSize /= 2;
386 
387     return test_vset<test_vload>(device, context, queue, create_local_load_code,
388                                  (size_t)localSize);
389 }
390 
create_constant_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)391 void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
392                                ExplicitType type, size_t inVectorSize,
393                                size_t outVectorSize)
394 {
395     std::snprintf(mem_type, sizeof(mem_type), "__constant");
396     std::snprintf(store_str, sizeof(store_str), store_patternV3);
397     const char *typeName = get_explicit_type_name(type);
398     std::string outTypeName = typeName;
399     if (inVectorSize != 3)
400     {
401         outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
402         std::snprintf(store_str, sizeof(store_str), store_pattern);
403     }
404 
405     std::string kernel_src = concat_kernel(
406         kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
407     destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
408                              typeName, (int)inVectorSize, (int)inVectorSize);
409 }
410 
test_vload_constant(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)411 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
412 {
413     // Determine the max size of a local buffer that we can test against
414     cl_ulong maxSize;
415     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
416     test_error( error, "Unable to get max size of constant memory buffer" );
417     if (maxSize > 10240) maxSize = 10240;
418     if (maxSize > 4096)
419         maxSize -= 2048;
420     else
421         maxSize /= 2;
422 
423     return test_vset<test_vload>(device, context, queue,
424                                  create_constant_load_code, (size_t)maxSize);
425 }
426 
create_private_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)427 void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
428                               ExplicitType type, size_t inVectorSize,
429                               size_t outVectorSize)
430 {
431     std::snprintf(store_str, sizeof(store_str), store_patternV3);
432     const char *typeName = get_explicit_type_name(type);
433     std::string outTypeName = typeName;
434     std::string inTypeName = typeName;
435     int bufSize = (int)inBufferSize * 3;
436     if (inVectorSize != 3)
437     {
438         outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
439         inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
440         bufSize = (int)inBufferSize;
441         std::snprintf(store_str, sizeof(store_str), store_pattern);
442     }
443 
444     std::string kernel_src = concat_kernel(
445         pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
446     destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
447                              inTypeName.c_str(), outTypeName.c_str(), typeName,
448                              (int)inVectorSize, (int)inVectorSize, typeName);
449 }
450 
test_vload_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)451 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
452 {
453     // We have no idea how much actual private storage is available, so just pick a reasonable value,
454     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
455     return test_vset<test_vload>(device, context, queue,
456                                  create_private_load_code, 256);
457 }
458 
459 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
460 #pragma mark -------------------- vstore harness --------------------------
461 
test_vstore(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_program_fn createFn,size_t bufferSize)462 int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
463                 ExplicitType type, unsigned int vecSize,
464                 create_program_fn createFn, size_t bufferSize)
465 {
466     clProgramWrapper program;
467     clKernelWrapper kernel;
468     clMemWrapper streams[ 3 ];
469     MTdataHolder d(gRandomSeed);
470 
471     size_t threads[ 1 ], localThreads[ 1 ];
472     size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
473 
474     pragma_str[0] = '\0';
475     if (type == kDouble)
476         std::snprintf(pragma_str, sizeof(pragma_str),
477                       "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
478     else if (type == kHalf)
479         std::snprintf(pragma_str, sizeof(pragma_str),
480                       "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
481 
482     if (DEBUG)
483         bufferSize = (bufferSize < 128) ? bufferSize : 128;
484 
485     typeSize = get_explicit_type_size( type );
486     numElements = bufferSize / ( typeSize * vecSize );
487     bufferSize = numElements * typeSize * vecSize;    // To account for rounding
488     if( numStores > numElements * 2 / 3 )
489     {
490         // Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test
491         // (Plus, we leave some room for extra values to make sure didn't get written)
492         numStores = numElements * 2 / 3;
493         if( numStores < 1 )
494             numStores = 1;
495     }
496     if (DEBUG)
497         log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
498 
499     std::vector<cl_uint> offsets(numStores);
500     std::vector<char> inBuffer(numStores * typeSize * vecSize);
501 
502     clProtectedArray outBuffer( numElements * typeSize * vecSize );
503     std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
504 
505     // Create some random input data and random offsets to load from
506     generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
507 
508     // Note: make sure no two offsets are the same, otherwise the output would depend on
509     // the order that threads ran in, and that would be next to impossible to verify
510     std::vector<char> flags(numElements);
511     flags.assign(flags.size(), 0);
512 
513     for (size_t i = 0; i < numStores; i++)
514     {
515         do
516         {
517             offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d );    // Note: keep it one vec below the end for offset testing
518         } while( flags[ offsets[ i ] ] != 0 );
519         flags[ offsets[ i ] ] = -1;
520         if (LINEAR_OFFSETS)
521             offsets[i] = (int)i;
522     }
523     if (LINEAR_OFFSETS)
524         log_info("Offsets set to thread IDs to simplify output.\n");
525 
526     std::string programSrc;
527     createFn(programSrc, numElements, type, vecSize, vecSize);
528 
529     // Create our kernel
530     const char *ptr = programSrc.c_str();
531     cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
532                                                &ptr, "test_fn");
533     test_error( error, "Unable to create testing kernel" );
534     if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
535 
536     // Get the number of args to differentiate the kernels with local storage. (They have 5)
537     cl_uint numArgs;
538     error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
539     test_error( error, "clGetKernelInfo failed");
540 
541     // Set up parameters
542     streams[0] =
543         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
544                        numStores * typeSize * vecSize * sizeof(cl_char),
545                        (void *)inBuffer.data(), &error);
546     test_error( error, "Unable to create kernel stream" );
547     streams[1] =
548         clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
549                        numStores * sizeof(cl_uint), offsets.data(), &error);
550     test_error( error, "Unable to create kernel stream" );
551     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
552     test_error( error, "Unable to create kernel stream" );
553 
554     // Set parameters and run
555     if (numArgs == 5)
556     {
557         // We need to set the size of the local storage
558         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
559         test_error( error, "clSetKernelArg for buffer failed");
560         for (size_t i = 0; i < 3; i++)
561         {
562             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
563             test_error( error, "Unable to set kernel argument" );
564         }
565     }
566     else
567     {
568         // No local storage
569         for (size_t i = 0; i < 3; i++)
570         {
571             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
572             if (error) log_info("%s\n", programSrc.c_str());
573             test_error( error, "Unable to set kernel argument" );
574         }
575     }
576 
577     threads[ 0 ] = numStores;
578     error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
579     test_error( error, "Unable to get local thread size" );
580 
581     // Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since
582     // otherwise stores might overlap each other, and it'd be a nightmare to test!
583     for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ )
584     {
585         if (DEBUG)
586             log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]);
587 
588         // Clear the results first
589         memset( outBuffer, 0, numElements * typeSize * vecSize );
590         error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
591         test_error( error, "Unable to erase result stream" );
592 
593         // Set up the new offset and run
594         if (numArgs == 5)
595             error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset );
596         else
597             error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset );
598         test_error( error, "Unable to set address offset argument" );
599 
600         error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
601         test_error( error, "Unable to exec kernel" );
602 
603         // Get the results
604         error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
605         test_error( error, "Unable to read results" );
606 
607         // Create the reference results
608         referenceBuffer.assign(referenceBuffer.size(), 0);
609         for (size_t i = 0; i < numStores; i++)
610         {
611             memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
612                                     * typeSize],
613                    &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
614         }
615 
616         // Validate the results now
617         char *expected = referenceBuffer.data();
618         char *actual = (char *)(void *)outBuffer;
619 
620         if (DEBUG)
621         {
622             log_info("Memory contents:\n");
623             char inString[1024];
624             char expectedString[1024], actualString[1024];
625             for (size_t i = 0; i < numElements; i++)
626             {
627                 if (i < numStores)
628                 {
629                     log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
630                              GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
631                              GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ),
632                              offsets[i]);
633                     if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*vecSize]), typeSize * vecSize) != 0)
634                         log_error(" << ERROR\n");
635                     else
636                         log_info("\n");
637                 }
638                 else
639                 {
640                     log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
641                              GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
642                              GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ));
643                 }
644             }
645         }
646 
647         for (size_t i = 0; i < numElements; i++)
648         {
649             if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
650             {
651                 char expectedString[ 1024 ], actualString[ 1024 ];
652                 log_error( "ERROR: Data sample %d for vstore of %s%d did not validate (expected {%s}, got {%s}",
653                           (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
654                           GetDataVectorString( actual, typeSize, vecSize, actualString ) );
655                 size_t j;
656                 for( j = 0; j < numStores; j++ )
657                 {
658                     if( offsets[ j ] == (cl_uint)i )
659                     {
660                         log_error( ", stored from store #%d (of %d, offset = %d) with address offset of %d", (int)j, (int)numStores, offsets[j], (int)addressOffset );
661                         break;
662                     }
663                 }
664                 if( j == numStores )
665                     log_error( ", supposed to be canary value" );
666                 log_error( ")\n" );
667                 return 1;
668             }
669             expected += typeSize * vecSize;
670             actual += typeSize * vecSize;
671         }
672     }
673     return 0;
674 }
675 
676 #pragma mark -------------------- vstore test cases --------------------------
677 
create_global_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)678 void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
679                               ExplicitType type, size_t inVectorSize,
680                               size_t /*unused*/)
681 {
682     // clang-format off
683     const char *pattern [] = {
684     pragma_str,
685     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
686     "{\n"
687     "    int tid = get_global_id( 0 );\n"
688     "    vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
689     "}\n" };
690 
691     const char *patternV3 [] = {
692     pragma_str,
693     "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
694     "{\n"
695     "    int tid = get_global_id( 0 );\n"
696     "    if((tid&3) == 0) { // if \"tid\" is a multiple of 4 \n"
697     "      vstore3( srcValues[ 3*(tid>>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n"
698     "    } else {\n"
699     "      vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
700     "    }\n"
701     "}\n" };
702     // clang-format on
703 
704     const char *typeName = get_explicit_type_name(type);
705     if(inVectorSize == 3) {
706         std::string kernel_src =
707             concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
708         destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
709     }
710     else
711     {
712         std::string kernel_src =
713             concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
714         destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
715                                  typeName, (int)inVectorSize);
716     }
717 }
718 
test_vstore_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)719 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
720 {
721     return test_vset<test_vstore>(device, context, queue,
722                                   create_global_store_code, 10240);
723 }
724 
create_local_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)725 void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
726                              ExplicitType type, size_t inVectorSize,
727                              size_t /*unused*/)
728 {
729     // clang-format off
730     const char *pattern[] = {
731     pragma_str,
732     "#define LOC_TYPE %s\n"
733     "#define LOC_VTYPE %s%d\n"
734     "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
735     "{\n"
736     "    int tid = get_global_id( 0 );\n"
737     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
738     " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
739     " sSharedStorage[ offsets[tid] +1 ] =  sSharedStorage[ offsets[tid] ];\n"
740     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
741     "\n"
742     "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
743     "\n"
744     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
745     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
746     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
747     "\n"
748     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
749     // otherwise, local threads would be overwriting results from other local threads
750     "  int i;\n"
751     "  __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
752     "  __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
753     "  for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
754     "       dp[i] = sp[i];\n"
755     "}\n" };
756 
757     const char *patternV3 [] = {
758     pragma_str,
759     "#define LOC_TYPE %s\n"
760     "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
761     "{\n"
762     "    int tid = get_global_id( 0 );\n"
763     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
764     "    sSharedStorage[ 3*offsets[tid]   ] = (LOC_TYPE)0;\n"
765     "    sSharedStorage[ 3*offsets[tid] +1 ] =  \n"
766     "        sSharedStorage[ 3*offsets[tid] ];\n"
767     "    sSharedStorage[ 3*offsets[tid] +2 ] =  \n"
768     "        sSharedStorage[ 3*offsets[tid]];\n"
769     "    sSharedStorage[ 3*offsets[tid] +3 ] =  \n"
770     "        sSharedStorage[ 3*offsets[tid]];\n"
771     "    sSharedStorage[ 3*offsets[tid] +4 ] =  \n"
772     "        sSharedStorage[ 3*offsets[tid] ];\n"
773     "    sSharedStorage[ 3*offsets[tid] +5 ] =  \n"
774     "        sSharedStorage[ 3*offsets[tid]];\n"
775     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
776     "\n"
777     " vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage  + alignmentOffset );\n"
778     "\n"
779     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
780     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
781     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
782     "\n"
783     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
784     // otherwise, local threads would be overwriting results from other local threads
785     "  int i;\n"
786     "  __local LOC_TYPE *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
787     "  __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
788     "  for( i = 0; i < 3; i++ ) \n"
789     "       dp[i] = sp[i];\n"
790     "}\n" };
791     // clang-format on
792 
793     const char *typeName = get_explicit_type_name(type);
794     if(inVectorSize == 3) {
795         std::string kernel_src =
796             concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
797         destBuffer = str_sprintf(kernel_src, typeName);
798     }
799     else
800     {
801         std::string kernel_src =
802             concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
803         destBuffer = str_sprintf(kernel_src, typeName, typeName,
804                                  (int)inVectorSize, (int)inVectorSize);
805     }
806 }
807 
test_vstore_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)808 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
809 {
810     // Determine the max size of a local buffer that we can test against
811     cl_ulong localSize;
812     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
813     test_error( error, "Unable to get max size of local memory buffer" );
814     if (localSize > 10240) localSize = 10240;
815     if (localSize > 4096)
816         localSize -= 2048;
817     else
818         localSize /= 2;
819     return test_vset<test_vstore>(device, context, queue,
820                                   create_local_store_code, (size_t)localSize);
821 }
822 
create_private_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)823 void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
824                                ExplicitType type, size_t inVectorSize,
825                                size_t /*unused*/)
826 {
827     // clang-format off
828     const char *pattern [] = {
829     pragma_str,
830     "#define PRIV_TYPE %s\n"
831     "#define PRIV_VTYPE %s%d\n"
832     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
833     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
834     "\n"
835     "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
836     "{\n"
837     "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
838     "  int tid = get_global_id( 0 );\n"
839     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
840     " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
841     "\n"
842     "  vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
843     "\n"
844     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
845     // otherwise, local threads would be overwriting results from other local threads
846     "  uint i;\n"
847     "  __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
848     "  __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
849     "  for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
850     "       dp[i] = sp[i];\n"
851     "}\n"};
852 
853     const char *patternV3  [] = {
854     pragma_str,
855     "#define PRIV_TYPE %s\n"
856     "#define PRIV_VTYPE %s3\n"
857     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
858     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
859     "\n"
860     "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
861     "{\n"
862     "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
863     "  int tid = get_global_id( 0 );\n"
864     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
865     " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
866     "\n"
867     "  vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
868     "  uint i;\n"
869     "  __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
870     "  __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
871     "  for( i = 0; i < 3; i++ ) \n"
872     "       dp[i] = sp[i];\n"
873     "}\n"};
874     // clang-format on
875 
876     const char *typeName = get_explicit_type_name(type);
877     if(inVectorSize == 3) {
878         std::string kernel_src =
879             concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
880         destBuffer =
881             str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
882     }
883     else
884     {
885         std::string kernel_src =
886             concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
887         destBuffer =
888             str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
889                         (int)inBufferSize, (int)inVectorSize);
890     }
891 }
892 
test_vstore_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)893 int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
894 {
895     // We have no idea how much actual private storage is available, so just pick a reasonable value,
896     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
897     return test_vset<test_vstore>(device, context, queue,
898                                   create_private_store_code, 256);
899 }
900 
901 
902 
903