• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 
25 
26 #include "procs.h"
27 #include "harness/conversions.h"
28 #include "harness/typeWrappers.h"
29 #include "harness/errorHelpers.h"
30 
31 // Outputs debug information for stores
32 #define DEBUG 0
33 // Forces stores/loads to be done with offsets = tid
34 #define LINEAR_OFFSETS 0
35 #define NUM_LOADS    512
36 
37 static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
38 
39 #pragma mark -------------------- vload harness --------------------------
40 
41 typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
42 
test_vload(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_vload_program_fn createFn,size_t bufferSize,MTdata d)43 int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
44                create_vload_program_fn createFn, size_t bufferSize, MTdata d )
45 {
46     int error;
47 
48     clProgramWrapper program;
49     clKernelWrapper kernel;
50     clMemWrapper streams[ 4 ];
51     const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
52 
53     if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
54 
55     size_t threads[ 1 ], localThreads[ 1 ];
56     clProtectedArray inBuffer( bufferSize );
57     char programSrc[ 10240 ];
58     cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
59     size_t numElements, typeSize, i;
60     unsigned int outVectorSize;
61 
62 
63     typeSize = get_explicit_type_size( type );
64     numElements = bufferSize / ( typeSize * vecSize );
65     bufferSize = numElements * typeSize * vecSize;    // To account for rounding
66 
67     if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
68 
69     // Create some random input data and random offsets to load from
70     generate_random_data( type, numElements * vecSize, d, (void *)inBuffer );
71     for( i = 0; i < numLoads; i++ )
72     {
73         offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d );
74         if( offsets[ i ] < numElements - 2 )
75             alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d );
76         else
77             alignmentOffsets[ i ] = 0;
78         if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i;
79     }
80     if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n");
81 
82     // 32-bit fixup
83     outVectorSize = vecSize;
84 
85     // Declare output buffers now
86 #if !(defined(_WIN32) && defined(_MSC_VER))
87     char outBuffer[ numLoads * typeSize * outVectorSize ];
88     char referenceBuffer[ numLoads * typeSize * vecSize ];
89 #else
90     char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
91     char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
92 #endif
93 
94     // Create the program
95 
96 
97     createFn( programSrc, numElements, type, vecSize, outVectorSize);
98 
99     // Create our kernel
100     const char *ptr = programSrc;
101 
102     error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
103     test_error( error, "Unable to create testing kernel" );
104     if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
105 
106     // Get the number of args to differentiate the kernels with local storage. (They have 5)
107     cl_uint numArgs;
108     error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
109     test_error( error, "clGetKernelInfo failed");
110 
111     // Set up parameters
112     streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error );
113     test_error( error, "Unable to create kernel stream" );
114     streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error );
115     test_error( error, "Unable to create kernel stream" );
116     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
117     test_error( error, "Unable to create kernel stream" );
118     streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
119     test_error( error, "Unable to create kernel stream" );
120 
121     // Set parameters and run
122     if (numArgs == 5) {
123         // We need to set the size of the local storage
124         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
125         test_error( error, "clSetKernelArg for buffer failed");
126         for( i = 0; i < 4; i++ )
127         {
128             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
129             test_error( error, "Unable to set kernel argument" );
130         }
131     } else {
132         // No local storage
133         for( i = 0; i < 4; i++ )
134         {
135             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
136             test_error( error, "Unable to set kernel argument" );
137         }
138     }
139 
140     threads[ 0 ] = numLoads;
141     error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
142     test_error( error, "Unable to get local thread size" );
143 
144     error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
145     test_error( error, "Unable to exec kernel" );
146 
147     // Get the results
148     error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
149     test_error( error, "Unable to read results" );
150 
151 
152     // Create the reference results
153     memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
154     for( i = 0; i < numLoads; i++ )
155     {
156         memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
157                typeSize * vecSize );
158     }
159 
160     // Validate the results now
161     char *expected = referenceBuffer;
162     char *actual = outBuffer;
163     char *in = (char *)(void *)inBuffer;
164 
165     if (DEBUG) {
166         log_info("Memory contents:\n");
167         for (i=0; i<numElements; i++) {
168             char  inString[1024];
169             char expectedString[ 1024 ], actualString[ 1024 ];
170             if (i < numLoads) {
171                 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
172                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
173                          GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ),
174                          offsets[i], alignmentOffsets[i]);
175                 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*outVectorSize]), typeSize * vecSize) != 0)
176                     log_error(" << ERROR\n");
177                 else
178                     log_info("\n");
179             } else {
180                 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
181                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
182                          GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ));
183             }
184         }
185     }
186 
187     for( i = 0; i < numLoads; i++ )
188     {
189         if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
190         {
191             char expectedString[ 1024 ], actualString[ 1024 ];
192             log_error( "ERROR: Data sample %d for vload of %s%d did not validate (expected {%s}, got {%s}, loaded from offset %d)\n",
193                       (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
194                       GetDataVectorString( actual, typeSize, vecSize, actualString ), (int)offsets[ i ] );
195             return 1;
196         }
197         expected += typeSize * vecSize;
198         actual += typeSize * outVectorSize;
199     }
200 
201     return 0;
202 }
203 
test_vloadset(cl_device_id device,cl_context context,cl_command_queue queue,create_vload_program_fn createFn,size_t bufferSize)204 int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
205 {
206     ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
207     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
208     const char *size_names[] = { "2", "3", "4", "8", "16"};
209     unsigned int typeIdx, sizeIdx;
210     int error = 0;
211     MTdata mtData = init_genrand( gRandomSeed );
212 
213     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
214 
215     for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
216     {
217 
218         if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
219             continue;
220 
221         if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
222             continue;
223 
224         for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
225         {
226             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
227 
228             int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
229             if (error_this_type) {
230                 error += error_this_type;
231                 log_error("Failure; skipping further sizes for this type.");
232                 break;
233             }
234         }
235     }
236 
237     free_mtdata(mtData);
238 
239     return error;
240 }
241 
242 #pragma mark -------------------- vload test cases --------------------------
243 
create_global_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)244 void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
245 {
246     const char *pattern =
247     "%s%s"
248     "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
249     "{\n"
250     "    int tid = get_global_id( 0 );\n"
251     "    %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
252     "   results[ tid ] = tmp;\n"
253     "}\n";
254 
255     const char *patternV3 =
256     "%s%s"
257     "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
258     "{\n"
259     "    int tid = get_global_id( 0 );\n"
260     "    %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
261     "   results[ 3*tid ] = tmp.s0;\n"
262     "   results[ 3*tid+1 ] = tmp.s1;\n"
263     "   results[ 3*tid+2 ] = tmp.s2;\n"
264     "}\n";
265 
266     const char *typeName = get_explicit_type_name(type);
267     if(inVectorSize == 3) {
268         sprintf( destBuffer, patternV3,
269                 type == kDouble ? doubleExtensionPragma : "",
270                 "",
271                 typeName, typeName, typeName, typeName );
272     } else {
273         sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
274                 "",
275                 typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
276                 (int)inVectorSize, typeName );
277     }
278 }
279 
test_vload_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)280 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
281 {
282     return test_vloadset( device, context, queue, create_global_load_code, 10240 );
283 }
284 
285 
create_local_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)286 void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
287 {
288     const char *pattern =
289     "%s%s"
290     //"   __local %s%d sSharedStorage[ %d ];\n"
291     "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
292     "{\n"
293     "    int tid = get_global_id( 0 );\n"
294     "   int lid = get_local_id( 0 );\n"
295     "\n"
296     "    if( lid == 0 )\n"
297     "    {\n"
298     "        for( int i = 0; i < %d; i++ )\n"
299     "           sSharedStorage[ i ] = src[ i ];\n"
300     "    }\n"
301     //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
302     //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
303     //  updated on all threads at that point
304     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
305     "\n"
306     "    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
307     "   results[ tid ] = tmp;\n"
308     "}\n";
309 
310     const char *patternV3 =
311     "%s%s"
312     //"   __local %s%d sSharedStorage[ %d ];\n"
313     "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
314     "{\n"
315     "    int tid = get_global_id( 0 );\n"
316     "   int lid = get_local_id( 0 );\n"
317     "\n"
318     "    if( lid == 0 )\n"
319     "    {\n"
320     "        for( int i = 0; i < %d; i++ ) {\n"
321     "           sSharedStorage[ 3*i   ] = src[ 3*i   ];\n"
322     "           sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
323     "           sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
324     "        }\n"
325     "    }\n"
326     //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
327     //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
328     //  updated on all threads at that point
329     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
330     "\n"
331     "    %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
332     "   results[ 3*tid   ] = tmp.s0;\n"
333     "   results[ 3*tid +1] = tmp.s1;\n"
334     "   results[ 3*tid +2] = tmp.s2;\n"
335     "}\n";
336 
337     const char *typeName = get_explicit_type_name(type);
338     if(inVectorSize == 3) {
339         sprintf( destBuffer, patternV3,
340                 type == kDouble  ? doubleExtensionPragma : "",
341                 "",
342                 typeName, /*(int)inBufferSize,*/
343                 typeName, typeName,
344                 (int)inBufferSize,
345                 typeName, typeName );
346     } else {
347         sprintf( destBuffer, pattern,
348                 type == kDouble  ? doubleExtensionPragma : "",
349                 "",
350                 typeName, (int)inVectorSize, /*(int)inBufferSize,*/
351                 typeName, (int)inVectorSize, typeName, (int)outVectorSize,
352                 (int)inBufferSize,
353                 typeName, (int)inVectorSize, (int)inVectorSize, typeName );
354     }
355 }
356 
test_vload_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)357 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
358 {
359     // Determine the max size of a local buffer that we can test against
360     cl_ulong localSize;
361     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
362     test_error( error, "Unable to get max size of local memory buffer" );
363     if( localSize > 10240 )
364         localSize = 10240;
365     if (localSize > 4096)
366         localSize -= 2048;
367     else
368         localSize /= 2;
369 
370     return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
371 }
372 
373 
create_constant_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)374 void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
375 {
376     const char *pattern =
377     "%s%s"
378     "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
379     "{\n"
380     "    int tid = get_global_id( 0 );\n"
381     "    %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
382     "   results[ tid ] = tmp;\n"
383     "}\n";
384 
385     const char *patternV3 =
386     "%s%s"
387     "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
388     "{\n"
389     "    int tid = get_global_id( 0 );\n"
390     "    %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
391     "   results[ 3*tid   ] = tmp.s0;\n"
392     "   results[ 3*tid+1 ] = tmp.s1;\n"
393     "   results[ 3*tid+2 ] = tmp.s2;\n"
394     "}\n";
395 
396     const char *typeName = get_explicit_type_name(type);
397     if(inVectorSize == 3) {
398         sprintf( destBuffer, patternV3,
399                 type == kDouble ? doubleExtensionPragma : "",
400                 "",
401                 typeName, typeName,  typeName,
402                 typeName );
403     } else {
404         sprintf( destBuffer, pattern,
405                 type == kDouble ? doubleExtensionPragma : "",
406                 "",
407                 typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
408                 (int)inVectorSize, typeName );
409     }
410 }
411 
test_vload_constant(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)412 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
413 {
414     // Determine the max size of a local buffer that we can test against
415     cl_ulong maxSize;
416     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
417     test_error( error, "Unable to get max size of constant memory buffer" );
418     if( maxSize > 10240 )
419         maxSize = 10240;
420     if (maxSize > 4096)
421         maxSize -= 2048;
422     else
423         maxSize /= 2;
424 
425     return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
426 }
427 
428 
create_private_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)429 void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
430 {
431     const char *pattern =
432     "%s%s"
433     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
434     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
435     "#define PRIV_TYPE %s%d\n"
436     "#define PRIV_SIZE %d\n"
437     "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
438     "{\n"
439     "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
440     "    int tid = get_global_id( 0 );\n"
441     "\n"
442     "    for( int i = 0; i < %d; i++ )\n"
443     "      sPrivateStorage[ i ] = src[ i ];\n"
444     //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
445     //  anybody else to sync up
446     "\n"
447     "    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
448     "   results[ tid ] = tmp;\n"
449     "}\n";
450 
451     const char *patternV3 =
452     "%s%s"
453     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
454     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
455     "#define PRIV_TYPE %s\n"
456     "#define PRIV_SIZE %d\n"
457     "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
458     "{\n"
459     "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
460     "    int tid = get_global_id( 0 );\n"
461     "\n"
462     "    for( int i = 0; i < PRIV_SIZE; i++ )\n"
463     "    {\n"
464     "        sPrivateStorage[ i ] = src[ i ];\n"
465     "    }\n"
466     //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
467     //  anybody else to sync up
468     "\n"
469     "    %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
470     "   results[ 3*tid   ] = tmp.s0;\n"
471     "   results[ 3*tid+1 ] = tmp.s1;\n"
472     "   results[ 3*tid+2 ] = tmp.s2;\n"
473     "}\n";
474 
475     const char *typeName = get_explicit_type_name(type);
476     if(inVectorSize ==3) {
477         sprintf( destBuffer, patternV3,
478                 type == kDouble ? doubleExtensionPragma : "",
479                 "",
480                 typeName, 3*((int)inBufferSize),
481                 typeName, typeName,
482                 typeName );
483         // log_info("Src is \"\n%s\n\"\n", destBuffer);
484     } else {
485         sprintf( destBuffer, pattern,
486                 type == kDouble ? doubleExtensionPragma : "",
487                 "",
488                 typeName, (int)inVectorSize, (int)inBufferSize,
489                 typeName, (int)inVectorSize, typeName, (int)outVectorSize,
490                 (int)inBufferSize,
491                 typeName, (int)inVectorSize, (int)inVectorSize, typeName );
492     }
493 }
494 
test_vload_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)495 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
496 {
497     // We have no idea how much actual private storage is available, so just pick a reasonable value,
498     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
499     return test_vloadset( device, context, queue, create_private_load_code, 256 );
500 }
501 
502 
503 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
504 #pragma mark -------------------- vstore harness --------------------------
505 
506 typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
507 
test_vstore(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_vstore_program_fn createFn,size_t bufferSize,MTdata d)508 int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
509                 create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
510 {
511     int error;
512 
513     clProgramWrapper program;
514     clKernelWrapper kernel;
515     clMemWrapper streams[ 3 ];
516 
517     size_t threads[ 1 ], localThreads[ 1 ];
518 
519     size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
520 
521     if (DEBUG)
522         bufferSize = (bufferSize < 128) ? bufferSize : 128;
523 
524     typeSize = get_explicit_type_size( type );
525     numElements = bufferSize / ( typeSize * vecSize );
526     bufferSize = numElements * typeSize * vecSize;    // To account for rounding
527     if( numStores > numElements * 2 / 3 )
528     {
529         // Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test
530         // (Plus, we leave some room for extra values to make sure didn't get written)
531         numStores = numElements * 2 / 3;
532         if( numStores < 1 )
533             numStores = 1;
534     }
535     if (DEBUG)
536         log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
537 #if !(defined(_WIN32) && defined(_MSC_VER))
538     cl_uint offsets[ numStores ];
539 #else
540     cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
541 #endif
542     char programSrc[ 10240 ];
543     size_t i;
544 
545 #if !(defined(_WIN32) && defined(_MSC_VER))
546     char inBuffer[ numStores * typeSize * vecSize ];
547 #else
548     char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
549 #endif
550     clProtectedArray outBuffer( numElements * typeSize * vecSize );
551 #if !(defined(_WIN32) && defined(_MSC_VER))
552     char referenceBuffer[ numElements * typeSize * vecSize ];
553 #else
554     char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
555 #endif
556 
557     // Create some random input data and random offsets to load from
558     generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
559 
560     // Note: make sure no two offsets are the same, otherwise the output would depend on
561     // the order that threads ran in, and that would be next to impossible to verify
562 #if !(defined(_WIN32) && defined(_MSC_VER))
563     char flags[ numElements ];
564 #else
565     char* flags = (char*)_malloca( numElements * sizeof(char));
566 #endif
567 
568     memset( flags, 0, numElements * sizeof(char) );
569     for( i = 0; i < numStores; i++ )
570     {
571         do
572         {
573             offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d );    // Note: keep it one vec below the end for offset testing
574         } while( flags[ offsets[ i ] ] != 0 );
575         flags[ offsets[ i ] ] = -1;
576         if (LINEAR_OFFSETS)
577             offsets[i] = (int)i;
578     }
579     if (LINEAR_OFFSETS)
580         log_info("Offsets set to thread IDs to simplify output.\n");
581 
582     createFn( programSrc, numElements, type, vecSize );
583 
584     // Create our kernel
585     const char *ptr = programSrc;
586     error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
587     test_error( error, "Unable to create testing kernel" );
588     if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
589 
590     // Get the number of args to differentiate the kernels with local storage. (They have 5)
591     cl_uint numArgs;
592     error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
593     test_error( error, "clGetKernelInfo failed");
594 
595     // Set up parameters
596     streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
597     test_error( error, "Unable to create kernel stream" );
598     streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
599     test_error( error, "Unable to create kernel stream" );
600     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
601     test_error( error, "Unable to create kernel stream" );
602 
603     // Set parameters and run
604     if (numArgs == 5)
605     {
606         // We need to set the size of the local storage
607         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
608         test_error( error, "clSetKernelArg for buffer failed");
609         for( i = 0; i < 3; i++ )
610         {
611             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
612             test_error( error, "Unable to set kernel argument" );
613         }
614     }
615     else
616     {
617         // No local storage
618         for( i = 0; i < 3; i++ )
619         {
620             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
621             if (error)
622                 log_info("%s\n", programSrc);
623             test_error( error, "Unable to set kernel argument" );
624         }
625     }
626 
627     threads[ 0 ] = numStores;
628     error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
629     test_error( error, "Unable to get local thread size" );
630 
631     // Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since
632     // otherwise stores might overlap each other, and it'd be a nightmare to test!
633     for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ )
634     {
635         if (DEBUG)
636             log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]);
637 
638         // Clear the results first
639         memset( outBuffer, 0, numElements * typeSize * vecSize );
640         error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
641         test_error( error, "Unable to erase result stream" );
642 
643         // Set up the new offset and run
644         if (numArgs == 5)
645             error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset );
646         else
647             error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset );
648         test_error( error, "Unable to set address offset argument" );
649 
650         error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
651         test_error( error, "Unable to exec kernel" );
652 
653         // Get the results
654         error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
655         test_error( error, "Unable to read results" );
656 
657 
658         // Create the reference results
659         memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
660         for( i = 0; i < numStores; i++ )
661         {
662             memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
663         }
664 
665         // Validate the results now
666         char *expected = referenceBuffer;
667         char *actual = (char *)(void *)outBuffer;
668 
669         if (DEBUG)
670         {
671             log_info("Memory contents:\n");
672             for (i=0; i<numElements; i++)
673             {
674                 char  inString[1024];
675                 char expectedString[ 1024 ], actualString[ 1024 ];
676                 if (i < numStores)
677                 {
678                     log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
679                              GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
680                              GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ),
681                              offsets[i]);
682                     if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*vecSize]), typeSize * vecSize) != 0)
683                         log_error(" << ERROR\n");
684                     else
685                         log_info("\n");
686                 }
687                 else
688                 {
689                     log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
690                              GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
691                              GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ));
692                 }
693             }
694         }
695 
696         for( i = 0; i < numElements; i++ )
697         {
698             if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
699             {
700                 char expectedString[ 1024 ], actualString[ 1024 ];
701                 log_error( "ERROR: Data sample %d for vstore of %s%d did not validate (expected {%s}, got {%s}",
702                           (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
703                           GetDataVectorString( actual, typeSize, vecSize, actualString ) );
704                 size_t j;
705                 for( j = 0; j < numStores; j++ )
706                 {
707                     if( offsets[ j ] == (cl_uint)i )
708                     {
709                         log_error( ", stored from store #%d (of %d, offset = %d) with address offset of %d", (int)j, (int)numStores, offsets[j], (int)addressOffset );
710                         break;
711                     }
712                 }
713                 if( j == numStores )
714                     log_error( ", supposed to be canary value" );
715                 log_error( ")\n" );
716                 return 1;
717             }
718             expected += typeSize * vecSize;
719             actual += typeSize * vecSize;
720         }
721     }
722 
723     return 0;
724 }
725 
test_vstoreset(cl_device_id device,cl_context context,cl_command_queue queue,create_vstore_program_fn createFn,size_t bufferSize)726 int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
727 {
728     ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
729     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
730     const char *size_names[] = { "2", "3", "4", "8", "16"};
731     unsigned int typeIdx, sizeIdx;
732     int error = 0;
733     MTdata d = init_genrand( gRandomSeed );
734 
735     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
736 
737     for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
738     {
739         if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
740             continue;
741 
742         if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
743             continue;
744 
745         for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
746         {
747             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
748 
749             int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
750             if (error_this_type)
751             {
752                 log_error("Failure; skipping further sizes for this type.\n");
753                 error += error_this_type;
754                 break;
755             }
756         }
757     }
758 
759     free_mtdata(d);
760     return error;
761 }
762 
763 
764 #pragma mark -------------------- vstore test cases --------------------------
765 
create_global_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)766 void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
767 {
768     const char *pattern =
769     "%s"
770     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
771     "{\n"
772     "    int tid = get_global_id( 0 );\n"
773     "    vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
774     "}\n";
775 
776     const char *patternV3 =
777     "%s"
778     "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
779     "{\n"
780     "    int tid = get_global_id( 0 );\n"
781     "    if((tid&3) == 0) { // if \"tid\" is a multiple of 4 \n"
782     "      vstore3( srcValues[ 3*(tid>>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n"
783     "    } else {\n"
784     "      vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
785     "    }\n"
786     "}\n";
787 
788     const char *typeName = get_explicit_type_name(type);
789 
790     if(inVectorSize == 3) {
791         sprintf( destBuffer, patternV3,
792                 type == kDouble ? doubleExtensionPragma : "",
793                 typeName, typeName, typeName);
794 
795     } else {
796         sprintf( destBuffer, pattern,
797                 type == kDouble ? doubleExtensionPragma : "",
798                 typeName, (int)inVectorSize, typeName, (int)inVectorSize );
799     }
800     // if(inVectorSize == 3 || inVectorSize == 4) {
801     //     log_info("\n----\n%s\n----\n", destBuffer);
802     // }
803 }
804 
test_vstore_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)805 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
806 {
807     return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
808 }
809 
810 
create_local_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)811 void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
812 {
813     const char *pattern =
814     "%s"
815     "\n"
816     "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
817     "{\n"
818     "    int tid = get_global_id( 0 );\n"
819     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
820     " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
821     " sSharedStorage[ offsets[tid] +1 ] =  sSharedStorage[ offsets[tid] ];\n"
822     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
823     "\n"
824     "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
825     "\n"
826     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
827     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
828     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
829     "\n"
830     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
831     // otherwise, local threads would be overwriting results from other local threads
832     "  int i;\n"
833     "  __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
834     "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
835     "  for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
836     "       dp[i] = sp[i];\n"
837     "}\n";
838 
839     const char *patternV3 =
840     "%s"
841     "\n"
842     "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
843     "{\n"
844     "    int tid = get_global_id( 0 );\n"
845     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
846     "    sSharedStorage[ 3*offsets[tid]   ] = (%s)0;\n"
847     "    sSharedStorage[ 3*offsets[tid] +1 ] =  \n"
848     "        sSharedStorage[ 3*offsets[tid] ];\n"
849     "    sSharedStorage[ 3*offsets[tid] +2 ] =  \n"
850     "        sSharedStorage[ 3*offsets[tid]];\n"
851     "    sSharedStorage[ 3*offsets[tid] +3 ] =  \n"
852     "        sSharedStorage[ 3*offsets[tid]];\n"
853     "    sSharedStorage[ 3*offsets[tid] +4 ] =  \n"
854     "        sSharedStorage[ 3*offsets[tid] ];\n"
855     "    sSharedStorage[ 3*offsets[tid] +5 ] =  \n"
856     "        sSharedStorage[ 3*offsets[tid]];\n"
857     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
858     "\n"
859     " vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage  + alignmentOffset );\n"
860     "\n"
861     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
862     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
863     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
864     "\n"
865     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
866     // otherwise, local threads would be overwriting results from other local threads
867     "  int i;\n"
868     "  __local %s *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
869     "  __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
870     "  for( i = 0; i < 3; i++ ) \n"
871     "       dp[i] = sp[i];\n"
872     "}\n";
873 
874     const char *typeName = get_explicit_type_name(type);
875     if(inVectorSize == 3) {
876         sprintf( destBuffer, patternV3,
877                 type == kDouble ? doubleExtensionPragma : "",
878                 typeName,
879                 typeName,
880                 typeName,  typeName,
881                 typeName, typeName, typeName  );
882     } else {
883         sprintf( destBuffer, pattern,
884                 type == kDouble ? doubleExtensionPragma : "",
885                 typeName, (int)inVectorSize,
886                 typeName, (int)inVectorSize, typeName, (int)inVectorSize,
887                 typeName, (int)inVectorSize, typeName,
888                 (int)inVectorSize, typeName, typeName,
889                 typeName, typeName, typeName  );
890     }
891     // log_info(destBuffer);
892 }
893 
test_vstore_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)894 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
895 {
896     // Determine the max size of a local buffer that we can test against
897     cl_ulong localSize;
898     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
899     test_error( error, "Unable to get max size of local memory buffer" );
900     if( localSize > 10240 )
901         localSize = 10240;
902     if (localSize > 4096)
903         localSize -= 2048;
904     else
905         localSize /= 2;
906     return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
907 }
908 
909 
create_private_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)910 void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
911 {
912     const char *pattern =
913     "%s"
914     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
915     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
916     "\n"
917     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
918     "{\n"
919     "    __private %s%d sPrivateStorage[ %d ];\n"
920     "    int tid = get_global_id( 0 );\n"
921     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
922     " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
923     "\n"
924     "   vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
925     "\n"
926     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
927     // otherwise, local threads would be overwriting results from other local threads
928     "  uint i;\n"
929     "  __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
930     "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
931     "  for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
932     "       dp[i] = sp[i];\n"
933     "}\n";
934 
935 
936     const char *patternV3 =
937     "%s"
938     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
939     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
940     "\n"
941     "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
942     "{\n"
943     "    __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
944     "    int tid = get_global_id( 0 );\n"
945     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
946     " sPrivateStorage[tid] = (%s3)(%s)0;\n"
947     "\n"
948 
949     "   vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
950     "\n"
951     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
952     // otherwise, local threads would be overwriting results from other local threads
953     "  uint i;\n"
954     "  __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
955     "  __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
956     "  for( i = 0; i < 3; i++ ) \n"
957     "       dp[i] = sp[i];\n"
958     "}\n";
959 
960     const char *typeName = get_explicit_type_name(type);
961     if(inVectorSize == 3) {
962         sprintf( destBuffer, patternV3,
963                 type == kDouble ? doubleExtensionPragma : "",
964                 typeName,  typeName,
965                 typeName, (int)inBufferSize,
966                 typeName, typeName,
967                 typeName, typeName, typeName, typeName, typeName );
968     } else {
969         sprintf( destBuffer, pattern,
970                 type == kDouble ? doubleExtensionPragma : "",
971                 typeName, (int)inVectorSize, typeName, (int)inVectorSize,
972                 typeName, (int)inVectorSize, (int)inBufferSize,
973                 typeName, (int)inVectorSize, typeName,
974                 (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
975     }
976 }
977 
test_vstore_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)978 int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
979 {
980     // We have no idea how much actual private storage is available, so just pick a reasonable value,
981     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
982     return test_vstoreset( device, context, queue, create_private_store_code, 256 );
983 }
984 
985 
986 
987