1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <limits.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24
25
26 #include "procs.h"
27 #include "harness/conversions.h"
28 #include "harness/typeWrappers.h"
29 #include "harness/errorHelpers.h"
30
31 // Outputs debug information for stores
32 #define DEBUG 0
33 // Forces stores/loads to be done with offsets = tid
34 #define LINEAR_OFFSETS 0
35 #define NUM_LOADS 512
36
37 static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
38
39 #pragma mark -------------------- vload harness --------------------------
40
41 typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
42
test_vload(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_vload_program_fn createFn,size_t bufferSize,MTdata d)43 int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
44 create_vload_program_fn createFn, size_t bufferSize, MTdata d )
45 {
46 int error;
47
48 clProgramWrapper program;
49 clKernelWrapper kernel;
50 clMemWrapper streams[ 4 ];
51 const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
52
53 if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
54
55 size_t threads[ 1 ], localThreads[ 1 ];
56 clProtectedArray inBuffer( bufferSize );
57 char programSrc[ 10240 ];
58 cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
59 size_t numElements, typeSize, i;
60 unsigned int outVectorSize;
61
62
63 typeSize = get_explicit_type_size( type );
64 numElements = bufferSize / ( typeSize * vecSize );
65 bufferSize = numElements * typeSize * vecSize; // To account for rounding
66
67 if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
68
69 // Create some random input data and random offsets to load from
70 generate_random_data( type, numElements * vecSize, d, (void *)inBuffer );
71 for( i = 0; i < numLoads; i++ )
72 {
73 offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d );
74 if( offsets[ i ] < numElements - 2 )
75 alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d );
76 else
77 alignmentOffsets[ i ] = 0;
78 if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i;
79 }
80 if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n");
81
82 // 32-bit fixup
83 outVectorSize = vecSize;
84
85 // Declare output buffers now
86 #if !(defined(_WIN32) && defined(_MSC_VER))
87 char outBuffer[ numLoads * typeSize * outVectorSize ];
88 char referenceBuffer[ numLoads * typeSize * vecSize ];
89 #else
90 char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
91 char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
92 #endif
93
94 // Create the program
95
96
97 createFn( programSrc, numElements, type, vecSize, outVectorSize);
98
99 // Create our kernel
100 const char *ptr = programSrc;
101
102 error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
103 test_error( error, "Unable to create testing kernel" );
104 if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
105
106 // Get the number of args to differentiate the kernels with local storage. (They have 5)
107 cl_uint numArgs;
108 error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
109 test_error( error, "clGetKernelInfo failed");
110
111 // Set up parameters
112 streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error );
113 test_error( error, "Unable to create kernel stream" );
114 streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error );
115 test_error( error, "Unable to create kernel stream" );
116 streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
117 test_error( error, "Unable to create kernel stream" );
118 streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
119 test_error( error, "Unable to create kernel stream" );
120
121 // Set parameters and run
122 if (numArgs == 5) {
123 // We need to set the size of the local storage
124 error = clSetKernelArg(kernel, 0, bufferSize, NULL);
125 test_error( error, "clSetKernelArg for buffer failed");
126 for( i = 0; i < 4; i++ )
127 {
128 error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
129 test_error( error, "Unable to set kernel argument" );
130 }
131 } else {
132 // No local storage
133 for( i = 0; i < 4; i++ )
134 {
135 error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
136 test_error( error, "Unable to set kernel argument" );
137 }
138 }
139
140 threads[ 0 ] = numLoads;
141 error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
142 test_error( error, "Unable to get local thread size" );
143
144 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
145 test_error( error, "Unable to exec kernel" );
146
147 // Get the results
148 error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
149 test_error( error, "Unable to read results" );
150
151
152 // Create the reference results
153 memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
154 for( i = 0; i < numLoads; i++ )
155 {
156 memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
157 typeSize * vecSize );
158 }
159
160 // Validate the results now
161 char *expected = referenceBuffer;
162 char *actual = outBuffer;
163 char *in = (char *)(void *)inBuffer;
164
165 if (DEBUG) {
166 log_info("Memory contents:\n");
167 for (i=0; i<numElements; i++) {
168 char inString[1024];
169 char expectedString[ 1024 ], actualString[ 1024 ];
170 if (i < numLoads) {
171 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
172 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
173 GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ),
174 offsets[i], alignmentOffsets[i]);
175 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*outVectorSize]), typeSize * vecSize) != 0)
176 log_error(" << ERROR\n");
177 else
178 log_info("\n");
179 } else {
180 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
181 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
182 GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ));
183 }
184 }
185 }
186
187 for( i = 0; i < numLoads; i++ )
188 {
189 if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
190 {
191 char expectedString[ 1024 ], actualString[ 1024 ];
192 log_error( "ERROR: Data sample %d for vload of %s%d did not validate (expected {%s}, got {%s}, loaded from offset %d)\n",
193 (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
194 GetDataVectorString( actual, typeSize, vecSize, actualString ), (int)offsets[ i ] );
195 return 1;
196 }
197 expected += typeSize * vecSize;
198 actual += typeSize * outVectorSize;
199 }
200
201 return 0;
202 }
203
test_vloadset(cl_device_id device,cl_context context,cl_command_queue queue,create_vload_program_fn createFn,size_t bufferSize)204 int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
205 {
206 ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
207 unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
208 const char *size_names[] = { "2", "3", "4", "8", "16"};
209 unsigned int typeIdx, sizeIdx;
210 int error = 0;
211 MTdata mtData = init_genrand( gRandomSeed );
212
213 log_info("Testing with buffer size of %d.\n", (int)bufferSize);
214
215 for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
216 {
217
218 if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
219 continue;
220
221 if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
222 continue;
223
224 for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
225 {
226 log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
227
228 int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
229 if (error_this_type) {
230 error += error_this_type;
231 log_error("Failure; skipping further sizes for this type.");
232 break;
233 }
234 }
235 }
236
237 free_mtdata(mtData);
238
239 return error;
240 }
241
242 #pragma mark -------------------- vload test cases --------------------------
243
create_global_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)244 void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
245 {
246 const char *pattern =
247 "%s%s"
248 "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
249 "{\n"
250 " int tid = get_global_id( 0 );\n"
251 " %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
252 " results[ tid ] = tmp;\n"
253 "}\n";
254
255 const char *patternV3 =
256 "%s%s"
257 "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
258 "{\n"
259 " int tid = get_global_id( 0 );\n"
260 " %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
261 " results[ 3*tid ] = tmp.s0;\n"
262 " results[ 3*tid+1 ] = tmp.s1;\n"
263 " results[ 3*tid+2 ] = tmp.s2;\n"
264 "}\n";
265
266 const char *typeName = get_explicit_type_name(type);
267 if(inVectorSize == 3) {
268 sprintf( destBuffer, patternV3,
269 type == kDouble ? doubleExtensionPragma : "",
270 "",
271 typeName, typeName, typeName, typeName );
272 } else {
273 sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
274 "",
275 typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
276 (int)inVectorSize, typeName );
277 }
278 }
279
test_vload_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)280 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
281 {
282 return test_vloadset( device, context, queue, create_global_load_code, 10240 );
283 }
284
285
create_local_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)286 void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
287 {
288 const char *pattern =
289 "%s%s"
290 //" __local %s%d sSharedStorage[ %d ];\n"
291 "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
292 "{\n"
293 " int tid = get_global_id( 0 );\n"
294 " int lid = get_local_id( 0 );\n"
295 "\n"
296 " if( lid == 0 )\n"
297 " {\n"
298 " for( int i = 0; i < %d; i++ )\n"
299 " sSharedStorage[ i ] = src[ i ];\n"
300 " }\n"
301 // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
302 // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
303 // updated on all threads at that point
304 " barrier( CLK_LOCAL_MEM_FENCE );\n"
305 "\n"
306 " %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
307 " results[ tid ] = tmp;\n"
308 "}\n";
309
310 const char *patternV3 =
311 "%s%s"
312 //" __local %s%d sSharedStorage[ %d ];\n"
313 "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
314 "{\n"
315 " int tid = get_global_id( 0 );\n"
316 " int lid = get_local_id( 0 );\n"
317 "\n"
318 " if( lid == 0 )\n"
319 " {\n"
320 " for( int i = 0; i < %d; i++ ) {\n"
321 " sSharedStorage[ 3*i ] = src[ 3*i ];\n"
322 " sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
323 " sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
324 " }\n"
325 " }\n"
326 // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
327 // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
328 // updated on all threads at that point
329 " barrier( CLK_LOCAL_MEM_FENCE );\n"
330 "\n"
331 " %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
332 " results[ 3*tid ] = tmp.s0;\n"
333 " results[ 3*tid +1] = tmp.s1;\n"
334 " results[ 3*tid +2] = tmp.s2;\n"
335 "}\n";
336
337 const char *typeName = get_explicit_type_name(type);
338 if(inVectorSize == 3) {
339 sprintf( destBuffer, patternV3,
340 type == kDouble ? doubleExtensionPragma : "",
341 "",
342 typeName, /*(int)inBufferSize,*/
343 typeName, typeName,
344 (int)inBufferSize,
345 typeName, typeName );
346 } else {
347 sprintf( destBuffer, pattern,
348 type == kDouble ? doubleExtensionPragma : "",
349 "",
350 typeName, (int)inVectorSize, /*(int)inBufferSize,*/
351 typeName, (int)inVectorSize, typeName, (int)outVectorSize,
352 (int)inBufferSize,
353 typeName, (int)inVectorSize, (int)inVectorSize, typeName );
354 }
355 }
356
test_vload_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)357 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
358 {
359 // Determine the max size of a local buffer that we can test against
360 cl_ulong localSize;
361 int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
362 test_error( error, "Unable to get max size of local memory buffer" );
363 if( localSize > 10240 )
364 localSize = 10240;
365 if (localSize > 4096)
366 localSize -= 2048;
367 else
368 localSize /= 2;
369
370 return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
371 }
372
373
create_constant_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)374 void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
375 {
376 const char *pattern =
377 "%s%s"
378 "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
379 "{\n"
380 " int tid = get_global_id( 0 );\n"
381 " %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
382 " results[ tid ] = tmp;\n"
383 "}\n";
384
385 const char *patternV3 =
386 "%s%s"
387 "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
388 "{\n"
389 " int tid = get_global_id( 0 );\n"
390 " %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
391 " results[ 3*tid ] = tmp.s0;\n"
392 " results[ 3*tid+1 ] = tmp.s1;\n"
393 " results[ 3*tid+2 ] = tmp.s2;\n"
394 "}\n";
395
396 const char *typeName = get_explicit_type_name(type);
397 if(inVectorSize == 3) {
398 sprintf( destBuffer, patternV3,
399 type == kDouble ? doubleExtensionPragma : "",
400 "",
401 typeName, typeName, typeName,
402 typeName );
403 } else {
404 sprintf( destBuffer, pattern,
405 type == kDouble ? doubleExtensionPragma : "",
406 "",
407 typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
408 (int)inVectorSize, typeName );
409 }
410 }
411
test_vload_constant(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)412 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
413 {
414 // Determine the max size of a local buffer that we can test against
415 cl_ulong maxSize;
416 int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
417 test_error( error, "Unable to get max size of constant memory buffer" );
418 if( maxSize > 10240 )
419 maxSize = 10240;
420 if (maxSize > 4096)
421 maxSize -= 2048;
422 else
423 maxSize /= 2;
424
425 return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
426 }
427
428
create_private_load_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)429 void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
430 {
431 const char *pattern =
432 "%s%s"
433 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
434 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
435 "#define PRIV_TYPE %s%d\n"
436 "#define PRIV_SIZE %d\n"
437 "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
438 "{\n"
439 " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
440 " int tid = get_global_id( 0 );\n"
441 "\n"
442 " for( int i = 0; i < %d; i++ )\n"
443 " sPrivateStorage[ i ] = src[ i ];\n"
444 // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
445 // anybody else to sync up
446 "\n"
447 " %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
448 " results[ tid ] = tmp;\n"
449 "}\n";
450
451 const char *patternV3 =
452 "%s%s"
453 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
454 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
455 "#define PRIV_TYPE %s\n"
456 "#define PRIV_SIZE %d\n"
457 "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
458 "{\n"
459 " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
460 " int tid = get_global_id( 0 );\n"
461 "\n"
462 " for( int i = 0; i < PRIV_SIZE; i++ )\n"
463 " {\n"
464 " sPrivateStorage[ i ] = src[ i ];\n"
465 " }\n"
466 // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
467 // anybody else to sync up
468 "\n"
469 " %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
470 " results[ 3*tid ] = tmp.s0;\n"
471 " results[ 3*tid+1 ] = tmp.s1;\n"
472 " results[ 3*tid+2 ] = tmp.s2;\n"
473 "}\n";
474
475 const char *typeName = get_explicit_type_name(type);
476 if(inVectorSize ==3) {
477 sprintf( destBuffer, patternV3,
478 type == kDouble ? doubleExtensionPragma : "",
479 "",
480 typeName, 3*((int)inBufferSize),
481 typeName, typeName,
482 typeName );
483 // log_info("Src is \"\n%s\n\"\n", destBuffer);
484 } else {
485 sprintf( destBuffer, pattern,
486 type == kDouble ? doubleExtensionPragma : "",
487 "",
488 typeName, (int)inVectorSize, (int)inBufferSize,
489 typeName, (int)inVectorSize, typeName, (int)outVectorSize,
490 (int)inBufferSize,
491 typeName, (int)inVectorSize, (int)inVectorSize, typeName );
492 }
493 }
494
test_vload_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)495 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
496 {
497 // We have no idea how much actual private storage is available, so just pick a reasonable value,
498 // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
499 return test_vloadset( device, context, queue, create_private_load_code, 256 );
500 }
501
502
503 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
504 #pragma mark -------------------- vstore harness --------------------------
505
506 typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
507
test_vstore(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_vstore_program_fn createFn,size_t bufferSize,MTdata d)508 int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
509 create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
510 {
511 int error;
512
513 clProgramWrapper program;
514 clKernelWrapper kernel;
515 clMemWrapper streams[ 3 ];
516
517 size_t threads[ 1 ], localThreads[ 1 ];
518
519 size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
520
521 if (DEBUG)
522 bufferSize = (bufferSize < 128) ? bufferSize : 128;
523
524 typeSize = get_explicit_type_size( type );
525 numElements = bufferSize / ( typeSize * vecSize );
526 bufferSize = numElements * typeSize * vecSize; // To account for rounding
527 if( numStores > numElements * 2 / 3 )
528 {
529 // Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test
530 // (Plus, we leave some room for extra values to make sure didn't get written)
531 numStores = numElements * 2 / 3;
532 if( numStores < 1 )
533 numStores = 1;
534 }
535 if (DEBUG)
536 log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
537 #if !(defined(_WIN32) && defined(_MSC_VER))
538 cl_uint offsets[ numStores ];
539 #else
540 cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
541 #endif
542 char programSrc[ 10240 ];
543 size_t i;
544
545 #if !(defined(_WIN32) && defined(_MSC_VER))
546 char inBuffer[ numStores * typeSize * vecSize ];
547 #else
548 char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
549 #endif
550 clProtectedArray outBuffer( numElements * typeSize * vecSize );
551 #if !(defined(_WIN32) && defined(_MSC_VER))
552 char referenceBuffer[ numElements * typeSize * vecSize ];
553 #else
554 char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
555 #endif
556
557 // Create some random input data and random offsets to load from
558 generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
559
560 // Note: make sure no two offsets are the same, otherwise the output would depend on
561 // the order that threads ran in, and that would be next to impossible to verify
562 #if !(defined(_WIN32) && defined(_MSC_VER))
563 char flags[ numElements ];
564 #else
565 char* flags = (char*)_malloca( numElements * sizeof(char));
566 #endif
567
568 memset( flags, 0, numElements * sizeof(char) );
569 for( i = 0; i < numStores; i++ )
570 {
571 do
572 {
573 offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d ); // Note: keep it one vec below the end for offset testing
574 } while( flags[ offsets[ i ] ] != 0 );
575 flags[ offsets[ i ] ] = -1;
576 if (LINEAR_OFFSETS)
577 offsets[i] = (int)i;
578 }
579 if (LINEAR_OFFSETS)
580 log_info("Offsets set to thread IDs to simplify output.\n");
581
582 createFn( programSrc, numElements, type, vecSize );
583
584 // Create our kernel
585 const char *ptr = programSrc;
586 error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
587 test_error( error, "Unable to create testing kernel" );
588 if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
589
590 // Get the number of args to differentiate the kernels with local storage. (They have 5)
591 cl_uint numArgs;
592 error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
593 test_error( error, "clGetKernelInfo failed");
594
595 // Set up parameters
596 streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
597 test_error( error, "Unable to create kernel stream" );
598 streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
599 test_error( error, "Unable to create kernel stream" );
600 streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
601 test_error( error, "Unable to create kernel stream" );
602
603 // Set parameters and run
604 if (numArgs == 5)
605 {
606 // We need to set the size of the local storage
607 error = clSetKernelArg(kernel, 0, bufferSize, NULL);
608 test_error( error, "clSetKernelArg for buffer failed");
609 for( i = 0; i < 3; i++ )
610 {
611 error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
612 test_error( error, "Unable to set kernel argument" );
613 }
614 }
615 else
616 {
617 // No local storage
618 for( i = 0; i < 3; i++ )
619 {
620 error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
621 if (error)
622 log_info("%s\n", programSrc);
623 test_error( error, "Unable to set kernel argument" );
624 }
625 }
626
627 threads[ 0 ] = numStores;
628 error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
629 test_error( error, "Unable to get local thread size" );
630
631 // Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since
632 // otherwise stores might overlap each other, and it'd be a nightmare to test!
633 for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ )
634 {
635 if (DEBUG)
636 log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]);
637
638 // Clear the results first
639 memset( outBuffer, 0, numElements * typeSize * vecSize );
640 error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
641 test_error( error, "Unable to erase result stream" );
642
643 // Set up the new offset and run
644 if (numArgs == 5)
645 error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset );
646 else
647 error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset );
648 test_error( error, "Unable to set address offset argument" );
649
650 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
651 test_error( error, "Unable to exec kernel" );
652
653 // Get the results
654 error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
655 test_error( error, "Unable to read results" );
656
657
658 // Create the reference results
659 memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
660 for( i = 0; i < numStores; i++ )
661 {
662 memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
663 }
664
665 // Validate the results now
666 char *expected = referenceBuffer;
667 char *actual = (char *)(void *)outBuffer;
668
669 if (DEBUG)
670 {
671 log_info("Memory contents:\n");
672 for (i=0; i<numElements; i++)
673 {
674 char inString[1024];
675 char expectedString[ 1024 ], actualString[ 1024 ];
676 if (i < numStores)
677 {
678 log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
679 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
680 GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ),
681 offsets[i]);
682 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*vecSize]), typeSize * vecSize) != 0)
683 log_error(" << ERROR\n");
684 else
685 log_info("\n");
686 }
687 else
688 {
689 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
690 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
691 GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ));
692 }
693 }
694 }
695
696 for( i = 0; i < numElements; i++ )
697 {
698 if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
699 {
700 char expectedString[ 1024 ], actualString[ 1024 ];
701 log_error( "ERROR: Data sample %d for vstore of %s%d did not validate (expected {%s}, got {%s}",
702 (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
703 GetDataVectorString( actual, typeSize, vecSize, actualString ) );
704 size_t j;
705 for( j = 0; j < numStores; j++ )
706 {
707 if( offsets[ j ] == (cl_uint)i )
708 {
709 log_error( ", stored from store #%d (of %d, offset = %d) with address offset of %d", (int)j, (int)numStores, offsets[j], (int)addressOffset );
710 break;
711 }
712 }
713 if( j == numStores )
714 log_error( ", supposed to be canary value" );
715 log_error( ")\n" );
716 return 1;
717 }
718 expected += typeSize * vecSize;
719 actual += typeSize * vecSize;
720 }
721 }
722
723 return 0;
724 }
725
test_vstoreset(cl_device_id device,cl_context context,cl_command_queue queue,create_vstore_program_fn createFn,size_t bufferSize)726 int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
727 {
728 ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
729 unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
730 const char *size_names[] = { "2", "3", "4", "8", "16"};
731 unsigned int typeIdx, sizeIdx;
732 int error = 0;
733 MTdata d = init_genrand( gRandomSeed );
734
735 log_info("Testing with buffer size of %d.\n", (int)bufferSize);
736
737 for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
738 {
739 if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
740 continue;
741
742 if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
743 continue;
744
745 for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
746 {
747 log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
748
749 int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
750 if (error_this_type)
751 {
752 log_error("Failure; skipping further sizes for this type.\n");
753 error += error_this_type;
754 break;
755 }
756 }
757 }
758
759 free_mtdata(d);
760 return error;
761 }
762
763
764 #pragma mark -------------------- vstore test cases --------------------------
765
create_global_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)766 void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
767 {
768 const char *pattern =
769 "%s"
770 "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
771 "{\n"
772 " int tid = get_global_id( 0 );\n"
773 " vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
774 "}\n";
775
776 const char *patternV3 =
777 "%s"
778 "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
779 "{\n"
780 " int tid = get_global_id( 0 );\n"
781 " if((tid&3) == 0) { // if \"tid\" is a multiple of 4 \n"
782 " vstore3( srcValues[ 3*(tid>>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n"
783 " } else {\n"
784 " vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
785 " }\n"
786 "}\n";
787
788 const char *typeName = get_explicit_type_name(type);
789
790 if(inVectorSize == 3) {
791 sprintf( destBuffer, patternV3,
792 type == kDouble ? doubleExtensionPragma : "",
793 typeName, typeName, typeName);
794
795 } else {
796 sprintf( destBuffer, pattern,
797 type == kDouble ? doubleExtensionPragma : "",
798 typeName, (int)inVectorSize, typeName, (int)inVectorSize );
799 }
800 // if(inVectorSize == 3 || inVectorSize == 4) {
801 // log_info("\n----\n%s\n----\n", destBuffer);
802 // }
803 }
804
test_vstore_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)805 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
806 {
807 return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
808 }
809
810
create_local_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)811 void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
812 {
813 const char *pattern =
814 "%s"
815 "\n"
816 "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
817 "{\n"
818 " int tid = get_global_id( 0 );\n"
819 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
820 " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
821 " sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n"
822 " barrier( CLK_LOCAL_MEM_FENCE );\n"
823 "\n"
824 " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
825 "\n"
826 // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
827 // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
828 " barrier( CLK_LOCAL_MEM_FENCE );\n"
829 "\n"
830 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
831 // otherwise, local threads would be overwriting results from other local threads
832 " int i;\n"
833 " __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
834 " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
835 " for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
836 " dp[i] = sp[i];\n"
837 "}\n";
838
839 const char *patternV3 =
840 "%s"
841 "\n"
842 "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
843 "{\n"
844 " int tid = get_global_id( 0 );\n"
845 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
846 " sSharedStorage[ 3*offsets[tid] ] = (%s)0;\n"
847 " sSharedStorage[ 3*offsets[tid] +1 ] = \n"
848 " sSharedStorage[ 3*offsets[tid] ];\n"
849 " sSharedStorage[ 3*offsets[tid] +2 ] = \n"
850 " sSharedStorage[ 3*offsets[tid]];\n"
851 " sSharedStorage[ 3*offsets[tid] +3 ] = \n"
852 " sSharedStorage[ 3*offsets[tid]];\n"
853 " sSharedStorage[ 3*offsets[tid] +4 ] = \n"
854 " sSharedStorage[ 3*offsets[tid] ];\n"
855 " sSharedStorage[ 3*offsets[tid] +5 ] = \n"
856 " sSharedStorage[ 3*offsets[tid]];\n"
857 " barrier( CLK_LOCAL_MEM_FENCE );\n"
858 "\n"
859 " vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage + alignmentOffset );\n"
860 "\n"
861 // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
862 // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
863 " barrier( CLK_LOCAL_MEM_FENCE );\n"
864 "\n"
865 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
866 // otherwise, local threads would be overwriting results from other local threads
867 " int i;\n"
868 " __local %s *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
869 " __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
870 " for( i = 0; i < 3; i++ ) \n"
871 " dp[i] = sp[i];\n"
872 "}\n";
873
874 const char *typeName = get_explicit_type_name(type);
875 if(inVectorSize == 3) {
876 sprintf( destBuffer, patternV3,
877 type == kDouble ? doubleExtensionPragma : "",
878 typeName,
879 typeName,
880 typeName, typeName,
881 typeName, typeName, typeName );
882 } else {
883 sprintf( destBuffer, pattern,
884 type == kDouble ? doubleExtensionPragma : "",
885 typeName, (int)inVectorSize,
886 typeName, (int)inVectorSize, typeName, (int)inVectorSize,
887 typeName, (int)inVectorSize, typeName,
888 (int)inVectorSize, typeName, typeName,
889 typeName, typeName, typeName );
890 }
891 // log_info(destBuffer);
892 }
893
test_vstore_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)894 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
895 {
896 // Determine the max size of a local buffer that we can test against
897 cl_ulong localSize;
898 int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
899 test_error( error, "Unable to get max size of local memory buffer" );
900 if( localSize > 10240 )
901 localSize = 10240;
902 if (localSize > 4096)
903 localSize -= 2048;
904 else
905 localSize /= 2;
906 return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
907 }
908
909
create_private_store_code(char * destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize)910 void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
911 {
912 const char *pattern =
913 "%s"
914 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
915 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
916 "\n"
917 "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
918 "{\n"
919 " __private %s%d sPrivateStorage[ %d ];\n"
920 " int tid = get_global_id( 0 );\n"
921 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
922 " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
923 "\n"
924 " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
925 "\n"
926 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
927 // otherwise, local threads would be overwriting results from other local threads
928 " uint i;\n"
929 " __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
930 " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
931 " for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
932 " dp[i] = sp[i];\n"
933 "}\n";
934
935
936 const char *patternV3 =
937 "%s"
938 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
939 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
940 "\n"
941 "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
942 "{\n"
943 " __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
944 " int tid = get_global_id( 0 );\n"
945 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
946 " sPrivateStorage[tid] = (%s3)(%s)0;\n"
947 "\n"
948
949 " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
950 "\n"
951 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
952 // otherwise, local threads would be overwriting results from other local threads
953 " uint i;\n"
954 " __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
955 " __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
956 " for( i = 0; i < 3; i++ ) \n"
957 " dp[i] = sp[i];\n"
958 "}\n";
959
960 const char *typeName = get_explicit_type_name(type);
961 if(inVectorSize == 3) {
962 sprintf( destBuffer, patternV3,
963 type == kDouble ? doubleExtensionPragma : "",
964 typeName, typeName,
965 typeName, (int)inBufferSize,
966 typeName, typeName,
967 typeName, typeName, typeName, typeName, typeName );
968 } else {
969 sprintf( destBuffer, pattern,
970 type == kDouble ? doubleExtensionPragma : "",
971 typeName, (int)inVectorSize, typeName, (int)inVectorSize,
972 typeName, (int)inVectorSize, (int)inBufferSize,
973 typeName, (int)inVectorSize, typeName,
974 (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
975 }
976 }
977
test_vstore_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)978 int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
979 {
980 // We have no idea how much actual private storage is available, so just pick a reasonable value,
981 // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
982 return test_vstoreset( device, context, queue, create_private_store_code, 256 );
983 }
984
985
986
987