1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)26 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
27 cl_kernel *k, cl_program *p, bool relaxedMode)
28 {
29 const char *c[] = { "__kernel void math_kernel",
30 sizeNames[vectorSize],
31 "( __global int",
32 sizeNames[vectorSize],
33 "* out, __global float",
34 sizeNames[vectorSize],
35 "* in1, __global float",
36 sizeNames[vectorSize],
37 "* in2 )\n"
38 "{\n"
39 " size_t i = get_global_id(0);\n"
40 " out[i] = ",
41 name,
42 "( in1[i], in2[i] );\n"
43 "}\n" };
44
45 const char *c3[] = {
46 "__kernel void math_kernel",
47 sizeNames[vectorSize],
48 "( __global int* out, __global float* in, __global float* in2)\n"
49 "{\n"
50 " size_t i = get_global_id(0);\n"
51 " if( i + 1 < get_global_size(0) )\n"
52 " {\n"
53 " float3 f0 = vload3( 0, in + 3 * i );\n"
54 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
55 " int3 i0 = ",
56 name,
57 "( f0, f1 );\n"
58 " vstore3( i0, 0, out + 3*i );\n"
59 " }\n"
60 " else\n"
61 " {\n"
62 " size_t parity = i & 1; // Figure out how many elements are "
63 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
64 "buffer size \n"
65 " float3 f0;\n"
66 " float3 f1;\n"
67 " switch( parity )\n"
68 " {\n"
69 " case 1:\n"
70 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
71 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
72 " break;\n"
73 " case 0:\n"
74 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
75 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
76 " break;\n"
77 " }\n"
78 " int3 i0 = ",
79 name,
80 "( f0, f1 );\n"
81 " switch( parity )\n"
82 " {\n"
83 " case 0:\n"
84 " out[3*i+1] = i0.y; \n"
85 " // fall through\n"
86 " case 1:\n"
87 " out[3*i] = i0.x; \n"
88 " break;\n"
89 " }\n"
90 " }\n"
91 "}\n"
92 };
93
94 const char **kern = c;
95 size_t kernSize = sizeof(c) / sizeof(c[0]);
96
97 if (sizeValues[vectorSize] == 3)
98 {
99 kern = c3;
100 kernSize = sizeof(c3) / sizeof(c3[0]);
101 }
102
103 char testName[32];
104 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
105 sizeNames[vectorSize]);
106
107 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
108 relaxedMode);
109 }
110
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)111 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
112 {
113 BuildKernelInfo *info = (BuildKernelInfo *)p;
114 cl_uint vectorSize = gMinVectorSizeIndex + job_id;
115 return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
116 info->kernels[vectorSize].data(),
117 &(info->programs[vectorSize]), info->relaxedMode);
118 }
119
120 // Thread specific data for a worker thread
121 struct ThreadInfo
122 {
123 // Input and output buffers for the thread
124 clMemWrapper inBuf;
125 clMemWrapper inBuf2;
126 Buffers outBuf;
127
128 MTdataHolder d;
129
130 // Per thread command queue to improve performance
131 clCommandQueueWrapper tQueue;
132 };
133
134 struct TestInfo
135 {
136 size_t subBufferSize; // Size of the sub-buffer in elements
137 const Func *f; // A pointer to the function info
138
139 // Programs for various vector sizes.
140 Programs programs;
141
142 // Thread-specific kernels for each vector size:
143 // k[vector_size][thread_id]
144 KernelMatrix k;
145
146 // Array of thread specific information
147 std::vector<ThreadInfo> tinfo;
148
149 cl_uint threadCount; // Number of worker threads
150 cl_uint jobCount; // Number of jobs
151 cl_uint step; // step between each chunk and the next.
152 cl_uint scale; // stride between individual test values
153 int ftz; // non-zero if running in flush to zero mode
154 bool relaxedMode; // True if test is running in relaxed mode, false
155 // otherwise.
156 };
157
158 // A table of more difficult cases to get right
159 const float specialValues[] = {
160 -NAN,
161 -INFINITY,
162 -FLT_MAX,
163 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
164 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
165 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
166 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
167 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
168 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
169 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
170 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
171 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
172 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
173 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
174 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
175 -1000.f,
176 -100.f,
177 -4.0f,
178 -3.5f,
179 -3.0f,
180 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
181 -2.5f,
182 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
183 -2.0f,
184 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
185 -1.5f,
186 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
187 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
188 -1.0f,
189 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
190 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
191 -0.5f,
192 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
193 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
194 -0.25f,
195 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
196 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
197 -FLT_MIN,
198 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
199 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
200 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
201 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
202 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
203 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
204 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
205 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
206 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
207 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
208 -0.0f,
209
210 +NAN,
211 +INFINITY,
212 +FLT_MAX,
213 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
214 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
215 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
216 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
217 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
218 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
219 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
220 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
221 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
222 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
223 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
224 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
225 +1000.f,
226 +100.f,
227 +4.0f,
228 +3.5f,
229 +3.0f,
230 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
231 2.5f,
232 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
233 +2.0f,
234 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
235 1.5f,
236 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
237 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
238 +1.0f,
239 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
240 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
241 +0.5f,
242 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
243 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
244 +0.25f,
245 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
246 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
247 +FLT_MIN,
248 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
249 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
250 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
251 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
252 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
253 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
254 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
255 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
256 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
257 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
258 +0.0f,
259 };
260
261 constexpr size_t specialValuesCount =
262 sizeof(specialValues) / sizeof(specialValues[0]);
263
Test(cl_uint job_id,cl_uint thread_id,void * data)264 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
265 {
266 TestInfo *job = (TestInfo *)data;
267 size_t buffer_elements = job->subBufferSize;
268 size_t buffer_size = buffer_elements * sizeof(cl_float);
269 cl_uint base = job_id * (cl_uint)job->step;
270 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
271 fptr func = job->f->func;
272 int ftz = job->ftz;
273 bool relaxedMode = job->relaxedMode;
274 MTdata d = tinfo->d;
275 cl_int error;
276 const char *name = job->f->name;
277 cl_int *t = 0;
278 cl_int *r = 0;
279 cl_float *s = 0;
280 cl_float *s2 = 0;
281
282 // start the map of the output arrays
283 cl_event e[VECTOR_SIZE_COUNT];
284 cl_int *out[VECTOR_SIZE_COUNT];
285 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
286 {
287 out[j] = (cl_int *)clEnqueueMapBuffer(
288 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
289 buffer_size, 0, NULL, e + j, &error);
290 if (error || NULL == out[j])
291 {
292 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
293 error);
294 return error;
295 }
296 }
297
298 // Get that moving
299 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
300
301 // Init input array
302 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
303 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
304 cl_uint idx = 0;
305
306 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
307 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
308
309 if (job_id <= (cl_uint)lastSpecialJobIndex)
310 { // test edge cases
311 float *fp = (float *)p;
312 float *fp2 = (float *)p2;
313 uint32_t x, y;
314
315 x = (job_id * buffer_elements) % specialValuesCount;
316 y = (job_id * buffer_elements) / specialValuesCount;
317
318 for (; idx < buffer_elements; idx++)
319 {
320 fp[idx] = specialValues[x];
321 fp2[idx] = specialValues[y];
322 ++x;
323 if (x >= specialValuesCount)
324 {
325 x = 0;
326 y++;
327 if (y >= specialValuesCount) break;
328 }
329 }
330 }
331
332 // Init any remaining values.
333 for (; idx < buffer_elements; idx++)
334 {
335 p[idx] = genrand_int32(d);
336 p2[idx] = genrand_int32(d);
337 }
338
339 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
340 buffer_size, p, 0, NULL, NULL)))
341 {
342 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
343 goto exit;
344 }
345
346 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
347 buffer_size, p2, 0, NULL, NULL)))
348 {
349 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
350 goto exit;
351 }
352
353 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
354 {
355 // Wait for the map to finish
356 if ((error = clWaitForEvents(1, e + j)))
357 {
358 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
359 goto exit;
360 }
361 if ((error = clReleaseEvent(e[j])))
362 {
363 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
364 goto exit;
365 }
366
367 // Fill the result buffer with garbage, so that old results don't carry
368 // over
369 uint32_t pattern = 0xffffdead;
370 memset_pattern4(out[j], &pattern, buffer_size);
371 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
372 out[j], 0, NULL, NULL)))
373 {
374 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
375 error);
376 goto exit;
377 }
378
379 // run the kernel
380 size_t vectorCount =
381 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
382 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
383 // own copy of the cl_kernel
384 cl_program program = job->programs[j];
385
386 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
387 &tinfo->outBuf[j])))
388 {
389 LogBuildError(program);
390 return error;
391 }
392 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
393 &tinfo->inBuf)))
394 {
395 LogBuildError(program);
396 return error;
397 }
398 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
399 &tinfo->inBuf2)))
400 {
401 LogBuildError(program);
402 return error;
403 }
404
405 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
406 &vectorCount, NULL, 0, NULL, NULL)))
407 {
408 vlog_error("FAILED -- could not execute kernel\n");
409 goto exit;
410 }
411 }
412
413 // Get that moving
414 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
415
416 if (gSkipCorrectnessTesting) return CL_SUCCESS;
417
418 // Calculate the correctly rounded reference result
419 r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
420 s = (float *)gIn + thread_id * buffer_elements;
421 s2 = (float *)gIn2 + thread_id * buffer_elements;
422 for (size_t j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
423
424 // Read the data back -- no need to wait for the first N-1 buffers but wait
425 // for the last buffer. This is an in order queue.
426 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
427 {
428 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
429 out[j] = (cl_int *)clEnqueueMapBuffer(
430 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
431 buffer_size, 0, NULL, NULL, &error);
432 if (error || NULL == out[j])
433 {
434 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
435 error);
436 goto exit;
437 }
438 }
439
440 // Verify data
441 t = (cl_int *)r;
442 for (size_t j = 0; j < buffer_elements; j++)
443 {
444 cl_int *q = out[0];
445
446 if (gMinVectorSizeIndex == 0 && t[j] != q[j])
447 {
448 if (ftz || relaxedMode)
449 {
450 if (IsFloatSubnormal(s[j]))
451 {
452 if (IsFloatSubnormal(s2[j]))
453 {
454 int correct = func.i_ff(0.0f, 0.0f);
455 int correct2 = func.i_ff(0.0f, -0.0f);
456 int correct3 = func.i_ff(-0.0f, 0.0f);
457 int correct4 = func.i_ff(-0.0f, -0.0f);
458
459 if (correct == q[j] || correct2 == q[j]
460 || correct3 == q[j] || correct4 == q[j])
461 continue;
462 }
463 else
464 {
465 int correct = func.i_ff(0.0f, s2[j]);
466 int correct2 = func.i_ff(-0.0f, s2[j]);
467 if (correct == q[j] || correct2 == q[j]) continue;
468 }
469 }
470 else if (IsFloatSubnormal(s2[j]))
471 {
472 int correct = func.i_ff(s[j], 0.0f);
473 int correct2 = func.i_ff(s[j], -0.0f);
474 if (correct == q[j] || correct2 == q[j]) continue;
475 }
476 }
477
478 uint32_t err = t[j] - q[j];
479 if (q[j] > t[j]) err = q[j] - t[j];
480 vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
481 "0x%8.8x (index: %zu)\n",
482 name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
483 j);
484 error = -1;
485 goto exit;
486 }
487
488 for (auto k = std::max(1U, gMinVectorSizeIndex);
489 k < gMaxVectorSizeIndex; k++)
490 {
491 q = out[k];
492 // If we aren't getting the correctly rounded result
493 if (-t[j] != q[j])
494 {
495 if (ftz || relaxedMode)
496 {
497 if (IsFloatSubnormal(s[j]))
498 {
499 if (IsFloatSubnormal(s2[j]))
500 {
501 int correct = -func.i_ff(0.0f, 0.0f);
502 int correct2 = -func.i_ff(0.0f, -0.0f);
503 int correct3 = -func.i_ff(-0.0f, 0.0f);
504 int correct4 = -func.i_ff(-0.0f, -0.0f);
505
506 if (correct == q[j] || correct2 == q[j]
507 || correct3 == q[j] || correct4 == q[j])
508 continue;
509 }
510 else
511 {
512 int correct = -func.i_ff(0.0f, s2[j]);
513 int correct2 = -func.i_ff(-0.0f, s2[j]);
514 if (correct == q[j] || correct2 == q[j]) continue;
515 }
516 }
517 else if (IsFloatSubnormal(s2[j]))
518 {
519 int correct = -func.i_ff(s[j], 0.0f);
520 int correct2 = -func.i_ff(s[j], -0.0f);
521 if (correct == q[j] || correct2 == q[j]) continue;
522 }
523 }
524 cl_uint err = -t[j] - q[j];
525 if (q[j] > -t[j]) err = q[j] + t[j];
526 vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
527 "vs. 0x%8.8x (index: %zu)\n",
528 name, sizeNames[k], err, ((float *)s)[j],
529 ((float *)s2)[j], -t[j], q[j], j);
530 error = -1;
531 goto exit;
532 }
533 }
534 }
535
536 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
537 {
538 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
539 out[j], 0, NULL, NULL)))
540 {
541 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
542 j, error);
543 return error;
544 }
545 }
546
547 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
548
549
550 if (0 == (base & 0x0fffffff))
551 {
552 if (gVerboseBruteForce)
553 {
554 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
555 "ThreadCount:%2u\n",
556 base, job->step, job->scale, buffer_elements,
557 job->threadCount);
558 }
559 else
560 {
561 vlog(".");
562 }
563 fflush(stdout);
564 }
565
566 exit:
567 return error;
568 }
569
570 } // anonymous namespace
571
TestMacro_Int_Float_Float(const Func * f,MTdata d,bool relaxedMode)572 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
573 {
574 TestInfo test_info{};
575 cl_int error;
576
577 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
578
579 // Init test_info
580 test_info.threadCount = GetThreadCount();
581 test_info.subBufferSize = BUFFER_SIZE
582 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
583 test_info.scale = getTestScale(sizeof(cl_float));
584
585 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
586 if (test_info.step / test_info.subBufferSize != test_info.scale)
587 {
588 // there was overflow
589 test_info.jobCount = 1;
590 }
591 else
592 {
593 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
594 }
595
596 test_info.f = f;
597 test_info.ftz =
598 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
599 test_info.relaxedMode = relaxedMode;
600
601 // cl_kernels aren't thread safe, so we make one for each vector size for
602 // every thread
603 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
604 {
605 test_info.k[i].resize(test_info.threadCount, nullptr);
606 }
607
608 test_info.tinfo.resize(test_info.threadCount);
609 for (cl_uint i = 0; i < test_info.threadCount; i++)
610 {
611 cl_buffer_region region = {
612 i * test_info.subBufferSize * sizeof(cl_float),
613 test_info.subBufferSize * sizeof(cl_float)
614 };
615 test_info.tinfo[i].inBuf =
616 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
617 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
618 if (error || NULL == test_info.tinfo[i].inBuf)
619 {
620 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
621 "region {%zd, %zd}\n",
622 region.origin, region.size);
623 goto exit;
624 }
625 test_info.tinfo[i].inBuf2 =
626 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
627 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
628 if (error || NULL == test_info.tinfo[i].inBuf2)
629 {
630 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
631 "region {%zd, %zd}\n",
632 region.origin, region.size);
633 goto exit;
634 }
635
636 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
637 {
638 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
639 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
640 ®ion, &error);
641 if (error || NULL == test_info.tinfo[i].outBuf[j])
642 {
643 vlog_error("Error: Unable to create sub-buffer of "
644 "gOutBuffer[%d] for region {%zd, %zd}\n",
645 (int)j, region.origin, region.size);
646 goto exit;
647 }
648 }
649 test_info.tinfo[i].tQueue =
650 clCreateCommandQueue(gContext, gDevice, 0, &error);
651 if (NULL == test_info.tinfo[i].tQueue || error)
652 {
653 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
654 goto exit;
655 }
656
657 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
658 }
659
660 // Init the kernels
661 {
662 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
663 test_info.programs, f->nameInCode,
664 relaxedMode };
665 if ((error = ThreadPool_Do(BuildKernelFn,
666 gMaxVectorSizeIndex - gMinVectorSizeIndex,
667 &build_info)))
668 goto exit;
669 }
670
671 // Run the kernels
672 if (!gSkipCorrectnessTesting)
673 {
674 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
675
676 if (error) goto exit;
677
678 if (gWimpyMode)
679 vlog("Wimp pass");
680 else
681 vlog("passed");
682 }
683
684 vlog("\n");
685
686 exit:
687 // Release
688 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
689 {
690 for (auto &kernel : test_info.k[i])
691 {
692 clReleaseKernel(kernel);
693 }
694 }
695
696 return error;
697 }
698