• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <climits>
23 #include <cstring>
24 
25 namespace {
26 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)27 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
28                 cl_kernel *k, cl_program *p, bool relaxedMode)
29 {
30     const char *c[] = { "__kernel void math_kernel",
31                         sizeNames[vectorSize],
32                         "( __global float",
33                         sizeNames[vectorSize],
34                         "* out, __global float",
35                         sizeNames[vectorSize],
36                         "* in1, __global int",
37                         sizeNames[vectorSize],
38                         "* in2 )\n"
39                         "{\n"
40                         "   size_t i = get_global_id(0);\n"
41                         "   out[i] = ",
42                         name,
43                         "( in1[i], in2[i] );\n"
44                         "}\n" };
45 
46     const char *c3[] = {
47         "__kernel void math_kernel",
48         sizeNames[vectorSize],
49         "( __global float* out, __global float* in, __global int* in2)\n"
50         "{\n"
51         "   size_t i = get_global_id(0);\n"
52         "   if( i + 1 < get_global_size(0) )\n"
53         "   {\n"
54         "       float3 f0 = vload3( 0, in + 3 * i );\n"
55         "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
56         "       f0 = ",
57         name,
58         "( f0, i0 );\n"
59         "       vstore3( f0, 0, out + 3*i );\n"
60         "   }\n"
61         "   else\n"
62         "   {\n"
63         "       size_t parity = i & 1;   // Figure out how many elements are "
64         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65         "buffer size \n"
66         "       float3 f0;\n"
67         "       int3 i0;\n"
68         "       switch( parity )\n"
69         "       {\n"
70         "           case 1:\n"
71         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
72         "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
73         "               break;\n"
74         "           case 0:\n"
75         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
76         "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
77         "               break;\n"
78         "       }\n"
79         "       f0 = ",
80         name,
81         "( f0, i0 );\n"
82         "       switch( parity )\n"
83         "       {\n"
84         "           case 0:\n"
85         "               out[3*i+1] = f0.y; \n"
86         "               // fall through\n"
87         "           case 1:\n"
88         "               out[3*i] = f0.x; \n"
89         "               break;\n"
90         "       }\n"
91         "   }\n"
92         "}\n"
93     };
94 
95     const char **kern = c;
96     size_t kernSize = sizeof(c) / sizeof(c[0]);
97 
98     if (sizeValues[vectorSize] == 3)
99     {
100         kern = c3;
101         kernSize = sizeof(c3) / sizeof(c3[0]);
102     }
103 
104     char testName[32];
105     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106              sizeNames[vectorSize]);
107 
108     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
109                        relaxedMode);
110 }
111 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)112 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
113 {
114     BuildKernelInfo *info = (BuildKernelInfo *)p;
115     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
116     return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
117                        info->kernels[vectorSize].data(),
118                        &(info->programs[vectorSize]), info->relaxedMode);
119 }
120 
121 // Thread specific data for a worker thread
122 struct ThreadInfo
123 {
124     // Input and output buffers for the thread
125     clMemWrapper inBuf;
126     clMemWrapper inBuf2;
127     Buffers outBuf;
128 
129     float maxError; // max error value. Init to 0.
130     double
131         maxErrorValue; // position of the max error value (param 1).  Init to 0.
132     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
133                            // to 0.
134     MTdataHolder d;
135 
136     // Per thread command queue to improve performance
137     clCommandQueueWrapper tQueue;
138 };
139 
140 struct TestInfo
141 {
142     size_t subBufferSize; // Size of the sub-buffer in elements
143     const Func *f; // A pointer to the function info
144 
145     // Programs for various vector sizes.
146     Programs programs;
147 
148     // Thread-specific kernels for each vector size:
149     // k[vector_size][thread_id]
150     KernelMatrix k;
151 
152     // Array of thread specific information
153     std::vector<ThreadInfo> tinfo;
154 
155     cl_uint threadCount; // Number of worker threads
156     cl_uint jobCount; // Number of jobs
157     cl_uint step; // step between each chunk and the next.
158     cl_uint scale; // stride between individual test values
159     float ulps; // max_allowed ulps
160     int ftz; // non-zero if running in flush to zero mode
161     bool relaxedMode; // True if test is running in relaxed mode, false
162                       // otherwise.
163     // no special values
164 };
165 
166 // A table of more difficult cases to get right
167 const float specialValues[] = {
168     -NAN,
169     -INFINITY,
170     -FLT_MAX,
171     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
172     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
173     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
174     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
175     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
176     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
177     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
178     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
179     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
180     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
181     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
182     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
183     -1000.f,
184     -100.f,
185     -4.0f,
186     -3.5f,
187     -3.0f,
188     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
189     -2.5f,
190     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
191     -2.0f,
192     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
193     -1.5f,
194     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
195     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
196     -1.0f,
197     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
198     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
199     -0.5f,
200     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
201     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
202     -0.25f,
203     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
204     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
205     -FLT_MIN,
206     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
207     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
208     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
209     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
210     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
211     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
212     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
213     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
214     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
215     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
216     -0.0f,
217 
218     +NAN,
219     +INFINITY,
220     +FLT_MAX,
221     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
222     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
223     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
224     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
225     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
226     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
227     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
228     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
229     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
230     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
231     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
232     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
233     +1000.f,
234     +100.f,
235     +4.0f,
236     +3.5f,
237     +3.0f,
238     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
239     2.5f,
240     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
241     +2.0f,
242     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
243     1.5f,
244     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
245     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
246     +1.0f,
247     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
248     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
249     +0.5f,
250     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
251     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
252     +0.25f,
253     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
254     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
255     +FLT_MIN,
256     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
257     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
258     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
259     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
260     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
261     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
262     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
263     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
264     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
265     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
266     +0.0f,
267 };
268 
269 constexpr size_t specialValuesCount =
270     sizeof(specialValues) / sizeof(specialValues[0]);
271 
272 const int specialValuesInt[] = {
273     0,           1,           2,           3,          126,        127,
274     128,         0x02000001,  0x04000001,  1465264071, 1488522147, -1,
275     -2,          -3,          -126,        -127,       -128,       -0x02000001,
276     -0x04000001, -1465264071, -1488522147,
277 };
278 
279 constexpr size_t specialValuesIntCount =
280     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
281 
Test(cl_uint job_id,cl_uint thread_id,void * data)282 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
283 {
284     TestInfo *job = (TestInfo *)data;
285     size_t buffer_elements = job->subBufferSize;
286     size_t buffer_size = buffer_elements * sizeof(cl_float);
287     cl_uint base = job_id * (cl_uint)job->step;
288     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
289     fptr func = job->f->func;
290     int ftz = job->ftz;
291     bool relaxedMode = job->relaxedMode;
292     float ulps = job->ulps;
293     MTdata d = tinfo->d;
294     cl_int error;
295     const char *name = job->f->name;
296     cl_uint *t = 0;
297     cl_float *r = 0;
298     cl_float *s = 0;
299     cl_int *s2 = 0;
300 
301     // start the map of the output arrays
302     cl_event e[VECTOR_SIZE_COUNT];
303     cl_uint *out[VECTOR_SIZE_COUNT];
304     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
305     {
306         out[j] = (cl_uint *)clEnqueueMapBuffer(
307             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
308             buffer_size, 0, NULL, e + j, &error);
309         if (error || NULL == out[j])
310         {
311             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
312                        error);
313             return error;
314         }
315     }
316 
317     // Get that moving
318     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
319 
320     // Init input array
321     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
322     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
323     size_t idx = 0;
324     int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
325     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
326 
327     if (job_id <= (cl_uint)lastSpecialJobIndex)
328     { // test edge cases
329         float *fp = (float *)p;
330         cl_int *ip2 = (cl_int *)p2;
331         uint32_t x, y;
332 
333         x = (job_id * buffer_elements) % specialValuesCount;
334         y = (job_id * buffer_elements) / specialValuesCount;
335 
336         for (; idx < buffer_elements; idx++)
337         {
338             fp[idx] = specialValues[x];
339             ip2[idx] = specialValuesInt[y];
340             ++x;
341             if (x >= specialValuesCount)
342             {
343                 x = 0;
344                 y++;
345                 if (y >= specialValuesIntCount) break;
346             }
347         }
348     }
349 
350     // Init any remaining values.
351     for (; idx < buffer_elements; idx++)
352     {
353         p[idx] = genrand_int32(d);
354         p2[idx] = genrand_int32(d);
355     }
356 
357     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
358                                       buffer_size, p, 0, NULL, NULL)))
359     {
360         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
361         goto exit;
362     }
363 
364     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
365                                       buffer_size, p2, 0, NULL, NULL)))
366     {
367         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
368         goto exit;
369     }
370 
371     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
372     {
373         // Wait for the map to finish
374         if ((error = clWaitForEvents(1, e + j)))
375         {
376             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
377             goto exit;
378         }
379         if ((error = clReleaseEvent(e[j])))
380         {
381             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
382             goto exit;
383         }
384 
385         // Fill the result buffer with garbage, so that old results don't carry
386         // over
387         uint32_t pattern = 0xffffdead;
388         memset_pattern4(out[j], &pattern, buffer_size);
389         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
390                                              out[j], 0, NULL, NULL)))
391         {
392             vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
393                        error);
394             goto exit;
395         }
396 
397         // run the kernel
398         size_t vectorCount =
399             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
400         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
401                                                  // own copy of the cl_kernel
402         cl_program program = job->programs[j];
403 
404         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
405                                     &tinfo->outBuf[j])))
406         {
407             LogBuildError(program);
408             return error;
409         }
410         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
411                                     &tinfo->inBuf)))
412         {
413             LogBuildError(program);
414             return error;
415         }
416         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
417                                     &tinfo->inBuf2)))
418         {
419             LogBuildError(program);
420             return error;
421         }
422 
423         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
424                                             &vectorCount, NULL, 0, NULL, NULL)))
425         {
426             vlog_error("FAILED -- could not execute kernel\n");
427             goto exit;
428         }
429     }
430 
431     // Get that moving
432     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
433 
434     if (gSkipCorrectnessTesting) return CL_SUCCESS;
435 
436     // Calculate the correctly rounded reference result
437     r = (float *)gOut_Ref + thread_id * buffer_elements;
438     s = (float *)gIn + thread_id * buffer_elements;
439     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
440     for (size_t j = 0; j < buffer_elements; j++)
441         r[j] = (float)func.f_fi(s[j], s2[j]);
442 
443     // Read the data back -- no need to wait for the first N-1 buffers but wait
444     // for the last buffer. This is an in order queue.
445     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
446     {
447         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
448         out[j] = (cl_uint *)clEnqueueMapBuffer(
449             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
450             buffer_size, 0, NULL, NULL, &error);
451         if (error || NULL == out[j])
452         {
453             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
454                        error);
455             goto exit;
456         }
457     }
458 
459     // Verify data
460     t = (cl_uint *)r;
461     for (size_t j = 0; j < buffer_elements; j++)
462     {
463         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
464         {
465             cl_uint *q = out[k];
466 
467             // If we aren't getting the correctly rounded result
468             if (t[j] != q[j])
469             {
470                 float test = ((float *)q)[j];
471                 double correct = func.f_fi(s[j], s2[j]);
472                 float err = Ulp_Error(test, correct);
473                 int fail = !(fabsf(err) <= ulps);
474 
475                 if (fail && (ftz || relaxedMode))
476                 {
477                     // retry per section 6.5.3.2
478                     if (IsFloatResultSubnormal(correct, ulps))
479                     {
480                         fail = fail && (test != 0.0f);
481                         if (!fail) err = 0.0f;
482                     }
483 
484                     // retry per section 6.5.3.3
485                     if (IsFloatSubnormal(s[j]))
486                     {
487                         double correct2, correct3;
488                         float err2, err3;
489                         correct2 = func.f_fi(0.0, s2[j]);
490                         correct3 = func.f_fi(-0.0, s2[j]);
491                         err2 = Ulp_Error(test, correct2);
492                         err3 = Ulp_Error(test, correct3);
493                         fail = fail
494                             && ((!(fabsf(err2) <= ulps))
495                                 && (!(fabsf(err3) <= ulps)));
496                         if (fabsf(err2) < fabsf(err)) err = err2;
497                         if (fabsf(err3) < fabsf(err)) err = err3;
498 
499                         // retry per section 6.5.3.4
500                         if (IsFloatResultSubnormal(correct2, ulps)
501                             || IsFloatResultSubnormal(correct3, ulps))
502                         {
503                             fail = fail && (test != 0.0f);
504                             if (!fail) err = 0.0f;
505                         }
506                     }
507                 }
508 
509                 if (fabsf(err) > tinfo->maxError)
510                 {
511                     tinfo->maxError = fabsf(err);
512                     tinfo->maxErrorValue = s[j];
513                     tinfo->maxErrorValue2 = s2[j];
514                 }
515                 if (fail)
516                 {
517                     vlog_error(
518                         "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
519                         "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %zu\n",
520                         name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
521                         s2[j], r[j], ((uint32_t *)r)[j], test,
522                         ((cl_uint *)&test)[0], j);
523                     error = -1;
524                     goto exit;
525                 }
526             }
527         }
528     }
529 
530     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
531     {
532         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
533                                              out[j], 0, NULL, NULL)))
534         {
535             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
536                        j, error);
537             return error;
538         }
539     }
540 
541     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
542 
543 
544     if (0 == (base & 0x0fffffff))
545     {
546         if (gVerboseBruteForce)
547         {
548             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
549                  "ThreadCount:%2u\n",
550                  base, job->step, job->scale, buffer_elements, job->ulps,
551                  job->threadCount);
552         }
553         else
554         {
555             vlog(".");
556         }
557         fflush(stdout);
558     }
559 
560 exit:
561     return error;
562 }
563 
564 } // anonymous namespace
565 
TestFunc_Float_Float_Int(const Func * f,MTdata d,bool relaxedMode)566 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
567 {
568     TestInfo test_info{};
569     cl_int error;
570     float maxError = 0.0f;
571     double maxErrorVal = 0.0;
572     cl_int maxErrorVal2 = 0;
573 
574     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
575 
576     // Init test_info
577     test_info.threadCount = GetThreadCount();
578     test_info.subBufferSize = BUFFER_SIZE
579         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
580     test_info.scale = getTestScale(sizeof(cl_float));
581 
582     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
583     if (test_info.step / test_info.subBufferSize != test_info.scale)
584     {
585         // there was overflow
586         test_info.jobCount = 1;
587     }
588     else
589     {
590         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
591     }
592 
593     test_info.f = f;
594     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
595     test_info.ftz =
596         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
597     test_info.relaxedMode = relaxedMode;
598 
599     // cl_kernels aren't thread safe, so we make one for each vector size for
600     // every thread
601     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
602     {
603         test_info.k[i].resize(test_info.threadCount, nullptr);
604     }
605 
606     test_info.tinfo.resize(test_info.threadCount);
607     for (cl_uint i = 0; i < test_info.threadCount; i++)
608     {
609         cl_buffer_region region = {
610             i * test_info.subBufferSize * sizeof(cl_float),
611             test_info.subBufferSize * sizeof(cl_float)
612         };
613         test_info.tinfo[i].inBuf =
614             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
615                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
616         if (error || NULL == test_info.tinfo[i].inBuf)
617         {
618             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
619                        "region {%zd, %zd}\n",
620                        region.origin, region.size);
621             goto exit;
622         }
623         cl_buffer_region region2 = { i * test_info.subBufferSize
624                                          * sizeof(cl_int),
625                                      test_info.subBufferSize * sizeof(cl_int) };
626         test_info.tinfo[i].inBuf2 =
627             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
628                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
629         if (error || NULL == test_info.tinfo[i].inBuf2)
630         {
631             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
632                        "region {%zd, %zd}\n",
633                        region.origin, region.size);
634             goto exit;
635         }
636 
637         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
638         {
639             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
640                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
641                 &region, &error);
642             if (error || NULL == test_info.tinfo[i].outBuf[j])
643             {
644                 vlog_error("Error: Unable to create sub-buffer of "
645                            "gOutBuffer[%d] for region {%zd, %zd}\n",
646                            (int)j, region.origin, region.size);
647                 goto exit;
648             }
649         }
650         test_info.tinfo[i].tQueue =
651             clCreateCommandQueue(gContext, gDevice, 0, &error);
652         if (NULL == test_info.tinfo[i].tQueue || error)
653         {
654             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
655             goto exit;
656         }
657 
658         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
659     }
660 
661     // Init the kernels
662     {
663         BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
664                                     test_info.programs, f->nameInCode,
665                                     relaxedMode };
666         if ((error = ThreadPool_Do(BuildKernelFn,
667                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
668                                    &build_info)))
669             goto exit;
670     }
671 
672     // Run the kernels
673     if (!gSkipCorrectnessTesting)
674     {
675         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
676 
677         // Accumulate the arithmetic errors
678         for (cl_uint i = 0; i < test_info.threadCount; i++)
679         {
680             if (test_info.tinfo[i].maxError > maxError)
681             {
682                 maxError = test_info.tinfo[i].maxError;
683                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
684                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
685             }
686         }
687 
688         if (error) goto exit;
689 
690         if (gWimpyMode)
691             vlog("Wimp pass");
692         else
693             vlog("passed");
694 
695         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
696     }
697 
698     vlog("\n");
699 
700 exit:
701     // Release
702     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
703     {
704         for (auto &kernel : test_info.k[i])
705         {
706             clReleaseKernel(kernel);
707         }
708     }
709 
710     return error;
711 }
712