• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cinttypes>
23 #include <cstring>
24 
25 #define CORRECTLY_ROUNDED 0
26 #define FLUSHED 1
27 
28 namespace {
29 
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)30 int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
31                 bool relaxedMode)
32 {
33     auto kernel_name = GetKernelName(vectorSize);
34     auto source = GetTernaryKernel(kernel_name, name, ParameterType::Float,
35                                    ParameterType::Float, ParameterType::Float,
36                                    ParameterType::Float, vectorSize);
37     std::array<const char *, 1> sources{ source.c_str() };
38     return MakeKernel(sources.data(), sources.size(), kernel_name.c_str(), k, p,
39                       relaxedMode);
40 }
41 
42 struct BuildKernelInfo2
43 {
44     cl_kernel *kernels;
45     Programs &programs;
46     const char *nameInCode;
47     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
48 };
49 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)50 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
51 {
52     BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
53     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
54     return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
55                        &(info->programs[vectorSize]), info->relaxedMode);
56 }
57 
58 // A table of more difficult cases to get right
59 const float specialValues[] = {
60     -NAN,
61     -INFINITY,
62     -FLT_MAX,
63     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
64     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
65     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
66     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
67     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
68     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
69     -3.0f,
70     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
71     -2.5f,
72     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
73     -2.0f,
74     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
75     -1.75f,
76     -1.5f,
77     -1.25f,
78     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
79     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
80     MAKE_HEX_FLOAT(-0x1.003p0f, -0x1003000L, -24),
81     -MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
82     -1.0f,
83     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
84     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
85     -FLT_MIN,
86     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
87     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
88     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
89     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
90     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
91     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
92     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
93     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
94     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
95     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
96     -0.0f,
97 
98     +NAN,
99     +INFINITY,
100     +FLT_MAX,
101     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
102     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
103     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
104     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
105     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
106     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
107     +3.0f,
108     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
109     2.5f,
110     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
111     +2.0f,
112     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
113     1.75f,
114     1.5f,
115     1.25f,
116     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
117     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
118     MAKE_HEX_FLOAT(0x1.003p0f, 0x1003000L, -24),
119     +MAKE_HEX_FLOAT(0x1.001p0f, 0x1001000L, -24),
120     +1.0f,
121     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
122     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
123     +FLT_MIN,
124     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
125     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
126     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
127     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
128     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
129     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
130     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
131     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
132     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
133     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
134     +0.0f,
135 };
136 
137 constexpr size_t specialValuesCount =
138     sizeof(specialValues) / sizeof(specialValues[0]);
139 
140 } // anonymous namespace
141 
TestFunc_Float_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)142 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
143 {
144     int error;
145 
146     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
147 
148     Programs programs;
149     cl_kernel kernels[VECTOR_SIZE_COUNT];
150     float maxError = 0.0f;
151     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
152     float maxErrorVal = 0.0f;
153     float maxErrorVal2 = 0.0f;
154     float maxErrorVal3 = 0.0f;
155     uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
156 
157     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
158 
159     float float_ulps;
160     if (gIsEmbedded)
161         float_ulps = f->float_embedded_ulps;
162     else
163         float_ulps = f->float_ulps;
164 
165     int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
166 
167     // Init the kernels
168     {
169         BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
170                                      relaxedMode };
171         if ((error = ThreadPool_Do(BuildKernelFn,
172                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
173                                    &build_info)))
174             return error;
175     }
176 
177     for (uint64_t i = 0; i < (1ULL << 32); i += step)
178     {
179         // Init input array
180         cl_uint *p = (cl_uint *)gIn;
181         cl_uint *p2 = (cl_uint *)gIn2;
182         cl_uint *p3 = (cl_uint *)gIn3;
183         size_t idx = 0;
184 
185         if (i == 0)
186         { // test edge cases
187             float *fp = (float *)gIn;
188             float *fp2 = (float *)gIn2;
189             float *fp3 = (float *)gIn3;
190             uint32_t x, y, z;
191             x = y = z = 0;
192             for (; idx < BUFFER_SIZE / sizeof(float); idx++)
193             {
194                 fp[idx] = specialValues[x];
195                 fp2[idx] = specialValues[y];
196                 fp3[idx] = specialValues[z];
197 
198                 if (++x >= specialValuesCount)
199                 {
200                     x = 0;
201                     if (++y >= specialValuesCount)
202                     {
203                         y = 0;
204                         if (++z >= specialValuesCount) break;
205                     }
206                 }
207             }
208             if (idx == BUFFER_SIZE / sizeof(float))
209                 vlog_error("Test Error: not all special cases tested!\n");
210         }
211 
212         for (; idx < BUFFER_SIZE / sizeof(float); idx++)
213         {
214             p[idx] = genrand_int32(d);
215             p2[idx] = genrand_int32(d);
216             p3[idx] = genrand_int32(d);
217         }
218 
219         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
220                                           BUFFER_SIZE, gIn, 0, NULL, NULL)))
221         {
222             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
223             return error;
224         }
225 
226         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
227                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
228         {
229             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
230             return error;
231         }
232 
233         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
234                                           BUFFER_SIZE, gIn3, 0, NULL, NULL)))
235         {
236             vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
237             return error;
238         }
239 
240         // write garbage into output arrays
241         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
242         {
243             uint32_t pattern = 0xffffdead;
244             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
245             if ((error =
246                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
247                                           BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
248             {
249                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
250                            error, j);
251                 goto exit;
252             }
253         }
254 
255         // Run the kernels
256         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
257         {
258             size_t vectorSize = sizeof(cl_float) * sizeValues[j];
259             size_t localCount = (BUFFER_SIZE + vectorSize - 1)
260                 / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
261             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
262                                         &gOutBuffer[j])))
263             {
264                 LogBuildError(programs[j]);
265                 goto exit;
266             }
267             if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
268                                         &gInBuffer)))
269             {
270                 LogBuildError(programs[j]);
271                 goto exit;
272             }
273             if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
274                                         &gInBuffer2)))
275             {
276                 LogBuildError(programs[j]);
277                 goto exit;
278             }
279             if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
280                                         &gInBuffer3)))
281             {
282                 LogBuildError(programs[j]);
283                 goto exit;
284             }
285 
286             if ((error =
287                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
288                                             &localCount, NULL, 0, NULL, NULL)))
289             {
290                 vlog_error("FAILED -- could not execute kernel\n");
291                 goto exit;
292             }
293         }
294 
295         // Get that moving
296         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
297 
298         // Calculate the correctly rounded reference result
299         float *r = (float *)gOut_Ref;
300         float *s = (float *)gIn;
301         float *s2 = (float *)gIn2;
302         float *s3 = (float *)gIn3;
303         if (skipNanInf)
304         {
305             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
306             {
307                 feclearexcept(FE_OVERFLOW);
308                 r[j] =
309                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
310                 overflow[j] =
311                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
312             }
313         }
314         else
315         {
316             for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
317                 r[j] =
318                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
319         }
320 
321         // Read the data back
322         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
323         {
324             if ((error =
325                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
326                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
327             {
328                 vlog_error("ReadArray failed %d\n", error);
329                 goto exit;
330             }
331         }
332 
333         if (gSkipCorrectnessTesting) break;
334 
335         // Verify data
336         uint32_t *t = (uint32_t *)gOut_Ref;
337         for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
338         {
339             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
340             {
341                 uint32_t *q = (uint32_t *)(gOut[k]);
342 
343                 // If we aren't getting the correctly rounded result
344                 if (t[j] != q[j])
345                 {
346                     float err;
347                     int fail;
348                     float test = ((float *)q)[j];
349                     float correct =
350                         f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
351 
352                     // Per section 10 paragraph 6, accept any result if an input
353                     // or output is a infinity or NaN or overflow
354                     if (skipNanInf)
355                     {
356                         if (overflow[j] || IsFloatInfinity(correct)
357                             || IsFloatNaN(correct) || IsFloatInfinity(s[j])
358                             || IsFloatNaN(s[j]) || IsFloatInfinity(s2[j])
359                             || IsFloatNaN(s2[j]) || IsFloatInfinity(s3[j])
360                             || IsFloatNaN(s3[j]))
361                             continue;
362                     }
363 
364 
365                     err = Ulp_Error(test, correct);
366                     fail = !(fabsf(err) <= float_ulps);
367 
368                     if (fail && (ftz || relaxedMode))
369                     {
370                         float correct2, err2;
371 
372                         // retry per section 6.5.3.2  with flushing on
373                         if (0.0f == test
374                             && 0.0f
375                                 == f->func.f_fma(s[j], s2[j], s3[j], FLUSHED))
376                         {
377                             fail = 0;
378                             err = 0.0f;
379                         }
380 
381                         // retry per section 6.5.3.3
382                         if (fail && IsFloatSubnormal(s[j]))
383                         { // look at me,
384                             float err3, correct3;
385 
386                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
387 
388                             correct2 = f->func.f_fma(0.0f, s2[j], s3[j],
389                                                      CORRECTLY_ROUNDED);
390                             correct3 = f->func.f_fma(-0.0f, s2[j], s3[j],
391                                                      CORRECTLY_ROUNDED);
392 
393                             if (skipNanInf)
394                             {
395                                 if (fetestexcept(FE_OVERFLOW)) continue;
396 
397                                 // Note: no double rounding here.  Reference
398                                 // functions calculate in single precision.
399                                 if (IsFloatInfinity(correct2)
400                                     || IsFloatNaN(correct2)
401                                     || IsFloatInfinity(correct3)
402                                     || IsFloatNaN(correct3))
403                                     continue;
404                             }
405 
406                             err2 = Ulp_Error(test, correct2);
407                             err3 = Ulp_Error(test, correct3);
408                             fail = fail
409                                 && ((!(fabsf(err2) <= float_ulps))
410                                     && (!(fabsf(err3) <= float_ulps)));
411                             if (fabsf(err2) < fabsf(err)) err = err2;
412                             if (fabsf(err3) < fabsf(err)) err = err3;
413 
414                             // retry per section 6.5.3.4
415                             if (0.0f == test
416                                 && (0.0f
417                                         == f->func.f_fma(0.0f, s2[j], s3[j],
418                                                          FLUSHED)
419                                     || 0.0f
420                                         == f->func.f_fma(-0.0f, s2[j], s3[j],
421                                                          FLUSHED)))
422                             {
423                                 fail = 0;
424                                 err = 0.0f;
425                             }
426 
427                             // try with first two args as zero
428                             if (IsFloatSubnormal(s2[j]))
429                             { // its fun to have fun,
430                                 double correct4, correct5;
431                                 float err4, err5;
432 
433                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
434 
435                                 correct2 = f->func.f_fma(0.0f, 0.0f, s3[j],
436                                                          CORRECTLY_ROUNDED);
437                                 correct3 = f->func.f_fma(-0.0f, 0.0f, s3[j],
438                                                          CORRECTLY_ROUNDED);
439                                 correct4 = f->func.f_fma(0.0f, -0.0f, s3[j],
440                                                          CORRECTLY_ROUNDED);
441                                 correct5 = f->func.f_fma(-0.0f, -0.0f, s3[j],
442                                                          CORRECTLY_ROUNDED);
443 
444                                 // Per section 10 paragraph 6, accept any result
445                                 // if an input or output is a infinity or NaN or
446                                 // overflow
447                                 if (!gInfNanSupport)
448                                 {
449                                     if (fetestexcept(FE_OVERFLOW)) continue;
450 
451                                     // Note: no double rounding here.  Reference
452                                     // functions calculate in single precision.
453                                     if (IsFloatInfinity(correct2)
454                                         || IsFloatNaN(correct2)
455                                         || IsFloatInfinity(correct3)
456                                         || IsFloatNaN(correct3)
457                                         || IsFloatInfinity(correct4)
458                                         || IsFloatNaN(correct4)
459                                         || IsFloatInfinity(correct5)
460                                         || IsFloatNaN(correct5))
461                                         continue;
462                                 }
463 
464                                 err2 = Ulp_Error(test, correct2);
465                                 err3 = Ulp_Error(test, correct3);
466                                 err4 = Ulp_Error(test, correct4);
467                                 err5 = Ulp_Error(test, correct5);
468                                 fail = fail
469                                     && ((!(fabsf(err2) <= float_ulps))
470                                         && (!(fabsf(err3) <= float_ulps))
471                                         && (!(fabsf(err4) <= float_ulps))
472                                         && (!(fabsf(err5) <= float_ulps)));
473                                 if (fabsf(err2) < fabsf(err)) err = err2;
474                                 if (fabsf(err3) < fabsf(err)) err = err3;
475                                 if (fabsf(err4) < fabsf(err)) err = err4;
476                                 if (fabsf(err5) < fabsf(err)) err = err5;
477 
478                                 // retry per section 6.5.3.4
479                                 if (0.0f == test
480                                     && (0.0f
481                                             == f->func.f_fma(0.0f, 0.0f, s3[j],
482                                                              FLUSHED)
483                                         || 0.0f
484                                             == f->func.f_fma(-0.0f, 0.0f, s3[j],
485                                                              FLUSHED)
486                                         || 0.0f
487                                             == f->func.f_fma(0.0f, -0.0f, s3[j],
488                                                              FLUSHED)
489                                         || 0.0f
490                                             == f->func.f_fma(-0.0f, -0.0f,
491                                                              s3[j], FLUSHED)))
492                                 {
493                                     fail = 0;
494                                     err = 0.0f;
495                                 }
496 
497                                 if (IsFloatSubnormal(s3[j]))
498                                 {
499                                     if (test == 0.0f) // 0*0+0 is 0
500                                     {
501                                         fail = 0;
502                                         err = 0.0f;
503                                     }
504                                 }
505                             }
506                             else if (IsFloatSubnormal(s3[j]))
507                             {
508                                 double correct4, correct5;
509                                 float err4, err5;
510 
511                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
512 
513                                 correct2 = f->func.f_fma(0.0f, s2[j], 0.0f,
514                                                          CORRECTLY_ROUNDED);
515                                 correct3 = f->func.f_fma(-0.0f, s2[j], 0.0f,
516                                                          CORRECTLY_ROUNDED);
517                                 correct4 = f->func.f_fma(0.0f, s2[j], -0.0f,
518                                                          CORRECTLY_ROUNDED);
519                                 correct5 = f->func.f_fma(-0.0f, s2[j], -0.0f,
520                                                          CORRECTLY_ROUNDED);
521 
522                                 // Per section 10 paragraph 6, accept any result
523                                 // if an input or output is a infinity or NaN or
524                                 // overflow
525                                 if (!gInfNanSupport)
526                                 {
527                                     if (fetestexcept(FE_OVERFLOW)) continue;
528 
529                                     // Note: no double rounding here.  Reference
530                                     // functions calculate in single precision.
531                                     if (IsFloatInfinity(correct2)
532                                         || IsFloatNaN(correct2)
533                                         || IsFloatInfinity(correct3)
534                                         || IsFloatNaN(correct3)
535                                         || IsFloatInfinity(correct4)
536                                         || IsFloatNaN(correct4)
537                                         || IsFloatInfinity(correct5)
538                                         || IsFloatNaN(correct5))
539                                         continue;
540                                 }
541 
542                                 err2 = Ulp_Error(test, correct2);
543                                 err3 = Ulp_Error(test, correct3);
544                                 err4 = Ulp_Error(test, correct4);
545                                 err5 = Ulp_Error(test, correct5);
546                                 fail = fail
547                                     && ((!(fabsf(err2) <= float_ulps))
548                                         && (!(fabsf(err3) <= float_ulps))
549                                         && (!(fabsf(err4) <= float_ulps))
550                                         && (!(fabsf(err5) <= float_ulps)));
551                                 if (fabsf(err2) < fabsf(err)) err = err2;
552                                 if (fabsf(err3) < fabsf(err)) err = err3;
553                                 if (fabsf(err4) < fabsf(err)) err = err4;
554                                 if (fabsf(err5) < fabsf(err)) err = err5;
555 
556                                 // retry per section 6.5.3.4
557                                 if (0.0f == test
558                                     && (0.0f
559                                             == f->func.f_fma(0.0f, s2[j], 0.0f,
560                                                              FLUSHED)
561                                         || 0.0f
562                                             == f->func.f_fma(-0.0f, s2[j], 0.0f,
563                                                              FLUSHED)
564                                         || 0.0f
565                                             == f->func.f_fma(0.0f, s2[j], -0.0f,
566                                                              FLUSHED)
567                                         || 0.0f
568                                             == f->func.f_fma(-0.0f, s2[j],
569                                                              -0.0f, FLUSHED)))
570                                 {
571                                     fail = 0;
572                                     err = 0.0f;
573                                 }
574                             }
575                         }
576                         else if (fail && IsFloatSubnormal(s2[j]))
577                         {
578                             double correct2, correct3;
579                             float err2, err3;
580 
581                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
582 
583                             correct2 = f->func.f_fma(s[j], 0.0f, s3[j],
584                                                      CORRECTLY_ROUNDED);
585                             correct3 = f->func.f_fma(s[j], -0.0f, s3[j],
586                                                      CORRECTLY_ROUNDED);
587 
588                             if (skipNanInf)
589                             {
590                                 if (fetestexcept(FE_OVERFLOW)) continue;
591 
592                                 // Note: no double rounding here.  Reference
593                                 // functions calculate in single precision.
594                                 if (IsFloatInfinity(correct2)
595                                     || IsFloatNaN(correct2)
596                                     || IsFloatInfinity(correct3)
597                                     || IsFloatNaN(correct3))
598                                     continue;
599                             }
600 
601                             err2 = Ulp_Error(test, correct2);
602                             err3 = Ulp_Error(test, correct3);
603                             fail = fail
604                                 && ((!(fabsf(err2) <= float_ulps))
605                                     && (!(fabsf(err3) <= float_ulps)));
606                             if (fabsf(err2) < fabsf(err)) err = err2;
607                             if (fabsf(err3) < fabsf(err)) err = err3;
608 
609                             // retry per section 6.5.3.4
610                             if (0.0f == test
611                                 && (0.0f
612                                         == f->func.f_fma(s[j], 0.0f, s3[j],
613                                                          FLUSHED)
614                                     || 0.0f
615                                         == f->func.f_fma(s[j], -0.0f, s3[j],
616                                                          FLUSHED)))
617                             {
618                                 fail = 0;
619                                 err = 0.0f;
620                             }
621 
622                             // try with second two args as zero
623                             if (IsFloatSubnormal(s3[j]))
624                             {
625                                 double correct4, correct5;
626                                 float err4, err5;
627 
628                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
629 
630                                 correct2 = f->func.f_fma(s[j], 0.0f, 0.0f,
631                                                          CORRECTLY_ROUNDED);
632                                 correct3 = f->func.f_fma(s[j], -0.0f, 0.0f,
633                                                          CORRECTLY_ROUNDED);
634                                 correct4 = f->func.f_fma(s[j], 0.0f, -0.0f,
635                                                          CORRECTLY_ROUNDED);
636                                 correct5 = f->func.f_fma(s[j], -0.0f, -0.0f,
637                                                          CORRECTLY_ROUNDED);
638 
639                                 // Per section 10 paragraph 6, accept any result
640                                 // if an input or output is a infinity or NaN or
641                                 // overflow
642                                 if (!gInfNanSupport)
643                                 {
644                                     if (fetestexcept(FE_OVERFLOW)) continue;
645 
646                                     // Note: no double rounding here.  Reference
647                                     // functions calculate in single precision.
648                                     if (IsFloatInfinity(correct2)
649                                         || IsFloatNaN(correct2)
650                                         || IsFloatInfinity(correct3)
651                                         || IsFloatNaN(correct3)
652                                         || IsFloatInfinity(correct4)
653                                         || IsFloatNaN(correct4)
654                                         || IsFloatInfinity(correct5)
655                                         || IsFloatNaN(correct5))
656                                         continue;
657                                 }
658 
659                                 err2 = Ulp_Error(test, correct2);
660                                 err3 = Ulp_Error(test, correct3);
661                                 err4 = Ulp_Error(test, correct4);
662                                 err5 = Ulp_Error(test, correct5);
663                                 fail = fail
664                                     && ((!(fabsf(err2) <= float_ulps))
665                                         && (!(fabsf(err3) <= float_ulps))
666                                         && (!(fabsf(err4) <= float_ulps))
667                                         && (!(fabsf(err5) <= float_ulps)));
668                                 if (fabsf(err2) < fabsf(err)) err = err2;
669                                 if (fabsf(err3) < fabsf(err)) err = err3;
670                                 if (fabsf(err4) < fabsf(err)) err = err4;
671                                 if (fabsf(err5) < fabsf(err)) err = err5;
672 
673                                 // retry per section 6.5.3.4
674                                 if (0.0f == test
675                                     && (0.0f
676                                             == f->func.f_fma(s[j], 0.0f, 0.0f,
677                                                              FLUSHED)
678                                         || 0.0f
679                                             == f->func.f_fma(s[j], -0.0f, 0.0f,
680                                                              FLUSHED)
681                                         || 0.0f
682                                             == f->func.f_fma(s[j], 0.0f, -0.0f,
683                                                              FLUSHED)
684                                         || 0.0f
685                                             == f->func.f_fma(s[j], -0.0f, -0.0f,
686                                                              FLUSHED)))
687                                 {
688                                     fail = 0;
689                                     err = 0.0f;
690                                 }
691                             }
692                         }
693                         else if (fail && IsFloatSubnormal(s3[j]))
694                         {
695                             double correct2, correct3;
696                             float err2, err3;
697 
698                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
699 
700                             correct2 = f->func.f_fma(s[j], s2[j], 0.0f,
701                                                      CORRECTLY_ROUNDED);
702                             correct3 = f->func.f_fma(s[j], s2[j], -0.0f,
703                                                      CORRECTLY_ROUNDED);
704 
705                             if (skipNanInf)
706                             {
707                                 if (fetestexcept(FE_OVERFLOW)) continue;
708 
709                                 // Note: no double rounding here.  Reference
710                                 // functions calculate in single precision.
711                                 if (IsFloatInfinity(correct2)
712                                     || IsFloatNaN(correct2)
713                                     || IsFloatInfinity(correct3)
714                                     || IsFloatNaN(correct3))
715                                     continue;
716                             }
717 
718                             err2 = Ulp_Error(test, correct2);
719                             err3 = Ulp_Error(test, correct3);
720                             fail = fail
721                                 && ((!(fabsf(err2) <= float_ulps))
722                                     && (!(fabsf(err3) <= float_ulps)));
723                             if (fabsf(err2) < fabsf(err)) err = err2;
724                             if (fabsf(err3) < fabsf(err)) err = err3;
725 
726                             // retry per section 6.5.3.4
727                             if (0.0f == test
728                                 && (0.0f
729                                         == f->func.f_fma(s[j], s2[j], 0.0f,
730                                                          FLUSHED)
731                                     || 0.0f
732                                         == f->func.f_fma(s[j], s2[j], -0.0f,
733                                                          FLUSHED)))
734                             {
735                                 fail = 0;
736                                 err = 0.0f;
737                             }
738                         }
739                     }
740 
741                     if (fabsf(err) > maxError)
742                     {
743                         maxError = fabsf(err);
744                         maxErrorVal = s[j];
745                         maxErrorVal2 = s2[j];
746                         maxErrorVal3 = s3[j];
747                     }
748 
749                     if (fail)
750                     {
751                         vlog_error(
752                             "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
753                             "({0x%8.8x, 0x%8.8x, 0x%8.8x}): *%a vs. %a\n",
754                             f->name, sizeNames[k], err, s[j], s2[j], s3[j],
755                             ((cl_uint *)s)[j], ((cl_uint *)s2)[j],
756                             ((cl_uint *)s3)[j], ((float *)gOut_Ref)[j], test);
757                         error = -1;
758                         goto exit;
759                     }
760                 }
761             }
762         }
763 
764         if (0 == (i & 0x0fffffff))
765         {
766             if (gVerboseBruteForce)
767             {
768                 vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n",
769                      i, step, BUFFER_SIZE);
770             }
771             else
772             {
773                 vlog(".");
774             }
775             fflush(stdout);
776         }
777     }
778 
779     if (!gSkipCorrectnessTesting)
780     {
781         if (gWimpyMode)
782             vlog("Wimp pass");
783         else
784             vlog("passed");
785 
786         vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
787              maxErrorVal3);
788     }
789 
790     vlog("\n");
791 
792 exit:
793     // Release
794     for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
795     {
796         clReleaseKernel(kernels[k]);
797     }
798 
799     return error;
800 }
801