• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernel(const char * operator_symbol,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)26 int BuildKernel(const char *operator_symbol, int vectorSize,
27                 cl_uint kernel_count, cl_kernel *k, cl_program *p,
28                 bool relaxedMode)
29 {
30     const char *c[] = { "__kernel void math_kernel",
31                         sizeNames[vectorSize],
32                         "( __global float",
33                         sizeNames[vectorSize],
34                         "* out, __global float",
35                         sizeNames[vectorSize],
36                         "* in1, __global float",
37                         sizeNames[vectorSize],
38                         "* in2 )\n"
39                         "{\n"
40                         "   size_t i = get_global_id(0);\n"
41                         "   out[i] = in1[i] ",
42                         operator_symbol,
43                         " in2[i];\n"
44                         "}\n" };
45 
46     const char *c3[] = {
47         "__kernel void math_kernel",
48         sizeNames[vectorSize],
49         "( __global float* out, __global float* in, __global float* in2)\n"
50         "{\n"
51         "   size_t i = get_global_id(0);\n"
52         "   if( i + 1 < get_global_size(0) )\n"
53         "   {\n"
54         "       float3 f0 = vload3( 0, in + 3 * i );\n"
55         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
56         "       f0 = f0 ",
57         operator_symbol,
58         " f1;\n"
59         "       vstore3( f0, 0, out + 3*i );\n"
60         "   }\n"
61         "   else\n"
62         "   {\n"
63         "       size_t parity = i & 1;   // Figure out how many elements are "
64         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65         "buffer size \n"
66         "       float3 f0;\n"
67         "       float3 f1;\n"
68         "       switch( parity )\n"
69         "       {\n"
70         "           case 1:\n"
71         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
72         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
73         "               break;\n"
74         "           case 0:\n"
75         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
76         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
77         "               break;\n"
78         "       }\n"
79         "       f0 = f0 ",
80         operator_symbol,
81         " f1;\n"
82         "       switch( parity )\n"
83         "       {\n"
84         "           case 0:\n"
85         "               out[3*i+1] = f0.y; \n"
86         "               // fall through\n"
87         "           case 1:\n"
88         "               out[3*i] = f0.x; \n"
89         "               break;\n"
90         "       }\n"
91         "   }\n"
92         "}\n"
93     };
94 
95     const char **kern = c;
96     size_t kernSize = sizeof(c) / sizeof(c[0]);
97 
98     if (sizeValues[vectorSize] == 3)
99     {
100         kern = c3;
101         kernSize = sizeof(c3) / sizeof(c3[0]);
102     }
103 
104     char testName[32];
105     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106              sizeNames[vectorSize]);
107 
108     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
109                        relaxedMode);
110 }
111 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)112 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
113 {
114     BuildKernelInfo *info = (BuildKernelInfo *)p;
115     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
116     return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
117                        info->kernels[vectorSize].data(),
118                        &(info->programs[vectorSize]), info->relaxedMode);
119 }
120 
121 // Thread specific data for a worker thread
122 struct ThreadInfo
123 {
124     // Input and output buffers for the thread
125     clMemWrapper inBuf;
126     clMemWrapper inBuf2;
127     Buffers outBuf;
128 
129     float maxError; // max error value. Init to 0.
130     double
131         maxErrorValue; // position of the max error value (param 1).  Init to 0.
132     double maxErrorValue2; // position of the max error value (param 2).  Init
133                            // to 0.
134     MTdataHolder d;
135 
136     // Per thread command queue to improve performance
137     clCommandQueueWrapper tQueue;
138 };
139 
140 struct TestInfo
141 {
142     size_t subBufferSize; // Size of the sub-buffer in elements
143     const Func *f; // A pointer to the function info
144 
145     // Programs for various vector sizes.
146     Programs programs;
147 
148     // Thread-specific kernels for each vector size:
149     // k[vector_size][thread_id]
150     KernelMatrix k;
151 
152     // Array of thread specific information
153     std::vector<ThreadInfo> tinfo;
154 
155     cl_uint threadCount; // Number of worker threads
156     cl_uint jobCount; // Number of jobs
157     cl_uint step; // step between each chunk and the next.
158     cl_uint scale; // stride between individual test values
159     float ulps; // max_allowed ulps
160     int ftz; // non-zero if running in flush to zero mode
161     bool relaxedMode; // True if the test is being run in relaxed mode, false
162                       // otherwise.
163 
164     // no special fields
165 };
166 
167 // A table of more difficult cases to get right
168 const float specialValues[] = {
169     -NAN,
170     -INFINITY,
171     -FLT_MAX,
172     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
173     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
174     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
175     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
176     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
177     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
178     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
179     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
180     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
181     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
182     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
183     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
184     -1000.f,
185     -100.f,
186     -4.0f,
187     -3.5f,
188     -3.0f,
189     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
190     -2.5f,
191     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
192     -2.0f,
193     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
194     -1.5f,
195     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
196     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
197     -1.0f,
198     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
199     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
200     -0.5f,
201     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
202     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
203     -0.25f,
204     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
205     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
206     -FLT_MIN,
207     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
208     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
209     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
210     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
211     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
212     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
213     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
214     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
215     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
216     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
217     -0.0f,
218 
219     +NAN,
220     +INFINITY,
221     +FLT_MAX,
222     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
223     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
224     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
225     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
226     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
227     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
228     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
229     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
230     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
231     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
232     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
233     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
234     +1000.f,
235     +100.f,
236     +4.0f,
237     +3.5f,
238     +3.0f,
239     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
240     2.5f,
241     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
242     +2.0f,
243     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
244     1.5f,
245     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
246     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
247     +1.0f,
248     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
249     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
250     +0.5f,
251     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
252     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
253     +0.25f,
254     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
255     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
256     +FLT_MIN,
257     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
258     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
259     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
260     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
261     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
262     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
263     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
264     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
265     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
266     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
267     +0.0f,
268 };
269 
270 constexpr size_t specialValuesCount =
271     sizeof(specialValues) / sizeof(specialValues[0]);
272 
Test(cl_uint job_id,cl_uint thread_id,void * data)273 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
274 {
275     TestInfo *job = (TestInfo *)data;
276     size_t buffer_elements = job->subBufferSize;
277     size_t buffer_size = buffer_elements * sizeof(cl_float);
278     cl_uint base = job_id * (cl_uint)job->step;
279     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
280     fptr func = job->f->func;
281     int ftz = job->ftz;
282     bool relaxedMode = job->relaxedMode;
283     float ulps = getAllowedUlpError(job->f, relaxedMode);
284     MTdata d = tinfo->d;
285     cl_int error;
286     std::vector<bool> overflow(buffer_elements, false);
287     const char *name = job->f->name;
288     cl_uint *t = 0;
289     cl_float *r = 0;
290     cl_float *s = 0;
291     cl_float *s2 = 0;
292     RoundingMode oldRoundMode;
293 
294     if (relaxedMode)
295     {
296         func = job->f->rfunc;
297     }
298 
299     // start the map of the output arrays
300     cl_event e[VECTOR_SIZE_COUNT];
301     cl_uint *out[VECTOR_SIZE_COUNT];
302     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
303     {
304         out[j] = (cl_uint *)clEnqueueMapBuffer(
305             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
306             buffer_size, 0, NULL, e + j, &error);
307         if (error || NULL == out[j])
308         {
309             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
310                        error);
311             return error;
312         }
313     }
314 
315     // Get that moving
316     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
317 
318     // Init input array
319     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
320     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
321     cl_uint idx = 0;
322     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
323     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
324 
325     if (job_id <= (cl_uint)lastSpecialJobIndex)
326     {
327         // Insert special values
328         uint32_t x, y;
329 
330         x = (job_id * buffer_elements) % specialValuesCount;
331         y = (job_id * buffer_elements) / specialValuesCount;
332 
333         for (; idx < buffer_elements; idx++)
334         {
335             p[idx] = ((cl_uint *)specialValues)[x];
336             p2[idx] = ((cl_uint *)specialValues)[y];
337             ++x;
338             if (x >= specialValuesCount)
339             {
340                 x = 0;
341                 y++;
342                 if (y >= specialValuesCount) break;
343             }
344             if (relaxedMode && strcmp(name, "divide") == 0)
345             {
346                 cl_uint pj = p[idx] & 0x7fffffff;
347                 cl_uint p2j = p2[idx] & 0x7fffffff;
348                 // Replace values outside [2^-62, 2^62] with QNaN
349                 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
350                 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
351             }
352         }
353     }
354 
355     // Init any remaining values.
356     for (; idx < buffer_elements; idx++)
357     {
358         p[idx] = genrand_int32(d);
359         p2[idx] = genrand_int32(d);
360 
361         if (relaxedMode && strcmp(name, "divide") == 0)
362         {
363             cl_uint pj = p[idx] & 0x7fffffff;
364             cl_uint p2j = p2[idx] & 0x7fffffff;
365             // Replace values outside [2^-62, 2^62] with QNaN
366             if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
367             if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
368         }
369     }
370 
371     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
372                                       buffer_size, p, 0, NULL, NULL)))
373     {
374         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
375         goto exit;
376     }
377 
378     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
379                                       buffer_size, p2, 0, NULL, NULL)))
380     {
381         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
382         goto exit;
383     }
384 
385     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
386     {
387         // Wait for the map to finish
388         if ((error = clWaitForEvents(1, e + j)))
389         {
390             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
391             goto exit;
392         }
393         if ((error = clReleaseEvent(e[j])))
394         {
395             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
396             goto exit;
397         }
398 
399         // Fill the result buffer with garbage, so that old results don't carry
400         // over
401         uint32_t pattern = 0xffffdead;
402         memset_pattern4(out[j], &pattern, buffer_size);
403         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
404                                              out[j], 0, NULL, NULL)))
405         {
406             vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
407                        error);
408             goto exit;
409         }
410 
411         // run the kernel
412         size_t vectorCount =
413             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
414         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
415                                                  // own copy of the cl_kernel
416         cl_program program = job->programs[j];
417 
418         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
419                                     &tinfo->outBuf[j])))
420         {
421             LogBuildError(program);
422             return error;
423         }
424         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
425                                     &tinfo->inBuf)))
426         {
427             LogBuildError(program);
428             return error;
429         }
430         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
431                                     &tinfo->inBuf2)))
432         {
433             LogBuildError(program);
434             return error;
435         }
436 
437         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
438                                             &vectorCount, NULL, 0, NULL, NULL)))
439         {
440             vlog_error("FAILED -- could not execute kernel\n");
441             goto exit;
442         }
443     }
444 
445     // Get that moving
446     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
447 
448     if (gSkipCorrectnessTesting)
449     {
450         return CL_SUCCESS;
451     }
452 
453     // Calculate the correctly rounded reference result
454     FPU_mode_type oldMode;
455     memset(&oldMode, 0, sizeof(oldMode));
456     if (ftz || relaxedMode) ForceFTZ(&oldMode);
457 
458     // Set the rounding mode to match the device
459     oldRoundMode = kRoundToNearestEven;
460     if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
461 
462     // Calculate the correctly rounded reference result
463     r = (float *)gOut_Ref + thread_id * buffer_elements;
464     s = (float *)gIn + thread_id * buffer_elements;
465     s2 = (float *)gIn2 + thread_id * buffer_elements;
466     if (gInfNanSupport)
467     {
468         for (size_t j = 0; j < buffer_elements; j++)
469             r[j] = (float)func.f_ff(s[j], s2[j]);
470     }
471     else
472     {
473         for (size_t j = 0; j < buffer_elements; j++)
474         {
475             feclearexcept(FE_OVERFLOW);
476             r[j] = (float)func.f_ff(s[j], s2[j]);
477             overflow[j] =
478                 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
479         }
480     }
481 
482     if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
483 
484     if (ftz || relaxedMode) RestoreFPState(&oldMode);
485 
486     // Read the data back -- no need to wait for the first N-1 buffers but wait
487     // for the last buffer. This is an in order queue.
488     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
489     {
490         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
491         out[j] = (cl_uint *)clEnqueueMapBuffer(
492             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
493             buffer_size, 0, NULL, NULL, &error);
494         if (error || NULL == out[j])
495         {
496             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
497                        error);
498             goto exit;
499         }
500     }
501 
502     // Verify data
503     t = (cl_uint *)r;
504     for (size_t j = 0; j < buffer_elements; j++)
505     {
506         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
507         {
508             cl_uint *q = out[k];
509 
510             // If we aren't getting the correctly rounded result
511             if (t[j] != q[j])
512             {
513                 float test = ((float *)q)[j];
514                 double correct = func.f_ff(s[j], s2[j]);
515 
516                 // Per section 10 paragraph 6, accept any result if an input or
517                 // output is a infinity or NaN or overflow
518                 if (!gInfNanSupport)
519                 {
520                     // Note: no double rounding here.  Reference functions
521                     // calculate in single precision.
522                     if (overflow[j] || IsFloatInfinity(correct)
523                         || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
524                         || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
525                         || IsFloatNaN(s[j]))
526                         continue;
527                 }
528 
529                 // Per section 10 paragraph 6, accept embedded devices always
530                 // returning positive 0.0.
531                 if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
532                     continue;
533 
534                 float err = Ulp_Error(test, correct);
535                 float errB = Ulp_Error(test, (float)correct);
536 
537                 int fail =
538                     ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
539                 if (fabsf(errB) < fabsf(err)) err = errB;
540 
541                 if (fail && (ftz || relaxedMode))
542                 {
543                     // retry per section 6.5.3.2
544                     if (IsFloatResultSubnormal(correct, ulps))
545                     {
546                         fail = fail && (test != 0.0f);
547                         if (!fail) err = 0.0f;
548                     }
549 
550                     // retry per section 6.5.3.3
551                     if (IsFloatSubnormal(s[j]))
552                     {
553                         double correct2, correct3;
554                         float err2, err3;
555 
556                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
557 
558                         correct2 = func.f_ff(0.0, s2[j]);
559                         correct3 = func.f_ff(-0.0, s2[j]);
560 
561                         // Per section 10 paragraph 6, accept any result if an
562                         // input or output is a infinity or NaN or overflow
563                         if (!gInfNanSupport)
564                         {
565                             if (fetestexcept(FE_OVERFLOW)) continue;
566 
567                             // Note: no double rounding here.  Reference
568                             // functions calculate in single precision.
569                             if (IsFloatInfinity(correct2)
570                                 || IsFloatNaN(correct2)
571                                 || IsFloatInfinity(correct3)
572                                 || IsFloatNaN(correct3))
573                                 continue;
574                         }
575 
576                         err2 = Ulp_Error(test, correct2);
577                         err3 = Ulp_Error(test, correct3);
578                         fail = fail
579                             && ((!(fabsf(err2) <= ulps))
580                                 && (!(fabsf(err3) <= ulps)));
581                         if (fabsf(err2) < fabsf(err)) err = err2;
582                         if (fabsf(err3) < fabsf(err)) err = err3;
583 
584                         // retry per section 6.5.3.4
585                         if (IsFloatResultSubnormal(correct2, ulps)
586                             || IsFloatResultSubnormal(correct3, ulps))
587                         {
588                             fail = fail && (test != 0.0f);
589                             if (!fail) err = 0.0f;
590                         }
591 
592                         // try with both args as zero
593                         if (IsFloatSubnormal(s2[j]))
594                         {
595                             double correct4, correct5;
596                             float err4, err5;
597 
598                             if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
599 
600                             correct2 = func.f_ff(0.0, 0.0);
601                             correct3 = func.f_ff(-0.0, 0.0);
602                             correct4 = func.f_ff(0.0, -0.0);
603                             correct5 = func.f_ff(-0.0, -0.0);
604 
605                             // Per section 10 paragraph 6, accept any result if
606                             // an input or output is a infinity or NaN or
607                             // overflow
608                             if (!gInfNanSupport)
609                             {
610                                 if (fetestexcept(FE_OVERFLOW)) continue;
611 
612                                 // Note: no double rounding here.  Reference
613                                 // functions calculate in single precision.
614                                 if (IsFloatInfinity(correct2)
615                                     || IsFloatNaN(correct2)
616                                     || IsFloatInfinity(correct3)
617                                     || IsFloatNaN(correct3)
618                                     || IsFloatInfinity(correct4)
619                                     || IsFloatNaN(correct4)
620                                     || IsFloatInfinity(correct5)
621                                     || IsFloatNaN(correct5))
622                                     continue;
623                             }
624 
625                             err2 = Ulp_Error(test, correct2);
626                             err3 = Ulp_Error(test, correct3);
627                             err4 = Ulp_Error(test, correct4);
628                             err5 = Ulp_Error(test, correct5);
629                             fail = fail
630                                 && ((!(fabsf(err2) <= ulps))
631                                     && (!(fabsf(err3) <= ulps))
632                                     && (!(fabsf(err4) <= ulps))
633                                     && (!(fabsf(err5) <= ulps)));
634                             if (fabsf(err2) < fabsf(err)) err = err2;
635                             if (fabsf(err3) < fabsf(err)) err = err3;
636                             if (fabsf(err4) < fabsf(err)) err = err4;
637                             if (fabsf(err5) < fabsf(err)) err = err5;
638 
639                             // retry per section 6.5.3.4
640                             if (IsFloatResultSubnormal(correct2, ulps)
641                                 || IsFloatResultSubnormal(correct3, ulps)
642                                 || IsFloatResultSubnormal(correct4, ulps)
643                                 || IsFloatResultSubnormal(correct5, ulps))
644                             {
645                                 fail = fail && (test != 0.0f);
646                                 if (!fail) err = 0.0f;
647                             }
648                         }
649                     }
650                     else if (IsFloatSubnormal(s2[j]))
651                     {
652                         double correct2, correct3;
653                         float err2, err3;
654 
655                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
656 
657                         correct2 = func.f_ff(s[j], 0.0);
658                         correct3 = func.f_ff(s[j], -0.0);
659 
660                         // Per section 10 paragraph 6, accept any result if an
661                         // input or output is a infinity or NaN or overflow
662                         if (!gInfNanSupport)
663                         {
664                             // Note: no double rounding here.  Reference
665                             // functions calculate in single precision.
666                             if (overflow[j] || IsFloatInfinity(correct)
667                                 || IsFloatNaN(correct)
668                                 || IsFloatInfinity(correct2)
669                                 || IsFloatNaN(correct2))
670                                 continue;
671                         }
672 
673                         err2 = Ulp_Error(test, correct2);
674                         err3 = Ulp_Error(test, correct3);
675                         fail = fail
676                             && ((!(fabsf(err2) <= ulps))
677                                 && (!(fabsf(err3) <= ulps)));
678                         if (fabsf(err2) < fabsf(err)) err = err2;
679                         if (fabsf(err3) < fabsf(err)) err = err3;
680 
681                         // retry per section 6.5.3.4
682                         if (IsFloatResultSubnormal(correct2, ulps)
683                             || IsFloatResultSubnormal(correct3, ulps))
684                         {
685                             fail = fail && (test != 0.0f);
686                             if (!fail) err = 0.0f;
687                         }
688                     }
689                 }
690 
691 
692                 if (fabsf(err) > tinfo->maxError)
693                 {
694                     tinfo->maxError = fabsf(err);
695                     tinfo->maxErrorValue = s[j];
696                     tinfo->maxErrorValue2 = s2[j];
697                 }
698                 if (fail)
699                 {
700                     vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
701                                "vs. %a (0x%8.8x) at index: %zu\n",
702                                name, sizeNames[k], err, s[j], s2[j], r[j], test,
703                                ((cl_uint *)&test)[0], j);
704                     error = -1;
705                     goto exit;
706                 }
707             }
708         }
709     }
710 
711     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
712     {
713         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
714                                              out[j], 0, NULL, NULL)))
715         {
716             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
717                        j, error);
718             return error;
719         }
720     }
721 
722     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
723 
724 
725     if (0 == (base & 0x0fffffff))
726     {
727         if (gVerboseBruteForce)
728         {
729             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
730                  "ThreadCount:%2u\n",
731                  base, job->step, job->scale, buffer_elements, job->ulps,
732                  job->threadCount);
733         }
734         else
735         {
736             vlog(".");
737         }
738         fflush(stdout);
739     }
740 
741 exit:
742     return error;
743 }
744 
745 } // anonymous namespace
746 
TestFunc_Float_Float_Float_Operator(const Func * f,MTdata d,bool relaxedMode)747 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
748                                         bool relaxedMode)
749 {
750     TestInfo test_info{};
751     cl_int error;
752     float maxError = 0.0f;
753     double maxErrorVal = 0.0;
754     double maxErrorVal2 = 0.0;
755 
756     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
757 
758     // Init test_info
759     test_info.threadCount = GetThreadCount();
760     test_info.subBufferSize = BUFFER_SIZE
761         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
762     test_info.scale = getTestScale(sizeof(cl_float));
763 
764     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
765     if (test_info.step / test_info.subBufferSize != test_info.scale)
766     {
767         // there was overflow
768         test_info.jobCount = 1;
769     }
770     else
771     {
772         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
773     }
774 
775     test_info.f = f;
776     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
777     test_info.ftz =
778         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
779     test_info.relaxedMode = relaxedMode;
780 
781     // cl_kernels aren't thread safe, so we make one for each vector size for
782     // every thread
783     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
784     {
785         test_info.k[i].resize(test_info.threadCount, nullptr);
786     }
787 
788     test_info.tinfo.resize(test_info.threadCount);
789     for (cl_uint i = 0; i < test_info.threadCount; i++)
790     {
791         cl_buffer_region region = {
792             i * test_info.subBufferSize * sizeof(cl_float),
793             test_info.subBufferSize * sizeof(cl_float)
794         };
795         test_info.tinfo[i].inBuf =
796             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
797                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
798         if (error || NULL == test_info.tinfo[i].inBuf)
799         {
800             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
801                        "region {%zd, %zd}\n",
802                        region.origin, region.size);
803             goto exit;
804         }
805         test_info.tinfo[i].inBuf2 =
806             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
807                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
808         if (error || NULL == test_info.tinfo[i].inBuf2)
809         {
810             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
811                        "region {%zd, %zd}\n",
812                        region.origin, region.size);
813             goto exit;
814         }
815 
816         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
817         {
818             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
819                 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
820                 &region, &error);
821             if (error || NULL == test_info.tinfo[i].outBuf[j])
822             {
823                 vlog_error("Error: Unable to create sub-buffer of "
824                            "gOutBuffer[%d] for region {%zd, %zd}\n",
825                            (int)j, region.origin, region.size);
826                 goto exit;
827             }
828         }
829         test_info.tinfo[i].tQueue =
830             clCreateCommandQueue(gContext, gDevice, 0, &error);
831         if (NULL == test_info.tinfo[i].tQueue || error)
832         {
833             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
834             goto exit;
835         }
836 
837         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
838     }
839 
840     // Init the kernels
841     {
842         BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
843                                     test_info.programs, f->nameInCode,
844                                     relaxedMode };
845         if ((error = ThreadPool_Do(BuildKernelFn,
846                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
847                                    &build_info)))
848             goto exit;
849     }
850 
851     // Run the kernels
852     if (!gSkipCorrectnessTesting)
853     {
854         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
855 
856         // Accumulate the arithmetic errors
857         for (cl_uint i = 0; i < test_info.threadCount; i++)
858         {
859             if (test_info.tinfo[i].maxError > maxError)
860             {
861                 maxError = test_info.tinfo[i].maxError;
862                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
863                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
864             }
865         }
866 
867         if (error) goto exit;
868 
869         if (gWimpyMode)
870             vlog("Wimp pass");
871         else
872             vlog("passed");
873 
874         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
875     }
876 
877     vlog("\n");
878 
879 exit:
880     // Release
881     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
882     {
883         for (auto &kernel : test_info.k[i])
884         {
885             clReleaseKernel(kernel);
886         }
887     }
888 
889     return error;
890 }
891