• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28     BuildKernelInfo &info = *(BuildKernelInfo *)p;
29     auto generator = [](const std::string &kernel_name, const char *builtin,
30                         cl_uint vector_size_index) {
31         return GetBinaryKernel(kernel_name, builtin, ParameterType::Float,
32                                ParameterType::Float, ParameterType::Float,
33                                vector_size_index);
34     };
35     return BuildKernels(info, job_id, generator);
36 }
37 
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41     // Input and output buffers for the thread
42     clMemWrapper inBuf;
43     clMemWrapper inBuf2;
44     Buffers outBuf;
45 
46     float maxError; // max error value. Init to 0.
47     double
48         maxErrorValue; // position of the max error value (param 1).  Init to 0.
49     double maxErrorValue2; // position of the max error value (param 2).  Init
50                            // to 0.
51     MTdataHolder d;
52 
53     // Per thread command queue to improve performance
54     clCommandQueueWrapper tQueue;
55 };
56 
57 struct TestInfo
58 {
59     size_t subBufferSize; // Size of the sub-buffer in elements
60     const Func *f; // A pointer to the function info
61 
62     // Programs for various vector sizes.
63     Programs programs;
64 
65     // Thread-specific kernels for each vector size:
66     // k[vector_size][thread_id]
67     KernelMatrix k;
68 
69     // Array of thread specific information
70     std::vector<ThreadInfo> tinfo;
71 
72     cl_uint threadCount; // Number of worker threads
73     cl_uint jobCount; // Number of jobs
74     cl_uint step; // step between each chunk and the next.
75     cl_uint scale; // stride between individual test values
76     float ulps; // max_allowed ulps
77     int ftz; // non-zero if running in flush to zero mode
78     bool relaxedMode; // True if the test is being run in relaxed mode, false
79                       // otherwise.
80 
81     // no special fields
82 };
83 
84 // A table of more difficult cases to get right
85 const float specialValues[] = {
86     -NAN,
87     -INFINITY,
88     -FLT_MAX,
89     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
90     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
91     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
92     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
93     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
94     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
95     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
96     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
97     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
98     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
99     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
100     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
101     -1000.f,
102     -100.f,
103     -4.0f,
104     -3.5f,
105     -3.0f,
106     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
107     -2.5f,
108     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
109     -2.0f,
110     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
111     -1.5f,
112     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
113     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
114     -1.0f,
115     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
116     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
117     -0.5f,
118     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
119     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
120     -0.25f,
121     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
122     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
123     -FLT_MIN,
124     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
125     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
126     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
127     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
128     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
129     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
130     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
131     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
132     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
133     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
134     -0.0f,
135 
136     +NAN,
137     +INFINITY,
138     +FLT_MAX,
139     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
140     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
141     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
142     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
143     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
144     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
145     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
146     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
147     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
148     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
149     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
150     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
151     +1000.f,
152     +100.f,
153     +4.0f,
154     +3.5f,
155     +3.0f,
156     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
157     2.5f,
158     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
159     +2.0f,
160     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
161     1.5f,
162     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
163     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
164     +1.0f,
165     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
166     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
167     +0.5f,
168     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
169     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
170     +0.25f,
171     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
172     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
173     +FLT_MIN,
174     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
175     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
176     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
177     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
178     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
179     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
180     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
181     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
182     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
183     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
184     +0.0f,
185 };
186 
187 constexpr size_t specialValuesCount =
188     sizeof(specialValues) / sizeof(specialValues[0]);
189 
Test(cl_uint job_id,cl_uint thread_id,void * data)190 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
191 {
192     TestInfo *job = (TestInfo *)data;
193     size_t buffer_elements = job->subBufferSize;
194     size_t buffer_size = buffer_elements * sizeof(cl_float);
195     cl_uint base = job_id * (cl_uint)job->step;
196     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
197     fptr func = job->f->func;
198     int ftz = job->ftz;
199     bool relaxedMode = job->relaxedMode;
200     float ulps = getAllowedUlpError(job->f, relaxedMode);
201     MTdata d = tinfo->d;
202     cl_int error;
203     std::vector<bool> overflow(buffer_elements, false);
204     const char *name = job->f->name;
205     cl_uint *t = 0;
206     cl_float *r = 0;
207     cl_float *s = 0;
208     cl_float *s2 = 0;
209     RoundingMode oldRoundMode;
210 
211     if (relaxedMode)
212     {
213         func = job->f->rfunc;
214     }
215 
216     cl_event e[VECTOR_SIZE_COUNT];
217     cl_uint *out[VECTOR_SIZE_COUNT];
218     if (gHostFill)
219     {
220         // start the map of the output arrays
221         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
222         {
223             out[j] = (cl_uint *)clEnqueueMapBuffer(
224                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
225                 buffer_size, 0, NULL, e + j, &error);
226             if (error || NULL == out[j])
227             {
228                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
229                            error);
230                 return error;
231             }
232         }
233 
234         // Get that moving
235         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
236     }
237 
238     // Init input array
239     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
240     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
241     cl_uint idx = 0;
242     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
243     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
244 
245     if (job_id <= (cl_uint)lastSpecialJobIndex)
246     {
247         // Insert special values
248         uint32_t x, y;
249 
250         x = (job_id * buffer_elements) % specialValuesCount;
251         y = (job_id * buffer_elements) / specialValuesCount;
252 
253         for (; idx < buffer_elements; idx++)
254         {
255             p[idx] = ((cl_uint *)specialValues)[x];
256             p2[idx] = ((cl_uint *)specialValues)[y];
257             ++x;
258             if (x >= specialValuesCount)
259             {
260                 x = 0;
261                 y++;
262                 if (y >= specialValuesCount) break;
263             }
264             if (relaxedMode && strcmp(name, "divide") == 0)
265             {
266                 cl_uint pj = p[idx] & 0x7fffffff;
267                 cl_uint p2j = p2[idx] & 0x7fffffff;
268                 // Replace values outside [2^-62, 2^62] with QNaN
269                 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
270                 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
271             }
272         }
273     }
274 
275     // Init any remaining values
276     for (; idx < buffer_elements; idx++)
277     {
278         p[idx] = genrand_int32(d);
279         p2[idx] = genrand_int32(d);
280 
281         if (relaxedMode && strcmp(name, "divide") == 0)
282         {
283             cl_uint pj = p[idx] & 0x7fffffff;
284             cl_uint p2j = p2[idx] & 0x7fffffff;
285             // Replace values outside [2^-62, 2^62] with QNaN
286             if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
287             if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
288         }
289     }
290 
291     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
292                                       buffer_size, p, 0, NULL, NULL)))
293     {
294         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
295         return error;
296     }
297 
298     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
299                                       buffer_size, p2, 0, NULL, NULL)))
300     {
301         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
302         return error;
303     }
304 
305     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
306     {
307         if (gHostFill)
308         {
309             // Wait for the map to finish
310             if ((error = clWaitForEvents(1, e + j)))
311             {
312                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
313                 return error;
314             }
315             if ((error = clReleaseEvent(e[j])))
316             {
317                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
318                 return error;
319             }
320         }
321 
322         // Fill the result buffer with garbage, so that old results don't carry
323         // over
324         uint32_t pattern = 0xffffdead;
325         if (gHostFill)
326         {
327             memset_pattern4(out[j], &pattern, buffer_size);
328             if ((error = clEnqueueUnmapMemObject(
329                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
330             {
331                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
332                            error);
333                 return error;
334             }
335         }
336         else
337         {
338             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
339                                              &pattern, sizeof(pattern), 0,
340                                              buffer_size, 0, NULL, NULL)))
341             {
342                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
343                            error);
344                 return error;
345             }
346         }
347 
348         // Run the kernel
349         size_t vectorCount =
350             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
351         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
352                                                  // own copy of the cl_kernel
353         cl_program program = job->programs[j];
354 
355         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
356                                     &tinfo->outBuf[j])))
357         {
358             LogBuildError(program);
359             return error;
360         }
361         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
362                                     &tinfo->inBuf)))
363         {
364             LogBuildError(program);
365             return error;
366         }
367         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
368                                     &tinfo->inBuf2)))
369         {
370             LogBuildError(program);
371             return error;
372         }
373 
374         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
375                                             &vectorCount, NULL, 0, NULL, NULL)))
376         {
377             vlog_error("FAILED -- could not execute kernel\n");
378             return error;
379         }
380     }
381 
382     // Get that moving
383     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
384 
385     if (gSkipCorrectnessTesting)
386     {
387         return CL_SUCCESS;
388     }
389 
390     // Calculate the correctly rounded reference result
391     FPU_mode_type oldMode;
392     memset(&oldMode, 0, sizeof(oldMode));
393     if (ftz || relaxedMode) ForceFTZ(&oldMode);
394 
395     // Set the rounding mode to match the device
396     oldRoundMode = kRoundToNearestEven;
397     if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
398 
399     // Calculate the correctly rounded reference result
400     r = (float *)gOut_Ref + thread_id * buffer_elements;
401     s = (float *)gIn + thread_id * buffer_elements;
402     s2 = (float *)gIn2 + thread_id * buffer_elements;
403     if (gInfNanSupport)
404     {
405         for (size_t j = 0; j < buffer_elements; j++)
406             r[j] = (float)func.f_ff(s[j], s2[j]);
407     }
408     else
409     {
410         for (size_t j = 0; j < buffer_elements; j++)
411         {
412             feclearexcept(FE_OVERFLOW);
413             r[j] = (float)func.f_ff(s[j], s2[j]);
414             overflow[j] =
415                 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
416         }
417     }
418 
419     if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
420 
421     if (ftz || relaxedMode) RestoreFPState(&oldMode);
422 
423     // Read the data back -- no need to wait for the first N-1 buffers but wait
424     // for the last buffer. This is an in order queue.
425     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
426     {
427         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
428         out[j] = (cl_uint *)clEnqueueMapBuffer(
429             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
430             buffer_size, 0, NULL, NULL, &error);
431         if (error || NULL == out[j])
432         {
433             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
434                        error);
435             return error;
436         }
437     }
438 
439     // Verify data
440     t = (cl_uint *)r;
441     for (size_t j = 0; j < buffer_elements; j++)
442     {
443         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
444         {
445             cl_uint *q = out[k];
446 
447             // If we aren't getting the correctly rounded result
448             if (t[j] != q[j])
449             {
450                 float test = ((float *)q)[j];
451                 double correct = func.f_ff(s[j], s2[j]);
452 
453                 // Per section 10 paragraph 6, accept any result if an input or
454                 // output is a infinity or NaN or overflow
455                 if (!gInfNanSupport)
456                 {
457                     // Note: no double rounding here.  Reference functions
458                     // calculate in single precision.
459                     if (overflow[j] || IsFloatInfinity(correct)
460                         || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
461                         || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
462                         || IsFloatNaN(s[j]))
463                         continue;
464                 }
465 
466                 // Per section 10 paragraph 6, accept embedded devices always
467                 // returning positive 0.0.
468                 if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
469                     continue;
470 
471                 float err = Ulp_Error(test, correct);
472                 float errB = Ulp_Error(test, (float)correct);
473 
474                 int fail =
475                     ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
476                 if (fabsf(errB) < fabsf(err)) err = errB;
477 
478                 if (fail && (ftz || relaxedMode))
479                 {
480                     // retry per section 6.5.3.2
481                     if (IsFloatResultSubnormal(correct, ulps))
482                     {
483                         fail = fail && (test != 0.0f);
484                         if (!fail) err = 0.0f;
485                     }
486 
487                     // retry per section 6.5.3.3
488                     if (IsFloatSubnormal(s[j]))
489                     {
490                         double correct2, correct3;
491                         float err2, err3;
492 
493                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
494 
495                         correct2 = func.f_ff(0.0, s2[j]);
496                         correct3 = func.f_ff(-0.0, s2[j]);
497 
498                         // Per section 10 paragraph 6, accept any result if an
499                         // input or output is a infinity or NaN or overflow
500                         if (!gInfNanSupport)
501                         {
502                             if (fetestexcept(FE_OVERFLOW)) continue;
503 
504                             // Note: no double rounding here.  Reference
505                             // functions calculate in single precision.
506                             if (IsFloatInfinity(correct2)
507                                 || IsFloatNaN(correct2)
508                                 || IsFloatInfinity(correct3)
509                                 || IsFloatNaN(correct3))
510                                 continue;
511                         }
512 
513                         err2 = Ulp_Error(test, correct2);
514                         err3 = Ulp_Error(test, correct3);
515                         fail = fail
516                             && ((!(fabsf(err2) <= ulps))
517                                 && (!(fabsf(err3) <= ulps)));
518                         if (fabsf(err2) < fabsf(err)) err = err2;
519                         if (fabsf(err3) < fabsf(err)) err = err3;
520 
521                         // retry per section 6.5.3.4
522                         if (IsFloatResultSubnormal(correct2, ulps)
523                             || IsFloatResultSubnormal(correct3, ulps))
524                         {
525                             fail = fail && (test != 0.0f);
526                             if (!fail) err = 0.0f;
527                         }
528 
529                         // try with both args as zero
530                         if (IsFloatSubnormal(s2[j]))
531                         {
532                             double correct4, correct5;
533                             float err4, err5;
534 
535                             if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
536 
537                             correct2 = func.f_ff(0.0, 0.0);
538                             correct3 = func.f_ff(-0.0, 0.0);
539                             correct4 = func.f_ff(0.0, -0.0);
540                             correct5 = func.f_ff(-0.0, -0.0);
541 
542                             // Per section 10 paragraph 6, accept any result if
543                             // an input or output is a infinity or NaN or
544                             // overflow
545                             if (!gInfNanSupport)
546                             {
547                                 if (fetestexcept(FE_OVERFLOW)) continue;
548 
549                                 // Note: no double rounding here.  Reference
550                                 // functions calculate in single precision.
551                                 if (IsFloatInfinity(correct2)
552                                     || IsFloatNaN(correct2)
553                                     || IsFloatInfinity(correct3)
554                                     || IsFloatNaN(correct3)
555                                     || IsFloatInfinity(correct4)
556                                     || IsFloatNaN(correct4)
557                                     || IsFloatInfinity(correct5)
558                                     || IsFloatNaN(correct5))
559                                     continue;
560                             }
561 
562                             err2 = Ulp_Error(test, correct2);
563                             err3 = Ulp_Error(test, correct3);
564                             err4 = Ulp_Error(test, correct4);
565                             err5 = Ulp_Error(test, correct5);
566                             fail = fail
567                                 && ((!(fabsf(err2) <= ulps))
568                                     && (!(fabsf(err3) <= ulps))
569                                     && (!(fabsf(err4) <= ulps))
570                                     && (!(fabsf(err5) <= ulps)));
571                             if (fabsf(err2) < fabsf(err)) err = err2;
572                             if (fabsf(err3) < fabsf(err)) err = err3;
573                             if (fabsf(err4) < fabsf(err)) err = err4;
574                             if (fabsf(err5) < fabsf(err)) err = err5;
575 
576                             // retry per section 6.5.3.4
577                             if (IsFloatResultSubnormal(correct2, ulps)
578                                 || IsFloatResultSubnormal(correct3, ulps)
579                                 || IsFloatResultSubnormal(correct4, ulps)
580                                 || IsFloatResultSubnormal(correct5, ulps))
581                             {
582                                 fail = fail && (test != 0.0f);
583                                 if (!fail) err = 0.0f;
584                             }
585                         }
586                     }
587                     else if (IsFloatSubnormal(s2[j]))
588                     {
589                         double correct2, correct3;
590                         float err2, err3;
591 
592                         if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
593 
594                         correct2 = func.f_ff(s[j], 0.0);
595                         correct3 = func.f_ff(s[j], -0.0);
596 
597                         // Per section 10 paragraph 6, accept any result if an
598                         // input or output is a infinity or NaN or overflow
599                         if (!gInfNanSupport)
600                         {
601                             // Note: no double rounding here.  Reference
602                             // functions calculate in single precision.
603                             if (overflow[j] || IsFloatInfinity(correct)
604                                 || IsFloatNaN(correct)
605                                 || IsFloatInfinity(correct2)
606                                 || IsFloatNaN(correct2))
607                                 continue;
608                         }
609 
610                         err2 = Ulp_Error(test, correct2);
611                         err3 = Ulp_Error(test, correct3);
612                         fail = fail
613                             && ((!(fabsf(err2) <= ulps))
614                                 && (!(fabsf(err3) <= ulps)));
615                         if (fabsf(err2) < fabsf(err)) err = err2;
616                         if (fabsf(err3) < fabsf(err)) err = err3;
617 
618                         // retry per section 6.5.3.4
619                         if (IsFloatResultSubnormal(correct2, ulps)
620                             || IsFloatResultSubnormal(correct3, ulps))
621                         {
622                             fail = fail && (test != 0.0f);
623                             if (!fail) err = 0.0f;
624                         }
625                     }
626                 }
627 
628 
629                 if (fabsf(err) > tinfo->maxError)
630                 {
631                     tinfo->maxError = fabsf(err);
632                     tinfo->maxErrorValue = s[j];
633                     tinfo->maxErrorValue2 = s2[j];
634                 }
635                 if (fail)
636                 {
637                     vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
638                                "vs. %a (0x%8.8x) at index: %zu\n",
639                                name, sizeNames[k], err, s[j], s2[j], r[j], test,
640                                ((cl_uint *)&test)[0], j);
641                     return -1;
642                 }
643             }
644         }
645     }
646 
647     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
648     {
649         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
650                                              out[j], 0, NULL, NULL)))
651         {
652             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
653                        j, error);
654             return error;
655         }
656     }
657 
658     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
659 
660 
661     if (0 == (base & 0x0fffffff))
662     {
663         if (gVerboseBruteForce)
664         {
665             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
666                  "ThreadCount:%2u\n",
667                  base, job->step, job->scale, buffer_elements, job->ulps,
668                  job->threadCount);
669         }
670         else
671         {
672             vlog(".");
673         }
674         fflush(stdout);
675     }
676 
677     return CL_SUCCESS;
678 }
679 
680 } // anonymous namespace
681 
TestFunc_Float_Float_Float_Operator(const Func * f,MTdata d,bool relaxedMode)682 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
683                                         bool relaxedMode)
684 {
685     TestInfo test_info{};
686     cl_int error;
687     float maxError = 0.0f;
688     double maxErrorVal = 0.0;
689     double maxErrorVal2 = 0.0;
690 
691     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
692 
693     // Init test_info
694     test_info.threadCount = GetThreadCount();
695     test_info.subBufferSize = BUFFER_SIZE
696         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
697     test_info.scale = getTestScale(sizeof(cl_float));
698 
699     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
700     if (test_info.step / test_info.subBufferSize != test_info.scale)
701     {
702         // there was overflow
703         test_info.jobCount = 1;
704     }
705     else
706     {
707         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
708     }
709 
710     test_info.f = f;
711     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
712     test_info.ftz =
713         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
714     test_info.relaxedMode = relaxedMode;
715 
716     test_info.tinfo.resize(test_info.threadCount);
717     for (cl_uint i = 0; i < test_info.threadCount; i++)
718     {
719         cl_buffer_region region = {
720             i * test_info.subBufferSize * sizeof(cl_float),
721             test_info.subBufferSize * sizeof(cl_float)
722         };
723         test_info.tinfo[i].inBuf =
724             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
725                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
726         if (error || NULL == test_info.tinfo[i].inBuf)
727         {
728             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
729                        "region {%zd, %zd}\n",
730                        region.origin, region.size);
731             return error;
732         }
733         test_info.tinfo[i].inBuf2 =
734             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
735                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
736         if (error || NULL == test_info.tinfo[i].inBuf2)
737         {
738             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
739                        "region {%zd, %zd}\n",
740                        region.origin, region.size);
741             return error;
742         }
743 
744         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
745         {
746             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
747                 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
748                 &region, &error);
749             if (error || NULL == test_info.tinfo[i].outBuf[j])
750             {
751                 vlog_error("Error: Unable to create sub-buffer of "
752                            "gOutBuffer[%d] for region {%zd, %zd}\n",
753                            (int)j, region.origin, region.size);
754                 return error;
755             }
756         }
757         test_info.tinfo[i].tQueue =
758             clCreateCommandQueue(gContext, gDevice, 0, &error);
759         if (NULL == test_info.tinfo[i].tQueue || error)
760         {
761             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
762             return error;
763         }
764 
765         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
766     }
767 
768     // Init the kernels
769     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
770                                 test_info.programs, f->nameInCode,
771                                 relaxedMode };
772     if ((error = ThreadPool_Do(BuildKernelFn,
773                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
774                                &build_info)))
775         return error;
776 
777     // Run the kernels
778     if (!gSkipCorrectnessTesting)
779     {
780         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
781         if (error) return error;
782 
783         // Accumulate the arithmetic errors
784         for (cl_uint i = 0; i < test_info.threadCount; i++)
785         {
786             if (test_info.tinfo[i].maxError > maxError)
787             {
788                 maxError = test_info.tinfo[i].maxError;
789                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
790                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
791             }
792         }
793 
794         if (gWimpyMode)
795             vlog("Wimp pass");
796         else
797             vlog("passed");
798 
799         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
800     }
801 
802     vlog("\n");
803 
804     return CL_SUCCESS;
805 }
806