• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cinttypes>
23 #include <cstring>
24 
25 namespace {
26 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)27 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
28                 cl_kernel *k, cl_program *p, bool relaxedMode)
29 {
30     const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
31                         "__kernel void math_kernel",
32                         sizeNames[vectorSize],
33                         "( __global long",
34                         sizeNames[vectorSize],
35                         "* out, __global double",
36                         sizeNames[vectorSize],
37                         "* in1, __global double",
38                         sizeNames[vectorSize],
39                         "* in2 )\n"
40                         "{\n"
41                         "   size_t i = get_global_id(0);\n"
42                         "   out[i] = ",
43                         name,
44                         "( in1[i], in2[i] );\n"
45                         "}\n" };
46 
47     const char *c3[] = {
48         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
49         "__kernel void math_kernel",
50         sizeNames[vectorSize],
51         "( __global long* out, __global double* in, __global double* in2)\n"
52         "{\n"
53         "   size_t i = get_global_id(0);\n"
54         "   if( i + 1 < get_global_size(0) )\n"
55         "   {\n"
56         "       double3 f0 = vload3( 0, in + 3 * i );\n"
57         "       double3 f1 = vload3( 0, in2 + 3 * i );\n"
58         "       long3 l0 = ",
59         name,
60         "( f0, f1 );\n"
61         "       vstore3( l0, 0, out + 3*i );\n"
62         "   }\n"
63         "   else\n"
64         "   {\n"
65         "       size_t parity = i & 1;   // Figure out how many elements are "
66         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
67         "buffer size \n"
68         "       double3 f0;\n"
69         "       double3 f1;\n"
70         "       switch( parity )\n"
71         "       {\n"
72         "           case 1:\n"
73         "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
74         "               f1 = (double3)( in2[3*i], NAN, NAN ); \n"
75         "               break;\n"
76         "           case 0:\n"
77         "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
78         "               f1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
79         "               break;\n"
80         "       }\n"
81         "       long3 l0 = ",
82         name,
83         "( f0, f1 );\n"
84         "       switch( parity )\n"
85         "       {\n"
86         "           case 0:\n"
87         "               out[3*i+1] = l0.y; \n"
88         "               // fall through\n"
89         "           case 1:\n"
90         "               out[3*i] = l0.x; \n"
91         "               break;\n"
92         "       }\n"
93         "   }\n"
94         "}\n"
95     };
96 
97     const char **kern = c;
98     size_t kernSize = sizeof(c) / sizeof(c[0]);
99 
100     if (sizeValues[vectorSize] == 3)
101     {
102         kern = c3;
103         kernSize = sizeof(c3) / sizeof(c3[0]);
104     }
105 
106     char testName[32];
107     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
108              sizeNames[vectorSize]);
109 
110     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
111                        relaxedMode);
112 }
113 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)114 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
115 {
116     BuildKernelInfo *info = (BuildKernelInfo *)p;
117     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
118     return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
119                        info->kernels[vectorSize].data(),
120                        &(info->programs[vectorSize]), info->relaxedMode);
121 }
122 
123 // Thread specific data for a worker thread
124 struct ThreadInfo
125 {
126     // Input and output buffers for the thread
127     clMemWrapper inBuf;
128     clMemWrapper inBuf2;
129     Buffers outBuf;
130 
131     MTdataHolder d;
132 
133     // Per thread command queue to improve performance
134     clCommandQueueWrapper tQueue;
135 };
136 
137 struct TestInfo
138 {
139     size_t subBufferSize; // Size of the sub-buffer in elements
140     const Func *f; // A pointer to the function info
141 
142     // Programs for various vector sizes.
143     Programs programs;
144 
145     // Thread-specific kernels for each vector size:
146     // k[vector_size][thread_id]
147     KernelMatrix k;
148 
149     // Array of thread specific information
150     std::vector<ThreadInfo> tinfo;
151 
152     cl_uint threadCount; // Number of worker threads
153     cl_uint jobCount; // Number of jobs
154     cl_uint step; // step between each chunk and the next.
155     cl_uint scale; // stride between individual test values
156     int ftz; // non-zero if running in flush to zero mode
157     bool relaxedMode; // True if test is running in relaxed mode, false
158                       // otherwise.
159 };
160 
161 // A table of more difficult cases to get right
162 const double specialValues[] = {
163     -NAN,
164     -INFINITY,
165     -DBL_MAX,
166     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
167     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
168     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
169     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
170     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
171     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
172     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
173     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
174     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
175     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
176     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
177     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
178     -1000.0,
179     -100.0,
180     -4.0,
181     -3.5,
182     -3.0,
183     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
184     -2.5,
185     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
186     -2.0,
187     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
188     -1.5,
189     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
190     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
191     -1.0,
192     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
193     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
194     -0.5,
195     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
196     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
197     -0.25,
198     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
199     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
200     -DBL_MIN,
201     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
202     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
203     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
204     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
205     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
206     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
207     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
208     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
209     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
210     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
211     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
212     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
213     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
214     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
215     -0.0,
216 
217     +NAN,
218     +INFINITY,
219     +DBL_MAX,
220     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
221     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
222     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
223     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
224     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
225     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
226     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
227     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
228     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
229     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
230     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
231     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
232     +1000.0,
233     +100.0,
234     +4.0,
235     +3.5,
236     +3.0,
237     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
238     +2.5,
239     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
240     +2.0,
241     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
242     +1.5,
243     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
244     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
245     +1.0,
246     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
247     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
248     +0.5,
249     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
250     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
251     +0.25,
252     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
253     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
254     +DBL_MIN,
255     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
256     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
257     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
258     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
259     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
260     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
261     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
262     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
263     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
264     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
265     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
266     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
267     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
268     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
269     +0.0,
270 };
271 
272 constexpr size_t specialValuesCount =
273     sizeof(specialValues) / sizeof(specialValues[0]);
274 
Test(cl_uint job_id,cl_uint thread_id,void * data)275 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
276 {
277     TestInfo *job = (TestInfo *)data;
278     size_t buffer_elements = job->subBufferSize;
279     size_t buffer_size = buffer_elements * sizeof(cl_double);
280     cl_uint base = job_id * (cl_uint)job->step;
281     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
282     dptr dfunc = job->f->dfunc;
283     int ftz = job->ftz;
284     bool relaxedMode = job->relaxedMode;
285     MTdata d = tinfo->d;
286     cl_int error;
287     const char *name = job->f->name;
288     cl_long *t;
289     cl_long *r;
290     cl_double *s;
291     cl_double *s2;
292 
293     Force64BitFPUPrecision();
294 
295     // start the map of the output arrays
296     cl_event e[VECTOR_SIZE_COUNT];
297     cl_long *out[VECTOR_SIZE_COUNT];
298     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
299     {
300         out[j] = (cl_long *)clEnqueueMapBuffer(
301             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
302             buffer_size, 0, NULL, e + j, &error);
303         if (error || NULL == out[j])
304         {
305             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
306                        error);
307             return error;
308         }
309     }
310 
311     // Get that moving
312     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
313 
314     // Init input array
315     double *p = (double *)gIn + thread_id * buffer_elements;
316     double *p2 = (double *)gIn2 + thread_id * buffer_elements;
317     cl_uint idx = 0;
318     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
319     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
320 
321     if (job_id <= (cl_uint)lastSpecialJobIndex)
322     { // test edge cases
323         uint32_t x, y;
324 
325         x = (job_id * buffer_elements) % specialValuesCount;
326         y = (job_id * buffer_elements) / specialValuesCount;
327 
328         for (; idx < buffer_elements; idx++)
329         {
330             p[idx] = specialValues[x];
331             p2[idx] = specialValues[y];
332             if (++x >= specialValuesCount)
333             {
334                 x = 0;
335                 y++;
336                 if (y >= specialValuesCount) break;
337             }
338         }
339     }
340 
341     // Init any remaining values.
342     for (; idx < buffer_elements; idx++)
343     {
344         ((cl_ulong *)p)[idx] = genrand_int64(d);
345         ((cl_ulong *)p2)[idx] = genrand_int64(d);
346     }
347 
348     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
349                                       buffer_size, p, 0, NULL, NULL)))
350     {
351         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
352         goto exit;
353     }
354 
355     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
356                                       buffer_size, p2, 0, NULL, NULL)))
357     {
358         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
359         goto exit;
360     }
361 
362     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
363     {
364         // Wait for the map to finish
365         if ((error = clWaitForEvents(1, e + j)))
366         {
367             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
368             goto exit;
369         }
370         if ((error = clReleaseEvent(e[j])))
371         {
372             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
373             goto exit;
374         }
375 
376         // Fill the result buffer with garbage, so that old results don't carry
377         // over
378         uint32_t pattern = 0xffffdead;
379         memset_pattern4(out[j], &pattern, buffer_size);
380         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
381                                              out[j], 0, NULL, NULL)))
382         {
383             vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
384                        error);
385             goto exit;
386         }
387 
388         // run the kernel
389         size_t vectorCount =
390             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
391         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
392                                                  // own copy of the cl_kernel
393         cl_program program = job->programs[j];
394 
395         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
396                                     &tinfo->outBuf[j])))
397         {
398             LogBuildError(program);
399             return error;
400         }
401         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
402                                     &tinfo->inBuf)))
403         {
404             LogBuildError(program);
405             return error;
406         }
407         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
408                                     &tinfo->inBuf2)))
409         {
410             LogBuildError(program);
411             return error;
412         }
413 
414         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
415                                             &vectorCount, NULL, 0, NULL, NULL)))
416         {
417             vlog_error("FAILED -- could not execute kernel\n");
418             goto exit;
419         }
420     }
421 
422     // Get that moving
423     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
424 
425     if (gSkipCorrectnessTesting) return CL_SUCCESS;
426 
427     // Calculate the correctly rounded reference result
428     r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
429     s = (cl_double *)gIn + thread_id * buffer_elements;
430     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
431     for (size_t j = 0; j < buffer_elements; j++) r[j] = dfunc.i_ff(s[j], s2[j]);
432 
433     // Read the data back -- no need to wait for the first N-1 buffers but wait
434     // for the last buffer. This is an in order queue.
435     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
436     {
437         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
438         out[j] = (cl_long *)clEnqueueMapBuffer(
439             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
440             buffer_size, 0, NULL, NULL, &error);
441         if (error || NULL == out[j])
442         {
443             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
444                        error);
445             goto exit;
446         }
447     }
448 
449     // Verify data
450     t = (cl_long *)r;
451     for (size_t j = 0; j < buffer_elements; j++)
452     {
453         cl_long *q = out[0];
454 
455         // If we aren't getting the correctly rounded result
456         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
457         {
458             // If we aren't getting the correctly rounded result
459             if (ftz || relaxedMode)
460             {
461                 if (IsDoubleSubnormal(s[j]))
462                 {
463                     if (IsDoubleSubnormal(s2[j]))
464                     {
465                         int64_t correct = dfunc.i_ff(0.0f, 0.0f);
466                         int64_t correct2 = dfunc.i_ff(0.0f, -0.0f);
467                         int64_t correct3 = dfunc.i_ff(-0.0f, 0.0f);
468                         int64_t correct4 = dfunc.i_ff(-0.0f, -0.0f);
469 
470                         if (correct == q[j] || correct2 == q[j]
471                             || correct3 == q[j] || correct4 == q[j])
472                             continue;
473                     }
474                     else
475                     {
476                         int64_t correct = dfunc.i_ff(0.0f, s2[j]);
477                         int64_t correct2 = dfunc.i_ff(-0.0f, s2[j]);
478                         if (correct == q[j] || correct2 == q[j]) continue;
479                     }
480                 }
481                 else if (IsDoubleSubnormal(s2[j]))
482                 {
483                     int64_t correct = dfunc.i_ff(s[j], 0.0f);
484                     int64_t correct2 = dfunc.i_ff(s[j], -0.0f);
485                     if (correct == q[j] || correct2 == q[j]) continue;
486                 }
487             }
488 
489             cl_ulong err = t[j] - q[j];
490             if (q[j] > t[j]) err = q[j] - t[j];
491             vlog_error("\nERROR: %s: %" PRId64
492                        " ulp error at {%.13la, %.13la}: *%" PRId64 " "
493                        "vs. %" PRId64 "  (index: %zu)\n",
494                        name, err, ((double *)s)[j], ((double *)s2)[j], t[j],
495                        q[j], j);
496             error = -1;
497             goto exit;
498         }
499 
500 
501         for (auto k = std::max(1U, gMinVectorSizeIndex);
502              k < gMaxVectorSizeIndex; k++)
503         {
504             q = (cl_long *)out[k];
505             // If we aren't getting the correctly rounded result
506             if (-t[j] != q[j])
507             {
508                 if (ftz || relaxedMode)
509                 {
510                     if (IsDoubleSubnormal(s[j]))
511                     {
512                         if (IsDoubleSubnormal(s2[j]))
513                         {
514                             int64_t correct = -dfunc.i_ff(0.0f, 0.0f);
515                             int64_t correct2 = -dfunc.i_ff(0.0f, -0.0f);
516                             int64_t correct3 = -dfunc.i_ff(-0.0f, 0.0f);
517                             int64_t correct4 = -dfunc.i_ff(-0.0f, -0.0f);
518 
519                             if (correct == q[j] || correct2 == q[j]
520                                 || correct3 == q[j] || correct4 == q[j])
521                                 continue;
522                         }
523                         else
524                         {
525                             int64_t correct = -dfunc.i_ff(0.0f, s2[j]);
526                             int64_t correct2 = -dfunc.i_ff(-0.0f, s2[j]);
527                             if (correct == q[j] || correct2 == q[j]) continue;
528                         }
529                     }
530                     else if (IsDoubleSubnormal(s2[j]))
531                     {
532                         int64_t correct = -dfunc.i_ff(s[j], 0.0f);
533                         int64_t correct2 = -dfunc.i_ff(s[j], -0.0f);
534                         if (correct == q[j] || correct2 == q[j]) continue;
535                     }
536                 }
537 
538                 cl_ulong err = -t[j] - q[j];
539                 if (q[j] > -t[j]) err = q[j] + t[j];
540                 vlog_error("\nERROR: %sD%s: %" PRId64 " ulp error at {%.13la, "
541                            "%.13la}: *%" PRId64 " vs. %" PRId64
542                            "  (index: %zu)\n",
543                            name, sizeNames[k], err, ((double *)s)[j],
544                            ((double *)s2)[j], -t[j], q[j], j);
545                 error = -1;
546                 goto exit;
547             }
548         }
549     }
550 
551     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
552     {
553         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
554                                              out[j], 0, NULL, NULL)))
555         {
556             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
557                        j, error);
558             return error;
559         }
560     }
561 
562     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
563 
564 
565     if (0 == (base & 0x0fffffff))
566     {
567         if (gVerboseBruteForce)
568         {
569             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
570                  "ThreadCount:%2u\n",
571                  base, job->step, job->scale, buffer_elements,
572                  job->threadCount);
573         }
574         else
575         {
576             vlog(".");
577         }
578         fflush(stdout);
579     }
580 
581 exit:
582     return error;
583 }
584 
585 } // anonymous namespace
586 
TestMacro_Int_Double_Double(const Func * f,MTdata d,bool relaxedMode)587 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
588 {
589     TestInfo test_info{};
590     cl_int error;
591 
592     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
593 
594     // Init test_info
595     test_info.threadCount = GetThreadCount();
596     test_info.subBufferSize = BUFFER_SIZE
597         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
598     test_info.scale = getTestScale(sizeof(cl_double));
599 
600     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
601     if (test_info.step / test_info.subBufferSize != test_info.scale)
602     {
603         // there was overflow
604         test_info.jobCount = 1;
605     }
606     else
607     {
608         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
609     }
610 
611     test_info.f = f;
612     test_info.ftz = f->ftz || gForceFTZ;
613     test_info.relaxedMode = relaxedMode;
614 
615     // cl_kernels aren't thread safe, so we make one for each vector size for
616     // every thread
617     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
618     {
619         test_info.k[i].resize(test_info.threadCount, nullptr);
620     }
621 
622     test_info.tinfo.resize(test_info.threadCount);
623     for (cl_uint i = 0; i < test_info.threadCount; i++)
624     {
625         cl_buffer_region region = {
626             i * test_info.subBufferSize * sizeof(cl_double),
627             test_info.subBufferSize * sizeof(cl_double)
628         };
629         test_info.tinfo[i].inBuf =
630             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
631                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
632         if (error || NULL == test_info.tinfo[i].inBuf)
633         {
634             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
635                        "region {%zd, %zd}\n",
636                        region.origin, region.size);
637             goto exit;
638         }
639         test_info.tinfo[i].inBuf2 =
640             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
641                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
642         if (error || NULL == test_info.tinfo[i].inBuf2)
643         {
644             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
645                        "region {%zd, %zd}\n",
646                        region.origin, region.size);
647             goto exit;
648         }
649 
650         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
651         {
652             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
653                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
654                 &region, &error);
655             if (error || NULL == test_info.tinfo[i].outBuf[j])
656             {
657                 vlog_error("Error: Unable to create sub-buffer of "
658                            "gOutBuffer[%d] for region {%zd, %zd}\n",
659                            (int)j, region.origin, region.size);
660                 goto exit;
661             }
662         }
663         test_info.tinfo[i].tQueue =
664             clCreateCommandQueue(gContext, gDevice, 0, &error);
665         if (NULL == test_info.tinfo[i].tQueue || error)
666         {
667             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
668             goto exit;
669         }
670 
671         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
672     }
673 
674     // Init the kernels
675     {
676         BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
677                                     test_info.programs, f->nameInCode,
678                                     relaxedMode };
679         if ((error = ThreadPool_Do(BuildKernelFn,
680                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
681                                    &build_info)))
682             goto exit;
683     }
684 
685     // Run the kernels
686     if (!gSkipCorrectnessTesting)
687     {
688         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
689 
690         if (error) goto exit;
691 
692         if (gWimpyMode)
693             vlog("Wimp pass");
694         else
695             vlog("passed");
696     }
697 
698     vlog("\n");
699 
700 exit:
701     // Release
702     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
703     {
704         for (auto &kernel : test_info.k[i])
705         {
706             clReleaseKernel(kernel);
707         }
708     }
709 
710     return error;
711 }
712