• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)26 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
27                 cl_kernel *k, cl_program *p, bool relaxedMode)
28 {
29     const char *c[] = { "__kernel void math_kernel",
30                         sizeNames[vectorSize],
31                         "( __global int",
32                         sizeNames[vectorSize],
33                         "* out, __global float",
34                         sizeNames[vectorSize],
35                         "* in1, __global float",
36                         sizeNames[vectorSize],
37                         "* in2 )\n"
38                         "{\n"
39                         "   size_t i = get_global_id(0);\n"
40                         "   out[i] = ",
41                         name,
42                         "( in1[i], in2[i] );\n"
43                         "}\n" };
44 
45     const char *c3[] = {
46         "__kernel void math_kernel",
47         sizeNames[vectorSize],
48         "( __global int* out, __global float* in, __global float* in2)\n"
49         "{\n"
50         "   size_t i = get_global_id(0);\n"
51         "   if( i + 1 < get_global_size(0) )\n"
52         "   {\n"
53         "       float3 f0 = vload3( 0, in + 3 * i );\n"
54         "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
55         "       int3 i0 = ",
56         name,
57         "( f0, f1 );\n"
58         "       vstore3( i0, 0, out + 3*i );\n"
59         "   }\n"
60         "   else\n"
61         "   {\n"
62         "       size_t parity = i & 1;   // Figure out how many elements are "
63         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
64         "buffer size \n"
65         "       float3 f0;\n"
66         "       float3 f1;\n"
67         "       switch( parity )\n"
68         "       {\n"
69         "           case 1:\n"
70         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
71         "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
72         "               break;\n"
73         "           case 0:\n"
74         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
75         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
76         "               break;\n"
77         "       }\n"
78         "       int3 i0 = ",
79         name,
80         "( f0, f1 );\n"
81         "       switch( parity )\n"
82         "       {\n"
83         "           case 0:\n"
84         "               out[3*i+1] = i0.y; \n"
85         "               // fall through\n"
86         "           case 1:\n"
87         "               out[3*i] = i0.x; \n"
88         "               break;\n"
89         "       }\n"
90         "   }\n"
91         "}\n"
92     };
93 
94     const char **kern = c;
95     size_t kernSize = sizeof(c) / sizeof(c[0]);
96 
97     if (sizeValues[vectorSize] == 3)
98     {
99         kern = c3;
100         kernSize = sizeof(c3) / sizeof(c3[0]);
101     }
102 
103     char testName[32];
104     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
105              sizeNames[vectorSize]);
106 
107     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
108                        relaxedMode);
109 }
110 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)111 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
112 {
113     BuildKernelInfo *info = (BuildKernelInfo *)p;
114     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
115     return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
116                        info->kernels[vectorSize].data(),
117                        &(info->programs[vectorSize]), info->relaxedMode);
118 }
119 
120 // Thread specific data for a worker thread
121 struct ThreadInfo
122 {
123     // Input and output buffers for the thread
124     clMemWrapper inBuf;
125     clMemWrapper inBuf2;
126     Buffers outBuf;
127 
128     MTdataHolder d;
129 
130     // Per thread command queue to improve performance
131     clCommandQueueWrapper tQueue;
132 };
133 
134 struct TestInfo
135 {
136     size_t subBufferSize; // Size of the sub-buffer in elements
137     const Func *f; // A pointer to the function info
138 
139     // Programs for various vector sizes.
140     Programs programs;
141 
142     // Thread-specific kernels for each vector size:
143     // k[vector_size][thread_id]
144     KernelMatrix k;
145 
146     // Array of thread specific information
147     std::vector<ThreadInfo> tinfo;
148 
149     cl_uint threadCount; // Number of worker threads
150     cl_uint jobCount; // Number of jobs
151     cl_uint step; // step between each chunk and the next.
152     cl_uint scale; // stride between individual test values
153     int ftz; // non-zero if running in flush to zero mode
154     bool relaxedMode; // True if test is running in relaxed mode, false
155                       // otherwise.
156 };
157 
158 // A table of more difficult cases to get right
159 const float specialValues[] = {
160     -NAN,
161     -INFINITY,
162     -FLT_MAX,
163     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
164     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
165     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
166     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
167     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
168     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
169     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
170     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
171     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
172     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
173     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
174     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
175     -1000.f,
176     -100.f,
177     -4.0f,
178     -3.5f,
179     -3.0f,
180     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
181     -2.5f,
182     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
183     -2.0f,
184     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
185     -1.5f,
186     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
187     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
188     -1.0f,
189     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
190     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
191     -0.5f,
192     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
193     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
194     -0.25f,
195     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
196     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
197     -FLT_MIN,
198     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
199     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
200     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
201     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
202     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
203     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
204     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
205     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
206     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
207     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
208     -0.0f,
209 
210     +NAN,
211     +INFINITY,
212     +FLT_MAX,
213     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
214     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
215     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
216     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
217     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
218     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
219     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
220     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
221     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
222     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
223     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
224     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
225     +1000.f,
226     +100.f,
227     +4.0f,
228     +3.5f,
229     +3.0f,
230     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
231     2.5f,
232     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
233     +2.0f,
234     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
235     1.5f,
236     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
237     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
238     +1.0f,
239     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
240     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
241     +0.5f,
242     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
243     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
244     +0.25f,
245     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
246     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
247     +FLT_MIN,
248     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
249     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
250     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
251     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
252     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
253     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
254     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
255     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
256     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
257     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
258     +0.0f,
259 };
260 
261 constexpr size_t specialValuesCount =
262     sizeof(specialValues) / sizeof(specialValues[0]);
263 
Test(cl_uint job_id,cl_uint thread_id,void * data)264 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
265 {
266     TestInfo *job = (TestInfo *)data;
267     size_t buffer_elements = job->subBufferSize;
268     size_t buffer_size = buffer_elements * sizeof(cl_float);
269     cl_uint base = job_id * (cl_uint)job->step;
270     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
271     fptr func = job->f->func;
272     int ftz = job->ftz;
273     bool relaxedMode = job->relaxedMode;
274     MTdata d = tinfo->d;
275     cl_int error;
276     const char *name = job->f->name;
277     cl_int *t = 0;
278     cl_int *r = 0;
279     cl_float *s = 0;
280     cl_float *s2 = 0;
281 
282     // start the map of the output arrays
283     cl_event e[VECTOR_SIZE_COUNT];
284     cl_int *out[VECTOR_SIZE_COUNT];
285     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
286     {
287         out[j] = (cl_int *)clEnqueueMapBuffer(
288             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
289             buffer_size, 0, NULL, e + j, &error);
290         if (error || NULL == out[j])
291         {
292             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
293                        error);
294             return error;
295         }
296     }
297 
298     // Get that moving
299     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
300 
301     // Init input array
302     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
303     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
304     cl_uint idx = 0;
305 
306     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
307     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
308 
309     if (job_id <= (cl_uint)lastSpecialJobIndex)
310     { // test edge cases
311         float *fp = (float *)p;
312         float *fp2 = (float *)p2;
313         uint32_t x, y;
314 
315         x = (job_id * buffer_elements) % specialValuesCount;
316         y = (job_id * buffer_elements) / specialValuesCount;
317 
318         for (; idx < buffer_elements; idx++)
319         {
320             fp[idx] = specialValues[x];
321             fp2[idx] = specialValues[y];
322             ++x;
323             if (x >= specialValuesCount)
324             {
325                 x = 0;
326                 y++;
327                 if (y >= specialValuesCount) break;
328             }
329         }
330     }
331 
332     // Init any remaining values.
333     for (; idx < buffer_elements; idx++)
334     {
335         p[idx] = genrand_int32(d);
336         p2[idx] = genrand_int32(d);
337     }
338 
339     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
340                                       buffer_size, p, 0, NULL, NULL)))
341     {
342         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
343         goto exit;
344     }
345 
346     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
347                                       buffer_size, p2, 0, NULL, NULL)))
348     {
349         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
350         goto exit;
351     }
352 
353     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
354     {
355         // Wait for the map to finish
356         if ((error = clWaitForEvents(1, e + j)))
357         {
358             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
359             goto exit;
360         }
361         if ((error = clReleaseEvent(e[j])))
362         {
363             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
364             goto exit;
365         }
366 
367         // Fill the result buffer with garbage, so that old results don't carry
368         // over
369         uint32_t pattern = 0xffffdead;
370         memset_pattern4(out[j], &pattern, buffer_size);
371         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
372                                              out[j], 0, NULL, NULL)))
373         {
374             vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
375                        error);
376             goto exit;
377         }
378 
379         // run the kernel
380         size_t vectorCount =
381             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
382         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
383                                                  // own copy of the cl_kernel
384         cl_program program = job->programs[j];
385 
386         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
387                                     &tinfo->outBuf[j])))
388         {
389             LogBuildError(program);
390             return error;
391         }
392         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
393                                     &tinfo->inBuf)))
394         {
395             LogBuildError(program);
396             return error;
397         }
398         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
399                                     &tinfo->inBuf2)))
400         {
401             LogBuildError(program);
402             return error;
403         }
404 
405         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
406                                             &vectorCount, NULL, 0, NULL, NULL)))
407         {
408             vlog_error("FAILED -- could not execute kernel\n");
409             goto exit;
410         }
411     }
412 
413     // Get that moving
414     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
415 
416     if (gSkipCorrectnessTesting) return CL_SUCCESS;
417 
418     // Calculate the correctly rounded reference result
419     r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
420     s = (float *)gIn + thread_id * buffer_elements;
421     s2 = (float *)gIn2 + thread_id * buffer_elements;
422     for (size_t j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
423 
424     // Read the data back -- no need to wait for the first N-1 buffers but wait
425     // for the last buffer. This is an in order queue.
426     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
427     {
428         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
429         out[j] = (cl_int *)clEnqueueMapBuffer(
430             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
431             buffer_size, 0, NULL, NULL, &error);
432         if (error || NULL == out[j])
433         {
434             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
435                        error);
436             goto exit;
437         }
438     }
439 
440     // Verify data
441     t = (cl_int *)r;
442     for (size_t j = 0; j < buffer_elements; j++)
443     {
444         cl_int *q = out[0];
445 
446         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
447         {
448             if (ftz || relaxedMode)
449             {
450                 if (IsFloatSubnormal(s[j]))
451                 {
452                     if (IsFloatSubnormal(s2[j]))
453                     {
454                         int correct = func.i_ff(0.0f, 0.0f);
455                         int correct2 = func.i_ff(0.0f, -0.0f);
456                         int correct3 = func.i_ff(-0.0f, 0.0f);
457                         int correct4 = func.i_ff(-0.0f, -0.0f);
458 
459                         if (correct == q[j] || correct2 == q[j]
460                             || correct3 == q[j] || correct4 == q[j])
461                             continue;
462                     }
463                     else
464                     {
465                         int correct = func.i_ff(0.0f, s2[j]);
466                         int correct2 = func.i_ff(-0.0f, s2[j]);
467                         if (correct == q[j] || correct2 == q[j]) continue;
468                     }
469                 }
470                 else if (IsFloatSubnormal(s2[j]))
471                 {
472                     int correct = func.i_ff(s[j], 0.0f);
473                     int correct2 = func.i_ff(s[j], -0.0f);
474                     if (correct == q[j] || correct2 == q[j]) continue;
475                 }
476             }
477 
478             uint32_t err = t[j] - q[j];
479             if (q[j] > t[j]) err = q[j] - t[j];
480             vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
481                        "0x%8.8x (index: %zu)\n",
482                        name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
483                        j);
484             error = -1;
485             goto exit;
486         }
487 
488         for (auto k = std::max(1U, gMinVectorSizeIndex);
489              k < gMaxVectorSizeIndex; k++)
490         {
491             q = out[k];
492             // If we aren't getting the correctly rounded result
493             if (-t[j] != q[j])
494             {
495                 if (ftz || relaxedMode)
496                 {
497                     if (IsFloatSubnormal(s[j]))
498                     {
499                         if (IsFloatSubnormal(s2[j]))
500                         {
501                             int correct = -func.i_ff(0.0f, 0.0f);
502                             int correct2 = -func.i_ff(0.0f, -0.0f);
503                             int correct3 = -func.i_ff(-0.0f, 0.0f);
504                             int correct4 = -func.i_ff(-0.0f, -0.0f);
505 
506                             if (correct == q[j] || correct2 == q[j]
507                                 || correct3 == q[j] || correct4 == q[j])
508                                 continue;
509                         }
510                         else
511                         {
512                             int correct = -func.i_ff(0.0f, s2[j]);
513                             int correct2 = -func.i_ff(-0.0f, s2[j]);
514                             if (correct == q[j] || correct2 == q[j]) continue;
515                         }
516                     }
517                     else if (IsFloatSubnormal(s2[j]))
518                     {
519                         int correct = -func.i_ff(s[j], 0.0f);
520                         int correct2 = -func.i_ff(s[j], -0.0f);
521                         if (correct == q[j] || correct2 == q[j]) continue;
522                     }
523                 }
524                 cl_uint err = -t[j] - q[j];
525                 if (q[j] > -t[j]) err = q[j] + t[j];
526                 vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
527                            "vs. 0x%8.8x (index: %zu)\n",
528                            name, sizeNames[k], err, ((float *)s)[j],
529                            ((float *)s2)[j], -t[j], q[j], j);
530                 error = -1;
531                 goto exit;
532             }
533         }
534     }
535 
536     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
537     {
538         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
539                                              out[j], 0, NULL, NULL)))
540         {
541             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
542                        j, error);
543             return error;
544         }
545     }
546 
547     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
548 
549 
550     if (0 == (base & 0x0fffffff))
551     {
552         if (gVerboseBruteForce)
553         {
554             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
555                  "ThreadCount:%2u\n",
556                  base, job->step, job->scale, buffer_elements,
557                  job->threadCount);
558         }
559         else
560         {
561             vlog(".");
562         }
563         fflush(stdout);
564     }
565 
566 exit:
567     return error;
568 }
569 
570 } // anonymous namespace
571 
TestMacro_Int_Float_Float(const Func * f,MTdata d,bool relaxedMode)572 int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
573 {
574     TestInfo test_info{};
575     cl_int error;
576 
577     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
578 
579     // Init test_info
580     test_info.threadCount = GetThreadCount();
581     test_info.subBufferSize = BUFFER_SIZE
582         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
583     test_info.scale = getTestScale(sizeof(cl_float));
584 
585     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
586     if (test_info.step / test_info.subBufferSize != test_info.scale)
587     {
588         // there was overflow
589         test_info.jobCount = 1;
590     }
591     else
592     {
593         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
594     }
595 
596     test_info.f = f;
597     test_info.ftz =
598         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
599     test_info.relaxedMode = relaxedMode;
600 
601     // cl_kernels aren't thread safe, so we make one for each vector size for
602     // every thread
603     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
604     {
605         test_info.k[i].resize(test_info.threadCount, nullptr);
606     }
607 
608     test_info.tinfo.resize(test_info.threadCount);
609     for (cl_uint i = 0; i < test_info.threadCount; i++)
610     {
611         cl_buffer_region region = {
612             i * test_info.subBufferSize * sizeof(cl_float),
613             test_info.subBufferSize * sizeof(cl_float)
614         };
615         test_info.tinfo[i].inBuf =
616             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
617                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
618         if (error || NULL == test_info.tinfo[i].inBuf)
619         {
620             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
621                        "region {%zd, %zd}\n",
622                        region.origin, region.size);
623             goto exit;
624         }
625         test_info.tinfo[i].inBuf2 =
626             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
627                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
628         if (error || NULL == test_info.tinfo[i].inBuf2)
629         {
630             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
631                        "region {%zd, %zd}\n",
632                        region.origin, region.size);
633             goto exit;
634         }
635 
636         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
637         {
638             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
639                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
640                 &region, &error);
641             if (error || NULL == test_info.tinfo[i].outBuf[j])
642             {
643                 vlog_error("Error: Unable to create sub-buffer of "
644                            "gOutBuffer[%d] for region {%zd, %zd}\n",
645                            (int)j, region.origin, region.size);
646                 goto exit;
647             }
648         }
649         test_info.tinfo[i].tQueue =
650             clCreateCommandQueue(gContext, gDevice, 0, &error);
651         if (NULL == test_info.tinfo[i].tQueue || error)
652         {
653             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
654             goto exit;
655         }
656 
657         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
658     }
659 
660     // Init the kernels
661     {
662         BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
663                                     test_info.programs, f->nameInCode,
664                                     relaxedMode };
665         if ((error = ThreadPool_Do(BuildKernelFn,
666                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
667                                    &build_info)))
668             goto exit;
669     }
670 
671     // Run the kernels
672     if (!gSkipCorrectnessTesting)
673     {
674         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
675 
676         if (error) goto exit;
677 
678         if (gWimpyMode)
679             vlog("Wimp pass");
680         else
681             vlog("passed");
682     }
683 
684     vlog("\n");
685 
686 exit:
687     // Release
688     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
689     {
690         for (auto &kernel : test_info.k[i])
691         {
692             clReleaseKernel(kernel);
693         }
694     }
695 
696     return error;
697 }
698