1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cinttypes>
23 #include <cstring>
24 
25 namespace {
26 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)27 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
28 {
29     BuildKernelInfo &info = *(BuildKernelInfo *)p;
30     auto generator = [](const std::string &kernel_name, const char *builtin,
31                         cl_uint vector_size_index) {
32         return GetUnaryKernel(kernel_name, builtin, ParameterType::Long,
33                               ParameterType::Double, vector_size_index);
34     };
35     return BuildKernels(info, job_id, generator);
36 }
37 
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41     // Input and output buffers for the thread
42     clMemWrapper inBuf;
43     Buffers outBuf;
44 
45     // Per thread command queue to improve performance
46     clCommandQueueWrapper tQueue;
47 };
48 
49 struct TestInfo
50 {
51     size_t subBufferSize; // Size of the sub-buffer in elements
52     const Func *f; // A pointer to the function info
53 
54     // Programs for various vector sizes.
55     Programs programs;
56 
57     // Thread-specific kernels for each vector size:
58     // k[vector_size][thread_id]
59     KernelMatrix k;
60 
61     // Array of thread specific information
62     std::vector<ThreadInfo> tinfo;
63 
64     cl_uint threadCount; // Number of worker threads
65     cl_uint jobCount; // Number of jobs
66     cl_uint step; // step between each chunk and the next.
67     cl_uint scale; // stride between individual test values
68     int ftz; // non-zero if running in flush to zero mode
69     bool relaxedMode; // True if test is running in relaxed mode, false
70                       // otherwise.
71 };
72 
Test(cl_uint job_id,cl_uint thread_id,void * data)73 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
74 {
75     TestInfo *job = (TestInfo *)data;
76     size_t buffer_elements = job->subBufferSize;
77     size_t buffer_size = buffer_elements * sizeof(cl_double);
78     cl_uint scale = job->scale;
79     cl_uint base = job_id * (cl_uint)job->step;
80     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
81     dptr dfunc = job->f->dfunc;
82     int ftz = job->ftz;
83     bool relaxedMode = job->relaxedMode;
84     cl_int error;
85     const char *name = job->f->name;
86 
87     Force64BitFPUPrecision();
88 
89     cl_event e[VECTOR_SIZE_COUNT];
90     cl_long *out[VECTOR_SIZE_COUNT];
91     if (gHostFill)
92     {
93         // start the map of the output arrays
94         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
95         {
96             out[j] = (cl_long *)clEnqueueMapBuffer(
97                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
98                 buffer_size, 0, NULL, e + j, &error);
99             if (error || NULL == out[j])
100             {
101                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
102                            error);
103                 return error;
104             }
105         }
106 
107         // Get that moving
108         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
109     }
110 
111     // Write the new values to the input array
112     cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
113     for (size_t j = 0; j < buffer_elements; j++)
114         p[j] = DoubleFromUInt32(base + j * scale);
115 
116     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
117                                       buffer_size, p, 0, NULL, NULL)))
118     {
119         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
120         return error;
121     }
122 
123     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
124     {
125         if (gHostFill)
126         {
127             // Wait for the map to finish
128             if ((error = clWaitForEvents(1, e + j)))
129             {
130                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
131                 return error;
132             }
133             if ((error = clReleaseEvent(e[j])))
134             {
135                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
136                 return error;
137             }
138         }
139 
140         // Fill the result buffer with garbage, so that old results don't carry
141         // over
142         uint32_t pattern = 0xffffdead;
143         if (gHostFill)
144         {
145             memset_pattern4(out[j], &pattern, buffer_size);
146             if ((error = clEnqueueUnmapMemObject(
147                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
148             {
149                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
150                            error);
151                 return error;
152             }
153         }
154         else
155         {
156             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
157                                              &pattern, sizeof(pattern), 0,
158                                              buffer_size, 0, NULL, NULL)))
159             {
160                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
161                            error);
162                 return error;
163             }
164         }
165 
166         // Run the kernel
167         size_t vectorCount =
168             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
169         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
170                                                  // own copy of the cl_kernel
171         cl_program program = job->programs[j];
172 
173         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
174                                     &tinfo->outBuf[j])))
175         {
176             LogBuildError(program);
177             return error;
178         }
179         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
180                                     &tinfo->inBuf)))
181         {
182             LogBuildError(program);
183             return error;
184         }
185 
186         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
187                                             &vectorCount, NULL, 0, NULL, NULL)))
188         {
189             vlog_error("FAILED -- could not execute kernel\n");
190             return error;
191         }
192     }
193 
194     // Get that moving
195     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
196 
197     if (gSkipCorrectnessTesting) return CL_SUCCESS;
198 
199     // Calculate the correctly rounded reference result
200     cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
201     cl_double *s = (cl_double *)p;
202     for (size_t j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
203 
204     // Read the data back -- no need to wait for the first N-1 buffers but wait
205     // for the last buffer. This is an in order queue.
206     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
207     {
208         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
209         out[j] = (cl_long *)clEnqueueMapBuffer(
210             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
211             buffer_size, 0, NULL, NULL, &error);
212         if (error || NULL == out[j])
213         {
214             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
215                        error);
216             return error;
217         }
218     }
219 
220     // Verify data
221     cl_long *t = (cl_long *)r;
222     for (size_t j = 0; j < buffer_elements; j++)
223     {
224         cl_long *q = out[0];
225 
226         // If we aren't getting the correctly rounded result
227         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
228         {
229             // If we aren't getting the correctly rounded result
230             if (ftz || relaxedMode)
231             {
232                 if (IsDoubleSubnormal(s[j]))
233                 {
234                     cl_long correct = dfunc.i_f(+0.0f);
235                     cl_long correct2 = dfunc.i_f(-0.0f);
236                     if (correct == q[j] || correct2 == q[j]) continue;
237                 }
238             }
239 
240             cl_ulong err = t[j] - q[j];
241             if (q[j] > t[j]) err = q[j] - t[j];
242             vlog_error("\nERROR: %sD: %" PRId64
243                        " ulp error at %.13la: *%" PRId64 " vs. %" PRId64 "\n",
244                        name, err, ((double *)gIn)[j], t[j], q[j]);
245             return -1;
246         }
247 
248 
249         for (auto k = std::max(1U, gMinVectorSizeIndex);
250              k < gMaxVectorSizeIndex; k++)
251         {
252             q = out[k];
253             // If we aren't getting the correctly rounded result
254             if (-t[j] != q[j])
255             {
256                 if (ftz || relaxedMode)
257                 {
258                     if (IsDoubleSubnormal(s[j]))
259                     {
260                         int64_t correct = -dfunc.i_f(+0.0f);
261                         int64_t correct2 = -dfunc.i_f(-0.0f);
262                         if (correct == q[j] || correct2 == q[j]) continue;
263                     }
264                 }
265 
266                 cl_ulong err = -t[j] - q[j];
267                 if (q[j] > -t[j]) err = q[j] + t[j];
268                 vlog_error(
269                     "\nERROR: %sD%s: %" PRId64 " ulp error at %.13la: *%" PRId64
270                     " vs. %" PRId64 "\n",
271                     name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
272                 return -1;
273             }
274         }
275     }
276 
277     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
278     {
279         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
280                                              out[j], 0, NULL, NULL)))
281         {
282             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
283                        j, error);
284             return error;
285         }
286     }
287 
288     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
289 
290 
291     if (0 == (base & 0x0fffffff))
292     {
293         if (gVerboseBruteForce)
294         {
295             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
296                  "ThreadCount:%2u\n",
297                  base, job->step, job->scale, buffer_elements,
298                  job->threadCount);
299         }
300         else
301         {
302             vlog(".");
303         }
304         fflush(stdout);
305     }
306 
307     return CL_SUCCESS;
308 }
309 
310 } // anonymous namespace
311 
TestMacro_Int_Double(const Func * f,MTdata d,bool relaxedMode)312 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
313 {
314     TestInfo test_info{};
315     cl_int error;
316 
317     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
318 
319     // Init test_info
320     test_info.threadCount = GetThreadCount();
321     test_info.subBufferSize = BUFFER_SIZE
322         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
323     test_info.scale = getTestScale(sizeof(cl_double));
324 
325     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
326     if (test_info.step / test_info.subBufferSize != test_info.scale)
327     {
328         // there was overflow
329         test_info.jobCount = 1;
330     }
331     else
332     {
333         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
334     }
335 
336     test_info.f = f;
337     test_info.ftz = f->ftz || gForceFTZ;
338     test_info.relaxedMode = relaxedMode;
339 
340     test_info.tinfo.resize(test_info.threadCount);
341     for (cl_uint i = 0; i < test_info.threadCount; i++)
342     {
343         cl_buffer_region region = {
344             i * test_info.subBufferSize * sizeof(cl_double),
345             test_info.subBufferSize * sizeof(cl_double)
346         };
347         test_info.tinfo[i].inBuf =
348             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
349                               CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
350         if (error || NULL == test_info.tinfo[i].inBuf)
351         {
352             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
353                        "region {%zd, %zd}\n",
354                        region.origin, region.size);
355             return error;
356         }
357 
358         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
359         {
360             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
361                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
362                 ®ion, &error);
363             if (error || NULL == test_info.tinfo[i].outBuf[j])
364             {
365                 vlog_error("Error: Unable to create sub-buffer of "
366                            "gOutBuffer[%d] for region {%zd, %zd}\n",
367                            (int)j, region.origin, region.size);
368                 return error;
369             }
370         }
371         test_info.tinfo[i].tQueue =
372             clCreateCommandQueue(gContext, gDevice, 0, &error);
373         if (NULL == test_info.tinfo[i].tQueue || error)
374         {
375             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
376             return error;
377         }
378     }
379 
380     // Init the kernels
381     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
382                                 test_info.programs, f->nameInCode,
383                                 relaxedMode };
384     if ((error = ThreadPool_Do(BuildKernelFn,
385                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
386                                &build_info)))
387         return error;
388 
389     // Run the kernels
390     if (!gSkipCorrectnessTesting)
391     {
392         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
393         if (error) return error;
394 
395         if (gWimpyMode)
396             vlog("Wimp pass");
397         else
398             vlog("passed");
399     }
400 
401     vlog("\n");
402 
403     return CL_SUCCESS;
404 }
405