1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
26 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
27
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)28 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
29 cl_kernel *k, cl_program *p, bool relaxedMode)
30 {
31 const char *c[] = { "__kernel void math_kernel",
32 sizeNames[vectorSize],
33 "( __global float",
34 sizeNames[vectorSize],
35 "* out, __global float",
36 sizeNames[vectorSize],
37 "* in1, __global float",
38 sizeNames[vectorSize],
39 "* in2 )\n"
40 "{\n"
41 " size_t i = get_global_id(0);\n"
42 " out[i] = ",
43 name,
44 "( in1[i], in2[i] );\n"
45 "}\n" };
46
47 const char *c3[] = {
48 "__kernel void math_kernel",
49 sizeNames[vectorSize],
50 "( __global float* out, __global float* in, __global float* in2)\n"
51 "{\n"
52 " size_t i = get_global_id(0);\n"
53 " if( i + 1 < get_global_size(0) )\n"
54 " {\n"
55 " float3 f0 = vload3( 0, in + 3 * i );\n"
56 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
57 " f0 = ",
58 name,
59 "( f0, f1 );\n"
60 " vstore3( f0, 0, out + 3*i );\n"
61 " }\n"
62 " else\n"
63 " {\n"
64 " size_t parity = i & 1; // Figure out how many elements are "
65 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
66 "buffer size \n"
67 " float3 f0;\n"
68 " float3 f1;\n"
69 " switch( parity )\n"
70 " {\n"
71 " case 1:\n"
72 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
73 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
74 " break;\n"
75 " case 0:\n"
76 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
77 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
78 " break;\n"
79 " }\n"
80 " f0 = ",
81 name,
82 "( f0, f1 );\n"
83 " switch( parity )\n"
84 " {\n"
85 " case 0:\n"
86 " out[3*i+1] = f0.y; \n"
87 " // fall through\n"
88 " case 1:\n"
89 " out[3*i] = f0.x; \n"
90 " break;\n"
91 " }\n"
92 " }\n"
93 "}\n"
94 };
95
96 const char **kern = c;
97 size_t kernSize = sizeof(c) / sizeof(c[0]);
98
99 if (sizeValues[vectorSize] == 3)
100 {
101 kern = c3;
102 kernSize = sizeof(c3) / sizeof(c3[0]);
103 }
104
105 char testName[32];
106 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
107 sizeNames[vectorSize]);
108
109 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
110 relaxedMode);
111 }
112
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)113 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
114 {
115 BuildKernelInfo *info = (BuildKernelInfo *)p;
116 cl_uint vectorSize = gMinVectorSizeIndex + job_id;
117 return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
118 info->kernels[vectorSize].data(),
119 &(info->programs[vectorSize]), info->relaxedMode);
120 }
121
122 // Thread specific data for a worker thread
123 struct ThreadInfo
124 {
125 // Input and output buffers for the thread
126 clMemWrapper inBuf;
127 clMemWrapper inBuf2;
128 Buffers outBuf;
129
130 float maxError; // max error value. Init to 0.
131 double
132 maxErrorValue; // position of the max error value (param 1). Init to 0.
133 double maxErrorValue2; // position of the max error value (param 2). Init
134 // to 0.
135 MTdataHolder d;
136
137 // Per thread command queue to improve performance
138 clCommandQueueWrapper tQueue;
139 };
140
141 struct TestInfo
142 {
143 size_t subBufferSize; // Size of the sub-buffer in elements
144 const Func *f; // A pointer to the function info
145
146 // Programs for various vector sizes.
147 Programs programs;
148
149 // Thread-specific kernels for each vector size:
150 // k[vector_size][thread_id]
151 KernelMatrix k;
152
153 // Array of thread specific information
154 std::vector<ThreadInfo> tinfo;
155
156 cl_uint threadCount; // Number of worker threads
157 cl_uint jobCount; // Number of jobs
158 cl_uint step; // step between each chunk and the next.
159 cl_uint scale; // stride between individual test values
160 float ulps; // max_allowed ulps
161 int ftz; // non-zero if running in flush to zero mode
162
163 int isFDim;
164 int skipNanInf;
165 int isNextafter;
166 bool relaxedMode; // True if test is running in relaxed mode, false
167 // otherwise.
168 };
169
170 // A table of more difficult cases to get right
171 const float specialValues[] = {
172 -NAN,
173 -INFINITY,
174 -FLT_MAX,
175 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
176 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
177 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
178 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
179 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
180 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
181 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
182 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
183 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
184 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
185 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
186 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
187 -1000.f,
188 -100.f,
189 -4.0f,
190 -3.5f,
191 -3.0f,
192 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
193 -2.5f,
194 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
195 -2.0f,
196 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
197 -1.5f,
198 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
199 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
200 -1.0f,
201 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
202 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
203 -0.5f,
204 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
205 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
206 -0.25f,
207 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
208 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
209 -FLT_MIN,
210 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
211 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
212 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
213 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
214 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
215 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
216 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
217 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
218 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
219 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
220 -0.0f,
221
222 +NAN,
223 +INFINITY,
224 +FLT_MAX,
225 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
226 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
227 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
228 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
229 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
230 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
231 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
232 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
233 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
234 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
235 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
236 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
237 +1000.f,
238 +100.f,
239 +4.0f,
240 +3.5f,
241 +3.0f,
242 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
243 2.5f,
244 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
245 +2.0f,
246 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
247 1.5f,
248 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
249 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
250 +1.0f,
251 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
252 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
253 +0.5f,
254 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
255 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
256 +0.25f,
257 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
258 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
259 +FLT_MIN,
260 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
261 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
262 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
263 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
264 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
265 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
266 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
267 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
268 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
269 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
270 +0.0f,
271 };
272
273 constexpr size_t specialValuesCount =
274 sizeof(specialValues) / sizeof(specialValues[0]);
275
Test(cl_uint job_id,cl_uint thread_id,void * data)276 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
277 {
278 TestInfo *job = (TestInfo *)data;
279 size_t buffer_elements = job->subBufferSize;
280 size_t buffer_size = buffer_elements * sizeof(cl_float);
281 cl_uint base = job_id * (cl_uint)job->step;
282 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
283 fptr func = job->f->func;
284 int ftz = job->ftz;
285 bool relaxedMode = job->relaxedMode;
286 float ulps = getAllowedUlpError(job->f, relaxedMode);
287 MTdata d = tinfo->d;
288 cl_int error;
289 std::vector<bool> overflow(buffer_elements, false);
290 const char *name = job->f->name;
291 int isFDim = job->isFDim;
292 int skipNanInf = job->skipNanInf;
293 int isNextafter = job->isNextafter;
294 cl_uint *t = 0;
295 cl_float *r = 0;
296 cl_float *s = 0;
297 cl_float *s2 = 0;
298 cl_int copysign_test = 0;
299 RoundingMode oldRoundMode;
300 int skipVerification = 0;
301
302 if (relaxedMode)
303 {
304 func = job->f->rfunc;
305 if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
306 {
307 ulps = INFINITY;
308 skipVerification = 1;
309 }
310 }
311
312 // start the map of the output arrays
313 cl_event e[VECTOR_SIZE_COUNT];
314 cl_uint *out[VECTOR_SIZE_COUNT];
315 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
316 {
317 out[j] = (cl_uint *)clEnqueueMapBuffer(
318 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
319 buffer_size, 0, NULL, e + j, &error);
320 if (error || NULL == out[j])
321 {
322 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
323 error);
324 return error;
325 }
326 }
327
328 // Get that moving
329 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
330
331 // Init input array
332 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
333 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
334 cl_uint idx = 0;
335 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
336 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
337
338 if (job_id <= (cl_uint)lastSpecialJobIndex)
339 { // test edge cases
340 float *fp = (float *)p;
341 float *fp2 = (float *)p2;
342 uint32_t x, y;
343
344 x = (job_id * buffer_elements) % specialValuesCount;
345 y = (job_id * buffer_elements) / specialValuesCount;
346
347 for (; idx < buffer_elements; idx++)
348 {
349 fp[idx] = specialValues[x];
350 fp2[idx] = specialValues[y];
351 ++x;
352 if (x >= specialValuesCount)
353 {
354 x = 0;
355 y++;
356 if (y >= specialValuesCount) break;
357 }
358 }
359 }
360
361 // Init any remaining values.
362 for (; idx < buffer_elements; idx++)
363 {
364 p[idx] = genrand_int32(d);
365 p2[idx] = genrand_int32(d);
366 }
367
368 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
369 buffer_size, p, 0, NULL, NULL)))
370 {
371 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
372 goto exit;
373 }
374
375 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
376 buffer_size, p2, 0, NULL, NULL)))
377 {
378 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
379 goto exit;
380 }
381
382 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
383 {
384 // Wait for the map to finish
385 if ((error = clWaitForEvents(1, e + j)))
386 {
387 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
388 goto exit;
389 }
390 if ((error = clReleaseEvent(e[j])))
391 {
392 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
393 goto exit;
394 }
395
396 // Fill the result buffer with garbage, so that old results don't carry
397 // over
398 uint32_t pattern = 0xffffdead;
399 memset_pattern4(out[j], &pattern, buffer_size);
400 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
401 out[j], 0, NULL, NULL)))
402 {
403 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
404 error);
405 goto exit;
406 }
407
408 // run the kernel
409 size_t vectorCount =
410 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
411 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
412 // own copy of the cl_kernel
413 cl_program program = job->programs[j];
414
415 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
416 &tinfo->outBuf[j])))
417 {
418 LogBuildError(program);
419 return error;
420 }
421 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
422 &tinfo->inBuf)))
423 {
424 LogBuildError(program);
425 return error;
426 }
427 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
428 &tinfo->inBuf2)))
429 {
430 LogBuildError(program);
431 return error;
432 }
433
434 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
435 &vectorCount, NULL, 0, NULL, NULL)))
436 {
437 vlog_error("FAILED -- could not execute kernel\n");
438 goto exit;
439 }
440 }
441
442 // Get that moving
443 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
444
445 if (gSkipCorrectnessTesting)
446 {
447 if ((error = clFinish(tinfo->tQueue)))
448 {
449 vlog_error("Error: clFinish failed! err: %d\n", error);
450 goto exit;
451 }
452 return CL_SUCCESS;
453 }
454
455 FPU_mode_type oldMode;
456 oldRoundMode = kRoundToNearestEven;
457 if (isFDim)
458 {
459 // Calculate the correctly rounded reference result
460 memset(&oldMode, 0, sizeof(oldMode));
461 if (ftz || relaxedMode) ForceFTZ(&oldMode);
462
463 // Set the rounding mode to match the device
464 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
465 }
466
467 if (!strcmp(name, "copysign")) copysign_test = 1;
468
469 #define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
470
471 // Calculate the correctly rounded reference result
472 r = (float *)gOut_Ref + thread_id * buffer_elements;
473 s = (float *)gIn + thread_id * buffer_elements;
474 s2 = (float *)gIn2 + thread_id * buffer_elements;
475 if (skipNanInf)
476 {
477 for (size_t j = 0; j < buffer_elements; j++)
478 {
479 feclearexcept(FE_OVERFLOW);
480 r[j] = (float)ref_func(s[j], s2[j]);
481 overflow[j] =
482 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
483 }
484 }
485 else
486 {
487 for (size_t j = 0; j < buffer_elements; j++)
488 r[j] = (float)ref_func(s[j], s2[j]);
489 }
490
491 if (isFDim && ftz) RestoreFPState(&oldMode);
492
493 // Read the data back -- no need to wait for the first N-1 buffers but wait
494 // for the last buffer. This is an in order queue.
495 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
496 {
497 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
498 out[j] = (cl_uint *)clEnqueueMapBuffer(
499 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
500 buffer_size, 0, NULL, NULL, &error);
501 if (error || NULL == out[j])
502 {
503 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
504 error);
505 goto exit;
506 }
507 }
508
509 if (!skipVerification)
510 {
511 // Verify data
512 t = (cl_uint *)r;
513 for (size_t j = 0; j < buffer_elements; j++)
514 {
515 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
516 {
517 cl_uint *q = out[k];
518
519 // If we aren't getting the correctly rounded result
520 if (t[j] != q[j])
521 {
522 float test = ((float *)q)[j];
523 double correct = ref_func(s[j], s2[j]);
524
525 // Per section 10 paragraph 6, accept any result if an input
526 // or output is a infinity or NaN or overflow As per
527 // OpenCL 2.0 spec, section 5.8.4.3, enabling
528 // fast-relaxed-math mode also enables -cl-finite-math-only
529 // optimization. This optimization allows to assume that
530 // arguments and results are not NaNs or +/-INFs. Hence,
531 // accept any result if inputs or results are NaNs or INFs.
532 if (relaxedMode || skipNanInf)
533 {
534 if (skipNanInf && overflow[j]) continue;
535 // Note: no double rounding here. Reference functions
536 // calculate in single precision.
537 if (IsFloatInfinity(correct) || IsFloatNaN(correct)
538 || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
539 || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
540 continue;
541 }
542
543 float err = Ulp_Error(test, correct);
544 int fail = !(fabsf(err) <= ulps);
545
546 if (fail && (ftz || relaxedMode))
547 {
548 // retry per section 6.5.3.2
549 if (IsFloatResultSubnormal(correct, ulps))
550 {
551 fail = fail && (test != 0.0f);
552 if (!fail) err = 0.0f;
553 }
554
555 // nextafter on FTZ platforms may return the smallest
556 // normal float (2^-126) given a denormal or a zero
557 // as the first argument. The rationale here is that
558 // nextafter flushes the argument to zero and then
559 // returns the next representable number in the
560 // direction of the second argument, and since
561 // denorms are considered as zero, the smallest
562 // normal number is the next representable number.
563 // In which case, it should have the same sign as the
564 // second argument.
565 if (isNextafter)
566 {
567 if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
568 {
569 float value = copysignf(twoToMinus126, s2[j]);
570 fail = fail && (test != value);
571 if (!fail) err = 0.0f;
572 }
573 }
574 else
575 {
576 // retry per section 6.5.3.3
577 if (IsFloatSubnormal(s[j]))
578 {
579 double correct2, correct3;
580 float err2, err3;
581
582 if (skipNanInf) feclearexcept(FE_OVERFLOW);
583
584 correct2 = ref_func(0.0, s2[j]);
585 correct3 = ref_func(-0.0, s2[j]);
586
587 // Per section 10 paragraph 6, accept any result
588 // if an input or output is a infinity or NaN or
589 // overflow As per OpenCL 2.0 spec,
590 // section 5.8.4.3, enabling fast-relaxed-math
591 // mode also enables -cl-finite-math-only
592 // optimization. This optimization allows to
593 // assume that arguments and results are not
594 // NaNs or +/-INFs. Hence, accept any result if
595 // inputs or results are NaNs or INFs.
596 if (relaxedMode || skipNanInf)
597 {
598 if (fetestexcept(FE_OVERFLOW) && skipNanInf)
599 continue;
600
601 // Note: no double rounding here. Reference
602 // functions calculate in single precision.
603 if (IsFloatInfinity(correct2)
604 || IsFloatNaN(correct2)
605 || IsFloatInfinity(correct3)
606 || IsFloatNaN(correct3))
607 continue;
608 }
609
610 err2 = Ulp_Error(test, correct2);
611 err3 = Ulp_Error(test, correct3);
612 fail = fail
613 && ((!(fabsf(err2) <= ulps))
614 && (!(fabsf(err3) <= ulps)));
615 if (fabsf(err2) < fabsf(err)) err = err2;
616 if (fabsf(err3) < fabsf(err)) err = err3;
617
618 // retry per section 6.5.3.4
619 if (IsFloatResultSubnormal(correct2, ulps)
620 || IsFloatResultSubnormal(correct3, ulps))
621 {
622 fail = fail && (test != 0.0f);
623 if (!fail) err = 0.0f;
624 }
625
626 // try with both args as zero
627 if (IsFloatSubnormal(s2[j]))
628 {
629 double correct4, correct5;
630 float err4, err5;
631
632 if (skipNanInf) feclearexcept(FE_OVERFLOW);
633
634 correct2 = ref_func(0.0, 0.0);
635 correct3 = ref_func(-0.0, 0.0);
636 correct4 = ref_func(0.0, -0.0);
637 correct5 = ref_func(-0.0, -0.0);
638
639 // Per section 10 paragraph 6, accept any
640 // result if an input or output is a
641 // infinity or NaN or overflow As per
642 // OpenCL 2.0 spec, section 5.8.4.3,
643 // enabling fast-relaxed-math mode also
644 // enables -cl-finite-math-only
645 // optimization. This optimization allows to
646 // assume that arguments and results are not
647 // NaNs or +/-INFs. Hence, accept any result
648 // if inputs or results are NaNs or INFs.
649 if (relaxedMode || skipNanInf)
650 {
651 if (fetestexcept(FE_OVERFLOW)
652 && skipNanInf)
653 continue;
654
655 // Note: no double rounding here.
656 // Reference functions calculate in
657 // single precision.
658 if (IsFloatInfinity(correct2)
659 || IsFloatNaN(correct2)
660 || IsFloatInfinity(correct3)
661 || IsFloatNaN(correct3)
662 || IsFloatInfinity(correct4)
663 || IsFloatNaN(correct4)
664 || IsFloatInfinity(correct5)
665 || IsFloatNaN(correct5))
666 continue;
667 }
668
669 err2 = Ulp_Error(test, correct2);
670 err3 = Ulp_Error(test, correct3);
671 err4 = Ulp_Error(test, correct4);
672 err5 = Ulp_Error(test, correct5);
673 fail = fail
674 && ((!(fabsf(err2) <= ulps))
675 && (!(fabsf(err3) <= ulps))
676 && (!(fabsf(err4) <= ulps))
677 && (!(fabsf(err5) <= ulps)));
678 if (fabsf(err2) < fabsf(err)) err = err2;
679 if (fabsf(err3) < fabsf(err)) err = err3;
680 if (fabsf(err4) < fabsf(err)) err = err4;
681 if (fabsf(err5) < fabsf(err)) err = err5;
682
683 // retry per section 6.5.3.4
684 if (IsFloatResultSubnormal(correct2, ulps)
685 || IsFloatResultSubnormal(correct3,
686 ulps)
687 || IsFloatResultSubnormal(correct4,
688 ulps)
689 || IsFloatResultSubnormal(correct5,
690 ulps))
691 {
692 fail = fail && (test != 0.0f);
693 if (!fail) err = 0.0f;
694 }
695 }
696 }
697 else if (IsFloatSubnormal(s2[j]))
698 {
699 double correct2, correct3;
700 float err2, err3;
701
702 if (skipNanInf) feclearexcept(FE_OVERFLOW);
703
704 correct2 = ref_func(s[j], 0.0);
705 correct3 = ref_func(s[j], -0.0);
706
707 // Per section 10 paragraph 6, accept any result
708 // if an input or output is a infinity or NaN or
709 // overflow As per OpenCL 2.0 spec,
710 // section 5.8.4.3, enabling fast-relaxed-math
711 // mode also enables -cl-finite-math-only
712 // optimization. This optimization allows to
713 // assume that arguments and results are not
714 // NaNs or +/-INFs. Hence, accept any result if
715 // inputs or results are NaNs or INFs.
716 if (relaxedMode || skipNanInf)
717 {
718 // Note: no double rounding here. Reference
719 // functions calculate in single precision.
720 if (overflow[j] && skipNanInf) continue;
721
722 if (IsFloatInfinity(correct2)
723 || IsFloatNaN(correct2)
724 || IsFloatInfinity(correct3)
725 || IsFloatNaN(correct3))
726 continue;
727 }
728
729 err2 = Ulp_Error(test, correct2);
730 err3 = Ulp_Error(test, correct3);
731 fail = fail
732 && ((!(fabsf(err2) <= ulps))
733 && (!(fabsf(err3) <= ulps)));
734 if (fabsf(err2) < fabsf(err)) err = err2;
735 if (fabsf(err3) < fabsf(err)) err = err3;
736
737 // retry per section 6.5.3.4
738 if (IsFloatResultSubnormal(correct2, ulps)
739 || IsFloatResultSubnormal(correct3, ulps))
740 {
741 fail = fail && (test != 0.0f);
742 if (!fail) err = 0.0f;
743 }
744 }
745 }
746 }
747
748 if (fabsf(err) > tinfo->maxError)
749 {
750 tinfo->maxError = fabsf(err);
751 tinfo->maxErrorValue = s[j];
752 tinfo->maxErrorValue2 = s2[j];
753 }
754 if (fail)
755 {
756 vlog_error(
757 "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
758 "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %zu\n",
759 name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
760 s2[j], ((cl_uint *)s2)[j], r[j], test,
761 ((cl_uint *)&test)[0], j);
762 error = -1;
763 goto exit;
764 }
765 }
766 }
767 }
768 }
769
770 if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
771
772 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
773 {
774 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
775 out[j], 0, NULL, NULL)))
776 {
777 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
778 j, error);
779 return error;
780 }
781 }
782
783 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
784
785
786 if (0 == (base & 0x0fffffff))
787 {
788 if (gVerboseBruteForce)
789 {
790 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
791 "ThreadCount:%2u\n",
792 base, job->step, job->scale, buffer_elements, job->ulps,
793 job->threadCount);
794 }
795 else
796 {
797 vlog(".");
798 }
799 fflush(stdout);
800 }
801
802 exit:
803 return error;
804 }
805
806 } // anonymous namespace
807
TestFunc_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)808 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
809 {
810 TestInfo test_info{};
811 cl_int error;
812 float maxError = 0.0f;
813 double maxErrorVal = 0.0;
814 double maxErrorVal2 = 0.0;
815
816 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
817
818 // Init test_info
819 test_info.threadCount = GetThreadCount();
820 test_info.subBufferSize = BUFFER_SIZE
821 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
822 test_info.scale = getTestScale(sizeof(cl_float));
823
824 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
825 if (test_info.step / test_info.subBufferSize != test_info.scale)
826 {
827 // there was overflow
828 test_info.jobCount = 1;
829 }
830 else
831 {
832 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
833 }
834
835 test_info.f = f;
836 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
837 test_info.ftz =
838 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
839 test_info.relaxedMode = relaxedMode;
840 test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
841 test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
842 test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
843
844 // cl_kernels aren't thread safe, so we make one for each vector size for
845 // every thread
846 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
847 {
848 test_info.k[i].resize(test_info.threadCount, nullptr);
849 }
850
851 test_info.tinfo.resize(test_info.threadCount);
852 for (cl_uint i = 0; i < test_info.threadCount; i++)
853 {
854 cl_buffer_region region = {
855 i * test_info.subBufferSize * sizeof(cl_float),
856 test_info.subBufferSize * sizeof(cl_float)
857 };
858 test_info.tinfo[i].inBuf =
859 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
860 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
861 if (error || NULL == test_info.tinfo[i].inBuf)
862 {
863 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
864 "region {%zd, %zd}\n",
865 region.origin, region.size);
866 goto exit;
867 }
868 test_info.tinfo[i].inBuf2 =
869 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
870 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
871 if (error || NULL == test_info.tinfo[i].inBuf2)
872 {
873 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
874 "region {%zd, %zd}\n",
875 region.origin, region.size);
876 goto exit;
877 }
878
879 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
880 {
881 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
882 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
883 ®ion, &error);
884 if (error || NULL == test_info.tinfo[i].outBuf[j])
885 {
886 vlog_error("Error: Unable to create sub-buffer of "
887 "gOutBuffer[%d] for region {%zd, %zd}\n",
888 (int)j, region.origin, region.size);
889 goto exit;
890 }
891 }
892 test_info.tinfo[i].tQueue =
893 clCreateCommandQueue(gContext, gDevice, 0, &error);
894 if (NULL == test_info.tinfo[i].tQueue || error)
895 {
896 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
897 goto exit;
898 }
899
900 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
901 }
902
903 // Init the kernels
904 {
905 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
906 test_info.programs, f->nameInCode,
907 relaxedMode };
908 if ((error = ThreadPool_Do(BuildKernelFn,
909 gMaxVectorSizeIndex - gMinVectorSizeIndex,
910 &build_info)))
911 goto exit;
912 }
913
914 // Run the kernels
915 if (!gSkipCorrectnessTesting)
916 {
917 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
918
919 // Accumulate the arithmetic errors
920 for (cl_uint i = 0; i < test_info.threadCount; i++)
921 {
922 if (test_info.tinfo[i].maxError > maxError)
923 {
924 maxError = test_info.tinfo[i].maxError;
925 maxErrorVal = test_info.tinfo[i].maxErrorValue;
926 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
927 }
928 }
929
930 if (error) goto exit;
931
932 if (gWimpyMode)
933 vlog("Wimp pass");
934 else
935 vlog("passed");
936
937 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
938 }
939
940 vlog("\n");
941
942 exit:
943 // Release
944 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
945 {
946 for (auto &kernel : test_info.k[i])
947 {
948 clReleaseKernel(kernel);
949 }
950 }
951
952 return error;
953 }
954