1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cinttypes>
23 #include <climits>
24 #include <cstring>
25
26 namespace {
27
BuildKernel(const char * name,int vectorSize,cl_kernel * k,cl_program * p,bool relaxedMode)28 int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p,
29 bool relaxedMode)
30 {
31 const char *c[] = { "__kernel void math_kernel",
32 sizeNames[vectorSize],
33 "( __global float",
34 sizeNames[vectorSize],
35 "* out, __global int",
36 sizeNames[vectorSize],
37 "* out2, __global float",
38 sizeNames[vectorSize],
39 "* in1, __global float",
40 sizeNames[vectorSize],
41 "* in2 )\n"
42 "{\n"
43 " size_t i = get_global_id(0);\n"
44 " out[i] = ",
45 name,
46 "( in1[i], in2[i], out2 + i );\n"
47 "}\n" };
48
49 const char *c3[] = {
50 "__kernel void math_kernel",
51 sizeNames[vectorSize],
52 "( __global float* out, __global int* out2, __global float* in, "
53 "__global float* in2)\n"
54 "{\n"
55 " size_t i = get_global_id(0);\n"
56 " if( i + 1 < get_global_size(0) )\n"
57 " {\n"
58 " float3 f0 = vload3( 0, in + 3 * i );\n"
59 " float3 f1 = vload3( 0, in2 + 3 * i );\n"
60 " int3 i0 = 0xdeaddead;\n"
61 " f0 = ",
62 name,
63 "( f0, f1, &i0 );\n"
64 " vstore3( f0, 0, out + 3*i );\n"
65 " vstore3( i0, 0, out2 + 3*i );\n"
66 " }\n"
67 " else\n"
68 " {\n"
69 " size_t parity = i & 1; // Figure out how many elements are "
70 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
71 "buffer size \n"
72 " float3 f0;\n"
73 " float3 f1;\n"
74 " int3 i0 = 0xdeaddead;\n"
75 " switch( parity )\n"
76 " {\n"
77 " case 1:\n"
78 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
79 " f1 = (float3)( in2[3*i], NAN, NAN ); \n"
80 " break;\n"
81 " case 0:\n"
82 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
83 " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
84 " break;\n"
85 " }\n"
86 " f0 = ",
87 name,
88 "( f0, f1, &i0 );\n"
89 " switch( parity )\n"
90 " {\n"
91 " case 0:\n"
92 " out[3*i+1] = f0.y; \n"
93 " out2[3*i+1] = i0.y; \n"
94 " // fall through\n"
95 " case 1:\n"
96 " out[3*i] = f0.x; \n"
97 " out2[3*i] = i0.x; \n"
98 " break;\n"
99 " }\n"
100 " }\n"
101 "}\n"
102 };
103
104 const char **kern = c;
105 size_t kernSize = sizeof(c) / sizeof(c[0]);
106
107 if (sizeValues[vectorSize] == 3)
108 {
109 kern = c3;
110 kernSize = sizeof(c3) / sizeof(c3[0]);
111 }
112
113 char testName[32];
114 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
115 sizeNames[vectorSize]);
116
117 return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
118 }
119
120 struct BuildKernelInfo2
121 {
122 cl_kernel *kernels;
123 Programs &programs;
124 const char *nameInCode;
125 bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
126 };
127
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)128 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
129 {
130 BuildKernelInfo2 *info = (BuildKernelInfo2 *)p;
131 cl_uint vectorSize = gMinVectorSizeIndex + job_id;
132 return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize,
133 &(info->programs[vectorSize]), info->relaxedMode);
134 }
135
136 struct ComputeReferenceInfoF
137 {
138 const float *x;
139 const float *y;
140 float *r;
141 int *i;
142 double (*f_ffpI)(double, double, int *);
143 cl_uint lim;
144 cl_uint count;
145 };
146
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)147 cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
148 {
149 ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
150 cl_uint lim = cri->lim;
151 cl_uint count = cri->count;
152 cl_uint off = jid * count;
153 const float *x = cri->x + off;
154 const float *y = cri->y + off;
155 float *r = cri->r + off;
156 int *i = cri->i + off;
157 double (*f)(double, double, int *) = cri->f_ffpI;
158
159 if (off + count > lim) count = lim - off;
160
161 for (cl_uint j = 0; j < count; ++j)
162 r[j] = (float)f((double)x[j], (double)y[j], i + j);
163
164 return CL_SUCCESS;
165 }
166
167 } // anonymous namespace
168
TestFunc_FloatI_Float_Float(const Func * f,MTdata d,bool relaxedMode)169 int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
170 {
171 int error;
172
173 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
174
175 Programs programs;
176 cl_kernel kernels[VECTOR_SIZE_COUNT];
177 float maxError = 0.0f;
178 int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
179 int64_t maxError2 = 0;
180 float maxErrorVal = 0.0f;
181 float maxErrorVal2 = 0.0f;
182 uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
183
184 cl_uint threadCount = GetThreadCount();
185
186 float float_ulps;
187 if (gIsEmbedded)
188 float_ulps = f->float_embedded_ulps;
189 else
190 float_ulps = f->float_ulps;
191
192 int testingRemquo = !strcmp(f->name, "remquo");
193
194 // Init the kernels
195 {
196 BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode,
197 relaxedMode };
198 if ((error = ThreadPool_Do(BuildKernelFn,
199 gMaxVectorSizeIndex - gMinVectorSizeIndex,
200 &build_info)))
201 return error;
202 }
203
204 for (uint64_t i = 0; i < (1ULL << 32); i += step)
205 {
206 // Init input array
207 cl_uint *p = (cl_uint *)gIn;
208 cl_uint *p2 = (cl_uint *)gIn2;
209 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
210 {
211 p[j] = genrand_int32(d);
212 p2[j] = genrand_int32(d);
213 }
214
215 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
216 BUFFER_SIZE, gIn, 0, NULL, NULL)))
217 {
218 vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
219 return error;
220 }
221
222 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
223 BUFFER_SIZE, gIn2, 0, NULL, NULL)))
224 {
225 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
226 return error;
227 }
228
229 // write garbage into output arrays
230 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
231 {
232 uint32_t pattern = 0xffffdead;
233 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
234 if ((error =
235 clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
236 BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
237 {
238 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
239 error, j);
240 goto exit;
241 }
242
243 memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
244 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE,
245 0, BUFFER_SIZE, gOut2[j], 0, NULL,
246 NULL)))
247 {
248 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
249 error, j);
250 goto exit;
251 }
252 }
253
254 // Run the kernels
255 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
256 {
257 size_t vectorSize = sizeof(cl_float) * sizeValues[j];
258 size_t localCount = (BUFFER_SIZE + vectorSize - 1)
259 / vectorSize; // BUFFER_SIZE / vectorSize rounded up
260 if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
261 &gOutBuffer[j])))
262 {
263 LogBuildError(programs[j]);
264 goto exit;
265 }
266 if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
267 &gOutBuffer2[j])))
268 {
269 LogBuildError(programs[j]);
270 goto exit;
271 }
272 if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
273 &gInBuffer)))
274 {
275 LogBuildError(programs[j]);
276 goto exit;
277 }
278 if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
279 &gInBuffer2)))
280 {
281 LogBuildError(programs[j]);
282 goto exit;
283 }
284
285 if ((error =
286 clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
287 &localCount, NULL, 0, NULL, NULL)))
288 {
289 vlog_error("FAILED -- could not execute kernel\n");
290 goto exit;
291 }
292 }
293
294 // Get that moving
295 if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
296
297 // Calculate the correctly rounded reference result
298 float *s = (float *)gIn;
299 float *s2 = (float *)gIn2;
300
301 if (threadCount > 1)
302 {
303 ComputeReferenceInfoF cri;
304 cri.x = s;
305 cri.y = s2;
306 cri.r = (float *)gOut_Ref;
307 cri.i = (int *)gOut_Ref2;
308 cri.f_ffpI = f->func.f_ffpI;
309 cri.lim = BUFFER_SIZE / sizeof(float);
310 cri.count = (cri.lim + threadCount - 1) / threadCount;
311 ThreadPool_Do(ReferenceF, threadCount, &cri);
312 }
313 else
314 {
315 float *r = (float *)gOut_Ref;
316 int *r2 = (int *)gOut_Ref2;
317 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
318 r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
319 }
320
321 // Read the data back
322 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
323 {
324 if ((error =
325 clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
326 BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
327 {
328 vlog_error("ReadArray failed %d\n", error);
329 goto exit;
330 }
331 if ((error =
332 clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
333 BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
334 {
335 vlog_error("ReadArray2 failed %d\n", error);
336 goto exit;
337 }
338 }
339
340 if (gSkipCorrectnessTesting) break;
341
342 // Verify data
343 uint32_t *t = (uint32_t *)gOut_Ref;
344 int32_t *t2 = (int32_t *)gOut_Ref2;
345 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
346 {
347 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
348 {
349 uint32_t *q = (uint32_t *)(gOut[k]);
350 int32_t *q2 = (int32_t *)gOut2[k];
351
352 // Check for exact match to correctly rounded result
353 if (t[j] == q[j] && t2[j] == q2[j]) continue;
354
355 // Check for paired NaNs
356 if ((t[j] & 0x7fffffff) > 0x7f800000
357 && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
358 continue;
359
360 float test = ((float *)q)[j];
361 int correct2 = INT_MIN;
362 double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
363 float err = Ulp_Error(test, correct);
364 int64_t iErr;
365
366 // in case of remquo, we only care about the sign and last
367 // seven bits of integer as per the spec.
368 if (testingRemquo)
369 iErr = (long long)(q2[j] & 0x0000007f)
370 - (long long)(correct2 & 0x0000007f);
371 else
372 iErr = (long long)q2[j] - (long long)correct2;
373
374 // For remquo, if y = 0, x is infinite, or either is NaN
375 // then the standard either neglects to say what is returned
376 // in iptr or leaves it undefined or implementation defined.
377 int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
378 || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
379 || isnan(((float *)gIn)[j]);
380 if (iptrUndefined) iErr = 0;
381
382 int fail = !(fabsf(err) <= float_ulps && iErr == 0);
383 if ((ftz || relaxedMode) && fail)
384 {
385 // retry per section 6.5.3.2
386 if (IsFloatResultSubnormal(correct, float_ulps))
387 {
388 fail = fail && !(test == 0.0f && iErr == 0);
389 if (!fail) err = 0.0f;
390 }
391
392 // retry per section 6.5.3.3
393 if (IsFloatSubnormal(s[j]))
394 {
395 int correct3i, correct4i;
396 double correct3 =
397 f->func.f_ffpI(0.0, s2[j], &correct3i);
398 double correct4 =
399 f->func.f_ffpI(-0.0, s2[j], &correct4i);
400 float err2 = Ulp_Error(test, correct3);
401 float err3 = Ulp_Error(test, correct4);
402 int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
403 int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
404 fail = fail
405 && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
406 && (!(fabsf(err3) <= float_ulps
407 && iErr4 == 0)));
408 if (fabsf(err2) < fabsf(err)) err = err2;
409 if (fabsf(err3) < fabsf(err)) err = err3;
410 if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
411 if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
412
413 // retry per section 6.5.3.4
414 if (IsFloatResultSubnormal(correct2, float_ulps)
415 || IsFloatResultSubnormal(correct3, float_ulps))
416 {
417 fail = fail
418 && !(test == 0.0f
419 && (iErr3 == 0 || iErr4 == 0));
420 if (!fail) err = 0.0f;
421 }
422
423 // try with both args as zero
424 if (IsFloatSubnormal(s2[j]))
425 {
426 int correct7i, correct8i;
427 correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
428 correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
429 double correct7 =
430 f->func.f_ffpI(0.0, -0.0, &correct7i);
431 double correct8 =
432 f->func.f_ffpI(-0.0, -0.0, &correct8i);
433 err2 = Ulp_Error(test, correct3);
434 err3 = Ulp_Error(test, correct4);
435 float err4 = Ulp_Error(test, correct7);
436 float err5 = Ulp_Error(test, correct8);
437 iErr3 = (long long)q2[j] - (long long)correct3i;
438 iErr4 = (long long)q2[j] - (long long)correct4i;
439 int64_t iErr7 =
440 (long long)q2[j] - (long long)correct7i;
441 int64_t iErr8 =
442 (long long)q2[j] - (long long)correct8i;
443 fail = fail
444 && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
445 && (!(fabsf(err3) <= float_ulps
446 && iErr4 == 0))
447 && (!(fabsf(err4) <= float_ulps
448 && iErr7 == 0))
449 && (!(fabsf(err5) <= float_ulps
450 && iErr8 == 0)));
451 if (fabsf(err2) < fabsf(err)) err = err2;
452 if (fabsf(err3) < fabsf(err)) err = err3;
453 if (fabsf(err4) < fabsf(err)) err = err4;
454 if (fabsf(err5) < fabsf(err)) err = err5;
455 if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
456 if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
457 if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
458 if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
459
460 // retry per section 6.5.3.4
461 if (IsFloatResultSubnormal(correct3, float_ulps)
462 || IsFloatResultSubnormal(correct4, float_ulps)
463 || IsFloatResultSubnormal(correct7, float_ulps)
464 || IsFloatResultSubnormal(correct8, float_ulps))
465 {
466 fail = fail
467 && !(test == 0.0f
468 && (iErr3 == 0 || iErr4 == 0
469 || iErr7 == 0 || iErr8 == 0));
470 if (!fail) err = 0.0f;
471 }
472 }
473 }
474 else if (IsFloatSubnormal(s2[j]))
475 {
476 int correct3i, correct4i;
477 double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
478 double correct4 =
479 f->func.f_ffpI(s[j], -0.0, &correct4i);
480 float err2 = Ulp_Error(test, correct3);
481 float err3 = Ulp_Error(test, correct4);
482 int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
483 int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
484 fail = fail
485 && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
486 && (!(fabsf(err3) <= float_ulps
487 && iErr4 == 0)));
488 if (fabsf(err2) < fabsf(err)) err = err2;
489 if (fabsf(err3) < fabsf(err)) err = err3;
490 if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
491 if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
492
493 // retry per section 6.5.3.4
494 if (IsFloatResultSubnormal(correct2, float_ulps)
495 || IsFloatResultSubnormal(correct3, float_ulps))
496 {
497 fail = fail
498 && !(test == 0.0f
499 && (iErr3 == 0 || iErr4 == 0));
500 if (!fail) err = 0.0f;
501 }
502 }
503 }
504 if (fabsf(err) > maxError)
505 {
506 maxError = fabsf(err);
507 maxErrorVal = s[j];
508 }
509 if (llabs(iErr) > maxError2)
510 {
511 maxError2 = llabs(iErr);
512 maxErrorVal2 = s[j];
513 }
514
515 if (fail)
516 {
517 vlog_error("\nERROR: %s%s: {%f, %" PRId64
518 "} ulp error at {%a, %a} "
519 "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
520 "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
521 f->name, sizeNames[k], err, iErr,
522 ((float *)gIn)[j], ((float *)gIn2)[j],
523 ((cl_uint *)gIn)[j], ((cl_uint *)gIn2)[j],
524 ((float *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
525 ((cl_uint *)gOut_Ref)[j],
526 ((cl_uint *)gOut_Ref2)[j], test, q2[j],
527 ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
528 error = -1;
529 goto exit;
530 }
531 }
532 }
533
534 if (0 == (i & 0x0fffffff))
535 {
536 if (gVerboseBruteForce)
537 {
538 vlog("base:%14" PRIu64 " step:%10" PRIu64
539 " bufferSize:%10d \n",
540 i, step, BUFFER_SIZE);
541 }
542 else
543 {
544 vlog(".");
545 }
546 fflush(stdout);
547 }
548 }
549
550 if (!gSkipCorrectnessTesting)
551 {
552 if (gWimpyMode)
553 vlog("Wimp pass");
554 else
555 vlog("passed");
556
557 vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
558 maxErrorVal, maxErrorVal2);
559 }
560
561 vlog("\n");
562
563 exit:
564 // Release
565 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
566 {
567 clReleaseKernel(kernels[k]);
568 }
569
570 return error;
571 }
572