1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <climits>
23 #include <cstring>
24
25 namespace {
26
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)27 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
28 cl_kernel *k, cl_program *p, bool relaxedMode)
29 {
30 const char *c[] = { "__kernel void math_kernel",
31 sizeNames[vectorSize],
32 "( __global float",
33 sizeNames[vectorSize],
34 "* out, __global float",
35 sizeNames[vectorSize],
36 "* in1, __global int",
37 sizeNames[vectorSize],
38 "* in2 )\n"
39 "{\n"
40 " size_t i = get_global_id(0);\n"
41 " out[i] = ",
42 name,
43 "( in1[i], in2[i] );\n"
44 "}\n" };
45
46 const char *c3[] = {
47 "__kernel void math_kernel",
48 sizeNames[vectorSize],
49 "( __global float* out, __global float* in, __global int* in2)\n"
50 "{\n"
51 " size_t i = get_global_id(0);\n"
52 " if( i + 1 < get_global_size(0) )\n"
53 " {\n"
54 " float3 f0 = vload3( 0, in + 3 * i );\n"
55 " int3 i0 = vload3( 0, in2 + 3 * i );\n"
56 " f0 = ",
57 name,
58 "( f0, i0 );\n"
59 " vstore3( f0, 0, out + 3*i );\n"
60 " }\n"
61 " else\n"
62 " {\n"
63 " size_t parity = i & 1; // Figure out how many elements are "
64 "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
65 "buffer size \n"
66 " float3 f0;\n"
67 " int3 i0;\n"
68 " switch( parity )\n"
69 " {\n"
70 " case 1:\n"
71 " f0 = (float3)( in[3*i], NAN, NAN ); \n"
72 " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
73 " break;\n"
74 " case 0:\n"
75 " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
76 " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
77 " break;\n"
78 " }\n"
79 " f0 = ",
80 name,
81 "( f0, i0 );\n"
82 " switch( parity )\n"
83 " {\n"
84 " case 0:\n"
85 " out[3*i+1] = f0.y; \n"
86 " // fall through\n"
87 " case 1:\n"
88 " out[3*i] = f0.x; \n"
89 " break;\n"
90 " }\n"
91 " }\n"
92 "}\n"
93 };
94
95 const char **kern = c;
96 size_t kernSize = sizeof(c) / sizeof(c[0]);
97
98 if (sizeValues[vectorSize] == 3)
99 {
100 kern = c3;
101 kernSize = sizeof(c3) / sizeof(c3[0]);
102 }
103
104 char testName[32];
105 snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
106 sizeNames[vectorSize]);
107
108 return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
109 relaxedMode);
110 }
111
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)112 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
113 {
114 BuildKernelInfo *info = (BuildKernelInfo *)p;
115 cl_uint vectorSize = gMinVectorSizeIndex + job_id;
116 return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
117 info->kernels[vectorSize].data(),
118 &(info->programs[vectorSize]), info->relaxedMode);
119 }
120
121 // Thread specific data for a worker thread
122 struct ThreadInfo
123 {
124 // Input and output buffers for the thread
125 clMemWrapper inBuf;
126 clMemWrapper inBuf2;
127 Buffers outBuf;
128
129 float maxError; // max error value. Init to 0.
130 double
131 maxErrorValue; // position of the max error value (param 1). Init to 0.
132 cl_int maxErrorValue2; // position of the max error value (param 2). Init
133 // to 0.
134 MTdataHolder d;
135
136 // Per thread command queue to improve performance
137 clCommandQueueWrapper tQueue;
138 };
139
140 struct TestInfo
141 {
142 size_t subBufferSize; // Size of the sub-buffer in elements
143 const Func *f; // A pointer to the function info
144
145 // Programs for various vector sizes.
146 Programs programs;
147
148 // Thread-specific kernels for each vector size:
149 // k[vector_size][thread_id]
150 KernelMatrix k;
151
152 // Array of thread specific information
153 std::vector<ThreadInfo> tinfo;
154
155 cl_uint threadCount; // Number of worker threads
156 cl_uint jobCount; // Number of jobs
157 cl_uint step; // step between each chunk and the next.
158 cl_uint scale; // stride between individual test values
159 float ulps; // max_allowed ulps
160 int ftz; // non-zero if running in flush to zero mode
161 bool relaxedMode; // True if test is running in relaxed mode, false
162 // otherwise.
163 // no special values
164 };
165
166 // A table of more difficult cases to get right
167 const float specialValues[] = {
168 -NAN,
169 -INFINITY,
170 -FLT_MAX,
171 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
172 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
173 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
174 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
175 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
176 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
177 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
178 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
179 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
180 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
181 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
182 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
183 -1000.f,
184 -100.f,
185 -4.0f,
186 -3.5f,
187 -3.0f,
188 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
189 -2.5f,
190 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
191 -2.0f,
192 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
193 -1.5f,
194 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
195 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
196 -1.0f,
197 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
198 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
199 -0.5f,
200 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
201 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
202 -0.25f,
203 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
204 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
205 -FLT_MIN,
206 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
207 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
208 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
209 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
210 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
211 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
212 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
213 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
214 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
215 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
216 -0.0f,
217
218 +NAN,
219 +INFINITY,
220 +FLT_MAX,
221 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
222 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
223 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
224 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
225 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
226 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
227 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
228 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
229 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
230 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
231 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
232 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
233 +1000.f,
234 +100.f,
235 +4.0f,
236 +3.5f,
237 +3.0f,
238 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
239 2.5f,
240 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
241 +2.0f,
242 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
243 1.5f,
244 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
245 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
246 +1.0f,
247 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
248 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
249 +0.5f,
250 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
251 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
252 +0.25f,
253 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
254 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
255 +FLT_MIN,
256 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
257 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
258 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
259 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
260 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
261 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
262 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
263 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
264 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
265 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
266 +0.0f,
267 };
268
269 constexpr size_t specialValuesCount =
270 sizeof(specialValues) / sizeof(specialValues[0]);
271
272 const int specialValuesInt[] = {
273 0, 1, 2, 3, 126, 127,
274 128, 0x02000001, 0x04000001, 1465264071, 1488522147, -1,
275 -2, -3, -126, -127, -128, -0x02000001,
276 -0x04000001, -1465264071, -1488522147,
277 };
278
279 constexpr size_t specialValuesIntCount =
280 sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
281
Test(cl_uint job_id,cl_uint thread_id,void * data)282 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
283 {
284 TestInfo *job = (TestInfo *)data;
285 size_t buffer_elements = job->subBufferSize;
286 size_t buffer_size = buffer_elements * sizeof(cl_float);
287 cl_uint base = job_id * (cl_uint)job->step;
288 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
289 fptr func = job->f->func;
290 int ftz = job->ftz;
291 bool relaxedMode = job->relaxedMode;
292 float ulps = job->ulps;
293 MTdata d = tinfo->d;
294 cl_int error;
295 const char *name = job->f->name;
296 cl_uint *t = 0;
297 cl_float *r = 0;
298 cl_float *s = 0;
299 cl_int *s2 = 0;
300
301 // start the map of the output arrays
302 cl_event e[VECTOR_SIZE_COUNT];
303 cl_uint *out[VECTOR_SIZE_COUNT];
304 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
305 {
306 out[j] = (cl_uint *)clEnqueueMapBuffer(
307 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
308 buffer_size, 0, NULL, e + j, &error);
309 if (error || NULL == out[j])
310 {
311 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
312 error);
313 return error;
314 }
315 }
316
317 // Get that moving
318 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
319
320 // Init input array
321 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
322 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
323 size_t idx = 0;
324 int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
325 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
326
327 if (job_id <= (cl_uint)lastSpecialJobIndex)
328 { // test edge cases
329 float *fp = (float *)p;
330 cl_int *ip2 = (cl_int *)p2;
331 uint32_t x, y;
332
333 x = (job_id * buffer_elements) % specialValuesCount;
334 y = (job_id * buffer_elements) / specialValuesCount;
335
336 for (; idx < buffer_elements; idx++)
337 {
338 fp[idx] = specialValues[x];
339 ip2[idx] = specialValuesInt[y];
340 ++x;
341 if (x >= specialValuesCount)
342 {
343 x = 0;
344 y++;
345 if (y >= specialValuesIntCount) break;
346 }
347 }
348 }
349
350 // Init any remaining values.
351 for (; idx < buffer_elements; idx++)
352 {
353 p[idx] = genrand_int32(d);
354 p2[idx] = genrand_int32(d);
355 }
356
357 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
358 buffer_size, p, 0, NULL, NULL)))
359 {
360 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
361 goto exit;
362 }
363
364 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
365 buffer_size, p2, 0, NULL, NULL)))
366 {
367 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
368 goto exit;
369 }
370
371 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
372 {
373 // Wait for the map to finish
374 if ((error = clWaitForEvents(1, e + j)))
375 {
376 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
377 goto exit;
378 }
379 if ((error = clReleaseEvent(e[j])))
380 {
381 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
382 goto exit;
383 }
384
385 // Fill the result buffer with garbage, so that old results don't carry
386 // over
387 uint32_t pattern = 0xffffdead;
388 memset_pattern4(out[j], &pattern, buffer_size);
389 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
390 out[j], 0, NULL, NULL)))
391 {
392 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
393 error);
394 goto exit;
395 }
396
397 // run the kernel
398 size_t vectorCount =
399 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
400 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
401 // own copy of the cl_kernel
402 cl_program program = job->programs[j];
403
404 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
405 &tinfo->outBuf[j])))
406 {
407 LogBuildError(program);
408 return error;
409 }
410 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
411 &tinfo->inBuf)))
412 {
413 LogBuildError(program);
414 return error;
415 }
416 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
417 &tinfo->inBuf2)))
418 {
419 LogBuildError(program);
420 return error;
421 }
422
423 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
424 &vectorCount, NULL, 0, NULL, NULL)))
425 {
426 vlog_error("FAILED -- could not execute kernel\n");
427 goto exit;
428 }
429 }
430
431 // Get that moving
432 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
433
434 if (gSkipCorrectnessTesting) return CL_SUCCESS;
435
436 // Calculate the correctly rounded reference result
437 r = (float *)gOut_Ref + thread_id * buffer_elements;
438 s = (float *)gIn + thread_id * buffer_elements;
439 s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
440 for (size_t j = 0; j < buffer_elements; j++)
441 r[j] = (float)func.f_fi(s[j], s2[j]);
442
443 // Read the data back -- no need to wait for the first N-1 buffers but wait
444 // for the last buffer. This is an in order queue.
445 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
446 {
447 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
448 out[j] = (cl_uint *)clEnqueueMapBuffer(
449 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
450 buffer_size, 0, NULL, NULL, &error);
451 if (error || NULL == out[j])
452 {
453 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
454 error);
455 goto exit;
456 }
457 }
458
459 // Verify data
460 t = (cl_uint *)r;
461 for (size_t j = 0; j < buffer_elements; j++)
462 {
463 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
464 {
465 cl_uint *q = out[k];
466
467 // If we aren't getting the correctly rounded result
468 if (t[j] != q[j])
469 {
470 float test = ((float *)q)[j];
471 double correct = func.f_fi(s[j], s2[j]);
472 float err = Ulp_Error(test, correct);
473 int fail = !(fabsf(err) <= ulps);
474
475 if (fail && (ftz || relaxedMode))
476 {
477 // retry per section 6.5.3.2
478 if (IsFloatResultSubnormal(correct, ulps))
479 {
480 fail = fail && (test != 0.0f);
481 if (!fail) err = 0.0f;
482 }
483
484 // retry per section 6.5.3.3
485 if (IsFloatSubnormal(s[j]))
486 {
487 double correct2, correct3;
488 float err2, err3;
489 correct2 = func.f_fi(0.0, s2[j]);
490 correct3 = func.f_fi(-0.0, s2[j]);
491 err2 = Ulp_Error(test, correct2);
492 err3 = Ulp_Error(test, correct3);
493 fail = fail
494 && ((!(fabsf(err2) <= ulps))
495 && (!(fabsf(err3) <= ulps)));
496 if (fabsf(err2) < fabsf(err)) err = err2;
497 if (fabsf(err3) < fabsf(err)) err = err3;
498
499 // retry per section 6.5.3.4
500 if (IsFloatResultSubnormal(correct2, ulps)
501 || IsFloatResultSubnormal(correct3, ulps))
502 {
503 fail = fail && (test != 0.0f);
504 if (!fail) err = 0.0f;
505 }
506 }
507 }
508
509 if (fabsf(err) > tinfo->maxError)
510 {
511 tinfo->maxError = fabsf(err);
512 tinfo->maxErrorValue = s[j];
513 tinfo->maxErrorValue2 = s2[j];
514 }
515 if (fail)
516 {
517 vlog_error(
518 "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
519 "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %zu\n",
520 name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
521 s2[j], r[j], ((uint32_t *)r)[j], test,
522 ((cl_uint *)&test)[0], j);
523 error = -1;
524 goto exit;
525 }
526 }
527 }
528 }
529
530 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
531 {
532 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
533 out[j], 0, NULL, NULL)))
534 {
535 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
536 j, error);
537 return error;
538 }
539 }
540
541 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
542
543
544 if (0 == (base & 0x0fffffff))
545 {
546 if (gVerboseBruteForce)
547 {
548 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
549 "ThreadCount:%2u\n",
550 base, job->step, job->scale, buffer_elements, job->ulps,
551 job->threadCount);
552 }
553 else
554 {
555 vlog(".");
556 }
557 fflush(stdout);
558 }
559
560 exit:
561 return error;
562 }
563
564 } // anonymous namespace
565
TestFunc_Float_Float_Int(const Func * f,MTdata d,bool relaxedMode)566 int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
567 {
568 TestInfo test_info{};
569 cl_int error;
570 float maxError = 0.0f;
571 double maxErrorVal = 0.0;
572 cl_int maxErrorVal2 = 0;
573
574 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
575
576 // Init test_info
577 test_info.threadCount = GetThreadCount();
578 test_info.subBufferSize = BUFFER_SIZE
579 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
580 test_info.scale = getTestScale(sizeof(cl_float));
581
582 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
583 if (test_info.step / test_info.subBufferSize != test_info.scale)
584 {
585 // there was overflow
586 test_info.jobCount = 1;
587 }
588 else
589 {
590 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
591 }
592
593 test_info.f = f;
594 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
595 test_info.ftz =
596 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
597 test_info.relaxedMode = relaxedMode;
598
599 // cl_kernels aren't thread safe, so we make one for each vector size for
600 // every thread
601 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
602 {
603 test_info.k[i].resize(test_info.threadCount, nullptr);
604 }
605
606 test_info.tinfo.resize(test_info.threadCount);
607 for (cl_uint i = 0; i < test_info.threadCount; i++)
608 {
609 cl_buffer_region region = {
610 i * test_info.subBufferSize * sizeof(cl_float),
611 test_info.subBufferSize * sizeof(cl_float)
612 };
613 test_info.tinfo[i].inBuf =
614 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
615 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
616 if (error || NULL == test_info.tinfo[i].inBuf)
617 {
618 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
619 "region {%zd, %zd}\n",
620 region.origin, region.size);
621 goto exit;
622 }
623 cl_buffer_region region2 = { i * test_info.subBufferSize
624 * sizeof(cl_int),
625 test_info.subBufferSize * sizeof(cl_int) };
626 test_info.tinfo[i].inBuf2 =
627 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
628 CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error);
629 if (error || NULL == test_info.tinfo[i].inBuf2)
630 {
631 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
632 "region {%zd, %zd}\n",
633 region.origin, region.size);
634 goto exit;
635 }
636
637 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
638 {
639 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
640 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
641 ®ion, &error);
642 if (error || NULL == test_info.tinfo[i].outBuf[j])
643 {
644 vlog_error("Error: Unable to create sub-buffer of "
645 "gOutBuffer[%d] for region {%zd, %zd}\n",
646 (int)j, region.origin, region.size);
647 goto exit;
648 }
649 }
650 test_info.tinfo[i].tQueue =
651 clCreateCommandQueue(gContext, gDevice, 0, &error);
652 if (NULL == test_info.tinfo[i].tQueue || error)
653 {
654 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
655 goto exit;
656 }
657
658 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
659 }
660
661 // Init the kernels
662 {
663 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
664 test_info.programs, f->nameInCode,
665 relaxedMode };
666 if ((error = ThreadPool_Do(BuildKernelFn,
667 gMaxVectorSizeIndex - gMinVectorSizeIndex,
668 &build_info)))
669 goto exit;
670 }
671
672 // Run the kernels
673 if (!gSkipCorrectnessTesting)
674 {
675 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
676
677 // Accumulate the arithmetic errors
678 for (cl_uint i = 0; i < test_info.threadCount; i++)
679 {
680 if (test_info.tinfo[i].maxError > maxError)
681 {
682 maxError = test_info.tinfo[i].maxError;
683 maxErrorVal = test_info.tinfo[i].maxErrorValue;
684 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
685 }
686 }
687
688 if (error) goto exit;
689
690 if (gWimpyMode)
691 vlog("Wimp pass");
692 else
693 vlog("passed");
694
695 vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
696 }
697
698 vlog("\n");
699
700 exit:
701 // Release
702 for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
703 {
704 for (auto &kernel : test_info.k[i])
705 {
706 clReleaseKernel(kernel);
707 }
708 }
709
710 return error;
711 }
712