• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernel(const char * name,int vectorSize,cl_uint kernel_count,cl_kernel * k,cl_program * p,bool relaxedMode)26 int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
27                 cl_kernel *k, cl_program *p, bool relaxedMode)
28 {
29     const char *c[] = { "__kernel void math_kernel",
30                         sizeNames[vectorSize],
31                         "( __global float",
32                         sizeNames[vectorSize],
33                         "* out, __global float",
34                         sizeNames[vectorSize],
35                         "* in )\n"
36                         "{\n"
37                         "   size_t i = get_global_id(0);\n"
38                         "   out[i] = ",
39                         name,
40                         "( in[i] );\n"
41                         "}\n" };
42 
43     const char *c3[] = {
44         "__kernel void math_kernel",
45         sizeNames[vectorSize],
46         "( __global float* out, __global float* in)\n"
47         "{\n"
48         "   size_t i = get_global_id(0);\n"
49         "   if( i + 1 < get_global_size(0) )\n"
50         "   {\n"
51         "       float3 f0 = vload3( 0, in + 3 * i );\n"
52         "       f0 = ",
53         name,
54         "( f0 );\n"
55         "       vstore3( f0, 0, out + 3*i );\n"
56         "   }\n"
57         "   else\n"
58         "   {\n"
59         "       size_t parity = i & 1;   // Figure out how many elements are "
60         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
61         "buffer size \n"
62         "       float3 f0;\n"
63         "       switch( parity )\n"
64         "       {\n"
65         "           case 1:\n"
66         "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
67         "               break;\n"
68         "           case 0:\n"
69         "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
70         "               break;\n"
71         "       }\n"
72         "       f0 = ",
73         name,
74         "( f0 );\n"
75         "       switch( parity )\n"
76         "       {\n"
77         "           case 0:\n"
78         "               out[3*i+1] = f0.y; \n"
79         "               // fall through\n"
80         "           case 1:\n"
81         "               out[3*i] = f0.x; \n"
82         "               break;\n"
83         "       }\n"
84         "   }\n"
85         "}\n"
86     };
87 
88     const char **kern = c;
89     size_t kernSize = sizeof(c) / sizeof(c[0]);
90 
91     if (sizeValues[vectorSize] == 3)
92     {
93         kern = c3;
94         kernSize = sizeof(c3) / sizeof(c3[0]);
95     }
96 
97     char testName[32];
98     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
99              sizeNames[vectorSize]);
100 
101     return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
102                        relaxedMode);
103 }
104 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)105 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
106 {
107     BuildKernelInfo *info = (BuildKernelInfo *)p;
108     cl_uint vectorSize = gMinVectorSizeIndex + job_id;
109     return BuildKernel(info->nameInCode, vectorSize, info->threadCount,
110                        info->kernels[vectorSize].data(),
111                        &(info->programs[vectorSize]), info->relaxedMode);
112 }
113 
114 // Thread specific data for a worker thread
115 struct ThreadInfo
116 {
117     // Input and output buffers for the thread
118     clMemWrapper inBuf;
119     Buffers outBuf;
120 
121     float maxError; // max error value. Init to 0.
122     double maxErrorValue; // position of the max error value.  Init to 0.
123 
124     // Per thread command queue to improve performance
125     clCommandQueueWrapper tQueue;
126 };
127 
128 struct TestInfo
129 {
130     size_t subBufferSize; // Size of the sub-buffer in elements
131     const Func *f; // A pointer to the function info
132 
133     // Programs for various vector sizes.
134     Programs programs;
135 
136     // Thread-specific kernels for each vector size:
137     // k[vector_size][thread_id]
138     KernelMatrix k;
139 
140     // Array of thread specific information
141     std::vector<ThreadInfo> tinfo;
142 
143     cl_uint threadCount; // Number of worker threads
144     cl_uint jobCount; // Number of jobs
145     cl_uint step; // step between each chunk and the next.
146     cl_uint scale; // stride between individual test values
147     float ulps; // max_allowed ulps
148     int ftz; // non-zero if running in flush to zero mode
149 
150     int isRangeLimited; // 1 if the function is only to be evaluated over a
151                         // range
152     float half_sin_cos_tan_limit;
153     bool relaxedMode; // True if test is running in relaxed mode, false
154                       // otherwise.
155 };
156 
Test(cl_uint job_id,cl_uint thread_id,void * data)157 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
158 {
159     TestInfo *job = (TestInfo *)data;
160     size_t buffer_elements = job->subBufferSize;
161     size_t buffer_size = buffer_elements * sizeof(cl_float);
162     cl_uint scale = job->scale;
163     cl_uint base = job_id * (cl_uint)job->step;
164     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
165     fptr func = job->f->func;
166     const char *fname = job->f->name;
167     bool relaxedMode = job->relaxedMode;
168     float ulps = getAllowedUlpError(job->f, relaxedMode);
169     if (relaxedMode)
170     {
171         func = job->f->rfunc;
172     }
173 
174     cl_int error;
175 
176     int isRangeLimited = job->isRangeLimited;
177     float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
178     int ftz = job->ftz;
179 
180     // start the map of the output arrays
181     cl_event e[VECTOR_SIZE_COUNT];
182     cl_uint *out[VECTOR_SIZE_COUNT];
183     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
184     {
185         out[j] = (cl_uint *)clEnqueueMapBuffer(
186             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
187             buffer_size, 0, NULL, e + j, &error);
188         if (error || NULL == out[j])
189         {
190             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
191                        error);
192             return error;
193         }
194     }
195 
196     // Get that moving
197     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
198 
199     // Write the new values to the input array
200     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
201     for (size_t j = 0; j < buffer_elements; j++)
202     {
203         p[j] = base + j * scale;
204         if (relaxedMode)
205         {
206             float p_j = *(float *)&p[j];
207             if (strcmp(fname, "sin") == 0
208                 || strcmp(fname, "cos")
209                     == 0) // the domain of the function is [-pi,pi]
210             {
211                 if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
212             }
213 
214             if (strcmp(fname, "reciprocal") == 0)
215             {
216                 const float l_limit = HEX_FLT(+, 1, 0, -, 126);
217                 const float u_limit = HEX_FLT(+, 1, 0, +, 126);
218 
219                 if (fabs(p_j) < l_limit
220                     || fabs(p_j) > u_limit) // the domain of the function is
221                                             // [2^-126,2^126]
222                     ((float *)p)[j] = NAN;
223             }
224         }
225     }
226 
227     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
228                                       buffer_size, p, 0, NULL, NULL)))
229     {
230         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
231         return error;
232     }
233 
234     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
235     {
236         // Wait for the map to finish
237         if ((error = clWaitForEvents(1, e + j)))
238         {
239             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
240             return error;
241         }
242         if ((error = clReleaseEvent(e[j])))
243         {
244             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
245             return error;
246         }
247 
248         // Fill the result buffer with garbage, so that old results don't carry
249         // over
250         uint32_t pattern = 0xffffdead;
251         memset_pattern4(out[j], &pattern, buffer_size);
252         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
253                                              out[j], 0, NULL, NULL)))
254         {
255             vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
256                        error);
257             return error;
258         }
259 
260         // run the kernel
261         size_t vectorCount =
262             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
263         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
264                                                  // own copy of the cl_kernel
265         cl_program program = job->programs[j];
266 
267         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
268                                     &tinfo->outBuf[j])))
269         {
270             LogBuildError(program);
271             return error;
272         }
273         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
274                                     &tinfo->inBuf)))
275         {
276             LogBuildError(program);
277             return error;
278         }
279 
280         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
281                                             &vectorCount, NULL, 0, NULL, NULL)))
282         {
283             vlog_error("FAILED -- could not execute kernel\n");
284             return error;
285         }
286     }
287 
288     // Get that moving
289     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
290 
291     if (gSkipCorrectnessTesting) return CL_SUCCESS;
292 
293     // Calculate the correctly rounded reference result
294     float *r = (float *)gOut_Ref + thread_id * buffer_elements;
295     float *s = (float *)p;
296     for (size_t j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
297 
298     // Read the data back -- no need to wait for the first N-1 buffers but wait
299     // for the last buffer. This is an in order queue.
300     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
301     {
302         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
303         out[j] = (cl_uint *)clEnqueueMapBuffer(
304             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
305             buffer_size, 0, NULL, NULL, &error);
306         if (error || NULL == out[j])
307         {
308             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
309                        error);
310             return error;
311         }
312     }
313 
314     // Verify data
315     uint32_t *t = (uint32_t *)r;
316     for (size_t j = 0; j < buffer_elements; j++)
317     {
318         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
319         {
320             uint32_t *q = out[k];
321 
322             // If we aren't getting the correctly rounded result
323             if (t[j] != q[j])
324             {
325                 float test = ((float *)q)[j];
326                 double correct = func.f_f(s[j]);
327                 float err = Ulp_Error(test, correct);
328                 float abs_error = Abs_Error(test, correct);
329                 int fail = 0;
330                 int use_abs_error = 0;
331 
332                 // it is possible for the output to not match the reference
333                 // result but for Ulp_Error to be zero, for example -1.#QNAN
334                 // vs. 1.#QNAN. In such cases there is no failure
335                 if (err == 0.0f)
336                 {
337                     fail = 0;
338                 }
339                 else if (relaxedMode)
340                 {
341                     if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0)
342                     {
343                         fail = !(fabsf(abs_error) <= ulps);
344                         use_abs_error = 1;
345                     }
346                     if (strcmp(fname, "sinpi") == 0
347                         || strcmp(fname, "cospi") == 0)
348                     {
349                         if (s[j] >= -1.0 && s[j] <= 1.0)
350                         {
351                             fail = !(fabsf(abs_error) <= ulps);
352                             use_abs_error = 1;
353                         }
354                     }
355 
356                     if (strcmp(fname, "reciprocal") == 0)
357                     {
358                         fail = !(fabsf(err) <= ulps);
359                     }
360 
361                     if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
362                     {
363                         float exp_error = ulps;
364 
365                         if (!gIsEmbedded)
366                         {
367                             exp_error += floor(fabs(2 * s[j]));
368                         }
369 
370                         fail = !(fabsf(err) <= exp_error);
371                         ulps = exp_error;
372                     }
373                     if (strcmp(fname, "tan") == 0)
374                     {
375 
376                         if (!gFastRelaxedDerived)
377                         {
378                             fail = !(fabsf(err) <= ulps);
379                         }
380                         // Else fast math derived implementation does not
381                         // require ULP verification
382                     }
383                     if (strcmp(fname, "exp10") == 0)
384                     {
385                         if (!gFastRelaxedDerived)
386                         {
387                             fail = !(fabsf(err) <= ulps);
388                         }
389                         // Else fast math derived implementation does not
390                         // require ULP verification
391                     }
392                     if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0
393                         || strcmp(fname, "log10") == 0)
394                     {
395                         if (s[j] >= 0.5 && s[j] <= 2)
396                         {
397                             fail = !(fabsf(abs_error) <= ulps);
398                         }
399                         else
400                         {
401                             ulps = gIsEmbedded ? job->f->float_embedded_ulps
402                                                : job->f->float_ulps;
403                             fail = !(fabsf(err) <= ulps);
404                         }
405                     }
406 
407 
408                     // fast-relaxed implies finite-only
409                     if (IsFloatInfinity(correct) || IsFloatNaN(correct)
410                         || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
411                     {
412                         fail = 0;
413                         err = 0;
414                     }
415                 }
416                 else
417                 {
418                     fail = !(fabsf(err) <= ulps);
419                 }
420 
421                 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
422                 if (isRangeLimited
423                     && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
424                     && fabsf(s[j]) < INFINITY)
425                 {
426                     if (fabsf(test) <= half_sin_cos_tan_limit)
427                     {
428                         err = 0;
429                         fail = 0;
430                     }
431                 }
432 
433                 if (fail)
434                 {
435                     if (ftz || relaxedMode)
436                     {
437                         typedef int (*CheckForSubnormal)(
438                             double, float); // If we are in fast relaxed math,
439                                             // we have a different calculation
440                                             // for the subnormal threshold.
441                         CheckForSubnormal isFloatResultSubnormalPtr;
442 
443                         if (relaxedMode)
444                         {
445                             isFloatResultSubnormalPtr =
446                                 &IsFloatResultSubnormalAbsError;
447                         }
448                         else
449                         {
450                             isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
451                         }
452                         // retry per section 6.5.3.2
453                         if ((*isFloatResultSubnormalPtr)(correct, ulps))
454                         {
455                             fail = fail && (test != 0.0f);
456                             if (!fail) err = 0.0f;
457                         }
458 
459                         // retry per section 6.5.3.3
460                         if (IsFloatSubnormal(s[j]))
461                         {
462                             double correct2 = func.f_f(0.0);
463                             double correct3 = func.f_f(-0.0);
464                             float err2;
465                             float err3;
466                             if (use_abs_error)
467                             {
468                                 err2 = Abs_Error(test, correct2);
469                                 err3 = Abs_Error(test, correct3);
470                             }
471                             else
472                             {
473                                 err2 = Ulp_Error(test, correct2);
474                                 err3 = Ulp_Error(test, correct3);
475                             }
476                             fail = fail
477                                 && ((!(fabsf(err2) <= ulps))
478                                     && (!(fabsf(err3) <= ulps)));
479                             if (fabsf(err2) < fabsf(err)) err = err2;
480                             if (fabsf(err3) < fabsf(err)) err = err3;
481 
482                             // retry per section 6.5.3.4
483                             if ((*isFloatResultSubnormalPtr)(correct2, ulps)
484                                 || (*isFloatResultSubnormalPtr)(correct3, ulps))
485                             {
486                                 fail = fail && (test != 0.0f);
487                                 if (!fail) err = 0.0f;
488                             }
489                         }
490                     }
491                 }
492                 if (fabsf(err) > tinfo->maxError)
493                 {
494                     tinfo->maxError = fabsf(err);
495                     tinfo->maxErrorValue = s[j];
496                 }
497                 if (fail)
498                 {
499                     vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): "
500                                "*%a vs. %a\n",
501                                job->f->name, sizeNames[k], err, ((float *)s)[j],
502                                ((uint32_t *)s)[j], ((float *)t)[j], test);
503                     return -1;
504                 }
505             }
506         }
507     }
508 
509     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
510     {
511         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
512                                              out[j], 0, NULL, NULL)))
513         {
514             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
515                        j, error);
516             return error;
517         }
518     }
519 
520     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
521 
522 
523     if (0 == (base & 0x0fffffff))
524     {
525         if (gVerboseBruteForce)
526         {
527             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
528                  "ThreadCount:%2u\n",
529                  base, job->step, job->scale, buffer_elements, job->ulps,
530                  job->threadCount);
531         }
532         else
533         {
534             vlog(".");
535         }
536         fflush(stdout);
537     }
538 
539     return CL_SUCCESS;
540 }
541 
542 } // anonymous namespace
543 
TestFunc_Float_Float(const Func * f,MTdata d,bool relaxedMode)544 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
545 {
546     TestInfo test_info{};
547     cl_int error;
548     float maxError = 0.0f;
549     double maxErrorVal = 0.0;
550     int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
551 
552     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
553 
554     // Init test_info
555     test_info.threadCount = GetThreadCount();
556     test_info.subBufferSize = BUFFER_SIZE
557         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
558     test_info.scale = getTestScale(sizeof(cl_float));
559 
560     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
561     if (test_info.step / test_info.subBufferSize != test_info.scale)
562     {
563         // there was overflow
564         test_info.jobCount = 1;
565     }
566     else
567     {
568         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
569     }
570 
571     test_info.f = f;
572     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
573     test_info.ftz =
574         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
575     test_info.relaxedMode = relaxedMode;
576     // cl_kernels aren't thread safe, so we make one for each vector size for
577     // every thread
578     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
579     {
580         test_info.k[i].resize(test_info.threadCount, nullptr);
581     }
582 
583     test_info.tinfo.resize(test_info.threadCount);
584     for (cl_uint i = 0; i < test_info.threadCount; i++)
585     {
586         cl_buffer_region region = {
587             i * test_info.subBufferSize * sizeof(cl_float),
588             test_info.subBufferSize * sizeof(cl_float)
589         };
590         test_info.tinfo[i].inBuf =
591             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
592                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
593         if (error || NULL == test_info.tinfo[i].inBuf)
594         {
595             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
596                        "region {%zd, %zd}\n",
597                        region.origin, region.size);
598             goto exit;
599         }
600 
601         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
602         {
603             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
604                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
605                 &region, &error);
606             if (error || NULL == test_info.tinfo[i].outBuf[j])
607             {
608                 vlog_error("Error: Unable to create sub-buffer of "
609                            "gOutBuffer[%d] for region {%zd, %zd}\n",
610                            (int)j, region.origin, region.size);
611                 goto exit;
612             }
613         }
614         test_info.tinfo[i].tQueue =
615             clCreateCommandQueue(gContext, gDevice, 0, &error);
616         if (NULL == test_info.tinfo[i].tQueue || error)
617         {
618             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
619             goto exit;
620         }
621     }
622 
623     // Check for special cases for unary float
624     test_info.isRangeLimited = 0;
625     test_info.half_sin_cos_tan_limit = 0;
626     if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
627     {
628         test_info.isRangeLimited = 1;
629         test_info.half_sin_cos_tan_limit = 1.0f
630             + test_info.ulps
631                 * (FLT_EPSILON / 2.0f); // out of range results from finite
632                                         // inputs must be in [-1,1]
633     }
634     else if (0 == strcmp(f->name, "half_tan"))
635     {
636         test_info.isRangeLimited = 1;
637         test_info.half_sin_cos_tan_limit =
638             INFINITY; // out of range resut from finite inputs must be numeric
639     }
640 
641     // Init the kernels
642     {
643         BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
644                                     test_info.programs, f->nameInCode,
645                                     relaxedMode };
646         if ((error = ThreadPool_Do(BuildKernelFn,
647                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
648                                    &build_info)))
649             goto exit;
650     }
651 
652     // Run the kernels
653     if (!gSkipCorrectnessTesting || skipTestingRelaxed)
654     {
655         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
656 
657         // Accumulate the arithmetic errors
658         for (cl_uint i = 0; i < test_info.threadCount; i++)
659         {
660             if (test_info.tinfo[i].maxError > maxError)
661             {
662                 maxError = test_info.tinfo[i].maxError;
663                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
664             }
665         }
666 
667         if (error) goto exit;
668 
669         if (gWimpyMode)
670             vlog("Wimp pass");
671         else
672             vlog("passed");
673 
674         if (skipTestingRelaxed)
675         {
676             vlog(" (rlx skip correctness testing)\n");
677             goto exit;
678         }
679 
680         vlog("\t%8.2f @ %a", maxError, maxErrorVal);
681     }
682 
683     vlog("\n");
684 
685 exit:
686     // Release
687     for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
688     {
689         for (auto &kernel : test_info.k[i])
690         {
691             clReleaseKernel(kernel);
692         }
693     }
694 
695     return error;
696 }
697