• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017,2021 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "imageHelpers.h"
17 #include <limits.h>
18 #include <assert.h>
19 #if defined(__APPLE__)
20 #include <sys/mman.h>
21 #endif
22 #if !defined(_WIN32) && !defined(__APPLE__)
23 #include <malloc.h>
24 #endif
25 #include <algorithm>
26 #include <cinttypes>
27 #include <iterator>
28 #if !defined(_WIN32)
29 #include <cmath>
30 #endif
31 
32 RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
33 
34 cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
35 bool gTestRounding = false;
sRGBmap(float fc)36 double sRGBmap(float fc)
37 {
38     double c = (double)fc;
39 
40 #if !defined(_WIN32)
41     if (std::isnan(c)) c = 0.0;
42 #else
43     if (_isnan(c)) c = 0.0;
44 #endif
45 
46     if (c > 1.0)
47         c = 1.0;
48     else if (c < 0.0)
49         c = 0.0;
50     else if (c < 0.0031308)
51         c = 12.92 * c;
52     else
53         c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0);
54 
55     return c * 255.0;
56 }
57 
sRGBunmap(float fc)58 double sRGBunmap(float fc)
59 {
60     double c = (double)fc;
61     double result;
62 
63     if (c <= 0.04045)
64         result = c / 12.92;
65     else
66         result = pow((c + 0.055) / 1.055, 2.4);
67 
68     return result;
69 }
70 
71 
get_format_type_size(const cl_image_format * format)72 uint32_t get_format_type_size(const cl_image_format *format)
73 {
74     return get_channel_data_type_size(format->image_channel_data_type);
75 }
76 
get_channel_data_type_size(cl_channel_type channelType)77 uint32_t get_channel_data_type_size(cl_channel_type channelType)
78 {
79     switch (channelType)
80     {
81         case CL_SNORM_INT8:
82         case CL_UNORM_INT8:
83         case CL_SIGNED_INT8:
84         case CL_UNSIGNED_INT8: return 1;
85 
86         case CL_SNORM_INT16:
87         case CL_UNORM_INT16:
88         case CL_SIGNED_INT16:
89         case CL_UNSIGNED_INT16:
90         case CL_HALF_FLOAT:
91 #ifdef CL_SFIXED14_APPLE
92         case CL_SFIXED14_APPLE:
93 #endif
94             return sizeof(cl_short);
95 
96         case CL_SIGNED_INT32:
97         case CL_UNSIGNED_INT32: return sizeof(cl_int);
98 
99         case CL_UNORM_SHORT_565:
100         case CL_UNORM_SHORT_555:
101 #ifdef OBSOLETE_FORAMT
102         case CL_UNORM_SHORT_565_REV:
103         case CL_UNORM_SHORT_555_REV:
104 #endif
105             return 2;
106 
107 #ifdef OBSOLETE_FORAMT
108         case CL_UNORM_INT_8888:
109         case CL_UNORM_INT_8888_REV: return 4;
110 #endif
111 
112         case CL_UNORM_INT_101010:
113 #ifdef OBSOLETE_FORAMT
114         case CL_UNORM_INT_101010_REV:
115 #endif
116             return 4;
117 
118         case CL_FLOAT: return sizeof(cl_float);
119 
120         default: return 0;
121     }
122 }
123 
get_format_channel_count(const cl_image_format * format)124 uint32_t get_format_channel_count(const cl_image_format *format)
125 {
126     return get_channel_order_channel_count(format->image_channel_order);
127 }
128 
get_channel_order_channel_count(cl_channel_order order)129 uint32_t get_channel_order_channel_count(cl_channel_order order)
130 {
131     switch (order)
132     {
133         case CL_R:
134         case CL_A:
135         case CL_Rx:
136         case CL_INTENSITY:
137         case CL_LUMINANCE:
138         case CL_DEPTH:
139         case CL_DEPTH_STENCIL: return 1;
140 
141         case CL_RG:
142         case CL_RA:
143         case CL_RGx: return 2;
144 
145         case CL_RGB:
146         case CL_RGBx:
147         case CL_sRGB:
148         case CL_sRGBx: return 3;
149 
150         case CL_RGBA:
151         case CL_ARGB:
152         case CL_BGRA:
153         case CL_sRGBA:
154         case CL_sBGRA:
155         case CL_ABGR:
156 #ifdef CL_1RGB_APPLE
157         case CL_1RGB_APPLE:
158 #endif
159 #ifdef CL_BGR1_APPLE
160         case CL_BGR1_APPLE:
161 #endif
162 #ifdef CL_ABGR_APPLE
163         case CL_ABGR_APPLE:
164 #endif
165             return 4;
166 
167         default:
168             log_error("%s does not support 0x%x\n", __FUNCTION__, order);
169             return 0;
170     }
171 }
172 
get_channel_type_from_name(const char * name)173 cl_channel_type get_channel_type_from_name(const char *name)
174 {
175     struct
176     {
177         cl_channel_type type;
178         const char *name;
179     } typeNames[] = { { CL_SNORM_INT8, "CL_SNORM_INT8" },
180                       { CL_SNORM_INT16, "CL_SNORM_INT16" },
181                       { CL_UNORM_INT8, "CL_UNORM_INT8" },
182                       { CL_UNORM_INT16, "CL_UNORM_INT16" },
183                       { CL_UNORM_INT24, "CL_UNORM_INT24" },
184                       { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
185                       { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
186                       { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
187                       { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
188                       { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
189                       { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
190                       { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
191                       { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
192                       { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
193                       { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
194                       { CL_FLOAT, "CL_FLOAT" },
195 #ifdef CL_SFIXED14_APPLE
196                       { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
197 #endif
198     };
199     for (size_t i = 0; i < sizeof(typeNames) / sizeof(typeNames[0]); i++)
200     {
201         if (strcmp(typeNames[i].name, name) == 0
202             || strcmp(typeNames[i].name + 3, name) == 0)
203             return typeNames[i].type;
204     }
205     return (cl_channel_type)-1;
206 }
207 
get_channel_order_from_name(const char * name)208 cl_channel_order get_channel_order_from_name(const char *name)
209 {
210     const struct
211     {
212         cl_channel_order order;
213         const char *name;
214     } orderNames[] = {
215         { CL_R, "CL_R" },
216         { CL_A, "CL_A" },
217         { CL_Rx, "CL_Rx" },
218         { CL_RG, "CL_RG" },
219         { CL_RA, "CL_RA" },
220         { CL_RGx, "CL_RGx" },
221         { CL_RGB, "CL_RGB" },
222         { CL_RGBx, "CL_RGBx" },
223         { CL_RGBA, "CL_RGBA" },
224         { CL_BGRA, "CL_BGRA" },
225         { CL_ARGB, "CL_ARGB" },
226         { CL_INTENSITY, "CL_INTENSITY" },
227         { CL_LUMINANCE, "CL_LUMINANCE" },
228         { CL_DEPTH, "CL_DEPTH" },
229         { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" },
230         { CL_sRGB, "CL_sRGB" },
231         { CL_sRGBx, "CL_sRGBx" },
232         { CL_sRGBA, "CL_sRGBA" },
233         { CL_sBGRA, "CL_sBGRA" },
234         { CL_ABGR, "CL_ABGR" },
235 #ifdef CL_1RGB_APPLE
236         { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
237 #endif
238 #ifdef CL_BGR1_APPLE
239         { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
240 #endif
241     };
242 
243     for (size_t i = 0; i < sizeof(orderNames) / sizeof(orderNames[0]); i++)
244     {
245         if (strcmp(orderNames[i].name, name) == 0
246             || strcmp(orderNames[i].name + 3, name) == 0)
247             return orderNames[i].order;
248     }
249     return (cl_channel_order)-1;
250 }
251 
252 
is_format_signed(const cl_image_format * format)253 int is_format_signed(const cl_image_format *format)
254 {
255     switch (format->image_channel_data_type)
256     {
257         case CL_SNORM_INT8:
258         case CL_SIGNED_INT8:
259         case CL_SNORM_INT16:
260         case CL_SIGNED_INT16:
261         case CL_SIGNED_INT32:
262         case CL_HALF_FLOAT:
263         case CL_FLOAT:
264 #ifdef CL_SFIXED14_APPLE
265         case CL_SFIXED14_APPLE:
266 #endif
267             return 1;
268 
269         default: return 0;
270     }
271 }
272 
get_pixel_size(const cl_image_format * format)273 uint32_t get_pixel_size(const cl_image_format *format)
274 {
275     switch (format->image_channel_data_type)
276     {
277         case CL_SNORM_INT8:
278         case CL_UNORM_INT8:
279         case CL_SIGNED_INT8:
280         case CL_UNSIGNED_INT8: return get_format_channel_count(format);
281 
282         case CL_SNORM_INT16:
283         case CL_UNORM_INT16:
284         case CL_SIGNED_INT16:
285         case CL_UNSIGNED_INT16:
286         case CL_HALF_FLOAT:
287 #ifdef CL_SFIXED14_APPLE
288         case CL_SFIXED14_APPLE:
289 #endif
290             return get_format_channel_count(format) * sizeof(cl_ushort);
291 
292         case CL_SIGNED_INT32:
293         case CL_UNSIGNED_INT32:
294             return get_format_channel_count(format) * sizeof(cl_int);
295 
296         case CL_UNORM_SHORT_565:
297         case CL_UNORM_SHORT_555:
298 #ifdef OBSOLETE_FORAMT
299         case CL_UNORM_SHORT_565_REV:
300         case CL_UNORM_SHORT_555_REV:
301 #endif
302             return 2;
303 
304 #ifdef OBSOLETE_FORAMT
305         case CL_UNORM_INT_8888:
306         case CL_UNORM_INT_8888_REV: return 4;
307 #endif
308 
309         case CL_UNORM_INT_101010:
310 #ifdef OBSOLETE_FORAMT
311         case CL_UNORM_INT_101010_REV:
312 #endif
313             return 4;
314 
315         case CL_FLOAT:
316             return get_format_channel_count(format) * sizeof(cl_float);
317 
318         default: return 0;
319     }
320 }
321 
next_power_of_two(uint32_t v)322 uint32_t next_power_of_two(uint32_t v)
323 {
324     v--;
325     v |= v >> 1;
326     v |= v >> 2;
327     v |= v >> 4;
328     v |= v >> 8;
329     v |= v >> 16;
330     v++;
331     return v;
332 }
333 
get_pixel_alignment(const cl_image_format * format)334 uint32_t get_pixel_alignment(const cl_image_format *format)
335 {
336     return next_power_of_two(get_pixel_size(format));
337 }
338 
get_8_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)339 int get_8_bit_image_format(cl_context context, cl_mem_object_type objType,
340                            cl_mem_flags flags, size_t channelCount,
341                            cl_image_format *outFormat)
342 {
343     cl_image_format formatList[128];
344     unsigned int outFormatCount, i;
345     int error;
346 
347 
348     /* Make sure each image format is supported */
349     if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
350                                             formatList, &outFormatCount)))
351         return error;
352 
353 
354     /* Look for one that is an 8-bit format */
355     for (i = 0; i < outFormatCount; i++)
356     {
357         if (formatList[i].image_channel_data_type == CL_SNORM_INT8
358             || formatList[i].image_channel_data_type == CL_UNORM_INT8
359             || formatList[i].image_channel_data_type == CL_SIGNED_INT8
360             || formatList[i].image_channel_data_type == CL_UNSIGNED_INT8)
361         {
362             if (!channelCount
363                 || (channelCount
364                     && (get_format_channel_count(&formatList[i])
365                         == channelCount)))
366             {
367                 *outFormat = formatList[i];
368                 return 0;
369             }
370         }
371     }
372 
373     return -1;
374 }
375 
get_32_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)376 int get_32_bit_image_format(cl_context context, cl_mem_object_type objType,
377                             cl_mem_flags flags, size_t channelCount,
378                             cl_image_format *outFormat)
379 {
380     cl_image_format formatList[128];
381     unsigned int outFormatCount, i;
382     int error;
383 
384 
385     /* Make sure each image format is supported */
386     if ((error = clGetSupportedImageFormats(context, flags, objType, 128,
387                                             formatList, &outFormatCount)))
388         return error;
389 
390     /* Look for one that is an 8-bit format */
391     for (i = 0; i < outFormatCount; i++)
392     {
393         if (formatList[i].image_channel_data_type == CL_UNORM_INT_101010
394             || formatList[i].image_channel_data_type == CL_FLOAT
395             || formatList[i].image_channel_data_type == CL_SIGNED_INT32
396             || formatList[i].image_channel_data_type == CL_UNSIGNED_INT32)
397         {
398             if (!channelCount
399                 || (channelCount
400                     && (get_format_channel_count(&formatList[i])
401                         == channelCount)))
402             {
403                 *outFormat = formatList[i];
404                 return 0;
405             }
406         }
407     }
408 
409     return -1;
410 }
411 
print_first_pixel_difference_error(size_t where,const char * sourcePixel,const char * destPixel,image_descriptor * imageInfo,size_t y,size_t thirdDim)412 void print_first_pixel_difference_error(size_t where, const char *sourcePixel,
413                                         const char *destPixel,
414                                         image_descriptor *imageInfo, size_t y,
415                                         size_t thirdDim)
416 {
417     size_t pixel_size = get_pixel_size(imageInfo->format);
418 
419     log_error("ERROR: Scanline %d did not verify for image size %d,%d,%d "
420               "pitch %d (extra %d bytes)\n",
421               (int)y, (int)imageInfo->width, (int)imageInfo->height,
422               (int)thirdDim, (int)imageInfo->rowPitch,
423               (int)imageInfo->rowPitch
424                   - (int)imageInfo->width * (int)pixel_size);
425     log_error("Failed at column: %zu   ", where);
426 
427     switch (pixel_size)
428     {
429         case 1:
430             log_error("*0x%2.2x vs. 0x%2.2x\n", ((cl_uchar *)sourcePixel)[0],
431                       ((cl_uchar *)destPixel)[0]);
432             break;
433         case 2:
434             log_error("*0x%4.4x vs. 0x%4.4x\n", ((cl_ushort *)sourcePixel)[0],
435                       ((cl_ushort *)destPixel)[0]);
436             break;
437         case 3:
438             log_error("*{0x%2.2x, 0x%2.2x, 0x%2.2x} vs. "
439                       "{0x%2.2x, 0x%2.2x, 0x%2.2x}\n",
440                       ((cl_uchar *)sourcePixel)[0],
441                       ((cl_uchar *)sourcePixel)[1],
442                       ((cl_uchar *)sourcePixel)[2], ((cl_uchar *)destPixel)[0],
443                       ((cl_uchar *)destPixel)[1], ((cl_uchar *)destPixel)[2]);
444             break;
445         case 4:
446             log_error("*0x%8.8x vs. 0x%8.8x\n", ((cl_uint *)sourcePixel)[0],
447                       ((cl_uint *)destPixel)[0]);
448             break;
449         case 6:
450             log_error(
451                 "*{0x%4.4x, 0x%4.4x, 0x%4.4x} vs. "
452                 "{0x%4.4x, 0x%4.4x, 0x%4.4x}\n",
453                 ((cl_ushort *)sourcePixel)[0], ((cl_ushort *)sourcePixel)[1],
454                 ((cl_ushort *)sourcePixel)[2], ((cl_ushort *)destPixel)[0],
455                 ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]);
456             break;
457         case 8:
458             log_error("*0x%16.16" PRIx64 " vs. 0x%16.16" PRIx64 "\n",
459                       ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]);
460             break;
461         case 12:
462             log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
463                       "{0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
464                       ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
465                       ((cl_uint *)sourcePixel)[2], ((cl_uint *)destPixel)[0],
466                       ((cl_uint *)destPixel)[1], ((cl_uint *)destPixel)[2]);
467             break;
468         case 16:
469             log_error("*{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x} vs. "
470                       "{0x%8.8x, 0x%8.8x, 0x%8.8x, 0x%8.8x}\n",
471                       ((cl_uint *)sourcePixel)[0], ((cl_uint *)sourcePixel)[1],
472                       ((cl_uint *)sourcePixel)[2], ((cl_uint *)sourcePixel)[3],
473                       ((cl_uint *)destPixel)[0], ((cl_uint *)destPixel)[1],
474                       ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]);
475             break;
476         default:
477             log_error("Don't know how to print pixel size of %zu\n",
478                       pixel_size);
479             break;
480     }
481 }
482 
compare_scanlines(const image_descriptor * imageInfo,const char * aPtr,const char * bPtr)483 size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr,
484                          const char *bPtr)
485 {
486     size_t pixel_size = get_pixel_size(imageInfo->format);
487     size_t column;
488 
489     for (column = 0; column < imageInfo->width; column++)
490     {
491         switch (imageInfo->format->image_channel_data_type)
492         {
493             // If the data type is 101010, then ignore bits 31 and 32 when
494             // comparing the row
495             case CL_UNORM_INT_101010: {
496                 cl_uint aPixel = *(cl_uint *)aPtr;
497                 cl_uint bPixel = *(cl_uint *)bPtr;
498                 if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff))
499                     return column;
500             }
501             break;
502 
503             // If the data type is 555, ignore bit 15 when comparing the row
504             case CL_UNORM_SHORT_555: {
505                 cl_ushort aPixel = *(cl_ushort *)aPtr;
506                 cl_ushort bPixel = *(cl_ushort *)bPtr;
507                 if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column;
508             }
509             break;
510 
511             default:
512                 if (memcmp(aPtr, bPtr, pixel_size) != 0) return column;
513                 break;
514         }
515 
516         aPtr += pixel_size;
517         bPtr += pixel_size;
518     }
519 
520     // If we didn't find a difference, return the width of the image
521     return column;
522 }
523 
random_log_in_range(int minV,int maxV,MTdata d)524 int random_log_in_range(int minV, int maxV, MTdata d)
525 {
526     double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1);
527     int iv = (int)((float)(maxV - minV) * v);
528     return iv + minV;
529 }
530 
531 
532 // Define the addressing functions
533 typedef int (*AddressFn)(int value, size_t maxValue);
534 
NoAddressFn(int value,size_t maxValue)535 int NoAddressFn(int value, size_t maxValue) { return value; }
RepeatAddressFn(int value,size_t maxValue)536 int RepeatAddressFn(int value, size_t maxValue)
537 {
538     if (value < 0)
539         value += (int)maxValue;
540     else if (value >= (int)maxValue)
541         value -= (int)maxValue;
542     return value;
543 }
MirroredRepeatAddressFn(int value,size_t maxValue)544 int MirroredRepeatAddressFn(int value, size_t maxValue)
545 {
546     if (value < 0)
547         value = 0;
548     else if ((size_t)value >= maxValue)
549         value = (int)(maxValue - 1);
550     return value;
551 }
ClampAddressFn(int value,size_t maxValue)552 int ClampAddressFn(int value, size_t maxValue)
553 {
554     return (value < -1) ? -1
555                         : ((value > (cl_long)maxValue) ? (int)maxValue : value);
556 }
ClampToEdgeNearestFn(int value,size_t maxValue)557 int ClampToEdgeNearestFn(int value, size_t maxValue)
558 {
559     return (value < 0)
560         ? 0
561         : (((size_t)value > maxValue - 1) ? (int)maxValue - 1 : value);
562 }
563 AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn;
564 
565 // Note: normalized coords get repeated in normalized space, not unnormalized
566 // space! hence the special case here
567 volatile float gFloatHome;
RepeatNormalizedAddressFn(float fValue,size_t maxValue)568 float RepeatNormalizedAddressFn(float fValue, size_t maxValue)
569 {
570 #ifndef _MSC_VER // Use original if not the VS compiler.
571     // General computation for repeat
572     return (fValue - floorf(fValue)) * (float)maxValue; // Reduce to [0, 1.f]
573 #else // Otherwise, use this instead:
574     // Home the subtraction to a float to break up the sequence of x87
575     // instructions emitted by the VS compiler.
576     gFloatHome = fValue - floorf(fValue);
577     return gFloatHome * (float)maxValue;
578 #endif
579 }
580 
MirroredRepeatNormalizedAddressFn(float fValue,size_t maxValue)581 float MirroredRepeatNormalizedAddressFn(float fValue, size_t maxValue)
582 {
583     // Round to nearest multiple of two.
584     // Note halfway values flip flop here due to rte, but they both end up
585     // pointing the same place at the end of the day.
586     float s_prime = 2.0f * rintf(fValue * 0.5f);
587 
588     // Reduce to [-1, 1], Apply mirroring -> [0, 1]
589     s_prime = fabsf(fValue - s_prime);
590 
591     // un-normalize
592     return s_prime * (float)maxValue;
593 }
594 
595 struct AddressingTable
596 {
AddressingTableAddressingTable597     AddressingTable()
598     {
599         static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, "");
600         static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, "");
601 
602         mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
603               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn;
604         mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE]
605               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = NoAddressFn;
606         mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
607               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = RepeatAddressFn;
608         mTable[CL_ADDRESS_REPEAT - CL_ADDRESS_NONE]
609               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = RepeatAddressFn;
610         mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
611               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampToEdgeNearestFn;
612         mTable[CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE]
613               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampToEdgeLinearFn;
614         mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
615               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = ClampAddressFn;
616         mTable[CL_ADDRESS_CLAMP - CL_ADDRESS_NONE]
617               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = ClampAddressFn;
618         mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
619               [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
620         mTable[CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE]
621               [CL_FILTER_LINEAR - CL_FILTER_NEAREST] = MirroredRepeatAddressFn;
622     }
623 
operator []AddressingTable624     AddressFn operator[](image_sampler_data *sampler)
625     {
626         return mTable[(int)sampler->addressing_mode - CL_ADDRESS_NONE]
627                      [(int)sampler->filter_mode - CL_FILTER_NEAREST];
628     }
629 
630     AddressFn mTable[6][2];
631 };
632 
633 static AddressingTable sAddressingTable;
634 
is_sRGBA_order(cl_channel_order image_channel_order)635 bool is_sRGBA_order(cl_channel_order image_channel_order)
636 {
637     switch (image_channel_order)
638     {
639         case CL_sRGB:
640         case CL_sRGBx:
641         case CL_sRGBA:
642         case CL_sBGRA: return true;
643         default: return false;
644     }
645 }
646 
647 // Format helpers
648 
has_alpha(const cl_image_format * format)649 int has_alpha(const cl_image_format *format)
650 {
651     switch (format->image_channel_order)
652     {
653         case CL_R: return 0;
654         case CL_A: return 1;
655         case CL_Rx: return 0;
656         case CL_RG: return 0;
657         case CL_RA: return 1;
658         case CL_RGx: return 0;
659         case CL_RGB:
660         case CL_sRGB: return 0;
661         case CL_RGBx:
662         case CL_sRGBx: return 0;
663         case CL_RGBA: return 1;
664         case CL_BGRA: return 1;
665         case CL_ARGB: return 1;
666         case CL_ABGR: return 1;
667         case CL_INTENSITY: return 1;
668         case CL_LUMINANCE: return 0;
669 #ifdef CL_BGR1_APPLE
670         case CL_BGR1_APPLE: return 1;
671 #endif
672 #ifdef CL_1RGB_APPLE
673         case CL_1RGB_APPLE: return 1;
674 #endif
675         case CL_sRGBA:
676         case CL_sBGRA: return 1;
677         case CL_DEPTH: return 0;
678         default:
679             log_error("Invalid image channel order: %d\n",
680                       format->image_channel_order);
681             return 0;
682     }
683 }
684 
685 #define PRINT_MAX_SIZE_LOGIC 0
686 
687 #define SWAP(_a, _b)                                                           \
688     do                                                                         \
689     {                                                                          \
690         _a ^= _b;                                                              \
691         _b ^= _a;                                                              \
692         _a ^= _b;                                                              \
693     } while (0)
694 
get_max_sizes(size_t * numberOfSizes,const int maxNumberOfSizes,size_t sizes[][3],size_t maxWidth,size_t maxHeight,size_t maxDepth,size_t maxArraySize,const cl_ulong maxIndividualAllocSize,const cl_ulong maxTotalAllocSize,cl_mem_object_type image_type,const cl_image_format * format,int usingMaxPixelSizeBuffer)695 void get_max_sizes(
696     size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3],
697     size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
698     const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE
699     const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE
700     cl_mem_object_type image_type, const cl_image_format *format,
701     int usingMaxPixelSizeBuffer)
702 {
703 
704     bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
705     bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
706                     || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);
707 
708     // Validate we have a reasonable max depth for 3D
709     if (is3D && maxDepth < 2)
710     {
711         log_error("ERROR: Requesting max image sizes for 3D images when max "
712                   "depth is < 2.\n");
713         *numberOfSizes = 0;
714         return;
715     }
716     // Validate we have a reasonable max array size for 1D & 2D image arrays
717     if (isArray && maxArraySize < 2)
718     {
719         log_error("ERROR: Requesting max image sizes for an image array when "
720                   "max array size is < 1.\n");
721         *numberOfSizes = 0;
722         return;
723     }
724 
725     // Reduce the maximum because we are trying to test the max image
726     // dimensions, not the memory allocation
727     cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
728     cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
729     log_info("Note: max individual allocation adjusted down from %gMB to %gMB "
730              "and max total allocation adjusted down from %gMB to %gMB.\n",
731              maxIndividualAllocSize / (1024.0 * 1024.0),
732              adjustedMaxIndividualAllocSize / (1024.0 * 1024.0),
733              maxTotalAllocSize / (1024.0 * 1024.0),
734              adjustedMaxTotalAllocSize / (1024.0 * 1024.0));
735 
736     // Cap our max allocation to 1.0GB.
737     // FIXME -- why?  In the interest of not taking a long time?  We should
738     // still test this stuff...
739     if (adjustedMaxTotalAllocSize > (cl_ulong)1024 * 1024 * 1024)
740     {
741         adjustedMaxTotalAllocSize = (cl_ulong)1024 * 1024 * 1024;
742         log_info("Limiting max total allocation size to %gMB (down from %gMB) "
743                  "for test.\n",
744                  adjustedMaxTotalAllocSize / (1024.0 * 1024.0),
745                  maxTotalAllocSize / (1024.0 * 1024.0));
746     }
747 
748     cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
749     if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize * 2)
750         maxAllocSize = adjustedMaxTotalAllocSize / 2;
751 
752     size_t raw_pixel_size = get_pixel_size(format);
753     // If the test will be creating input (src) buffer of type int4 or float4,
754     // number of pixels will be governed by sizeof(int4 or float4) and not
755     // sizeof(dest fomat) Also if pixel size is 12 bytes i.e. RGB or RGBx, we
756     // adjust it to 16 bytes as GPUs has no concept of 3 channel images. GPUs
757     // expand these to four channel RGBA.
758     if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16;
759     size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
760 
761     log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, "
762              "per-allocation limit %gMB.\n",
763              maxWidth, maxHeight, isArray ? maxArraySize : maxDepth,
764              raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0)));
765 
766     // Keep track of the maximum sizes for each dimension
767     size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };
768 
769     switch (image_type)
770     {
771         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
772             maximum_sizes[1] = maxArraySize;
773             maximum_sizes[2] = 1;
774             break;
775         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
776             maximum_sizes[2] = maxArraySize;
777             break;
778     }
779 
780 
781         // Given one fixed sized dimension, this code finds one or two other
782         // dimensions, both with very small size, such that the size does not
783         // exceed the maximum passed to this function
784 
785 #if defined(__x86_64) || defined(__arm64__) || defined(__ppc64__)
786     size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15 };
787 #else
788     size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13 };
789 #endif
790 
791     static size_t other_size = 0;
792     enum
793     {
794         num_other_sizes = sizeof(other_sizes) / sizeof(size_t)
795     };
796 
797     (*numberOfSizes) = 0;
798 
799     if (image_type == CL_MEM_OBJECT_IMAGE1D)
800     {
801 
802         size_t M = maximum_sizes[0];
803 
804         // Store the size
805         sizes[(*numberOfSizes)][0] = M;
806         sizes[(*numberOfSizes)][1] = 1;
807         sizes[(*numberOfSizes)][2] = 1;
808         ++(*numberOfSizes);
809     }
810 
811     else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY
812              || image_type == CL_MEM_OBJECT_IMAGE2D)
813     {
814 
815         for (int fixed_dim = 0; fixed_dim < 2; ++fixed_dim)
816         {
817 
818             // Determine the size of the fixed dimension
819             size_t M = maximum_sizes[fixed_dim];
820             size_t A = max_pixels;
821 
822             int x0_dim = !fixed_dim;
823             size_t x0 = static_cast<size_t>(
824                 fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M),
825                      maximum_sizes[x0_dim]));
826 
827             // Store the size
828             sizes[(*numberOfSizes)][fixed_dim] = M;
829             sizes[(*numberOfSizes)][x0_dim] = x0;
830             sizes[(*numberOfSizes)][2] = 1;
831             ++(*numberOfSizes);
832         }
833     }
834 
835     else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
836              || image_type == CL_MEM_OBJECT_IMAGE3D)
837     {
838 
839         // Iterate over dimensions, finding sizes for the non-fixed dimension
840         for (int fixed_dim = 0; fixed_dim < 3; ++fixed_dim)
841         {
842 
843             // Determine the size of the fixed dimension
844             size_t M = maximum_sizes[fixed_dim];
845             size_t A = max_pixels;
846 
847             // Find two other dimensions, x0 and x1
848             int x0_dim = (fixed_dim == 0) ? 1 : 0;
849             int x1_dim = (fixed_dim == 2) ? 1 : 2;
850 
851             // Choose two other sizes for these dimensions
852             size_t x0 = static_cast<size_t>(
853                 fmin(fmin(A / M, maximum_sizes[x0_dim]),
854                      other_sizes[(other_size++) % num_other_sizes]));
855             // GPUs have certain restrictions on minimum width (row alignment)
856             // of images which has given us issues testing small widths in this
857             // test (say we set width to 3 for testing, and compute size based
858             // on this width and decide it fits within vram ... but GPU driver
859             // decides that, due to row alignment requirements, it has to use
860             // width of 16 which doesnt fit in vram). For this purpose we are
861             // not testing width < 16 for this test.
862             if (x0_dim == 0 && x0 < 16) x0 = 16;
863             size_t x1 = static_cast<size_t>(
864                 fmin(fmin(A / M / x0, maximum_sizes[x1_dim]),
865                      other_sizes[(other_size++) % num_other_sizes]));
866 
867             // Valid image sizes cannot be below 1. Due to the workaround for
868             // the xo_dim where x0 is overidden to 16 there might not be enough
869             // space left for x1 dimension. This could be a fractional 0.x size
870             // that when cast to integer would result in a value 0. In these
871             // cases we clamp the size to a minimum of 1.
872             if (x1 < 1) x1 = 1;
873 
874             // M and x0 cannot be '0' as they derive from clDeviceInfo calls
875             assert(x0 > 0 && M > 0);
876 
877             // Store the size
878             sizes[(*numberOfSizes)][fixed_dim] = M;
879             sizes[(*numberOfSizes)][x0_dim] = x0;
880             sizes[(*numberOfSizes)][x1_dim] = x1;
881             ++(*numberOfSizes);
882         }
883     }
884 
885     // Log the results
886     for (int j = 0; j < (int)(*numberOfSizes); j++)
887     {
888         switch (image_type)
889         {
890             case CL_MEM_OBJECT_IMAGE1D:
891                 log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0],
892                          raw_pixel_size * sizes[j][0] * sizes[j][1]
893                              * sizes[j][2] / (1024.0 * 1024.0));
894                 break;
895             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
896             case CL_MEM_OBJECT_IMAGE2D:
897                 log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j,
898                          sizes[j][0], sizes[j][1],
899                          raw_pixel_size * sizes[j][0] * sizes[j][1]
900                              * sizes[j][2] / (1024.0 * 1024.0));
901                 break;
902             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
903             case CL_MEM_OBJECT_IMAGE3D:
904                 log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j,
905                          sizes[j][0], sizes[j][1], sizes[j][2],
906                          raw_pixel_size * sizes[j][0] * sizes[j][1]
907                              * sizes[j][2] / (1024.0 * 1024.0));
908                 break;
909         }
910     }
911 }
912 
get_max_absolute_error(const cl_image_format * format,image_sampler_data * sampler)913 float get_max_absolute_error(const cl_image_format *format,
914                              image_sampler_data *sampler)
915 {
916     if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f;
917 
918     switch (format->image_channel_data_type)
919     {
920         case CL_SNORM_INT8: return 1.0f / 127.0f;
921         case CL_UNORM_INT8: return 1.0f / 255.0f;
922         case CL_UNORM_INT16: return 1.0f / 65535.0f;
923         case CL_SNORM_INT16: return 1.0f / 32767.0f;
924         case CL_FLOAT: return CL_FLT_MIN;
925 #ifdef CL_SFIXED14_APPLE
926         case CL_SFIXED14_APPLE: return 0x1.0p-14f;
927 #endif
928         case CL_UNORM_SHORT_555:
929         case CL_UNORM_SHORT_565: return 1.0f / 31.0f;
930         default: return 0.0f;
931     }
932 }
933 
get_max_relative_error(const cl_image_format * format,image_sampler_data * sampler,int is3D,int isLinearFilter)934 float get_max_relative_error(const cl_image_format *format,
935                              image_sampler_data *sampler, int is3D,
936                              int isLinearFilter)
937 {
938     float maxError = 0.0f;
939     float sampleCount = 1.0f;
940     if (isLinearFilter) sampleCount = is3D ? 8.0f : 4.0f;
941 
942     // Note that the ULP is defined here as the unit in the last place of the
943     // maximum magnitude sample used for filtering.
944 
945     // Section 8.3
946     switch (format->image_channel_data_type)
947     {
948         // The spec allows 2 ulps of error for normalized formats
949         case CL_SNORM_INT8:
950         case CL_UNORM_INT8:
951         case CL_SNORM_INT16:
952         case CL_UNORM_INT16:
953         case CL_UNORM_SHORT_565:
954         case CL_UNORM_SHORT_555:
955         case CL_UNORM_INT_101010:
956             // Maximum sampling error for round to zero normalization based on
957             // multiplication by reciprocal (using reciprocal generated in
958             // round to +inf mode, so that 1.0 matches spec)
959             maxError = 2 * FLT_EPSILON * sampleCount;
960             break;
961 
962             // If the implementation supports these formats then it will have to
963             // allow rounding error here too, because not all 32-bit ints are
964             // exactly representable in float
965         case CL_SIGNED_INT32:
966         case CL_UNSIGNED_INT32: maxError = 1 * FLT_EPSILON; break;
967     }
968 
969 
970     // Section 8.2
971     if (sampler->addressing_mode == CL_ADDRESS_REPEAT
972         || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT
973         || sampler->filter_mode != CL_FILTER_NEAREST
974         || sampler->normalized_coords)
975 #if defined(__APPLE__)
976     {
977         if (sampler->filter_mode != CL_FILTER_NEAREST)
978         {
979             // The maximum
980             if (gDeviceType == CL_DEVICE_TYPE_GPU)
981                 // Some GPUs ain't so accurate
982                 maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);
983             else
984                 // The standard method of 2d linear filtering delivers 4.0 ulps
985                 // of error in round to nearest (8 in rtz).
986                 maxError += 4.0f * FLT_EPSILON;
987         }
988         else
989             // normalized coordinates will introduce some error into the
990             // fractional part of the address, affecting results
991             maxError += 4.0f * FLT_EPSILON;
992     }
993 #else
994     {
995 #if !defined(_WIN32)
996 #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
997 #endif
998         // The spec allows linear filters to return any result most of the time.
999         // That's fine for implementations but a problem for testing. After all
1000         // users aren't going to like garbage images.  We have "picked a number"
1001         // here that we are going to attempt to conform to. Implementations are
1002         // free to pick another number, like infinity, if they like.
1003         // We picked a number for you, to provide /some/ sanity
1004         maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
1005         // ...but this is what the spec allows:
1006         // maxError = INFINITY;
1007         // Please feel free to pick any positive number. (NaN wont work.)
1008     }
1009 #endif
1010 
1011     // The error calculation itself can introduce error
1012     maxError += FLT_EPSILON * 2;
1013 
1014     return maxError;
1015 }
1016 
get_format_max_int(const cl_image_format * format)1017 size_t get_format_max_int(const cl_image_format *format)
1018 {
1019     switch (format->image_channel_data_type)
1020     {
1021         case CL_SNORM_INT8:
1022         case CL_SIGNED_INT8: return 127;
1023         case CL_UNORM_INT8:
1024         case CL_UNSIGNED_INT8: return 255;
1025 
1026         case CL_SNORM_INT16:
1027         case CL_SIGNED_INT16: return 32767;
1028 
1029         case CL_UNORM_INT16:
1030         case CL_UNSIGNED_INT16: return 65535;
1031 
1032         case CL_SIGNED_INT32: return 2147483647L;
1033 
1034         case CL_UNSIGNED_INT32: return 4294967295LL;
1035 
1036         case CL_UNORM_SHORT_565:
1037         case CL_UNORM_SHORT_555: return 31;
1038 
1039         case CL_UNORM_INT_101010: return 1023;
1040 
1041         case CL_HALF_FLOAT: return 1 << 10;
1042 
1043 #ifdef CL_SFIXED14_APPLE
1044         case CL_SFIXED14_APPLE: return 16384;
1045 #endif
1046         default: return 0;
1047     }
1048 }
1049 
get_format_min_int(const cl_image_format * format)1050 int get_format_min_int(const cl_image_format *format)
1051 {
1052     switch (format->image_channel_data_type)
1053     {
1054         case CL_SNORM_INT8:
1055         case CL_SIGNED_INT8: return -128;
1056         case CL_UNORM_INT8:
1057         case CL_UNSIGNED_INT8: return 0;
1058 
1059         case CL_SNORM_INT16:
1060         case CL_SIGNED_INT16: return -32768;
1061 
1062         case CL_UNORM_INT16:
1063         case CL_UNSIGNED_INT16: return 0;
1064 
1065         case CL_SIGNED_INT32: return -2147483648LL;
1066 
1067         case CL_UNSIGNED_INT32: return 0;
1068 
1069         case CL_UNORM_SHORT_565:
1070         case CL_UNORM_SHORT_555:
1071         case CL_UNORM_INT_101010: return 0;
1072 
1073         case CL_HALF_FLOAT: return -(1 << 10);
1074 
1075 #ifdef CL_SFIXED14_APPLE
1076         case CL_SFIXED14_APPLE: return -16384;
1077 #endif
1078 
1079         default: return 0;
1080     }
1081 }
1082 
convert_float_to_half(float f)1083 cl_half convert_float_to_half(float f)
1084 {
1085     switch (gFloatToHalfRoundingMode)
1086     {
1087         case kRoundToNearestEven: return cl_half_from_float(f, CL_HALF_RTE);
1088         case kRoundTowardZero: return cl_half_from_float(f, CL_HALF_RTZ);
1089         default:
1090             log_error("ERROR: Test internal error -- unhandled or unknown "
1091                       "float->half rounding mode.\n");
1092             exit(-1);
1093             return 0xffff;
1094     }
1095 }
1096 
get_image_size(image_descriptor const * imageInfo)1097 cl_ulong get_image_size(image_descriptor const *imageInfo)
1098 {
1099     cl_ulong imageSize;
1100 
1101     // Assumes rowPitch and slicePitch are always correctly defined
1102     if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
1103     {
1104         imageSize = (size_t)compute_mipmapped_image_size(*imageInfo);
1105     }
1106     else
1107     {
1108         switch (imageInfo->type)
1109         {
1110             case CL_MEM_OBJECT_IMAGE1D: imageSize = imageInfo->rowPitch; break;
1111             case CL_MEM_OBJECT_IMAGE2D:
1112                 imageSize = imageInfo->height * imageInfo->rowPitch;
1113                 break;
1114             case CL_MEM_OBJECT_IMAGE3D:
1115                 imageSize = imageInfo->depth * imageInfo->slicePitch;
1116                 break;
1117             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1118                 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1119                 break;
1120             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1121                 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1122                 break;
1123             default:
1124                 log_error("ERROR: Cannot identify image type %x\n",
1125                           imageInfo->type);
1126                 abort();
1127         }
1128     }
1129     return imageSize;
1130 }
1131 
1132 // Calculate image size in megabytes (strictly, mebibytes). Result is rounded
1133 // up.
get_image_size_mb(image_descriptor const * imageInfo)1134 cl_ulong get_image_size_mb(image_descriptor const *imageInfo)
1135 {
1136     cl_ulong imageSize = get_image_size(imageInfo);
1137     cl_ulong mb = imageSize / (1024 * 1024);
1138     if (imageSize % (1024 * 1024) > 0)
1139     {
1140         mb += 1;
1141     }
1142     return mb;
1143 }
1144 
1145 
1146 uint64_t gRoundingStartValue = 0;
1147 
1148 
escape_inf_nan_values(char * data,size_t allocSize)1149 void escape_inf_nan_values(char *data, size_t allocSize)
1150 {
1151     // filter values with 8 not-quite-highest bits
1152     unsigned int *intPtr = (unsigned int *)data;
1153     for (size_t i = 0; i<allocSize>> 2; i++)
1154     {
1155         if ((intPtr[i] & 0x7F800000) == 0x7F800000) intPtr[i] ^= 0x40000000;
1156     }
1157 
1158     // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits
1159     // = 0x7C00 are special)
1160     unsigned short *shortPtr = (unsigned short *)data;
1161     for (size_t i = 0; i<allocSize>> 1; i++)
1162     {
1163         if ((shortPtr[i] & 0x7C00) == 0x7C00) shortPtr[i] ^= 0x4000;
1164     }
1165 }
1166 
generate_random_image_data(image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d)1167 char *generate_random_image_data(image_descriptor *imageInfo,
1168                                  BufferOwningPtr<char> &P, MTdata d)
1169 {
1170     size_t allocSize = static_cast<size_t>(get_image_size(imageInfo));
1171     size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format);
1172     size_t i;
1173 
1174     if (imageInfo->num_mip_levels > 1)
1175         allocSize =
1176             static_cast<size_t>(compute_mipmapped_image_size(*imageInfo));
1177 
1178 #if defined(__APPLE__)
1179     char *data = NULL;
1180     if (gDeviceType == CL_DEVICE_TYPE_CPU)
1181     {
1182         size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;
1183 
1184         void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
1185                          MAP_ANON | MAP_PRIVATE, 0, 0);
1186         intptr_t data_end = (intptr_t)map + mapSize - 4096;
1187         data = (char *)(data_end - (intptr_t)allocSize);
1188 
1189         mprotect(map, 4096, PROT_NONE);
1190         mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
1191         P.reset(data, map, mapSize, allocSize);
1192     }
1193     else
1194     {
1195         data = (char *)malloc(allocSize);
1196         P.reset(data, NULL, 0, allocSize);
1197     }
1198 #else
1199     P.reset(NULL); // Free already allocated memory first, then try to allocate
1200                    // new block.
1201     char *data =
1202         (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
1203     P.reset(data, NULL, 0, allocSize, true);
1204 #endif
1205 
1206     if (data == NULL)
1207     {
1208         log_error("ERROR: Unable to malloc %zu bytes for "
1209                   "generate_random_image_data\n",
1210                   allocSize);
1211         return 0;
1212     }
1213 
1214     if (gTestRounding)
1215     {
1216         // Special case: fill with a ramp from 0 to the size of the type
1217         size_t typeSize = get_format_type_size(imageInfo->format);
1218         switch (typeSize)
1219         {
1220             case 1: {
1221                 char *ptr = data;
1222                 for (i = 0; i < allocSize; i++)
1223                     ptr[i] = (cl_char)(i + gRoundingStartValue);
1224             }
1225             break;
1226             case 2: {
1227                 cl_short *ptr = (cl_short *)data;
1228                 for (i = 0; i < allocSize / 2; i++)
1229                     ptr[i] = (cl_short)(i + gRoundingStartValue);
1230             }
1231             break;
1232             case 4: {
1233                 cl_int *ptr = (cl_int *)data;
1234                 for (i = 0; i < allocSize / 4; i++)
1235                     ptr[i] = (cl_int)(i + gRoundingStartValue);
1236             }
1237             break;
1238         }
1239 
1240         // Note: inf or nan float values would cause problems, although we don't
1241         // know this will actually be a float, so we just know what to look for
1242         escape_inf_nan_values(data, allocSize);
1243         return data;
1244     }
1245 
1246     // Otherwise, we should be able to just fill with random bits no matter what
1247     cl_uint *p = (cl_uint *)data;
1248     for (i = 0; i + 4 <= allocSize; i += 4) p[i / 4] = genrand_int32(d);
1249 
1250     for (; i < allocSize; i++) data[i] = genrand_int32(d);
1251 
1252     // Note: inf or nan float values would cause problems, although we don't
1253     // know this will actually be a float, so we just know what to look for
1254     escape_inf_nan_values(data, allocSize);
1255 
1256     if (/*!gTestMipmaps*/ imageInfo->num_mip_levels < 2)
1257     {
1258         // Fill unused edges with -1, NaN for float
1259         if (imageInfo->rowPitch > pixelRowBytes)
1260         {
1261             size_t height = 0;
1262 
1263             switch (imageInfo->type)
1264             {
1265                 case CL_MEM_OBJECT_IMAGE2D:
1266                 case CL_MEM_OBJECT_IMAGE3D:
1267                 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1268                     height = imageInfo->height;
1269                     break;
1270                 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1271                     height = imageInfo->arraySize;
1272                     break;
1273             }
1274 
1275             // Fill in the row padding regions
1276             for (i = 0; i < height; i++)
1277             {
1278                 size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
1279                 size_t length = imageInfo->rowPitch - pixelRowBytes;
1280                 memset(data + offset, 0xff, length);
1281             }
1282         }
1283 
1284         // Fill in the slice padding regions, if necessary:
1285 
1286         size_t slice_dimension = imageInfo->height;
1287         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1288         {
1289             slice_dimension = imageInfo->arraySize;
1290         }
1291 
1292         if (imageInfo->slicePitch > slice_dimension * imageInfo->rowPitch)
1293         {
1294             size_t depth = 0;
1295             switch (imageInfo->type)
1296             {
1297                 case CL_MEM_OBJECT_IMAGE2D:
1298                 case CL_MEM_OBJECT_IMAGE3D: depth = imageInfo->depth; break;
1299                 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1300                 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1301                     depth = imageInfo->arraySize;
1302                     break;
1303             }
1304 
1305             for (i = 0; i < depth; i++)
1306             {
1307                 size_t offset = i * imageInfo->slicePitch
1308                     + slice_dimension * imageInfo->rowPitch;
1309                 size_t length = imageInfo->slicePitch
1310                     - slice_dimension * imageInfo->rowPitch;
1311                 memset(data + offset, 0xff, length);
1312             }
1313         }
1314     }
1315 
1316     return data;
1317 }
1318 
1319 #define CLAMP_FLOAT(v) (fmaxf(fminf(v, 1.f), -1.f))
1320 
1321 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData,int lod)1322 void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
1323                             int y, int z, float *outData, int lod)
1324 {
1325     size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
1326            depth_lod = imageInfo->depth;
1327     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1328 
1329     if (imageInfo->num_mip_levels > 1)
1330     {
1331         switch (imageInfo->type)
1332         {
1333             case CL_MEM_OBJECT_IMAGE3D:
1334                 depth_lod =
1335                     (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
1336             case CL_MEM_OBJECT_IMAGE2D:
1337             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1338                 height_lod =
1339                     (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
1340             default:
1341                 width_lod =
1342                     (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
1343         }
1344         row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1345         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1346             slice_pitch_lod = row_pitch_lod;
1347         else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
1348                  || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1349             slice_pitch_lod = row_pitch_lod * height_lod;
1350     }
1351     else
1352     {
1353         row_pitch_lod = imageInfo->rowPitch;
1354         slice_pitch_lod = imageInfo->slicePitch;
1355     }
1356     if (x < 0 || y < 0 || z < 0 || x >= (int)width_lod
1357         || (height_lod != 0 && y >= (int)height_lod)
1358         || (depth_lod != 0 && z >= (int)depth_lod)
1359         || (imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize))
1360     {
1361         outData[0] = outData[1] = outData[2] = outData[3] = 0;
1362         if (!has_alpha(imageInfo->format)) outData[3] = 1;
1363         return;
1364     }
1365 
1366     const cl_image_format *format = imageInfo->format;
1367 
1368     unsigned int i;
1369     float tempData[4];
1370 
1371     // Advance to the right spot
1372     char *ptr = (char *)imageData;
1373     size_t pixelSize = get_pixel_size(format);
1374 
1375     ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize;
1376 
1377     // OpenCL only supports reading floats from certain formats
1378     size_t channelCount = get_format_channel_count(format);
1379     switch (format->image_channel_data_type)
1380     {
1381         case CL_SNORM_INT8: {
1382             cl_char *dPtr = (cl_char *)ptr;
1383             for (i = 0; i < channelCount; i++)
1384                 tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 127.0f);
1385             break;
1386         }
1387 
1388         case CL_UNORM_INT8: {
1389             unsigned char *dPtr = (unsigned char *)ptr;
1390             for (i = 0; i < channelCount; i++)
1391             {
1392                 if ((is_sRGBA_order(imageInfo->format->image_channel_order))
1393                     && i < 3) // only RGB need to be converted for sRGBA
1394                     tempData[i] = (float)sRGBunmap((float)dPtr[i] / 255.0f);
1395                 else
1396                     tempData[i] = (float)dPtr[i] / 255.0f;
1397             }
1398             break;
1399         }
1400 
1401         case CL_SIGNED_INT8: {
1402             cl_char *dPtr = (cl_char *)ptr;
1403             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1404             break;
1405         }
1406 
1407         case CL_UNSIGNED_INT8: {
1408             cl_uchar *dPtr = (cl_uchar *)ptr;
1409             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1410             break;
1411         }
1412 
1413         case CL_SNORM_INT16: {
1414             cl_short *dPtr = (cl_short *)ptr;
1415             for (i = 0; i < channelCount; i++)
1416                 tempData[i] = CLAMP_FLOAT((float)dPtr[i] / 32767.0f);
1417             break;
1418         }
1419 
1420         case CL_UNORM_INT16: {
1421             cl_ushort *dPtr = (cl_ushort *)ptr;
1422             for (i = 0; i < channelCount; i++)
1423                 tempData[i] = (float)dPtr[i] / 65535.0f;
1424             break;
1425         }
1426 
1427         case CL_SIGNED_INT16: {
1428             cl_short *dPtr = (cl_short *)ptr;
1429             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1430             break;
1431         }
1432 
1433         case CL_UNSIGNED_INT16: {
1434             cl_ushort *dPtr = (cl_ushort *)ptr;
1435             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1436             break;
1437         }
1438 
1439         case CL_HALF_FLOAT: {
1440             cl_half *dPtr = (cl_half *)ptr;
1441             for (i = 0; i < channelCount; i++)
1442                 tempData[i] = cl_half_to_float(dPtr[i]);
1443             break;
1444         }
1445 
1446         case CL_SIGNED_INT32: {
1447             cl_int *dPtr = (cl_int *)ptr;
1448             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1449             break;
1450         }
1451 
1452         case CL_UNSIGNED_INT32: {
1453             cl_uint *dPtr = (cl_uint *)ptr;
1454             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1455             break;
1456         }
1457 
1458         case CL_UNORM_SHORT_565: {
1459             cl_ushort *dPtr = (cl_ushort *)ptr;
1460             tempData[0] = (float)(dPtr[0] >> 11) / (float)31;
1461             tempData[1] = (float)((dPtr[0] >> 5) & 63) / (float)63;
1462             tempData[2] = (float)(dPtr[0] & 31) / (float)31;
1463             break;
1464         }
1465 
1466         case CL_UNORM_SHORT_555: {
1467             cl_ushort *dPtr = (cl_ushort *)ptr;
1468             tempData[0] = (float)((dPtr[0] >> 10) & 31) / (float)31;
1469             tempData[1] = (float)((dPtr[0] >> 5) & 31) / (float)31;
1470             tempData[2] = (float)(dPtr[0] & 31) / (float)31;
1471             break;
1472         }
1473 
1474         case CL_UNORM_INT_101010: {
1475             cl_uint *dPtr = (cl_uint *)ptr;
1476             tempData[0] = (float)((dPtr[0] >> 20) & 0x3ff) / (float)1023;
1477             tempData[1] = (float)((dPtr[0] >> 10) & 0x3ff) / (float)1023;
1478             tempData[2] = (float)(dPtr[0] & 0x3ff) / (float)1023;
1479             break;
1480         }
1481 
1482         case CL_FLOAT: {
1483             float *dPtr = (float *)ptr;
1484             for (i = 0; i < channelCount; i++) tempData[i] = (float)dPtr[i];
1485             break;
1486         }
1487 #ifdef CL_SFIXED14_APPLE
1488         case CL_SFIXED14_APPLE: {
1489             cl_ushort *dPtr = (cl_ushort *)ptr;
1490             for (i = 0; i < channelCount; i++)
1491                 tempData[i] = ((int)dPtr[i] - 16384) * 0x1.0p-14f;
1492             break;
1493         }
1494 #endif
1495     }
1496 
1497 
1498     outData[0] = outData[1] = outData[2] = 0;
1499     outData[3] = 1;
1500 
1501     switch (format->image_channel_order)
1502     {
1503         case CL_A: outData[3] = tempData[0]; break;
1504         case CL_R:
1505         case CL_Rx: outData[0] = tempData[0]; break;
1506         case CL_RA:
1507             outData[0] = tempData[0];
1508             outData[3] = tempData[1];
1509             break;
1510         case CL_RG:
1511         case CL_RGx:
1512             outData[0] = tempData[0];
1513             outData[1] = tempData[1];
1514             break;
1515         case CL_RGB:
1516         case CL_RGBx:
1517         case CL_sRGB:
1518         case CL_sRGBx:
1519             outData[0] = tempData[0];
1520             outData[1] = tempData[1];
1521             outData[2] = tempData[2];
1522             break;
1523         case CL_RGBA:
1524             outData[0] = tempData[0];
1525             outData[1] = tempData[1];
1526             outData[2] = tempData[2];
1527             outData[3] = tempData[3];
1528             break;
1529         case CL_ARGB:
1530             outData[0] = tempData[1];
1531             outData[1] = tempData[2];
1532             outData[2] = tempData[3];
1533             outData[3] = tempData[0];
1534             break;
1535         case CL_ABGR:
1536             outData[0] = tempData[3];
1537             outData[1] = tempData[2];
1538             outData[2] = tempData[1];
1539             outData[3] = tempData[0];
1540             break;
1541         case CL_BGRA:
1542         case CL_sBGRA:
1543             outData[0] = tempData[2];
1544             outData[1] = tempData[1];
1545             outData[2] = tempData[0];
1546             outData[3] = tempData[3];
1547             break;
1548         case CL_INTENSITY:
1549             outData[0] = tempData[0];
1550             outData[1] = tempData[0];
1551             outData[2] = tempData[0];
1552             outData[3] = tempData[0];
1553             break;
1554         case CL_LUMINANCE:
1555             outData[0] = tempData[0];
1556             outData[1] = tempData[0];
1557             outData[2] = tempData[0];
1558             break;
1559 #ifdef CL_1RGB_APPLE
1560         case CL_1RGB_APPLE:
1561             outData[0] = tempData[1];
1562             outData[1] = tempData[2];
1563             outData[2] = tempData[3];
1564             outData[3] = 1.0f;
1565             break;
1566 #endif
1567 #ifdef CL_BGR1_APPLE
1568         case CL_BGR1_APPLE:
1569             outData[0] = tempData[2];
1570             outData[1] = tempData[1];
1571             outData[2] = tempData[0];
1572             outData[3] = 1.0f;
1573             break;
1574 #endif
1575         case CL_sRGBA:
1576             outData[0] = tempData[0];
1577             outData[1] = tempData[1];
1578             outData[2] = tempData[2];
1579             outData[3] = tempData[3];
1580             break;
1581         case CL_DEPTH: outData[0] = tempData[0]; break;
1582         default:
1583             log_error("Invalid format:");
1584             print_header(format, true);
1585             break;
1586     }
1587 }
1588 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData)1589 void read_image_pixel_float(void *imageData, image_descriptor *imageInfo, int x,
1590                             int y, int z, float *outData)
1591 {
1592     read_image_pixel_float(imageData, imageInfo, x, y, z, outData, 0);
1593 }
1594 
get_integer_coords(float x,float y,float z,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1595 bool get_integer_coords(float x, float y, float z, size_t width, size_t height,
1596                         size_t depth, image_sampler_data *imageSampler,
1597                         image_descriptor *imageInfo, int &outX, int &outY,
1598                         int &outZ)
1599 {
1600     return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height,
1601                                      depth, imageSampler, imageInfo, outX, outY,
1602                                      outZ);
1603 }
1604 
get_integer_coords_offset(float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1605 bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset,
1606                                float yAddressOffset, float zAddressOffset,
1607                                size_t width, size_t height, size_t depth,
1608                                image_sampler_data *imageSampler,
1609                                image_descriptor *imageInfo, int &outX,
1610                                int &outY, int &outZ)
1611 {
1612     AddressFn adFn = sAddressingTable[imageSampler];
1613 
1614     float refX = floorf(x), refY = floorf(y), refZ = floorf(z);
1615 
1616     // Handle sampler-directed coordinate normalization + clamping.  Note that
1617     // the array coordinate for image array types is expected to be
1618     // unnormalized, and is clamped to 0..arraySize-1.
1619     if (imageSampler->normalized_coords)
1620     {
1621         switch (imageSampler->addressing_mode)
1622         {
1623             case CL_ADDRESS_REPEAT:
1624                 x = RepeatNormalizedAddressFn(x, width);
1625                 if (height != 0)
1626                 {
1627                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1628                         y = RepeatNormalizedAddressFn(y, height);
1629                 }
1630                 if (depth != 0)
1631                 {
1632                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1633                         z = RepeatNormalizedAddressFn(z, depth);
1634                 }
1635 
1636                 if (xAddressOffset != 0.0)
1637                 {
1638                     // Add in the offset
1639                     x += xAddressOffset;
1640                     // Handle wrapping
1641                     if (x > width) x -= (float)width;
1642                     if (x < 0) x += (float)width;
1643                 }
1644                 if ((yAddressOffset != 0.0)
1645                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
1646                 {
1647                     // Add in the offset
1648                     y += yAddressOffset;
1649                     // Handle wrapping
1650                     if (y > height) y -= (float)height;
1651                     if (y < 0) y += (float)height;
1652                 }
1653                 if ((zAddressOffset != 0.0)
1654                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
1655                 {
1656                     // Add in the offset
1657                     z += zAddressOffset;
1658                     // Handle wrapping
1659                     if (z > depth) z -= (float)depth;
1660                     if (z < 0) z += (float)depth;
1661                 }
1662                 break;
1663 
1664             case CL_ADDRESS_MIRRORED_REPEAT:
1665                 x = MirroredRepeatNormalizedAddressFn(x, width);
1666                 if (height != 0)
1667                 {
1668                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1669                         y = MirroredRepeatNormalizedAddressFn(y, height);
1670                 }
1671                 if (depth != 0)
1672                 {
1673                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1674                         z = MirroredRepeatNormalizedAddressFn(z, depth);
1675                 }
1676 
1677                 if (xAddressOffset != 0.0)
1678                 {
1679                     float temp = x + xAddressOffset;
1680                     if (temp > (float)width)
1681                         temp = (float)width - (temp - (float)width);
1682                     x = fabsf(temp);
1683                 }
1684                 if ((yAddressOffset != 0.0)
1685                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY))
1686                 {
1687                     float temp = y + yAddressOffset;
1688                     if (temp > (float)height)
1689                         temp = (float)height - (temp - (float)height);
1690                     y = fabsf(temp);
1691                 }
1692                 if ((zAddressOffset != 0.0)
1693                     && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY))
1694                 {
1695                     float temp = z + zAddressOffset;
1696                     if (temp > (float)depth)
1697                         temp = (float)depth - (temp - (float)depth);
1698                     z = fabsf(temp);
1699                 }
1700                 break;
1701 
1702             default:
1703                 // Also, remultiply to the original coords. This simulates any
1704                 // truncation in the pass to OpenCL
1705                 x *= (float)width;
1706                 x += xAddressOffset;
1707 
1708                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1709                 {
1710                     y *= (float)height;
1711                     y += yAddressOffset;
1712                 }
1713 
1714                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1715                 {
1716                     z *= (float)depth;
1717                     z += zAddressOffset;
1718                 }
1719                 break;
1720         }
1721     }
1722 
1723     // At this point, we're dealing with non-normalized coordinates.
1724 
1725     outX = adFn(static_cast<int>(floorf(x)), width);
1726 
1727     // 1D and 2D arrays require special care for the index coordinate:
1728 
1729     switch (imageInfo->type)
1730     {
1731         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1732             outY = static_cast<int>(
1733                 calculate_array_index(y, (float)imageInfo->arraySize - 1.0f));
1734             outZ = 0; /* don't care! */
1735             break;
1736         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1737             outY = adFn(static_cast<int>(floorf(y)), height);
1738             outZ = static_cast<int>(
1739                 calculate_array_index(z, (float)imageInfo->arraySize - 1.0f));
1740             break;
1741         default:
1742             // legacy path:
1743             if (height != 0) outY = adFn(static_cast<int>(floorf(y)), height);
1744             if (depth != 0) outZ = adFn(static_cast<int>(floorf(z)), depth);
1745     }
1746 
1747     return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ);
1748 }
1749 
frac(float a)1750 static float frac(float a) { return a - floorf(a); }
1751 
1752 static inline void pixelMax(const float a[4], const float b[4], float *results);
pixelMax(const float a[4],const float b[4],float * results)1753 static inline void pixelMax(const float a[4], const float b[4], float *results)
1754 {
1755     for (int i = 0; i < 4; i++) results[i] = errMax(fabsf(a[i]), fabsf(b[i]));
1756 }
1757 
1758 // If containsDenorms is NULL, flush denorms to zero
1759 // if containsDenorms is not NULL, record whether there are any denorms
1760 static inline void check_for_denorms(float a[4], int *containsDenorms);
check_for_denorms(float a[4],int * containsDenorms)1761 static inline void check_for_denorms(float a[4], int *containsDenorms)
1762 {
1763     if (NULL == containsDenorms)
1764     {
1765         for (int i = 0; i < 4; i++)
1766         {
1767             if (IsFloatSubnormal(a[i])) a[i] = copysignf(0.0f, a[i]);
1768         }
1769     }
1770     else
1771     {
1772         for (int i = 0; i < 4; i++)
1773         {
1774             if (IsFloatSubnormal(a[i]))
1775             {
1776                 *containsDenorms = 1;
1777                 break;
1778             }
1779         }
1780     }
1781 }
1782 
calculate_array_index(float coord,float extent)1783 inline float calculate_array_index(float coord, float extent)
1784 {
1785     // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
1786     //
1787     // given coordinate 'w' that represents an index:
1788     // layer_index = clamp( rint(w), 0, image_array_size - 1)
1789 
1790     float ret = rintf(coord);
1791     ret = ret > extent ? extent : ret;
1792     ret = ret < 0.0f ? 0.0f : ret;
1793 
1794     return ret;
1795 }
1796 
1797 /*
1798  * Utility function to unnormalized a coordinate given a particular sampler.
1799  *
1800  * name     - the name of the coordinate, used for verbose debugging only
1801  * coord    - the coordinate requiring unnormalization
1802  * offset   - an addressing offset to be added to the coordinate
1803  * extent   - the max value for this coordinate (e.g. width for x)
1804  */
unnormalize_coordinate(const char * name,float coord,float offset,float extent,cl_addressing_mode addressing_mode,int verbose)1805 static float unnormalize_coordinate(const char *name, float coord, float offset,
1806                                     float extent,
1807                                     cl_addressing_mode addressing_mode,
1808                                     int verbose)
1809 {
1810     float ret = 0.0f;
1811 
1812     switch (addressing_mode)
1813     {
1814         case CL_ADDRESS_REPEAT:
1815             ret = RepeatNormalizedAddressFn(coord, static_cast<size_t>(extent));
1816 
1817             if (verbose)
1818             {
1819                 log_info("\tRepeat filter denormalizes %s (%f) to %f\n", name,
1820                          coord, ret);
1821             }
1822 
1823             if (offset != 0.0)
1824             {
1825                 // Add in the offset, and handle wrapping.
1826                 ret += offset;
1827                 if (ret > extent) ret -= extent;
1828                 if (ret < 0.0) ret += extent;
1829             }
1830 
1831             if (verbose && offset != 0.0f)
1832             {
1833                 log_info("\tAddress offset of %f added to get %f\n", offset,
1834                          ret);
1835             }
1836             break;
1837 
1838         case CL_ADDRESS_MIRRORED_REPEAT:
1839             ret = MirroredRepeatNormalizedAddressFn(
1840                 coord, static_cast<size_t>(extent));
1841 
1842             if (verbose)
1843             {
1844                 log_info(
1845                     "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
1846                     name, coord, ret);
1847             }
1848 
1849             if (offset != 0.0)
1850             {
1851                 float temp = ret + offset;
1852                 if (temp > extent) temp = extent - (temp - extent);
1853                 ret = fabsf(temp);
1854             }
1855 
1856             if (verbose && offset != 0.0f)
1857             {
1858                 log_info("\tAddress offset of %f added to get %f\n", offset,
1859                          ret);
1860             }
1861             break;
1862 
1863         default:
1864 
1865             ret = coord * extent;
1866 
1867             if (verbose)
1868             {
1869                 log_info("\tFilter denormalizes %s to %f (%f * %f)\n", name,
1870                          ret, coord, extent);
1871             }
1872 
1873             ret += offset;
1874 
1875             if (verbose && offset != 0.0f)
1876             {
1877                 log_info("\tAddress offset of %f added to get %f\n", offset,
1878                          ret);
1879             }
1880     }
1881 
1882     return ret;
1883 }
1884 
1885 FloatPixel
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)1886 sample_image_pixel_float(void *imageData, image_descriptor *imageInfo, float x,
1887                          float y, float z, image_sampler_data *imageSampler,
1888                          float *outData, int verbose, int *containsDenorms)
1889 {
1890     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
1891                                            0.0f, 0.0f, imageSampler, outData,
1892                                            verbose, containsDenorms);
1893 }
1894 
1895 // returns max pixel value of the pixels touched
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1896 FloatPixel sample_image_pixel_float(void *imageData,
1897                                     image_descriptor *imageInfo, float x,
1898                                     float y, float z,
1899                                     image_sampler_data *imageSampler,
1900                                     float *outData, int verbose,
1901                                     int *containsDenorms, int lod)
1902 {
1903     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f,
1904                                            0.0f, 0.0f, imageSampler, outData,
1905                                            verbose, containsDenorms, lod);
1906 }
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1907 FloatPixel sample_image_pixel_float_offset(
1908     void *imageData, image_descriptor *imageInfo, float x, float y, float z,
1909     float xAddressOffset, float yAddressOffset, float zAddressOffset,
1910     image_sampler_data *imageSampler, float *outData, int verbose,
1911     int *containsDenorms, int lod)
1912 {
1913     AddressFn adFn = sAddressingTable[imageSampler];
1914     FloatPixel returnVal;
1915     size_t width_lod = imageInfo->width, height_lod = imageInfo->height,
1916            depth_lod = imageInfo->depth;
1917     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1918 
1919     if (imageInfo->num_mip_levels > 1)
1920     {
1921         switch (imageInfo->type)
1922         {
1923             case CL_MEM_OBJECT_IMAGE3D:
1924                 depth_lod =
1925                     (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
1926             case CL_MEM_OBJECT_IMAGE2D:
1927             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1928                 height_lod =
1929                     (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
1930             default:
1931                 width_lod =
1932                     (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
1933         }
1934         row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1935         if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
1936             slice_pitch_lod = row_pitch_lod;
1937         else if (imageInfo->type == CL_MEM_OBJECT_IMAGE3D
1938                  || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1939             slice_pitch_lod = row_pitch_lod * height_lod;
1940     }
1941     else
1942     {
1943         slice_pitch_lod = imageInfo->slicePitch;
1944         row_pitch_lod = imageInfo->rowPitch;
1945     }
1946 
1947     if (containsDenorms) *containsDenorms = 0;
1948 
1949     if (imageSampler->normalized_coords)
1950     {
1951 
1952         // We need to unnormalize our coordinates differently depending on
1953         // the image type, but 'x' is always processed the same way.
1954 
1955         x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod,
1956                                    imageSampler->addressing_mode, verbose);
1957 
1958         switch (imageInfo->type)
1959         {
1960 
1961                 // The image array types require special care:
1962 
1963             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1964                 z = 0; // don't care -- unused for 1D arrays
1965                 break;
1966 
1967             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1968                 y = unnormalize_coordinate(
1969                     "y", y, yAddressOffset, (float)height_lod,
1970                     imageSampler->addressing_mode, verbose);
1971                 break;
1972 
1973                 // Everybody else:
1974 
1975             default:
1976                 y = unnormalize_coordinate(
1977                     "y", y, yAddressOffset, (float)height_lod,
1978                     imageSampler->addressing_mode, verbose);
1979                 z = unnormalize_coordinate(
1980                     "z", z, zAddressOffset, (float)depth_lod,
1981                     imageSampler->addressing_mode, verbose);
1982         }
1983     }
1984     else if (verbose)
1985     {
1986 
1987         switch (imageInfo->type)
1988         {
1989             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1990                 log_info("Starting coordinate: %f, array index %f\n", x, y);
1991                 break;
1992             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1993                 log_info("Starting coordinate: %f, %f, array index %f\n", x, y,
1994                          z);
1995                 break;
1996             case CL_MEM_OBJECT_IMAGE1D:
1997             case CL_MEM_OBJECT_IMAGE1D_BUFFER:
1998                 log_info("Starting coordinate: %f\n", x);
1999                 break;
2000             case CL_MEM_OBJECT_IMAGE2D:
2001                 log_info("Starting coordinate: %f, %f\n", x, y);
2002                 break;
2003             case CL_MEM_OBJECT_IMAGE3D:
2004             default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
2005         }
2006     }
2007 
2008     // At this point, we have unnormalized coordinates.
2009 
2010     if (imageSampler->filter_mode == CL_FILTER_NEAREST)
2011     {
2012         int ix, iy, iz;
2013 
2014         // We apply the addressing function to the now-unnormalized
2015         // coordinates.  Note that the array cases again require special
2016         // care, per section 8.4 in the OpenCL 1.2 Specification.
2017 
2018         ix = adFn(static_cast<int>(floorf(x)), width_lod);
2019 
2020         switch (imageInfo->type)
2021         {
2022             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2023                 iy = static_cast<int>(calculate_array_index(
2024                     y, (float)(imageInfo->arraySize - 1)));
2025                 iz = 0;
2026                 if (verbose)
2027                 {
2028                     log_info("\tArray index %f evaluates to %d\n", y, iy);
2029                 }
2030                 break;
2031             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2032                 iy = adFn(static_cast<int>(floorf(y)), height_lod);
2033                 iz = static_cast<int>(calculate_array_index(
2034                     z, (float)(imageInfo->arraySize - 1)));
2035                 if (verbose)
2036                 {
2037                     log_info("\tArray index %f evaluates to %d\n", z, iz);
2038                 }
2039                 break;
2040             default:
2041                 iy = adFn(static_cast<int>(floorf(y)), height_lod);
2042                 if (depth_lod != 0)
2043                     iz = adFn(static_cast<int>(floorf(z)), depth_lod);
2044                 else
2045                     iz = 0;
2046         }
2047 
2048         if (verbose)
2049         {
2050             if (iz)
2051                 log_info(
2052                     "\tReference integer coords calculated: { %d, %d, %d }\n",
2053                     ix, iy, iz);
2054             else
2055                 log_info("\tReference integer coords calculated: { %d, %d }\n",
2056                          ix, iy);
2057         }
2058 
2059         read_image_pixel_float(imageData, imageInfo, ix, iy, iz, outData, lod);
2060         check_for_denorms(outData, containsDenorms);
2061         for (int i = 0; i < 4; i++) returnVal.p[i] = fabsf(outData[i]);
2062         return returnVal;
2063     }
2064     else
2065     {
2066         // Linear filtering cases.
2067 
2068         size_t width = width_lod, height = height_lod, depth = depth_lod;
2069 
2070         // Image arrays can use 2D filtering, but require us to walk into the
2071         // image a certain number of slices before reading.
2072 
2073         if (depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY
2074             || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2075         {
2076             float array_index = 0;
2077 
2078             size_t layer_offset = 0;
2079 
2080             if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
2081             {
2082                 array_index =
2083                     calculate_array_index(z, (float)(imageInfo->arraySize - 1));
2084                 layer_offset = slice_pitch_lod * (size_t)array_index;
2085             }
2086             else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2087             {
2088                 array_index =
2089                     calculate_array_index(y, (float)(imageInfo->arraySize - 1));
2090                 layer_offset = slice_pitch_lod * (size_t)array_index;
2091 
2092                 // Set up y and height so that the filtering below is correct
2093                 // 1D filtering on a single slice.
2094                 height = 1;
2095             }
2096 
2097             int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width);
2098             int y1 = 0;
2099             int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width);
2100             int y2 = 0;
2101             if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D)
2102                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
2103                 && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER))
2104             {
2105                 y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height);
2106                 y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height);
2107             }
2108             else
2109             {
2110                 y = 0.5f;
2111             }
2112 
2113             if (verbose)
2114             {
2115                 log_info("\tActual integer coords used (i = floor(x-.5)): i0:{ "
2116                          "%d, %d } and i1:{ %d, %d }\n",
2117                          x1, y1, x2, y2);
2118                 log_info("\tArray coordinate is %f\n", array_index);
2119             }
2120 
2121             // Walk to beginning of the 'correct' slice, if needed.
2122             char *imgPtr = ((char *)imageData) + layer_offset;
2123 
2124             float upLeft[4], upRight[4], lowLeft[4], lowRight[4];
2125             float maxUp[4], maxLow[4];
2126             read_image_pixel_float(imgPtr, imageInfo, x1, y1, 0, upLeft, lod);
2127             read_image_pixel_float(imgPtr, imageInfo, x2, y1, 0, upRight, lod);
2128             check_for_denorms(upLeft, containsDenorms);
2129             check_for_denorms(upRight, containsDenorms);
2130             pixelMax(upLeft, upRight, maxUp);
2131             read_image_pixel_float(imgPtr, imageInfo, x1, y2, 0, lowLeft, lod);
2132             read_image_pixel_float(imgPtr, imageInfo, x2, y2, 0, lowRight, lod);
2133             check_for_denorms(lowLeft, containsDenorms);
2134             check_for_denorms(lowRight, containsDenorms);
2135             pixelMax(lowLeft, lowRight, maxLow);
2136             pixelMax(maxUp, maxLow, returnVal.p);
2137 
2138             if (verbose)
2139             {
2140                 if (NULL == containsDenorms)
2141                     log_info("\tSampled pixels (rgba order, denorms flushed to "
2142                              "zero):\n");
2143                 else
2144                     log_info("\tSampled pixels (rgba order):\n");
2145                 log_info("\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1],
2146                          upLeft[2], upLeft[3]);
2147                 log_info("\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1],
2148                          upRight[2], upRight[3]);
2149                 log_info("\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1],
2150                          lowLeft[2], lowLeft[3]);
2151                 log_info("\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1],
2152                          lowRight[2], lowRight[3]);
2153             }
2154 
2155             bool printMe = false;
2156             if (x1 <= 0 || x2 <= 0 || x1 >= (int)width - 1
2157                 || x2 >= (int)width - 1)
2158                 printMe = true;
2159             if (y1 <= 0 || y2 <= 0 || y1 >= (int)height - 1
2160                 || y2 >= (int)height - 1)
2161                 printMe = true;
2162 
2163             double weights[2][2];
2164 
2165             weights[0][0] = weights[0][1] = 1.0 - frac(x - 0.5f);
2166             weights[1][0] = weights[1][1] = frac(x - 0.5f);
2167             weights[0][0] *= 1.0 - frac(y - 0.5f);
2168             weights[1][0] *= 1.0 - frac(y - 0.5f);
2169             weights[0][1] *= frac(y - 0.5f);
2170             weights[1][1] *= frac(y - 0.5f);
2171 
2172             if (verbose)
2173                 log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",
2174                          frac(x - 0.5f), frac(y - 0.5f));
2175 
2176             for (int i = 0; i < 3; i++)
2177             {
2178                 outData[i] = (float)((upLeft[i] * weights[0][0])
2179                                      + (upRight[i] * weights[1][0])
2180                                      + (lowLeft[i] * weights[0][1])
2181                                      + (lowRight[i] * weights[1][1]));
2182                 // flush subnormal results to zero if necessary
2183                 if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
2184                     outData[i] = copysignf(0.0f, outData[i]);
2185             }
2186             outData[3] = (float)((upLeft[3] * weights[0][0])
2187                                  + (upRight[3] * weights[1][0])
2188                                  + (lowLeft[3] * weights[0][1])
2189                                  + (lowRight[3] * weights[1][1]));
2190             // flush subnormal results to zero if necessary
2191             if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
2192                 outData[3] = copysignf(0.0f, outData[3]);
2193         }
2194         else
2195         {
2196             // 3D linear filtering
2197             int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width_lod);
2198             int y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height_lod);
2199             int z1 = adFn(static_cast<int>(floorf(z - 0.5f)), depth_lod);
2200             int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width_lod);
2201             int y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height_lod);
2202             int z2 = adFn(static_cast<int>(floorf(z - 0.5f) + 1), depth_lod);
2203 
2204             if (verbose)
2205                 log_info("\tActual integer coords used (i = floor(x-.5)): "
2206                          "i0:{%d, %d, %d} and i1:{%d, %d, %d}\n",
2207                          x1, y1, z1, x2, y2, z2);
2208 
2209             float upLeftA[4], upRightA[4], lowLeftA[4], lowRightA[4];
2210             float upLeftB[4], upRightB[4], lowLeftB[4], lowRightB[4];
2211             float pixelMaxA[4], pixelMaxB[4];
2212             read_image_pixel_float(imageData, imageInfo, x1, y1, z1, upLeftA,
2213                                    lod);
2214             read_image_pixel_float(imageData, imageInfo, x2, y1, z1, upRightA,
2215                                    lod);
2216             check_for_denorms(upLeftA, containsDenorms);
2217             check_for_denorms(upRightA, containsDenorms);
2218             pixelMax(upLeftA, upRightA, pixelMaxA);
2219             read_image_pixel_float(imageData, imageInfo, x1, y2, z1, lowLeftA,
2220                                    lod);
2221             read_image_pixel_float(imageData, imageInfo, x2, y2, z1, lowRightA,
2222                                    lod);
2223             check_for_denorms(lowLeftA, containsDenorms);
2224             check_for_denorms(lowRightA, containsDenorms);
2225             pixelMax(lowLeftA, lowRightA, pixelMaxB);
2226             pixelMax(pixelMaxA, pixelMaxB, returnVal.p);
2227             read_image_pixel_float(imageData, imageInfo, x1, y1, z2, upLeftB,
2228                                    lod);
2229             read_image_pixel_float(imageData, imageInfo, x2, y1, z2, upRightB,
2230                                    lod);
2231             check_for_denorms(upLeftB, containsDenorms);
2232             check_for_denorms(upRightB, containsDenorms);
2233             pixelMax(upLeftB, upRightB, pixelMaxA);
2234             read_image_pixel_float(imageData, imageInfo, x1, y2, z2, lowLeftB,
2235                                    lod);
2236             read_image_pixel_float(imageData, imageInfo, x2, y2, z2, lowRightB,
2237                                    lod);
2238             check_for_denorms(lowLeftB, containsDenorms);
2239             check_for_denorms(lowRightB, containsDenorms);
2240             pixelMax(lowLeftB, lowRightB, pixelMaxB);
2241             pixelMax(pixelMaxA, pixelMaxB, pixelMaxA);
2242             pixelMax(pixelMaxA, returnVal.p, returnVal.p);
2243 
2244             if (verbose)
2245             {
2246                 if (NULL == containsDenorms)
2247                     log_info("\tSampled pixels (rgba order, denorms flushed to "
2248                              "zero):\n");
2249                 else
2250                     log_info("\tSampled pixels (rgba order):\n");
2251                 log_info("\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1],
2252                          upLeftA[2], upLeftA[3]);
2253                 log_info("\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1],
2254                          upRightA[2], upRightA[3]);
2255                 log_info("\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1],
2256                          lowLeftA[2], lowLeftA[3]);
2257                 log_info("\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0],
2258                          lowRightA[1], lowRightA[2], lowRightA[3]);
2259                 log_info("\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1],
2260                          upLeftB[2], upLeftB[3]);
2261                 log_info("\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1],
2262                          upRightB[2], upRightB[3]);
2263                 log_info("\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1],
2264                          lowLeftB[2], lowLeftB[3]);
2265                 log_info("\t\tp111: %f, %f, %f, %f\n", lowRightB[0],
2266                          lowRightB[1], lowRightB[2], lowRightB[3]);
2267             }
2268 
2269             double weights[2][2][2];
2270 
2271             float a = frac(x - 0.5f), b = frac(y - 0.5f), c = frac(z - 0.5f);
2272             weights[0][0][0] = weights[0][1][0] = weights[0][0][1] =
2273                 weights[0][1][1] = 1.f - a;
2274             weights[1][0][0] = weights[1][1][0] = weights[1][0][1] =
2275                 weights[1][1][1] = a;
2276             weights[0][0][0] *= 1.f - b;
2277             weights[1][0][0] *= 1.f - b;
2278             weights[0][0][1] *= 1.f - b;
2279             weights[1][0][1] *= 1.f - b;
2280             weights[0][1][0] *= b;
2281             weights[1][1][0] *= b;
2282             weights[0][1][1] *= b;
2283             weights[1][1][1] *= b;
2284             weights[0][0][0] *= 1.f - c;
2285             weights[0][1][0] *= 1.f - c;
2286             weights[1][0][0] *= 1.f - c;
2287             weights[1][1][0] *= 1.f - c;
2288             weights[0][0][1] *= c;
2289             weights[0][1][1] *= c;
2290             weights[1][0][1] *= c;
2291             weights[1][1][1] *= c;
2292 
2293             if (verbose)
2294                 log_info("\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, "
2295                          "frac( z - 0.5f ) = %f\n",
2296                          frac(x - 0.5f), frac(y - 0.5f), frac(z - 0.5f));
2297 
2298             for (int i = 0; i < 3; i++)
2299             {
2300                 outData[i] = (float)((upLeftA[i] * weights[0][0][0])
2301                                      + (upRightA[i] * weights[1][0][0])
2302                                      + (lowLeftA[i] * weights[0][1][0])
2303                                      + (lowRightA[i] * weights[1][1][0])
2304                                      + (upLeftB[i] * weights[0][0][1])
2305                                      + (upRightB[i] * weights[1][0][1])
2306                                      + (lowLeftB[i] * weights[0][1][1])
2307                                      + (lowRightB[i] * weights[1][1][1]));
2308                 // flush subnormal results to zero if necessary
2309                 if (NULL == containsDenorms && fabs(outData[i]) < FLT_MIN)
2310                     outData[i] = copysignf(0.0f, outData[i]);
2311             }
2312             outData[3] = (float)((upLeftA[3] * weights[0][0][0])
2313                                  + (upRightA[3] * weights[1][0][0])
2314                                  + (lowLeftA[3] * weights[0][1][0])
2315                                  + (lowRightA[3] * weights[1][1][0])
2316                                  + (upLeftB[3] * weights[0][0][1])
2317                                  + (upRightB[3] * weights[1][0][1])
2318                                  + (lowLeftB[3] * weights[0][1][1])
2319                                  + (lowRightB[3] * weights[1][1][1]));
2320             // flush subnormal results to zero if necessary
2321             if (NULL == containsDenorms && fabs(outData[3]) < FLT_MIN)
2322                 outData[3] = copysignf(0.0f, outData[3]);
2323         }
2324 
2325         return returnVal;
2326     }
2327 }
2328 
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)2329 FloatPixel sample_image_pixel_float_offset(
2330     void *imageData, image_descriptor *imageInfo, float x, float y, float z,
2331     float xAddressOffset, float yAddressOffset, float zAddressOffset,
2332     image_sampler_data *imageSampler, float *outData, int verbose,
2333     int *containsDenorms)
2334 {
2335     return sample_image_pixel_float_offset(
2336         imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset,
2337         zAddressOffset, imageSampler, outData, verbose, containsDenorms, 0);
2338 }
2339 
2340 
debug_find_vector_in_image(void * imagePtr,image_descriptor * imageInfo,void * vectorToFind,size_t vectorSize,int * outX,int * outY,int * outZ,size_t lod)2341 int debug_find_vector_in_image(void *imagePtr, image_descriptor *imageInfo,
2342                                void *vectorToFind, size_t vectorSize, int *outX,
2343                                int *outY, int *outZ, size_t lod)
2344 {
2345     int foundCount = 0;
2346     char *iPtr = (char *)imagePtr;
2347     size_t width;
2348     size_t depth;
2349     size_t height;
2350     size_t row_pitch;
2351     size_t slice_pitch;
2352 
2353     switch (imageInfo->type)
2354     {
2355         case CL_MEM_OBJECT_IMAGE1D:
2356             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2357             height = 1;
2358             depth = 1;
2359             break;
2360         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2361             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2362             height = 1;
2363             depth = imageInfo->arraySize;
2364             break;
2365         case CL_MEM_OBJECT_IMAGE2D:
2366             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2367             height =
2368                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2369             depth = 1;
2370             break;
2371         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2372             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2373             height =
2374                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2375             depth = imageInfo->arraySize;
2376             break;
2377         case CL_MEM_OBJECT_IMAGE3D:
2378             width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2379             height =
2380                 (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2381             depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
2382             break;
2383     }
2384 
2385     row_pitch = width * get_pixel_size(imageInfo->format);
2386     slice_pitch = row_pitch * height;
2387 
2388     for (size_t z = 0; z < depth; z++)
2389     {
2390         for (size_t y = 0; y < height; y++)
2391         {
2392             for (size_t x = 0; x < width; x++)
2393             {
2394                 if (memcmp(iPtr, vectorToFind, vectorSize) == 0)
2395                 {
2396                     if (foundCount == 0)
2397                     {
2398                         *outX = (int)x;
2399                         if (outY != NULL) *outY = (int)y;
2400                         if (outZ != NULL) *outZ = (int)z;
2401                     }
2402                     foundCount++;
2403                 }
2404                 iPtr += vectorSize;
2405             }
2406             iPtr += row_pitch - (width * vectorSize);
2407         }
2408         iPtr += slice_pitch - (height * row_pitch);
2409     }
2410     return foundCount;
2411 }
2412 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,unsigned int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2413 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2414                               unsigned int *valuesToFind, int *outX, int *outY,
2415                               int *outZ, int lod)
2416 {
2417     char vectorToFind[4 * 4];
2418     size_t vectorSize = get_format_channel_count(imageInfo->format);
2419 
2420 
2421     if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8)
2422     {
2423         unsigned char *p = (unsigned char *)vectorToFind;
2424         for (unsigned int i = 0; i < vectorSize; i++)
2425             p[i] = (unsigned char)valuesToFind[i];
2426     }
2427     else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16)
2428     {
2429         unsigned short *p = (unsigned short *)vectorToFind;
2430         for (unsigned int i = 0; i < vectorSize; i++)
2431             p[i] = (unsigned short)valuesToFind[i];
2432         vectorSize *= 2;
2433     }
2434     else if (imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32)
2435     {
2436         unsigned int *p = (unsigned int *)vectorToFind;
2437         for (unsigned int i = 0; i < vectorSize; i++)
2438             p[i] = (unsigned int)valuesToFind[i];
2439         vectorSize *= 4;
2440     }
2441     else
2442     {
2443         log_info("WARNING: Unable to search for debug pixel: invalid image "
2444                  "format\n");
2445         return false;
2446     }
2447     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2448                                       vectorSize, outX, outY, outZ, lod);
2449 }
2450 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2451 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2452                               int *valuesToFind, int *outX, int *outY,
2453                               int *outZ, int lod)
2454 {
2455     char vectorToFind[4 * 4];
2456     size_t vectorSize = get_format_channel_count(imageInfo->format);
2457 
2458     if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT8)
2459     {
2460         char *p = (char *)vectorToFind;
2461         for (unsigned int i = 0; i < vectorSize; i++)
2462             p[i] = (char)valuesToFind[i];
2463     }
2464     else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT16)
2465     {
2466         short *p = (short *)vectorToFind;
2467         for (unsigned int i = 0; i < vectorSize; i++)
2468             p[i] = (short)valuesToFind[i];
2469         vectorSize *= 2;
2470     }
2471     else if (imageInfo->format->image_channel_data_type == CL_SIGNED_INT32)
2472     {
2473         int *p = (int *)vectorToFind;
2474         for (unsigned int i = 0; i < vectorSize; i++)
2475             p[i] = (int)valuesToFind[i];
2476         vectorSize *= 4;
2477     }
2478     else
2479     {
2480         log_info("WARNING: Unable to search for debug pixel: invalid image "
2481                  "format\n");
2482         return false;
2483     }
2484     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2485                                       vectorSize, outX, outY, outZ, lod);
2486 }
2487 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,float * valuesToFind,int * outX,int * outY,int * outZ,int lod)2488 int debug_find_pixel_in_image(void *imagePtr, image_descriptor *imageInfo,
2489                               float *valuesToFind, int *outX, int *outY,
2490                               int *outZ, int lod)
2491 {
2492     char vectorToFind[4 * 4];
2493     float swizzled[4];
2494     memcpy(swizzled, valuesToFind, sizeof(swizzled));
2495     size_t vectorSize = get_pixel_size(imageInfo->format);
2496     pack_image_pixel(swizzled, imageInfo->format, vectorToFind);
2497     return debug_find_vector_in_image(imagePtr, imageInfo, vectorToFind,
2498                                       vectorSize, outX, outY, outZ, lod);
2499 }
2500 
2501 template <class T>
swizzle_vector_for_image(T * srcVector,const cl_image_format * imageFormat)2502 void swizzle_vector_for_image(T *srcVector, const cl_image_format *imageFormat)
2503 {
2504     T temp;
2505     switch (imageFormat->image_channel_order)
2506     {
2507         case CL_A: srcVector[0] = srcVector[3]; break;
2508         case CL_R:
2509         case CL_Rx:
2510         case CL_RG:
2511         case CL_RGx:
2512         case CL_RGB:
2513         case CL_RGBx:
2514         case CL_RGBA:
2515         case CL_sRGB:
2516         case CL_sRGBx:
2517         case CL_sRGBA: break;
2518         case CL_RA: srcVector[1] = srcVector[3]; break;
2519         case CL_ARGB:
2520             temp = srcVector[3];
2521             srcVector[3] = srcVector[2];
2522             srcVector[2] = srcVector[1];
2523             srcVector[1] = srcVector[0];
2524             srcVector[0] = temp;
2525             break;
2526         case CL_ABGR:
2527             temp = srcVector[3];
2528             srcVector[3] = srcVector[0];
2529             srcVector[0] = temp;
2530             temp = srcVector[2];
2531             srcVector[2] = srcVector[1];
2532             srcVector[1] = temp;
2533             break;
2534         case CL_BGRA:
2535         case CL_sBGRA:
2536             temp = srcVector[0];
2537             srcVector[0] = srcVector[2];
2538             srcVector[2] = temp;
2539             break;
2540         case CL_INTENSITY:
2541             srcVector[3] = srcVector[0];
2542             srcVector[2] = srcVector[0];
2543             srcVector[1] = srcVector[0];
2544             break;
2545         case CL_LUMINANCE:
2546             srcVector[2] = srcVector[0];
2547             srcVector[1] = srcVector[0];
2548             break;
2549 #ifdef CL_1RGB_APPLE
2550         case CL_1RGB_APPLE:
2551             temp = srcVector[3];
2552             srcVector[3] = srcVector[2];
2553             srcVector[2] = srcVector[1];
2554             srcVector[1] = srcVector[0];
2555             srcVector[0] = temp;
2556             break;
2557 #endif
2558 #ifdef CL_BGR1_APPLE
2559         case CL_BGR1_APPLE:
2560             temp = srcVector[0];
2561             srcVector[0] = srcVector[2];
2562             srcVector[2] = temp;
2563             break;
2564 #endif
2565     }
2566 }
2567 
2568 #define SATURATE(v, min, max) (v < min ? min : (v > max ? max : v))
2569 
pack_image_pixel(unsigned int * srcVector,const cl_image_format * imageFormat,void * outData)2570 void pack_image_pixel(unsigned int *srcVector,
2571                       const cl_image_format *imageFormat, void *outData)
2572 {
2573     swizzle_vector_for_image<unsigned int>(srcVector, imageFormat);
2574     size_t channelCount = get_format_channel_count(imageFormat);
2575 
2576     switch (imageFormat->image_channel_data_type)
2577     {
2578         case CL_UNSIGNED_INT8: {
2579             unsigned char *ptr = (unsigned char *)outData;
2580             for (unsigned int i = 0; i < channelCount; i++)
2581                 ptr[i] = (unsigned char)SATURATE(srcVector[i], 0, 255);
2582             break;
2583         }
2584         case CL_UNSIGNED_INT16: {
2585             unsigned short *ptr = (unsigned short *)outData;
2586             for (unsigned int i = 0; i < channelCount; i++)
2587                 ptr[i] = (unsigned short)SATURATE(srcVector[i], 0, 65535);
2588             break;
2589         }
2590         case CL_UNSIGNED_INT32: {
2591             unsigned int *ptr = (unsigned int *)outData;
2592             for (unsigned int i = 0; i < channelCount; i++)
2593                 ptr[i] = (unsigned int)srcVector[i];
2594             break;
2595         }
2596         default: break;
2597     }
2598 }
2599 
pack_image_pixel(int * srcVector,const cl_image_format * imageFormat,void * outData)2600 void pack_image_pixel(int *srcVector, const cl_image_format *imageFormat,
2601                       void *outData)
2602 {
2603     swizzle_vector_for_image<int>(srcVector, imageFormat);
2604     size_t chanelCount = get_format_channel_count(imageFormat);
2605 
2606     switch (imageFormat->image_channel_data_type)
2607     {
2608         case CL_SIGNED_INT8: {
2609             char *ptr = (char *)outData;
2610             for (unsigned int i = 0; i < chanelCount; i++)
2611                 ptr[i] = (char)SATURATE(srcVector[i], -128, 127);
2612             break;
2613         }
2614         case CL_SIGNED_INT16: {
2615             short *ptr = (short *)outData;
2616             for (unsigned int i = 0; i < chanelCount; i++)
2617                 ptr[i] = (short)SATURATE(srcVector[i], -32768, 32767);
2618             break;
2619         }
2620         case CL_SIGNED_INT32: {
2621             int *ptr = (int *)outData;
2622             for (unsigned int i = 0; i < chanelCount; i++)
2623                 ptr[i] = (int)srcVector[i];
2624             break;
2625         }
2626         default: break;
2627     }
2628 }
2629 
round_to_even(float v)2630 cl_int round_to_even(float v)
2631 {
2632     // clamp overflow
2633     if (v >= -(float)CL_INT_MIN) return CL_INT_MAX;
2634     if (v <= (float)CL_INT_MIN) return CL_INT_MIN;
2635 
2636     // round fractional values to integer value
2637     if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23))
2638     {
2639         static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23),
2640                                         MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
2641         float magicVal = magic[v < 0.0f];
2642         v += magicVal;
2643         v -= magicVal;
2644     }
2645 
2646     return (cl_int)v;
2647 }
2648 
pack_image_pixel(float * srcVector,const cl_image_format * imageFormat,void * outData)2649 void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat,
2650                       void *outData)
2651 {
2652     swizzle_vector_for_image<float>(srcVector, imageFormat);
2653     size_t channelCount = get_format_channel_count(imageFormat);
2654     switch (imageFormat->image_channel_data_type)
2655     {
2656         case CL_HALF_FLOAT: {
2657             cl_half *ptr = (cl_half *)outData;
2658 
2659             switch (gFloatToHalfRoundingMode)
2660             {
2661                 case kRoundToNearestEven:
2662                     for (unsigned int i = 0; i < channelCount; i++)
2663                         ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTE);
2664                     break;
2665                 case kRoundTowardZero:
2666                     for (unsigned int i = 0; i < channelCount; i++)
2667                         ptr[i] = cl_half_from_float(srcVector[i], CL_HALF_RTZ);
2668                     break;
2669                 default:
2670                     log_error("ERROR: Test internal error -- unhandled or "
2671                               "unknown float->half rounding mode.\n");
2672                     exit(-1);
2673                     break;
2674             }
2675             break;
2676         }
2677 
2678         case CL_FLOAT: {
2679             cl_float *ptr = (cl_float *)outData;
2680             for (unsigned int i = 0; i < channelCount; i++)
2681                 ptr[i] = srcVector[i];
2682             break;
2683         }
2684 
2685         case CL_SNORM_INT8: {
2686             cl_char *ptr = (cl_char *)outData;
2687             for (unsigned int i = 0; i < channelCount; i++)
2688                 ptr[i] =
2689                     (cl_char)NORMALIZE_SIGNED(srcVector[i], -127.0f, 127.f);
2690             break;
2691         }
2692         case CL_SNORM_INT16: {
2693             cl_short *ptr = (cl_short *)outData;
2694             for (unsigned int i = 0; i < channelCount; i++)
2695                 ptr[i] =
2696                     (short)NORMALIZE_SIGNED(srcVector[i], -32767.f, 32767.f);
2697             break;
2698         }
2699         case CL_UNORM_INT8: {
2700             cl_uchar *ptr = (cl_uchar *)outData;
2701             if (is_sRGBA_order(imageFormat->image_channel_order))
2702             {
2703                 ptr[0] = (unsigned char)(sRGBmap(srcVector[0]) + 0.5);
2704                 ptr[1] = (unsigned char)(sRGBmap(srcVector[1]) + 0.5);
2705                 ptr[2] = (unsigned char)(sRGBmap(srcVector[2]) + 0.5);
2706                 if (channelCount == 4)
2707                     ptr[3] = (unsigned char)NORMALIZE(srcVector[3], 255.f);
2708             }
2709             else
2710             {
2711                 for (unsigned int i = 0; i < channelCount; i++)
2712                     ptr[i] = (unsigned char)NORMALIZE(srcVector[i], 255.f);
2713             }
2714 #ifdef CL_1RGB_APPLE
2715             if (imageFormat->image_channel_order == CL_1RGB_APPLE)
2716                 ptr[0] = 255.0f;
2717 #endif
2718 #ifdef CL_BGR1_APPLE
2719             if (imageFormat->image_channel_order == CL_BGR1_APPLE)
2720                 ptr[3] = 255.0f;
2721 #endif
2722             break;
2723         }
2724         case CL_UNORM_INT16: {
2725             cl_ushort *ptr = (cl_ushort *)outData;
2726             for (unsigned int i = 0; i < channelCount; i++)
2727                 ptr[i] = (unsigned short)NORMALIZE(srcVector[i], 65535.f);
2728             break;
2729         }
2730         case CL_UNORM_SHORT_555: {
2731             cl_ushort *ptr = (cl_ushort *)outData;
2732             ptr[0] =
2733                 (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 10)
2734                 | (((unsigned short)NORMALIZE(srcVector[1], 31.f) & 31) << 5)
2735                 | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
2736             break;
2737         }
2738         case CL_UNORM_SHORT_565: {
2739             cl_ushort *ptr = (cl_ushort *)outData;
2740             ptr[0] =
2741                 (((unsigned short)NORMALIZE(srcVector[0], 31.f) & 31) << 11)
2742                 | (((unsigned short)NORMALIZE(srcVector[1], 63.f) & 63) << 5)
2743                 | (((unsigned short)NORMALIZE(srcVector[2], 31.f) & 31) << 0);
2744             break;
2745         }
2746         case CL_UNORM_INT_101010: {
2747             cl_uint *ptr = (cl_uint *)outData;
2748             ptr[0] =
2749                 (((unsigned int)NORMALIZE(srcVector[0], 1023.f) & 1023) << 20)
2750                 | (((unsigned int)NORMALIZE(srcVector[1], 1023.f) & 1023) << 10)
2751                 | (((unsigned int)NORMALIZE(srcVector[2], 1023.f) & 1023) << 0);
2752             break;
2753         }
2754         case CL_SIGNED_INT8: {
2755             cl_char *ptr = (cl_char *)outData;
2756             for (unsigned int i = 0; i < channelCount; i++)
2757                 ptr[i] =
2758                     (cl_char)CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);
2759             break;
2760         }
2761         case CL_SIGNED_INT16: {
2762             cl_short *ptr = (cl_short *)outData;
2763             for (unsigned int i = 0; i < channelCount; i++)
2764                 ptr[i] =
2765                     (short)CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
2766             break;
2767         }
2768         case CL_SIGNED_INT32: {
2769             cl_int *ptr = (cl_int *)outData;
2770             for (unsigned int i = 0; i < channelCount; i++)
2771                 ptr[i] = round_to_even(srcVector[i]);
2772             break;
2773         }
2774         case CL_UNSIGNED_INT8: {
2775             cl_uchar *ptr = (cl_uchar *)outData;
2776             for (unsigned int i = 0; i < channelCount; i++)
2777                 ptr[i] =
2778                     (cl_uchar)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX);
2779             break;
2780         }
2781         case CL_UNSIGNED_INT16: {
2782             cl_ushort *ptr = (cl_ushort *)outData;
2783             for (unsigned int i = 0; i < channelCount; i++)
2784                 ptr[i] = (cl_ushort)CONVERT_UINT(srcVector[i], 32767.f,
2785                                                  CL_USHRT_MAX);
2786             break;
2787         }
2788         case CL_UNSIGNED_INT32: {
2789             cl_uint *ptr = (cl_uint *)outData;
2790             for (unsigned int i = 0; i < channelCount; i++)
2791                 ptr[i] = (cl_uint)CONVERT_UINT(
2792                     srcVector[i],
2793                     MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
2794                     CL_UINT_MAX);
2795             break;
2796         }
2797 #ifdef CL_SFIXED14_APPLE
2798         case CL_SFIXED14_APPLE: {
2799             cl_ushort *ptr = (cl_ushort *)outData;
2800             for (unsigned int i = 0; i < channelCount; i++)
2801             {
2802                 cl_float f = fmaxf(srcVector[i], -1.0f);
2803                 f = fminf(f, 3.0f);
2804                 cl_int d = rintf(f * 0x1.0p14f);
2805                 d += 16384;
2806                 if (d > CL_USHRT_MAX) d = CL_USHRT_MAX;
2807                 ptr[i] = d;
2808             }
2809             break;
2810         }
2811 #endif
2812         default:
2813             log_error("INTERNAL ERROR: unknown format (%d)\n",
2814                       imageFormat->image_channel_data_type);
2815             exit(-1);
2816             break;
2817     }
2818 }
2819 
pack_image_pixel_error(const float * srcVector,const cl_image_format * imageFormat,const void * results,float * errors)2820 void pack_image_pixel_error(const float *srcVector,
2821                             const cl_image_format *imageFormat,
2822                             const void *results, float *errors)
2823 {
2824     size_t channelCount = get_format_channel_count(imageFormat);
2825     switch (imageFormat->image_channel_data_type)
2826     {
2827         case CL_HALF_FLOAT: {
2828             const cl_half *ptr = (const cl_half *)results;
2829 
2830             for (unsigned int i = 0; i < channelCount; i++)
2831                 errors[i] = Ulp_Error_Half(ptr[i], srcVector[i]);
2832 
2833             break;
2834         }
2835 
2836         case CL_FLOAT: {
2837             const cl_ushort *ptr = (const cl_ushort *)results;
2838 
2839             for (unsigned int i = 0; i < channelCount; i++)
2840                 errors[i] = Ulp_Error(ptr[i], srcVector[i]);
2841 
2842             break;
2843         }
2844 
2845         case CL_SNORM_INT8: {
2846             const cl_char *ptr = (const cl_char *)results;
2847 
2848             for (unsigned int i = 0; i < channelCount; i++)
2849                 errors[i] = ptr[i]
2850                     - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -127.0f, 127.f);
2851 
2852             break;
2853         }
2854         case CL_SNORM_INT16: {
2855             const cl_short *ptr = (const cl_short *)results;
2856 
2857             for (unsigned int i = 0; i < channelCount; i++)
2858                 errors[i] = ptr[i]
2859                     - NORMALIZE_SIGNED_UNROUNDED(srcVector[i], -32767.f,
2860                                                  32767.f);
2861 
2862             break;
2863         }
2864         case CL_UNORM_INT8: {
2865             const cl_uchar *ptr = (const cl_uchar *)results;
2866 
2867             for (unsigned int i = 0; i < channelCount; i++)
2868                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 255.f);
2869 
2870             break;
2871         }
2872         case CL_UNORM_INT16: {
2873             const cl_ushort *ptr = (const cl_ushort *)results;
2874 
2875             for (unsigned int i = 0; i < channelCount; i++)
2876                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED(srcVector[i], 65535.f);
2877 
2878             break;
2879         }
2880         case CL_UNORM_SHORT_555: {
2881             const cl_ushort *ptr = (const cl_ushort *)results;
2882 
2883             errors[0] =
2884                 ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
2885             errors[1] =
2886                 ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED(srcVector[1], 31.f);
2887             errors[2] =
2888                 ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);
2889 
2890             break;
2891         }
2892         case CL_UNORM_SHORT_565: {
2893             const cl_ushort *ptr = (const cl_ushort *)results;
2894 
2895             errors[0] =
2896                 ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED(srcVector[0], 31.f);
2897             errors[1] =
2898                 ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED(srcVector[1], 63.f);
2899             errors[2] =
2900                 ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED(srcVector[2], 31.f);
2901 
2902             break;
2903         }
2904         case CL_UNORM_INT_101010: {
2905             const cl_uint *ptr = (const cl_uint *)results;
2906 
2907             errors[0] = ((ptr[0] >> 20) & 1023)
2908                 - NORMALIZE_UNROUNDED(srcVector[0], 1023.f);
2909             errors[1] = ((ptr[0] >> 10) & 1023)
2910                 - NORMALIZE_UNROUNDED(srcVector[1], 1023.f);
2911             errors[2] = ((ptr[0] >> 0) & 1023)
2912                 - NORMALIZE_UNROUNDED(srcVector[2], 1023.f);
2913 
2914             break;
2915         }
2916         case CL_SIGNED_INT8: {
2917             const cl_char *ptr = (const cl_char *)results;
2918 
2919             for (unsigned int i = 0; i < channelCount; i++)
2920                 errors[i] =
2921                     ptr[i] - CONVERT_INT(srcVector[i], -127.0f, 127.f, 127);
2922 
2923             break;
2924         }
2925         case CL_SIGNED_INT16: {
2926             const cl_short *ptr = (const cl_short *)results;
2927             for (unsigned int i = 0; i < channelCount; i++)
2928                 errors[i] = ptr[i]
2929                     - CONVERT_INT(srcVector[i], -32767.f, 32767.f, 32767);
2930             break;
2931         }
2932         case CL_SIGNED_INT32: {
2933             const cl_int *ptr = (const cl_int *)results;
2934             for (unsigned int i = 0; i < channelCount; i++)
2935                 errors[i] = (cl_float)((cl_long)ptr[i]
2936                                        - (cl_long)round_to_even(srcVector[i]));
2937             break;
2938         }
2939         case CL_UNSIGNED_INT8: {
2940             const cl_uchar *ptr = (const cl_uchar *)results;
2941             for (unsigned int i = 0; i < channelCount; i++)
2942                 errors[i] = static_cast<float>(
2943                     (cl_int)ptr[i]
2944                     - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX));
2945             break;
2946         }
2947         case CL_UNSIGNED_INT16: {
2948             const cl_ushort *ptr = (const cl_ushort *)results;
2949             for (unsigned int i = 0; i < channelCount; i++)
2950                 errors[i] = static_cast<float>(
2951                     (cl_int)ptr[i]
2952                     - (cl_int)CONVERT_UINT(srcVector[i], 32767.f,
2953                                            CL_USHRT_MAX));
2954             break;
2955         }
2956         case CL_UNSIGNED_INT32: {
2957             const cl_uint *ptr = (const cl_uint *)results;
2958             for (unsigned int i = 0; i < channelCount; i++)
2959                 errors[i] = (cl_float)(
2960                     (cl_long)ptr[i]
2961                     - (cl_long)CONVERT_UINT(
2962                         srcVector[i],
2963                         MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffe, 31 - 23),
2964                         CL_UINT_MAX));
2965             break;
2966         }
2967 #ifdef CL_SFIXED14_APPLE
2968         case CL_SFIXED14_APPLE: {
2969             const cl_ushort *ptr = (const cl_ushort *)results;
2970 
2971             for (unsigned int i = 0; i < channelCount; i++)
2972                 errors[i] = ptr[i]
2973                     - NORMALIZE_SIGNED_UNROUNDED(((int)srcVector[i] - 16384),
2974                                                  -16384.f, 49151.f);
2975 
2976             break;
2977         }
2978 #endif
2979         default:
2980             log_error("INTERNAL ERROR: unknown format (%d)\n",
2981                       imageFormat->image_channel_data_type);
2982             exit(-1);
2983             break;
2984     }
2985 }
2986 
2987 
2988 //
2989 //  Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
2990 //  This should be called lazily before attempting to verify image writes,
2991 //  otherwise an error will occur.
2992 //
DetectFloatToHalfRoundingMode(cl_command_queue q)2993 int DetectFloatToHalfRoundingMode(
2994     cl_command_queue q) // Returns CL_SUCCESS on success
2995 {
2996     cl_int err = CL_SUCCESS;
2997 
2998     if (gFloatToHalfRoundingMode == kDefaultRoundingMode)
2999     {
3000         // Some numbers near 0.5f, that we look at to see how the values are
3001         // rounded.
3002         static const cl_uint inData[4 * 4] = {
3003             0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U,
3004             0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
3005             0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U,
3006             0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U
3007         };
3008         static const size_t count = sizeof(inData) / (4 * sizeof(inData[0]));
3009         const float *inp = (const float *)inData;
3010         cl_context context = NULL;
3011 
3012         // Create an input buffer
3013         err = clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(context),
3014                                     &context, NULL);
3015         if (err)
3016         {
3017             log_error("Error:  could not get context from command queue in "
3018                       "DetectFloatToHalfRoundingMode  (%d)",
3019                       err);
3020             return err;
3021         }
3022 
3023         cl_mem inBuf = clCreateBuffer(context,
3024                                       CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
3025                                           | CL_MEM_ALLOC_HOST_PTR,
3026                                       sizeof(inData), (void *)inData, &err);
3027         if (NULL == inBuf || err)
3028         {
3029             log_error("Error:  could not create input buffer in "
3030                       "DetectFloatToHalfRoundingMode  (err: %d)",
3031                       err);
3032             return err;
3033         }
3034 
3035         // Create a small output image
3036         cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
3037         cl_mem outImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &fmt,
3038                                           count, 1, 0, NULL, &err);
3039         if (NULL == outImage || err)
3040         {
3041             log_error("Error:  could not create half float out image in "
3042                       "DetectFloatToHalfRoundingMode  (err: %d)",
3043                       err);
3044             clReleaseMemObject(inBuf);
3045             return err;
3046         }
3047 
3048         // Create our program, and a kernel
3049         const char *kernelSource[1] = {
3050             "kernel void detect_round( global float4 *in, write_only image2d_t "
3051             "out )\n"
3052             "{\n"
3053             "   write_imagef( out, (int2)(get_global_id(0),0), "
3054             "in[get_global_id(0)] );\n"
3055             "}\n"
3056         };
3057 
3058         clProgramWrapper program;
3059         clKernelWrapper kernel;
3060         err = create_single_kernel_helper(context, &program, &kernel, 1,
3061                                           kernelSource, "detect_round");
3062 
3063         if (NULL == program || err)
3064         {
3065             log_error("Error:  could not create program in "
3066                       "DetectFloatToHalfRoundingMode (err: %d)",
3067                       err);
3068             clReleaseMemObject(inBuf);
3069             clReleaseMemObject(outImage);
3070             return err;
3071         }
3072 
3073         cl_device_id device = NULL;
3074         err = clGetCommandQueueInfo(q, CL_QUEUE_DEVICE, sizeof(device), &device,
3075                                     NULL);
3076         if (err)
3077         {
3078             log_error("Error:  could not get device from command queue in "
3079                       "DetectFloatToHalfRoundingMode  (%d)",
3080                       err);
3081             clReleaseMemObject(inBuf);
3082             clReleaseMemObject(outImage);
3083             return err;
3084         }
3085 
3086         err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inBuf);
3087         if (err)
3088         {
3089             log_error("Error: could not set argument 0 of kernel in "
3090                       "DetectFloatToHalfRoundingMode (%d)",
3091                       err);
3092             clReleaseMemObject(inBuf);
3093             clReleaseMemObject(outImage);
3094             return err;
3095         }
3096 
3097         err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outImage);
3098         if (err)
3099         {
3100             log_error("Error: could not set argument 1 of kernel in "
3101                       "DetectFloatToHalfRoundingMode (%d)",
3102                       err);
3103             clReleaseMemObject(inBuf);
3104             clReleaseMemObject(outImage);
3105             return err;
3106         }
3107 
3108         // Run the kernel
3109         size_t global_work_size = count;
3110         err = clEnqueueNDRangeKernel(q, kernel, 1, NULL, &global_work_size,
3111                                      NULL, 0, NULL, NULL);
3112         if (err)
3113         {
3114             log_error("Error: could not enqueue kernel in "
3115                       "DetectFloatToHalfRoundingMode (%d)",
3116                       err);
3117             clReleaseMemObject(inBuf);
3118             clReleaseMemObject(outImage);
3119             return err;
3120         }
3121 
3122         // read the results
3123         cl_half outBuf[count * 4];
3124         memset(outBuf, -1, sizeof(outBuf));
3125         size_t origin[3] = { 0, 0, 0 };
3126         size_t region[3] = { count, 1, 1 };
3127         err = clEnqueueReadImage(q, outImage, CL_TRUE, origin, region, 0, 0,
3128                                  outBuf, 0, NULL, NULL);
3129         if (err)
3130         {
3131             log_error("Error: could not read output image in "
3132                       "DetectFloatToHalfRoundingMode (%d)",
3133                       err);
3134             clReleaseMemObject(inBuf);
3135             clReleaseMemObject(outImage);
3136             return err;
3137         }
3138 
3139         // Generate our list of reference results
3140         cl_half rte_ref[count * 4];
3141         cl_half rtz_ref[count * 4];
3142         for (size_t i = 0; i < 4 * count; i++)
3143         {
3144             rte_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTE);
3145             rtz_ref[i] = cl_half_from_float(inp[i], CL_HALF_RTZ);
3146         }
3147 
3148         // Verify that we got something in either rtz or rte mode
3149         if (0 == memcmp(rte_ref, outBuf, sizeof(rte_ref)))
3150         {
3151             log_info("Autodetected float->half rounding mode to be rte\n");
3152             gFloatToHalfRoundingMode = kRoundToNearestEven;
3153         }
3154         else if (0 == memcmp(rtz_ref, outBuf, sizeof(rtz_ref)))
3155         {
3156             log_info("Autodetected float->half rounding mode to be rtz\n");
3157             gFloatToHalfRoundingMode = kRoundTowardZero;
3158         }
3159         else
3160         {
3161             log_error("ERROR: float to half conversions proceed with invalid "
3162                       "rounding mode!\n");
3163             log_info("\nfor:");
3164             for (size_t i = 0; i < count; i++)
3165                 log_info(" {%a, %a, %a, %a},", inp[4 * i], inp[4 * i + 1],
3166                          inp[4 * i + 2], inp[4 * i + 3]);
3167             log_info("\ngot:");
3168             for (size_t i = 0; i < count; i++)
3169                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3170                          outBuf[4 * i], outBuf[4 * i + 1], outBuf[4 * i + 2],
3171                          outBuf[4 * i + 3]);
3172             log_info("\nrte:");
3173             for (size_t i = 0; i < count; i++)
3174                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3175                          rte_ref[4 * i], rte_ref[4 * i + 1], rte_ref[4 * i + 2],
3176                          rte_ref[4 * i + 3]);
3177             log_info("\nrtz:");
3178             for (size_t i = 0; i < count; i++)
3179                 log_info(" {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},",
3180                          rtz_ref[4 * i], rtz_ref[4 * i + 1], rtz_ref[4 * i + 2],
3181                          rtz_ref[4 * i + 3]);
3182             log_info("\n");
3183             err = -1;
3184             gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value
3185         }
3186 
3187         // clean up
3188         clReleaseMemObject(inBuf);
3189         clReleaseMemObject(outImage);
3190         return err;
3191     }
3192 
3193     // Make sure that the rounding mode was successfully detected, if we checked
3194     // earlier
3195     if (gFloatToHalfRoundingMode != kRoundToNearestEven
3196         && gFloatToHalfRoundingMode != kRoundTowardZero)
3197         return -2;
3198 
3199     return err;
3200 }
3201 
create_random_image_data(ExplicitType dataType,image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d,bool image2DFromBuffer)3202 char *create_random_image_data(ExplicitType dataType,
3203                                image_descriptor *imageInfo,
3204                                BufferOwningPtr<char> &P, MTdata d,
3205                                bool image2DFromBuffer)
3206 {
3207     size_t allocSize, numPixels;
3208     if (/*gTestMipmaps*/ imageInfo->num_mip_levels > 1)
3209     {
3210         allocSize = (size_t)(compute_mipmapped_image_size(*imageInfo) * 4
3211                              * get_explicit_type_size(dataType))
3212             / get_pixel_size(imageInfo->format);
3213         numPixels = allocSize / (get_explicit_type_size(dataType) * 4);
3214     }
3215     else
3216     {
3217         numPixels = (image2DFromBuffer ? imageInfo->rowPitch : imageInfo->width)
3218             * imageInfo->height * (imageInfo->depth ? imageInfo->depth : 1)
3219             * (imageInfo->arraySize ? imageInfo->arraySize : 1);
3220         allocSize = numPixels * 4 * get_explicit_type_size(dataType);
3221     }
3222 
3223 #if 0 // DEBUG
3224     {
3225       fprintf(stderr,"--- create_random_image_data:\n");
3226       fprintf(stderr,"allocSize = %zu\n",allocSize);
3227       fprintf(stderr,"numPixels = %zu\n",numPixels);
3228       fprintf(stderr,"width = %zu\n",imageInfo->width);
3229       fprintf(stderr,"height = %zu\n",imageInfo->height);
3230       fprintf(stderr,"depth = %zu\n",imageInfo->depth);
3231       fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch);
3232       fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch);
3233       fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize);
3234       fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType));
3235     }
3236 #endif
3237 
3238 #if defined(__APPLE__)
3239     char *data = NULL;
3240     if (gDeviceType == CL_DEVICE_TYPE_CPU)
3241     {
3242         size_t mapSize =
3243             ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages.
3244 
3245         void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE,
3246                          MAP_ANON | MAP_PRIVATE, 0, 0);
3247         if (map == MAP_FAILED)
3248         {
3249             perror("create_random_image_data: mmap");
3250             log_error("%s:%d: mmap failed, mapSize = %zu\n", __FILE__, __LINE__,
3251                       mapSize);
3252         }
3253         intptr_t data_end = (intptr_t)map + mapSize - 4096;
3254         data = (char *)(data_end - (intptr_t)allocSize);
3255 
3256         mprotect(map, 4096, PROT_NONE);
3257         mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
3258         P.reset(data, map, mapSize);
3259     }
3260     else
3261     {
3262         data = (char *)malloc(allocSize);
3263         P.reset(data);
3264     }
3265 #else
3266     char *data =
3267         (char *)align_malloc(allocSize, get_pixel_alignment(imageInfo->format));
3268     P.reset(data, NULL, 0, allocSize, true);
3269 #endif
3270 
3271     if (data == NULL)
3272     {
3273         log_error(
3274             "ERROR: Unable to malloc %zu bytes for create_random_image_data\n",
3275             allocSize);
3276         return NULL;
3277     }
3278 
3279     switch (dataType)
3280     {
3281         case kFloat: {
3282             float *inputValues = (float *)data;
3283             switch (imageInfo->format->image_channel_data_type)
3284             {
3285                 case CL_HALF_FLOAT: {
3286                     // Generate data that is (mostly) inside the range of a half
3287                     // float const float HALF_MIN = 5.96046448e-08f;
3288                     const float HALF_MAX = 65504.0f;
3289 
3290                     size_t i = 0;
3291                     inputValues[i++] = 0.f;
3292                     inputValues[i++] = 1.f;
3293                     inputValues[i++] = -1.f;
3294                     inputValues[i++] = 2.f;
3295                     for (; i < numPixels * 4; i++)
3296                         inputValues[i] = get_random_float(-HALF_MAX - 2.f,
3297                                                           HALF_MAX + 2.f, d);
3298                 }
3299                 break;
3300 #ifdef CL_SFIXED14_APPLE
3301                 case CL_SFIXED14_APPLE: {
3302                     size_t i = 0;
3303                     if (numPixels * 4 >= 8)
3304                     {
3305                         inputValues[i++] = INFINITY;
3306                         inputValues[i++] = 0x1.0p14f;
3307                         inputValues[i++] = 0x1.0p31f;
3308                         inputValues[i++] = 0x1.0p32f;
3309                         inputValues[i++] = -INFINITY;
3310                         inputValues[i++] = -0x1.0p14f;
3311                         inputValues[i++] = -0x1.0p31f;
3312                         inputValues[i++] = -0x1.1p31f;
3313                     }
3314                     for (; i < numPixels * 4; i++)
3315                         inputValues[i] = get_random_float(-1.1f, 3.1f, d);
3316                 }
3317                 break;
3318 #endif
3319                 case CL_FLOAT: {
3320                     size_t i = 0;
3321                     inputValues[i++] = INFINITY;
3322                     inputValues[i++] = -INFINITY;
3323                     inputValues[i++] = 0.0f;
3324                     inputValues[i++] = 0.0f;
3325                     cl_uint *p = (cl_uint *)data;
3326                     for (; i < numPixels * 4; i++) p[i] = genrand_int32(d);
3327                 }
3328                 break;
3329 
3330                 default:
3331                     size_t i = 0;
3332                     if (numPixels * 4 >= 36)
3333                     {
3334                         inputValues[i++] = 0.0f;
3335                         inputValues[i++] = 0.5f;
3336                         inputValues[i++] = 31.5f;
3337                         inputValues[i++] = 32.0f;
3338                         inputValues[i++] = 127.5f;
3339                         inputValues[i++] = 128.0f;
3340                         inputValues[i++] = 255.5f;
3341                         inputValues[i++] = 256.0f;
3342                         inputValues[i++] = 1023.5f;
3343                         inputValues[i++] = 1024.0f;
3344                         inputValues[i++] = 32767.5f;
3345                         inputValues[i++] = 32768.0f;
3346                         inputValues[i++] = 65535.5f;
3347                         inputValues[i++] = 65536.0f;
3348                         inputValues[i++] = 2147483648.0f;
3349                         inputValues[i++] = 4294967296.0f;
3350                         inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
3351                         inputValues[i++] = MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
3352                         inputValues[i++] = -0.0f;
3353                         inputValues[i++] = -0.5f;
3354                         inputValues[i++] = -31.5f;
3355                         inputValues[i++] = -32.0f;
3356                         inputValues[i++] = -127.5f;
3357                         inputValues[i++] = -128.0f;
3358                         inputValues[i++] = -255.5f;
3359                         inputValues[i++] = -256.0f;
3360                         inputValues[i++] = -1023.5f;
3361                         inputValues[i++] = -1024.0f;
3362                         inputValues[i++] = -32767.5f;
3363                         inputValues[i++] = -32768.0f;
3364                         inputValues[i++] = -65535.5f;
3365                         inputValues[i++] = -65536.0f;
3366                         inputValues[i++] = -2147483648.0f;
3367                         inputValues[i++] = -4294967296.0f;
3368                         inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p63f, 1, 63);
3369                         inputValues[i++] = -MAKE_HEX_FLOAT(0x1.0p64f, 1, 64);
3370                     }
3371                     if (is_format_signed(imageInfo->format))
3372                     {
3373                         for (; i < numPixels * 4; i++)
3374                             inputValues[i] = get_random_float(-1.1f, 1.1f, d);
3375                     }
3376                     else
3377                     {
3378                         for (; i < numPixels * 4; i++)
3379                             inputValues[i] = get_random_float(-0.1f, 1.1f, d);
3380                     }
3381                     break;
3382             }
3383             break;
3384         }
3385 
3386         case kInt: {
3387             int *imageData = (int *)data;
3388 
3389             // We want to generate ints (mostly) in range of the target format
3390             int formatMin = get_format_min_int(imageInfo->format);
3391             size_t formatMax = get_format_max_int(imageInfo->format);
3392             if (formatMin == 0)
3393             {
3394                 // Unsigned values, but we are only an int, so cap the actual
3395                 // max at the max of signed ints
3396                 if (formatMax > 2147483647L) formatMax = 2147483647L;
3397             }
3398             // If the final format is small enough, give us a bit of room for
3399             // out-of-range values to test
3400             if (formatMax < 2147483647L) formatMax += 2;
3401             if (formatMin > -2147483648LL) formatMin -= 2;
3402 
3403             // Now gen
3404             for (size_t i = 0; i < numPixels * 4; i++)
3405             {
3406                 imageData[i] = random_in_range(formatMin, (int)formatMax, d);
3407             }
3408             break;
3409         }
3410 
3411         case kUInt:
3412         case kUnsignedInt: {
3413             unsigned int *imageData = (unsigned int *)data;
3414 
3415             // We want to generate ints (mostly) in range of the target format
3416             int formatMin = get_format_min_int(imageInfo->format);
3417             size_t formatMax = get_format_max_int(imageInfo->format);
3418             if (formatMin < 0) formatMin = 0;
3419             // If the final format is small enough, give us a bit of room for
3420             // out-of-range values to test
3421             if (formatMax < 4294967295LL) formatMax += 2;
3422 
3423             // Now gen
3424             for (size_t i = 0; i < numPixels * 4; i++)
3425             {
3426                 imageData[i] = random_in_range(formatMin, (int)formatMax, d);
3427             }
3428             break;
3429         }
3430         default:
3431             // Unsupported source format
3432             delete[] data;
3433             return NULL;
3434     }
3435 
3436     return data;
3437 }
3438 
3439 /*
3440     deprecated
3441 bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t
3442 max, int &outValue )
3443 {
3444     int v = (int)value;
3445 
3446     switch(imageSampler->addressing_mode)
3447     {
3448         case CL_ADDRESS_REPEAT:
3449             outValue = v;
3450             while( v < 0 )
3451                 v += (int)max;
3452             while( v >= (int)max )
3453                 v -= (int)max;
3454             if( v != outValue )
3455             {
3456                 outValue = v;
3457                 return true;
3458             }
3459             return false;
3460 
3461         case CL_ADDRESS_MIRRORED_REPEAT:
3462             log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do
3463 we ever use this? exit(-1);
3464 
3465         default:
3466             if( v < 0 )
3467             {
3468                 outValue = 0;
3469                 return true;
3470             }
3471             if( v >= (int)max )
3472             {
3473                 outValue = (int)max - 1;
3474                 return true;
3475             }
3476             outValue = v;
3477             return false;
3478     }
3479 
3480 }
3481 */
3482 
get_sampler_kernel_code(image_sampler_data * imageSampler,char * outLine)3483 void get_sampler_kernel_code(image_sampler_data *imageSampler, char *outLine)
3484 {
3485     const char *normalized;
3486     const char *addressMode;
3487     const char *filterMode;
3488 
3489     if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP)
3490         addressMode = "CLK_ADDRESS_CLAMP";
3491     else if (imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
3492         addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
3493     else if (imageSampler->addressing_mode == CL_ADDRESS_REPEAT)
3494         addressMode = "CLK_ADDRESS_REPEAT";
3495     else if (imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
3496         addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
3497     else if (imageSampler->addressing_mode == CL_ADDRESS_NONE)
3498         addressMode = "CLK_ADDRESS_NONE";
3499     else
3500     {
3501         log_error("**Error: Unknown addressing mode! Aborting...\n");
3502         abort();
3503     }
3504 
3505     if (imageSampler->normalized_coords)
3506         normalized = "CLK_NORMALIZED_COORDS_TRUE";
3507     else
3508         normalized = "CLK_NORMALIZED_COORDS_FALSE";
3509 
3510     if (imageSampler->filter_mode == CL_FILTER_LINEAR)
3511         filterMode = "CLK_FILTER_LINEAR";
3512     else
3513         filterMode = "CLK_FILTER_NEAREST";
3514 
3515     sprintf(outLine, "    const sampler_t imageSampler = %s | %s | %s;\n",
3516             addressMode, filterMode, normalized);
3517 }
3518 
copy_image_data(image_descriptor * srcImageInfo,image_descriptor * dstImageInfo,void * imageValues,void * destImageValues,const size_t sourcePos[],const size_t destPos[],const size_t regionSize[])3519 void copy_image_data(image_descriptor *srcImageInfo,
3520                      image_descriptor *dstImageInfo, void *imageValues,
3521                      void *destImageValues, const size_t sourcePos[],
3522                      const size_t destPos[], const size_t regionSize[])
3523 {
3524     //  assert( srcImageInfo->format == dstImageInfo->format );
3525 
3526     size_t src_mip_level_offset = 0, dst_mip_level_offset = 0;
3527     size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod;
3528     size_t src_row_pitch_lod, src_slice_pitch_lod;
3529     size_t dst_row_pitch_lod, dst_slice_pitch_lod;
3530 
3531     size_t pixelSize = get_pixel_size(srcImageInfo->format);
3532 
3533     sourcePos_lod[0] = sourcePos[0];
3534     sourcePos_lod[1] = sourcePos[1];
3535     sourcePos_lod[2] = sourcePos[2];
3536     destPos_lod[0] = destPos[0];
3537     destPos_lod[1] = destPos[1];
3538     destPos_lod[2] = destPos[2];
3539     src_row_pitch_lod = srcImageInfo->rowPitch;
3540     dst_row_pitch_lod = dstImageInfo->rowPitch;
3541     src_slice_pitch_lod = srcImageInfo->slicePitch;
3542     dst_slice_pitch_lod = dstImageInfo->slicePitch;
3543 
3544     if (srcImageInfo->num_mip_levels > 1)
3545     {
3546         size_t src_width_lod = 1 /*srcImageInfo->width*/;
3547         size_t src_height_lod = 1 /*srcImageInfo->height*/;
3548         size_t src_depth_lod = 1 /*srcImageInfo->depth*/;
3549 
3550         switch (srcImageInfo->type)
3551         {
3552             case CL_MEM_OBJECT_IMAGE1D:
3553                 src_lod = sourcePos[1];
3554                 sourcePos_lod[1] = sourcePos_lod[2] = 0;
3555                 src_width_lod = (srcImageInfo->width >> src_lod)
3556                     ? (srcImageInfo->width >> src_lod)
3557                     : 1;
3558                 break;
3559             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3560             case CL_MEM_OBJECT_IMAGE2D:
3561                 src_lod = sourcePos[2];
3562                 sourcePos_lod[1] = sourcePos[1];
3563                 sourcePos_lod[2] = 0;
3564                 src_width_lod = (srcImageInfo->width >> src_lod)
3565                     ? (srcImageInfo->width >> src_lod)
3566                     : 1;
3567                 if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
3568                     src_height_lod = (srcImageInfo->height >> src_lod)
3569                         ? (srcImageInfo->height >> src_lod)
3570                         : 1;
3571                 break;
3572             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3573             case CL_MEM_OBJECT_IMAGE3D:
3574                 src_lod = sourcePos[3];
3575                 sourcePos_lod[1] = sourcePos[1];
3576                 sourcePos_lod[2] = sourcePos[2];
3577                 src_width_lod = (srcImageInfo->width >> src_lod)
3578                     ? (srcImageInfo->width >> src_lod)
3579                     : 1;
3580                 src_height_lod = (srcImageInfo->height >> src_lod)
3581                     ? (srcImageInfo->height >> src_lod)
3582                     : 1;
3583                 if (srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
3584                     src_depth_lod = (srcImageInfo->depth >> src_lod)
3585                         ? (srcImageInfo->depth >> src_lod)
3586                         : 1;
3587                 break;
3588         }
3589         src_mip_level_offset = compute_mip_level_offset(srcImageInfo, src_lod);
3590         src_row_pitch_lod =
3591             src_width_lod * get_pixel_size(srcImageInfo->format);
3592         src_slice_pitch_lod = src_row_pitch_lod * src_height_lod;
3593     }
3594 
3595     if (dstImageInfo->num_mip_levels > 1)
3596     {
3597         size_t dst_width_lod = 1 /*dstImageInfo->width*/;
3598         size_t dst_height_lod = 1 /*dstImageInfo->height*/;
3599         size_t dst_depth_lod = 1 /*dstImageInfo->depth*/;
3600         switch (dstImageInfo->type)
3601         {
3602             case CL_MEM_OBJECT_IMAGE1D:
3603                 dst_lod = destPos[1];
3604                 destPos_lod[1] = destPos_lod[2] = 0;
3605                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3606                     ? (dstImageInfo->width >> dst_lod)
3607                     : 1;
3608                 break;
3609             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3610             case CL_MEM_OBJECT_IMAGE2D:
3611                 dst_lod = destPos[2];
3612                 destPos_lod[1] = destPos[1];
3613                 destPos_lod[2] = 0;
3614                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3615                     ? (dstImageInfo->width >> dst_lod)
3616                     : 1;
3617                 if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D)
3618                     dst_height_lod = (dstImageInfo->height >> dst_lod)
3619                         ? (dstImageInfo->height >> dst_lod)
3620                         : 1;
3621                 break;
3622             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3623             case CL_MEM_OBJECT_IMAGE3D:
3624                 dst_lod = destPos[3];
3625                 destPos_lod[1] = destPos[1];
3626                 destPos_lod[2] = destPos[2];
3627                 dst_width_lod = (dstImageInfo->width >> dst_lod)
3628                     ? (dstImageInfo->width >> dst_lod)
3629                     : 1;
3630                 dst_height_lod = (dstImageInfo->height >> dst_lod)
3631                     ? (dstImageInfo->height >> dst_lod)
3632                     : 1;
3633                 if (dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D)
3634                     dst_depth_lod = (dstImageInfo->depth >> dst_lod)
3635                         ? (dstImageInfo->depth >> dst_lod)
3636                         : 1;
3637                 break;
3638         }
3639         dst_mip_level_offset = compute_mip_level_offset(dstImageInfo, dst_lod);
3640         dst_row_pitch_lod =
3641             dst_width_lod * get_pixel_size(dstImageInfo->format);
3642         dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod;
3643     }
3644 
3645     // Get initial pointers
3646     char *sourcePtr = (char *)imageValues
3647         + sourcePos_lod[2] * src_slice_pitch_lod
3648         + sourcePos_lod[1] * src_row_pitch_lod + pixelSize * sourcePos_lod[0]
3649         + src_mip_level_offset;
3650     char *destPtr = (char *)destImageValues
3651         + destPos_lod[2] * dst_slice_pitch_lod
3652         + destPos_lod[1] * dst_row_pitch_lod + pixelSize * destPos_lod[0]
3653         + dst_mip_level_offset;
3654 
3655     for (size_t z = 0; z < (regionSize[2] > 0 ? regionSize[2] : 1); z++)
3656     {
3657         char *rowSourcePtr = sourcePtr;
3658         char *rowDestPtr = destPtr;
3659         for (size_t y = 0; y < regionSize[1]; y++)
3660         {
3661             memcpy(rowDestPtr, rowSourcePtr, pixelSize * regionSize[0]);
3662             rowSourcePtr += src_row_pitch_lod;
3663             rowDestPtr += dst_row_pitch_lod;
3664         }
3665 
3666         sourcePtr += src_slice_pitch_lod;
3667         destPtr += dst_slice_pitch_lod;
3668     }
3669 }
3670 
random_float(float low,float high,MTdata d)3671 float random_float(float low, float high, MTdata d)
3672 {
3673     float t = (float)genrand_real1(d);
3674     return (1.0f - t) * low + t * high;
3675 }
3676 
CoordWalker(void * coords,bool useFloats,size_t vecSize)3677 CoordWalker::CoordWalker(void *coords, bool useFloats, size_t vecSize)
3678 {
3679     if (useFloats)
3680     {
3681         mFloatCoords = (cl_float *)coords;
3682         mIntCoords = NULL;
3683     }
3684     else
3685     {
3686         mFloatCoords = NULL;
3687         mIntCoords = (cl_int *)coords;
3688     }
3689     mVecSize = vecSize;
3690 }
3691 
~CoordWalker()3692 CoordWalker::~CoordWalker() {}
3693 
Get(size_t idx,size_t el)3694 cl_float CoordWalker::Get(size_t idx, size_t el)
3695 {
3696     if (mIntCoords != NULL)
3697         return (cl_float)mIntCoords[idx * mVecSize + el];
3698     else
3699         return mFloatCoords[idx * mVecSize + el];
3700 }
3701 
3702 
print_read_header(const cl_image_format * format,image_sampler_data * sampler,bool err,int t)3703 void print_read_header(const cl_image_format *format,
3704                        image_sampler_data *sampler, bool err, int t)
3705 {
3706     const char *addressMode = NULL;
3707     const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };
3708 
3709     if (sampler->addressing_mode == CL_ADDRESS_CLAMP)
3710         addressMode = "CL_ADDRESS_CLAMP";
3711     else if (sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE)
3712         addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
3713     else if (sampler->addressing_mode == CL_ADDRESS_REPEAT)
3714         addressMode = "CL_ADDRESS_REPEAT";
3715     else if (sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)
3716         addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
3717     else
3718         addressMode = "CL_ADDRESS_NONE";
3719 
3720     if (t)
3721     {
3722         if (err)
3723             log_error("[%-7s %-24s %d] - %s - %s - %s - %s\n",
3724                       GetChannelOrderName(format->image_channel_order),
3725                       GetChannelTypeName(format->image_channel_data_type),
3726                       (int)get_format_channel_count(format),
3727                       sampler->filter_mode == CL_FILTER_NEAREST
3728                           ? "CL_FILTER_NEAREST"
3729                           : "CL_FILTER_LINEAR",
3730                       addressMode,
3731                       normalizedNames[sampler->normalized_coords ? 1 : 0],
3732                       t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
3733         else
3734             log_info("[%-7s %-24s %d] - %s - %s - %s - %s\n",
3735                      GetChannelOrderName(format->image_channel_order),
3736                      GetChannelTypeName(format->image_channel_data_type),
3737                      (int)get_format_channel_count(format),
3738                      sampler->filter_mode == CL_FILTER_NEAREST
3739                          ? "CL_FILTER_NEAREST"
3740                          : "CL_FILTER_LINEAR",
3741                      addressMode,
3742                      normalizedNames[sampler->normalized_coords ? 1 : 0],
3743                      t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED");
3744     }
3745     else
3746     {
3747         if (err)
3748             log_error("[%-7s %-24s %d] - %s - %s - %s\n",
3749                       GetChannelOrderName(format->image_channel_order),
3750                       GetChannelTypeName(format->image_channel_data_type),
3751                       (int)get_format_channel_count(format),
3752                       sampler->filter_mode == CL_FILTER_NEAREST
3753                           ? "CL_FILTER_NEAREST"
3754                           : "CL_FILTER_LINEAR",
3755                       addressMode,
3756                       normalizedNames[sampler->normalized_coords ? 1 : 0]);
3757         else
3758             log_info("[%-7s %-24s %d] - %s - %s - %s\n",
3759                      GetChannelOrderName(format->image_channel_order),
3760                      GetChannelTypeName(format->image_channel_data_type),
3761                      (int)get_format_channel_count(format),
3762                      sampler->filter_mode == CL_FILTER_NEAREST
3763                          ? "CL_FILTER_NEAREST"
3764                          : "CL_FILTER_LINEAR",
3765                      addressMode,
3766                      normalizedNames[sampler->normalized_coords ? 1 : 0]);
3767     }
3768 }
3769 
print_write_header(const cl_image_format * format,bool err=false)3770 void print_write_header(const cl_image_format *format, bool err = false)
3771 {
3772     if (err)
3773         log_error("[%-7s %-24s %d]\n",
3774                   GetChannelOrderName(format->image_channel_order),
3775                   GetChannelTypeName(format->image_channel_data_type),
3776                   (int)get_format_channel_count(format));
3777     else
3778         log_info("[%-7s %-24s %d]\n",
3779                  GetChannelOrderName(format->image_channel_order),
3780                  GetChannelTypeName(format->image_channel_data_type),
3781                  (int)get_format_channel_count(format));
3782 }
3783 
3784 
print_header(const cl_image_format * format,bool err=false)3785 void print_header(const cl_image_format *format, bool err = false)
3786 {
3787     if (err)
3788     {
3789         log_error("[%-7s %-24s %d]\n",
3790                   GetChannelOrderName(format->image_channel_order),
3791                   GetChannelTypeName(format->image_channel_data_type),
3792                   (int)get_format_channel_count(format));
3793     }
3794     else
3795     {
3796         log_info("[%-7s %-24s %d]\n",
3797                  GetChannelOrderName(format->image_channel_order),
3798                  GetChannelTypeName(format->image_channel_data_type),
3799                  (int)get_format_channel_count(format));
3800     }
3801 }
3802 
find_format(cl_image_format * formatList,unsigned int numFormats,cl_image_format * formatToFind)3803 bool find_format(cl_image_format *formatList, unsigned int numFormats,
3804                  cl_image_format *formatToFind)
3805 {
3806     for (unsigned int i = 0; i < numFormats; i++)
3807     {
3808         if (formatList[i].image_channel_order
3809                 == formatToFind->image_channel_order
3810             && formatList[i].image_channel_data_type
3811                 == formatToFind->image_channel_data_type)
3812             return true;
3813     }
3814     return false;
3815 }
3816 
build_required_image_formats(cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device,std::vector<cl_image_format> & formatsToSupport)3817 void build_required_image_formats(
3818     cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device,
3819     std::vector<cl_image_format> &formatsToSupport)
3820 {
3821     formatsToSupport.clear();
3822 
3823     // Minimum list of supported image formats for reading or writing (embedded
3824     // profile)
3825     static std::vector<cl_image_format> embeddedProfile_readOrWrite{
3826         // clang-format off
3827         { CL_RGBA, CL_UNORM_INT8 },
3828         { CL_RGBA, CL_UNORM_INT16 },
3829         { CL_RGBA, CL_SIGNED_INT8 },
3830         { CL_RGBA, CL_SIGNED_INT16 },
3831         { CL_RGBA, CL_SIGNED_INT32 },
3832         { CL_RGBA, CL_UNSIGNED_INT8 },
3833         { CL_RGBA, CL_UNSIGNED_INT16 },
3834         { CL_RGBA, CL_UNSIGNED_INT32 },
3835         { CL_RGBA, CL_HALF_FLOAT },
3836         { CL_RGBA, CL_FLOAT },
3837         // clang-format on
3838     };
3839 
3840     // Minimum list of required image formats for reading or writing
3841     // num_channels, for all image types.
3842     static std::vector<cl_image_format> fullProfile_readOrWrite{
3843         // clang-format off
3844         { CL_RGBA, CL_UNORM_INT8 },
3845         { CL_RGBA, CL_UNORM_INT16 },
3846         { CL_RGBA, CL_SIGNED_INT8 },
3847         { CL_RGBA, CL_SIGNED_INT16 },
3848         { CL_RGBA, CL_SIGNED_INT32 },
3849         { CL_RGBA, CL_UNSIGNED_INT8 },
3850         { CL_RGBA, CL_UNSIGNED_INT16 },
3851         { CL_RGBA, CL_UNSIGNED_INT32 },
3852         { CL_RGBA, CL_HALF_FLOAT },
3853         { CL_RGBA, CL_FLOAT },
3854         { CL_BGRA, CL_UNORM_INT8 },
3855         // clang-format on
3856     };
3857 
3858     // Minimum list of supported image formats for reading or writing
3859     // (OpenCL 2.0, 2.1, or 2.2), for all image types.
3860     static std::vector<cl_image_format> fullProfile_2x_readOrWrite{
3861         // clang-format off
3862         { CL_R, CL_UNORM_INT8 },
3863         { CL_R, CL_UNORM_INT16 },
3864         { CL_R, CL_SNORM_INT8 },
3865         { CL_R, CL_SNORM_INT16 },
3866         { CL_R, CL_SIGNED_INT8 },
3867         { CL_R, CL_SIGNED_INT16 },
3868         { CL_R, CL_SIGNED_INT32 },
3869         { CL_R, CL_UNSIGNED_INT8 },
3870         { CL_R, CL_UNSIGNED_INT16 },
3871         { CL_R, CL_UNSIGNED_INT32 },
3872         { CL_R, CL_HALF_FLOAT },
3873         { CL_R, CL_FLOAT },
3874         { CL_RG, CL_UNORM_INT8 },
3875         { CL_RG, CL_UNORM_INT16 },
3876         { CL_RG, CL_SNORM_INT8 },
3877         { CL_RG, CL_SNORM_INT16 },
3878         { CL_RG, CL_SIGNED_INT8 },
3879         { CL_RG, CL_SIGNED_INT16 },
3880         { CL_RG, CL_SIGNED_INT32 },
3881         { CL_RG, CL_UNSIGNED_INT8 },
3882         { CL_RG, CL_UNSIGNED_INT16 },
3883         { CL_RG, CL_UNSIGNED_INT32 },
3884         { CL_RG, CL_HALF_FLOAT },
3885         { CL_RG, CL_FLOAT },
3886         { CL_RGBA, CL_UNORM_INT8 },
3887         { CL_RGBA, CL_UNORM_INT16 },
3888         { CL_RGBA, CL_SNORM_INT8 },
3889         { CL_RGBA, CL_SNORM_INT16 },
3890         { CL_RGBA, CL_SIGNED_INT8 },
3891         { CL_RGBA, CL_SIGNED_INT16 },
3892         { CL_RGBA, CL_SIGNED_INT32 },
3893         { CL_RGBA, CL_UNSIGNED_INT8 },
3894         { CL_RGBA, CL_UNSIGNED_INT16 },
3895         { CL_RGBA, CL_UNSIGNED_INT32 },
3896         { CL_RGBA, CL_HALF_FLOAT },
3897         { CL_RGBA, CL_FLOAT },
3898         { CL_BGRA, CL_UNORM_INT8 },
3899         // clang-format on
3900     };
3901 
3902     // Conditional addition to the 2x readOrWrite table:
3903     // Support for the CL_DEPTH image channel order is required only for 2D
3904     // images and 2D image arrays.
3905     static std::vector<cl_image_format> fullProfile_2x_readOrWrite_Depth{
3906         // clang-format off
3907         { CL_DEPTH, CL_UNORM_INT16 },
3908         { CL_DEPTH, CL_FLOAT },
3909         // clang-format on
3910     };
3911 
3912     // Conditional addition to the 2x readOrWrite table:
3913     // Support for reading from the CL_sRGBA image channel order is optional for
3914     // 1D image buffers. Support for writing to the CL_sRGBA image channel order
3915     // is optional for all image types.
3916     static std::vector<cl_image_format> fullProfile_2x_readOrWrite_srgb{
3917         { CL_sRGBA, CL_UNORM_INT8 },
3918     };
3919 
3920     // Minimum list of required image formats for reading and writing.
3921     static std::vector<cl_image_format> fullProfile_readAndWrite{
3922         // clang-format off
3923         { CL_R, CL_UNORM_INT8 },
3924         { CL_R, CL_SIGNED_INT8 },
3925         { CL_R, CL_SIGNED_INT16 },
3926         { CL_R, CL_SIGNED_INT32 },
3927         { CL_R, CL_UNSIGNED_INT8 },
3928         { CL_R, CL_UNSIGNED_INT16 },
3929         { CL_R, CL_UNSIGNED_INT32 },
3930         { CL_R, CL_HALF_FLOAT },
3931         { CL_R, CL_FLOAT },
3932         { CL_RGBA, CL_UNORM_INT8 },
3933         { CL_RGBA, CL_SIGNED_INT8 },
3934         { CL_RGBA, CL_SIGNED_INT16 },
3935         { CL_RGBA, CL_SIGNED_INT32 },
3936         { CL_RGBA, CL_UNSIGNED_INT8 },
3937         { CL_RGBA, CL_UNSIGNED_INT16 },
3938         { CL_RGBA, CL_UNSIGNED_INT32 },
3939         { CL_RGBA, CL_HALF_FLOAT },
3940         { CL_RGBA, CL_FLOAT },
3941         // clang-format on
3942     };
3943 
3944     // Embedded profile
3945     if (gIsEmbedded)
3946     {
3947         copy(embeddedProfile_readOrWrite.begin(),
3948              embeddedProfile_readOrWrite.end(),
3949              back_inserter(formatsToSupport));
3950     }
3951     // Full profile
3952     else
3953     {
3954         Version version = get_device_cl_version(device);
3955         if (version < Version(2, 0) || version >= Version(3, 0))
3956         {
3957             // Full profile, OpenCL 1.2 or 3.0.
3958             if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
3959             {
3960                 // Note: assumes that read-write images are supported!
3961                 copy(fullProfile_readAndWrite.begin(),
3962                      fullProfile_readAndWrite.end(),
3963                      back_inserter(formatsToSupport));
3964             }
3965             else
3966             {
3967                 copy(fullProfile_readOrWrite.begin(),
3968                      fullProfile_readOrWrite.end(),
3969                      back_inserter(formatsToSupport));
3970             }
3971         }
3972         else
3973         {
3974             // Full profile, OpenCL 2.0, 2.1, 2.2.
3975             if (flags & CL_MEM_KERNEL_READ_AND_WRITE)
3976             {
3977                 copy(fullProfile_readAndWrite.begin(),
3978                      fullProfile_readAndWrite.end(),
3979                      back_inserter(formatsToSupport));
3980             }
3981             else
3982             {
3983                 copy(fullProfile_2x_readOrWrite.begin(),
3984                      fullProfile_2x_readOrWrite.end(),
3985                      back_inserter(formatsToSupport));
3986 
3987                 // Support for the CL_DEPTH image channel order is required only
3988                 // for 2D images and 2D image arrays.
3989                 if (image_type == CL_MEM_OBJECT_IMAGE2D
3990                     || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
3991                 {
3992                     copy(fullProfile_2x_readOrWrite_Depth.begin(),
3993                          fullProfile_2x_readOrWrite_Depth.end(),
3994                          back_inserter(formatsToSupport));
3995                 }
3996 
3997                 // Support for reading from the CL_sRGBA image channel order is
3998                 // optional for 1D image buffers. Support for writing to the
3999                 // CL_sRGBA image channel order is optional for all image types.
4000                 if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER
4001                     && flags == CL_MEM_READ_ONLY)
4002                 {
4003                     copy(fullProfile_2x_readOrWrite_srgb.begin(),
4004                          fullProfile_2x_readOrWrite_srgb.end(),
4005                          back_inserter(formatsToSupport));
4006                 }
4007             }
4008         }
4009     }
4010 }
4011 
is_image_format_required(cl_image_format format,cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device)4012 bool is_image_format_required(cl_image_format format, cl_mem_flags flags,
4013                               cl_mem_object_type image_type,
4014                               cl_device_id device)
4015 {
4016     std::vector<cl_image_format> formatsToSupport;
4017     build_required_image_formats(flags, image_type, device, formatsToSupport);
4018 
4019     for (auto &formatItr : formatsToSupport)
4020     {
4021         if (formatItr.image_channel_order == format.image_channel_order
4022             && formatItr.image_channel_data_type
4023                 == format.image_channel_data_type)
4024         {
4025             return true;
4026         }
4027     }
4028 
4029     return false;
4030 }
4031 
compute_max_mip_levels(size_t width,size_t height,size_t depth)4032 cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth)
4033 {
4034     cl_uint retMaxMipLevels = 0;
4035     size_t max_dim = 0;
4036 
4037     max_dim = width;
4038     max_dim = height > max_dim ? height : max_dim;
4039     max_dim = depth > max_dim ? depth : max_dim;
4040 
4041     while (max_dim)
4042     {
4043         retMaxMipLevels++;
4044         max_dim >>= 1;
4045     }
4046     return retMaxMipLevels;
4047 }
4048 
compute_mipmapped_image_size(image_descriptor imageInfo)4049 cl_ulong compute_mipmapped_image_size(image_descriptor imageInfo)
4050 {
4051     cl_ulong retSize = 0;
4052     size_t curr_width, curr_height, curr_depth, curr_array_size;
4053     curr_width = imageInfo.width;
4054     curr_height = imageInfo.height;
4055     curr_depth = imageInfo.depth;
4056     curr_array_size = imageInfo.arraySize;
4057 
4058     for (int i = 0; i < (int)imageInfo.num_mip_levels; i++)
4059     {
4060         switch (imageInfo.type)
4061         {
4062             case CL_MEM_OBJECT_IMAGE3D:
4063                 retSize += (cl_ulong)curr_width * curr_height * curr_depth
4064                     * get_pixel_size(imageInfo.format);
4065                 break;
4066             case CL_MEM_OBJECT_IMAGE2D:
4067                 retSize += (cl_ulong)curr_width * curr_height
4068                     * get_pixel_size(imageInfo.format);
4069                 break;
4070             case CL_MEM_OBJECT_IMAGE1D:
4071                 retSize +=
4072                     (cl_ulong)curr_width * get_pixel_size(imageInfo.format);
4073                 break;
4074             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4075                 retSize += (cl_ulong)curr_width * curr_array_size
4076                     * get_pixel_size(imageInfo.format);
4077                 break;
4078             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4079                 retSize += (cl_ulong)curr_width * curr_height * curr_array_size
4080                     * get_pixel_size(imageInfo.format);
4081                 break;
4082         }
4083 
4084         switch (imageInfo.type)
4085         {
4086             case CL_MEM_OBJECT_IMAGE3D:
4087                 curr_depth = curr_depth >> 1 ? curr_depth >> 1 : 1;
4088             case CL_MEM_OBJECT_IMAGE2D:
4089             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4090                 curr_height = curr_height >> 1 ? curr_height >> 1 : 1;
4091             case CL_MEM_OBJECT_IMAGE1D:
4092             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4093                 curr_width = curr_width >> 1 ? curr_width >> 1 : 1;
4094         }
4095     }
4096 
4097     return retSize;
4098 }
4099 
compute_mip_level_offset(image_descriptor * imageInfo,size_t lod)4100 size_t compute_mip_level_offset(image_descriptor *imageInfo, size_t lod)
4101 {
4102     size_t retOffset = 0;
4103     size_t width, height, depth;
4104     width = imageInfo->width;
4105     height = imageInfo->height;
4106     depth = imageInfo->depth;
4107 
4108     for (size_t i = 0; i < lod; i++)
4109     {
4110         switch (imageInfo->type)
4111         {
4112             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4113                 retOffset += (size_t)width * height * imageInfo->arraySize
4114                     * get_pixel_size(imageInfo->format);
4115                 break;
4116             case CL_MEM_OBJECT_IMAGE3D:
4117                 retOffset += (size_t)width * height * depth
4118                     * get_pixel_size(imageInfo->format);
4119                 break;
4120             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4121                 retOffset += (size_t)width * imageInfo->arraySize
4122                     * get_pixel_size(imageInfo->format);
4123                 break;
4124             case CL_MEM_OBJECT_IMAGE2D:
4125                 retOffset +=
4126                     (size_t)width * height * get_pixel_size(imageInfo->format);
4127                 break;
4128             case CL_MEM_OBJECT_IMAGE1D:
4129                 retOffset += (size_t)width * get_pixel_size(imageInfo->format);
4130                 break;
4131         }
4132 
4133         // Compute next lod dimensions
4134         switch (imageInfo->type)
4135         {
4136             case CL_MEM_OBJECT_IMAGE3D: depth = (depth >> 1) ? (depth >> 1) : 1;
4137             case CL_MEM_OBJECT_IMAGE2D:
4138             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
4139                 height = (height >> 1) ? (height >> 1) : 1;
4140             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
4141             case CL_MEM_OBJECT_IMAGE1D: width = (width >> 1) ? (width >> 1) : 1;
4142         }
4143     }
4144     return retOffset;
4145 }
4146 
convert_image_type_to_string(cl_mem_object_type image_type)4147 const char *convert_image_type_to_string(cl_mem_object_type image_type)
4148 {
4149     switch (image_type)
4150     {
4151         case CL_MEM_OBJECT_IMAGE1D: return "1D";
4152         case CL_MEM_OBJECT_IMAGE2D: return "2D";
4153         case CL_MEM_OBJECT_IMAGE3D: return "3D";
4154         case CL_MEM_OBJECT_IMAGE1D_ARRAY: return "1D array";
4155         case CL_MEM_OBJECT_IMAGE2D_ARRAY: return "2D array";
4156         case CL_MEM_OBJECT_IMAGE1D_BUFFER: return "1D image buffer";
4157         default: return "unrecognized object type";
4158     }
4159 }
4160