• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "imageHelpers.h"
17 #include <limits.h>
18 #include <assert.h>
19 #if defined( __APPLE__ )
20 #include <sys/mman.h>
21 #endif
22 #if !defined (_WIN32) && !defined(__APPLE__)
23 #include <malloc.h>
24 #endif
25 #include <algorithm>
26 #include <iterator>
27 #if !defined (_WIN32)
28 #include <cmath>
29 #endif
30 
31 RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
32 
33 static cl_ushort float2half_rte( float f );
34 static cl_ushort float2half_rtz( float f );
35 
36 cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
37 bool gTestRounding = false;
38 double
sRGBmap(float fc)39 sRGBmap(float fc)
40 {
41     double c = (double)fc;
42 
43 #if !defined (_WIN32)
44     if (std::isnan(c))
45         c = 0.0;
46 #else
47     if (_isnan(c))
48         c = 0.0;
49 #endif
50 
51     if (c > 1.0)
52         c = 1.0;
53     else if (c < 0.0)
54         c = 0.0;
55     else if (c < 0.0031308)
56         c = 12.92 * c;
57     else
58         c = (1055.0/1000.0) * pow(c, 5.0/12.0) - (55.0/1000.0);
59 
60     return c * 255.0;
61 }
62 
63 double
sRGBunmap(float fc)64 sRGBunmap(float fc)
65 {
66     double c = (double)fc;
67     double result;
68 
69     if (c <= 0.04045)
70         result = c / 12.92;
71     else
72         result = pow((c + 0.055) / 1.055, 2.4);
73 
74     return result;
75 }
76 
77 
get_format_type_size(const cl_image_format * format)78 size_t get_format_type_size( const cl_image_format *format )
79 {
80     return get_channel_data_type_size( format->image_channel_data_type );
81 }
82 
get_channel_data_type_size(cl_channel_type channelType)83 size_t get_channel_data_type_size( cl_channel_type channelType )
84 {
85     switch( channelType )
86     {
87         case CL_SNORM_INT8:
88         case CL_UNORM_INT8:
89         case CL_SIGNED_INT8:
90         case CL_UNSIGNED_INT8:
91             return 1;
92 
93         case CL_SNORM_INT16:
94         case CL_UNORM_INT16:
95         case CL_SIGNED_INT16:
96         case CL_UNSIGNED_INT16:
97         case CL_HALF_FLOAT:
98 #ifdef CL_SFIXED14_APPLE
99         case CL_SFIXED14_APPLE:
100 #endif
101             return sizeof( cl_short );
102 
103         case CL_SIGNED_INT32:
104         case CL_UNSIGNED_INT32:
105             return sizeof( cl_int );
106 
107         case CL_UNORM_SHORT_565:
108         case CL_UNORM_SHORT_555:
109 #ifdef OBSOLETE_FORAMT
110         case CL_UNORM_SHORT_565_REV:
111         case CL_UNORM_SHORT_555_REV:
112 #endif
113             return 2;
114 
115 #ifdef OBSOLETE_FORAMT
116         case CL_UNORM_INT_8888:
117         case CL_UNORM_INT_8888_REV:
118             return 4;
119 #endif
120 
121         case CL_UNORM_INT_101010:
122 #ifdef OBSOLETE_FORAMT
123         case CL_UNORM_INT_101010_REV:
124 #endif
125             return 4;
126 
127         case CL_FLOAT:
128             return sizeof( cl_float );
129 
130         default:
131             return 0;
132     }
133 }
134 
get_format_channel_count(const cl_image_format * format)135 size_t get_format_channel_count( const cl_image_format *format )
136 {
137     return get_channel_order_channel_count( format->image_channel_order );
138 }
139 
get_channel_order_channel_count(cl_channel_order order)140 size_t get_channel_order_channel_count( cl_channel_order order )
141 {
142     switch( order )
143     {
144         case CL_R:
145         case CL_A:
146         case CL_Rx:
147         case CL_INTENSITY:
148         case CL_LUMINANCE:
149         case CL_DEPTH:
150         case CL_DEPTH_STENCIL:
151             return 1;
152 
153         case CL_RG:
154         case CL_RA:
155         case CL_RGx:
156             return 2;
157 
158         case CL_RGB:
159         case CL_RGBx:
160         case CL_sRGB:
161         case CL_sRGBx:
162             return 3;
163 
164         case CL_RGBA:
165         case CL_ARGB:
166         case CL_BGRA:
167         case CL_sRGBA:
168         case CL_sBGRA:
169         case CL_ABGR:
170 #ifdef CL_1RGB_APPLE
171         case CL_1RGB_APPLE:
172 #endif
173 #ifdef CL_BGR1_APPLE
174         case CL_BGR1_APPLE:
175 #endif
176 #ifdef CL_ABGR_APPLE
177         case CL_ABGR_APPLE:
178 #endif
179           return 4;
180 
181         default:
182           log_error("%s does not support 0x%x\n",__FUNCTION__,order);
183           return 0;
184     }
185 }
186 
get_channel_type_from_name(const char * name)187 cl_channel_type  get_channel_type_from_name( const char *name )
188 {
189     struct {
190         cl_channel_type type;
191         const char *name;
192     } typeNames[] = {
193         { CL_SNORM_INT8, "CL_SNORM_INT8" },
194         { CL_SNORM_INT16, "CL_SNORM_INT16" },
195         { CL_UNORM_INT8, "CL_UNORM_INT8" },
196         { CL_UNORM_INT16, "CL_UNORM_INT16" },
197         { CL_UNORM_INT24, "CL_UNORM_INT24" },
198         { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
199         { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
200         { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
201         { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
202         { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
203         { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
204         { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
205         { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
206         { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
207         { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
208         { CL_FLOAT, "CL_FLOAT" },
209 #ifdef CL_SFIXED14_APPLE
210         { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
211 #endif
212     };
213     for( size_t i = 0; i < sizeof( typeNames ) / sizeof( typeNames[ 0 ] ); i++ )
214     {
215         if( strcmp( typeNames[ i ].name, name ) == 0 || strcmp( typeNames[ i ].name + 3, name ) == 0 )
216             return typeNames[ i ].type;
217     }
218     return (cl_channel_type)-1;
219 }
220 
get_channel_order_from_name(const char * name)221 cl_channel_order  get_channel_order_from_name( const char *name )
222 {
223     const struct
224     {
225         cl_channel_order    order;
226         const char          *name;
227     }orderNames[] =
228     {
229         { CL_R, "CL_R" },
230         { CL_A, "CL_A" },
231         { CL_Rx, "CL_Rx" },
232         { CL_RG, "CL_RG" },
233         { CL_RA, "CL_RA" },
234         { CL_RGx, "CL_RGx" },
235         { CL_RGB, "CL_RGB" },
236         { CL_RGBx, "CL_RGBx" },
237         { CL_RGBA, "CL_RGBA" },
238         { CL_BGRA, "CL_BGRA" },
239         { CL_ARGB, "CL_ARGB" },
240         { CL_INTENSITY, "CL_INTENSITY"},
241         { CL_LUMINANCE, "CL_LUMINANCE"},
242         { CL_DEPTH, "CL_DEPTH" },
243         { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" },
244         { CL_sRGB, "CL_sRGB" },
245         { CL_sRGBx, "CL_sRGBx" },
246         { CL_sRGBA, "CL_sRGBA" },
247         { CL_sBGRA, "CL_sBGRA" },
248         { CL_ABGR, "CL_ABGR" },
249 #ifdef CL_1RGB_APPLE
250         { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
251 #endif
252 #ifdef CL_BGR1_APPLE
253         { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
254 #endif
255     };
256 
257     for( size_t i = 0; i < sizeof( orderNames ) / sizeof( orderNames[ 0 ] ); i++ )
258     {
259         if( strcmp( orderNames[ i ].name, name ) == 0 || strcmp( orderNames[ i ].name + 3, name ) == 0 )
260             return orderNames[ i ].order;
261     }
262     return (cl_channel_order)-1;
263 }
264 
265 
is_format_signed(const cl_image_format * format)266 int is_format_signed( const cl_image_format *format )
267 {
268     switch( format->image_channel_data_type )
269     {
270         case CL_SNORM_INT8:
271         case CL_SIGNED_INT8:
272         case CL_SNORM_INT16:
273         case CL_SIGNED_INT16:
274         case CL_SIGNED_INT32:
275         case CL_HALF_FLOAT:
276         case CL_FLOAT:
277 #ifdef CL_SFIXED14_APPLE
278         case CL_SFIXED14_APPLE:
279 #endif
280             return 1;
281 
282         default:
283             return 0;
284     }
285 }
286 
get_pixel_size(cl_image_format * format)287 size_t get_pixel_size( cl_image_format *format )
288 {
289   switch( format->image_channel_data_type )
290   {
291     case CL_SNORM_INT8:
292     case CL_UNORM_INT8:
293     case CL_SIGNED_INT8:
294     case CL_UNSIGNED_INT8:
295       return get_format_channel_count( format );
296 
297     case CL_SNORM_INT16:
298     case CL_UNORM_INT16:
299     case CL_SIGNED_INT16:
300     case CL_UNSIGNED_INT16:
301     case CL_HALF_FLOAT:
302 #ifdef  CL_SFIXED14_APPLE
303         case CL_SFIXED14_APPLE:
304 #endif
305       return get_format_channel_count( format ) * sizeof( cl_ushort );
306 
307     case CL_SIGNED_INT32:
308     case CL_UNSIGNED_INT32:
309       return get_format_channel_count( format ) * sizeof( cl_int );
310 
311     case CL_UNORM_SHORT_565:
312     case CL_UNORM_SHORT_555:
313 #ifdef OBSOLETE_FORAMT
314     case CL_UNORM_SHORT_565_REV:
315     case CL_UNORM_SHORT_555_REV:
316 #endif
317       return 2;
318 
319 #ifdef OBSOLETE_FORAMT
320     case CL_UNORM_INT_8888:
321     case CL_UNORM_INT_8888_REV:
322       return 4;
323 #endif
324 
325     case CL_UNORM_INT_101010:
326 #ifdef OBSOLETE_FORAMT
327     case CL_UNORM_INT_101010_REV:
328 #endif
329       return 4;
330 
331     case CL_FLOAT:
332       return get_format_channel_count( format ) * sizeof( cl_float );
333 
334     default:
335       return 0;
336   }
337 }
338 
get_8_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)339 int get_8_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat )
340 {
341     cl_image_format formatList[ 128 ];
342     unsigned int outFormatCount, i;
343     int error;
344 
345 
346     /* Make sure each image format is supported */
347     if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount )))
348     return error;
349 
350 
351     /* Look for one that is an 8-bit format */
352     for( i = 0; i < outFormatCount; i++ )
353     {
354         if( formatList[ i ].image_channel_data_type == CL_SNORM_INT8 ||
355        formatList[ i ].image_channel_data_type == CL_UNORM_INT8 ||
356            formatList[ i ].image_channel_data_type == CL_SIGNED_INT8 ||
357            formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT8 )
358         {
359       if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) )
360       {
361         *outFormat = formatList[ i ];
362         return 0;
363       }
364         }
365     }
366 
367     return -1;
368 }
369 
get_32_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)370 int get_32_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat )
371 {
372     cl_image_format formatList[ 128 ];
373     unsigned int outFormatCount, i;
374     int error;
375 
376 
377   /* Make sure each image format is supported */
378   if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount )))
379     return error;
380 
381   /* Look for one that is an 8-bit format */
382   for( i = 0; i < outFormatCount; i++ )
383   {
384         if( formatList[ i ].image_channel_data_type == CL_UNORM_INT_101010 ||
385             formatList[ i ].image_channel_data_type == CL_FLOAT ||
386             formatList[ i ].image_channel_data_type == CL_SIGNED_INT32 ||
387             formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT32 )
388     {
389       if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) )
390       {
391         *outFormat = formatList[ i ];
392         return 0;
393       }
394     }
395     }
396 
397     return -1;
398 }
399 
random_log_in_range(int minV,int maxV,MTdata d)400 int random_log_in_range( int minV, int maxV, MTdata d  )
401 {
402     double v = log2( ( (double)genrand_int32(d) / (double)0xffffffff ) + 1 );
403     int iv = (int)( (float)( maxV - minV ) * v );
404     return iv + minV;
405 }
406 
407 
408 // Define the addressing functions
409 typedef int (*AddressFn)( int value, size_t maxValue );
410 
NoAddressFn(int value,size_t maxValue)411 int         NoAddressFn( int value, size_t maxValue )               { return value; }
RepeatAddressFn(int value,size_t maxValue)412 int         RepeatAddressFn( int value, size_t maxValue )
413 {
414     if( value < 0 )
415         value += (int)maxValue;
416     else if( value >= (int)maxValue )
417         value -= (int)maxValue;
418     return value;
419 }
MirroredRepeatAddressFn(int value,size_t maxValue)420 int         MirroredRepeatAddressFn( int value, size_t maxValue )
421 {
422     if( value < 0 )
423         value  = 0;
424     else if( (size_t) value >= maxValue )
425         value = (int) (maxValue - 1);
426     return value;
427 }
ClampAddressFn(int value,size_t maxValue)428 int         ClampAddressFn( int value, size_t maxValue )            { return ( value < -1 ) ? -1 : ( ( value > (cl_long) maxValue ) ? (int)maxValue : value ); }
ClampToEdgeNearestFn(int value,size_t maxValue)429 int         ClampToEdgeNearestFn( int value, size_t maxValue )  { return ( value < 0 ) ? 0 : ( ( (size_t)value > maxValue - 1 ) ? (int)maxValue - 1 : value ); }
430 AddressFn   ClampToEdgeLinearFn                                                 = ClampToEdgeNearestFn;
431 
432 // Note: normalized coords get repeated in normalized space, not unnormalized space! hence the special case here
433 volatile float gFloatHome;
RepeatNormalizedAddressFn(float fValue,size_t maxValue)434 float           RepeatNormalizedAddressFn( float fValue, size_t maxValue )
435 {
436 #ifndef _MSC_VER // Use original if not the VS compiler.
437     // General computation for repeat
438     return (fValue - floorf( fValue )) * (float) maxValue; // Reduce to [0, 1.f]
439 #else // Otherwise, use this instead:
440     // Home the subtraction to a float to break up the sequence of x87
441     // instructions emitted by the VS compiler.
442     gFloatHome = fValue - floorf(fValue);
443     return gFloatHome * (float)maxValue;
444 #endif
445 }
446 
MirroredRepeatNormalizedAddressFn(float fValue,size_t maxValue)447 float           MirroredRepeatNormalizedAddressFn( float fValue, size_t maxValue )
448 {
449     // Round to nearest multiple of two
450     float s_prime = 2.0f * rintf( fValue * 0.5f );        // Note halfway values flip flop here due to rte, but they both end up pointing the same place at the end of the day
451 
452     // Reduce to [-1, 1], Apply mirroring -> [0, 1]
453     s_prime = fabsf( fValue - s_prime );
454 
455     // un-normalize
456     return s_prime * (float) maxValue;
457 }
458 
459 struct AddressingTable
460 {
AddressingTableAddressingTable461     AddressingTable()
462     {
463         ct_assert( ( CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6 ) );
464         ct_assert( CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2 );
465 
466         mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]            = NoAddressFn;
467         mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]             = NoAddressFn;
468         mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]          = RepeatAddressFn;
469         mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]           = RepeatAddressFn;
470         mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]   = ClampToEdgeNearestFn;
471         mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]    = ClampToEdgeLinearFn;
472         mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ]           = ClampAddressFn;
473         mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]            = ClampAddressFn;
474         mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn;
475         mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ]  = MirroredRepeatAddressFn;
476     }
477 
operator []AddressingTable478     AddressFn operator[]( image_sampler_data *sampler )
479     {
480         return mTable[ (int)sampler->addressing_mode - CL_ADDRESS_NONE ][ (int)sampler->filter_mode - CL_FILTER_NEAREST ];
481     }
482 
483     AddressFn mTable[ 6 ][ 2 ];
484 };
485 
486 static AddressingTable  sAddressingTable;
487 
is_sRGBA_order(cl_channel_order image_channel_order)488 bool is_sRGBA_order(cl_channel_order image_channel_order){
489     switch (image_channel_order) {
490         case CL_sRGB:
491         case CL_sRGBx:
492         case CL_sRGBA:
493         case CL_sBGRA:
494             return true;
495         default:
496             return false;
497     }
498 }
499 
500 // Format helpers
501 
has_alpha(cl_image_format * format)502 int has_alpha(cl_image_format *format) {
503     switch (format->image_channel_order) {
504         case CL_R:
505             return 0;
506         case CL_A:
507             return 1;
508         case CL_Rx:
509             return 0;
510         case CL_RG:
511             return 0;
512         case CL_RA:
513             return 1;
514         case CL_RGx:
515             return 0;
516         case CL_RGB:
517         case CL_sRGB:
518             return 0;
519         case CL_RGBx:
520         case CL_sRGBx:
521             return 0;
522         case CL_RGBA:
523             return 1;
524         case CL_BGRA:
525             return 1;
526         case CL_ARGB:
527             return 1;
528         case CL_INTENSITY:
529             return 1;
530         case CL_LUMINANCE:
531             return 0;
532 #ifdef CL_BGR1_APPLE
533         case CL_BGR1_APPLE: return 1;
534 #endif
535 #ifdef CL_1RGB_APPLE
536         case CL_1RGB_APPLE: return 1;
537 #endif
538         case CL_sRGBA:
539         case CL_sBGRA:
540             return 1;
541         case CL_DEPTH:
542             return 0;
543         default:
544             log_error("Invalid image channel order: %d\n", format->image_channel_order);
545             return 0;
546     }
547 
548 }
549 
550 #define PRINT_MAX_SIZE_LOGIC 0
551 
552 #define SWAP( _a, _b )      do{ _a ^= _b; _b ^= _a; _a ^= _b; }while(0)
553 #ifndef MAX
554     #define MAX( _a, _b )   ((_a) > (_b) ? (_a) : (_b))
555 #endif
556 
get_max_sizes(size_t * numberOfSizes,const int maxNumberOfSizes,size_t sizes[][3],size_t maxWidth,size_t maxHeight,size_t maxDepth,size_t maxArraySize,const cl_ulong maxIndividualAllocSize,const cl_ulong maxTotalAllocSize,cl_mem_object_type image_type,cl_image_format * format,int usingMaxPixelSizeBuffer)557 void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
558                    size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
559                    const cl_ulong maxIndividualAllocSize,       // CL_DEVICE_MAX_MEM_ALLOC_SIZE
560                    const cl_ulong maxTotalAllocSize,            // CL_DEVICE_GLOBAL_MEM_SIZE
561                    cl_mem_object_type image_type, cl_image_format *format, int usingMaxPixelSizeBuffer) {
562 
563     bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
564     bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);
565 
566     // Validate we have a reasonable max depth for 3D
567     if (is3D && maxDepth < 2) {
568         log_error("ERROR: Requesting max image sizes for 3D images when max depth is < 2.\n");
569         *numberOfSizes = 0;
570         return;
571     }
572     // Validate we have a reasonable max array size for 1D & 2D image arrays
573     if (isArray && maxArraySize < 2) {
574         log_error("ERROR: Requesting max image sizes for an image array when max array size is < 1.\n");
575         *numberOfSizes = 0;
576         return;
577     }
578 
579     // Reduce the maximum because we are trying to test the max image dimensions, not the memory allocation
580     cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
581     cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
582     log_info("Note: max individual allocation adjusted down from %gMB to %gMB and max total allocation adjusted down from %gMB to %gMB.\n",
583              maxIndividualAllocSize/(1024.0*1024.0), adjustedMaxIndividualAllocSize/(1024.0*1024.0),
584              maxTotalAllocSize/(1024.0*1024.0), adjustedMaxTotalAllocSize/(1024.0*1024.0));
585 
586     // Cap our max allocation to 1.0GB.
587     // FIXME -- why?  In the interest of not taking a long time?  We should still test this stuff...
588     if (adjustedMaxTotalAllocSize > (cl_ulong)1024*1024*1024) {
589       adjustedMaxTotalAllocSize = (cl_ulong)1024*1024*1024;
590       log_info("Limiting max total allocation size to %gMB (down from %gMB) for test.\n",
591         adjustedMaxTotalAllocSize/(1024.0*1024.0), maxTotalAllocSize/(1024.0*1024.0));
592     }
593 
594     cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
595     if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize*2)
596         maxAllocSize = adjustedMaxTotalAllocSize/2;
597 
598     size_t raw_pixel_size = get_pixel_size(format);
599     // If the test will be creating input (src) buffer of type int4 or float4, number of pixels will be
600     // governed by sizeof(int4 or float4) and not sizeof(dest fomat)
601     // Also if pixel size is 12 bytes i.e. RGB or RGBx, we adjust it to 16 bytes as GPUs has no concept
602     // of 3 channel images. GPUs expand these to four channel RGBA.
603     if(usingMaxPixelSizeBuffer || raw_pixel_size == 12)
604       raw_pixel_size = 16;
605     size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
606 
607     log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, per-allocation limit %gMB.\n",
608              maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize/(1024.0*1024.0)));
609 
610   // Keep track of the maximum sizes for each dimension
611   size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };
612 
613   switch (image_type) {
614     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
615       maximum_sizes[1] = maxArraySize;
616       maximum_sizes[2] = 1;
617       break;
618     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
619       maximum_sizes[2] = maxArraySize;
620       break;
621   }
622 
623 
624   // Given one fixed sized dimension, this code finds one or two other dimensions,
625   // both with very small size, such that the size does not exceed the maximum
626   // passed to this function
627 
628 #if defined(__x86_64) || defined (__arm64__) || defined (__ppc64__)
629   size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15};
630 #else
631   size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13};
632 #endif
633 
634   static size_t other_size = 0;
635   enum { num_other_sizes = sizeof(other_sizes)/sizeof(size_t) };
636 
637   (*numberOfSizes) = 0;
638 
639   if (image_type == CL_MEM_OBJECT_IMAGE1D) {
640 
641     double M = maximum_sizes[0];
642 
643     // Store the size
644     sizes[(*numberOfSizes)][0] = (size_t)M;
645     sizes[(*numberOfSizes)][1] = 1;
646     sizes[(*numberOfSizes)][2] = 1;
647     ++(*numberOfSizes);
648   }
649 
650   else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) {
651 
652     for (int fixed_dim=0;fixed_dim<2;++fixed_dim) {
653 
654       // Determine the size of the fixed dimension
655       double M = maximum_sizes[fixed_dim];
656       double A = max_pixels;
657 
658       int x0_dim = !fixed_dim;
659       double x0  = fmin(fmin(other_sizes[(other_size++)%num_other_sizes],A/M), maximum_sizes[x0_dim]);
660 
661       // Store the size
662       sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
663       sizes[(*numberOfSizes)][x0_dim]    = (size_t)x0;
664       sizes[(*numberOfSizes)][2]         = 1;
665       ++(*numberOfSizes);
666     }
667   }
668 
669   else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE3D) {
670 
671     // Iterate over dimensions, finding sizes for the non-fixed dimension
672     for (int fixed_dim=0;fixed_dim<3;++fixed_dim) {
673 
674       // Determine the size of the fixed dimension
675       double M = maximum_sizes[fixed_dim];
676       double A = max_pixels;
677 
678       // Find two other dimensions, x0 and x1
679       int x0_dim = (fixed_dim == 0) ? 1 : 0;
680       int x1_dim = (fixed_dim == 2) ? 1 : 2;
681 
682       // Choose two other sizes for these dimensions
683       double x0 = fmin(fmin(A/M,maximum_sizes[x0_dim]),other_sizes[(other_size++)%num_other_sizes]);
684       // GPUs have certain restrictions on minimum width (row alignment) of images which has given us issues
685       // testing small widths in this test (say we set width to 3 for testing, and compute size based on this width and decide
686       // it fits within vram ... but GPU driver decides that, due to row alignment requirements, it has to use
687       // width of 16 which doesnt fit in vram). For this purpose we are not testing width < 16 for this test.
688       if(x0_dim == 0 && x0 < 16)
689         x0 = 16;
690       double x1 = fmin(fmin(A/M/x0,maximum_sizes[x1_dim]),other_sizes[(other_size++)%num_other_sizes]);
691 
692       // Valid image sizes cannot be below 1. Due to the workaround for the xo_dim where x0 is overidden to 16
693       // there might not be enough space left for x1 dimension. This could be a fractional 0.x size that when cast to
694       // integer would result in a value 0. In these cases we clamp the size to a minimum of 1.
695       if ( x1 < 1 )
696         x1 = 1;
697 
698       // M and x0 cannot be '0' as they derive from clDeviceInfo calls
699       assert(x0 > 0 && M > 0);
700 
701       // Store the size
702       sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
703       sizes[(*numberOfSizes)][x0_dim]    = (size_t)x0;
704       sizes[(*numberOfSizes)][x1_dim]    = (size_t)x1;
705       ++(*numberOfSizes);
706     }
707   }
708 
709   // Log the results
710   for (int j=0; j<(int)(*numberOfSizes); j++) {
711     switch (image_type) {
712       case CL_MEM_OBJECT_IMAGE1D:
713         log_info(" size[%d] = [%ld] (%g MB image)\n",
714                  j, sizes[j][0], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
715         break;
716       case CL_MEM_OBJECT_IMAGE1D_ARRAY:
717       case CL_MEM_OBJECT_IMAGE2D:
718         log_info(" size[%d] = [%ld %ld] (%g MB image)\n",
719                  j, sizes[j][0], sizes[j][1], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
720         break;
721       case CL_MEM_OBJECT_IMAGE2D_ARRAY:
722       case CL_MEM_OBJECT_IMAGE3D:
723         log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n",
724                  j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
725         break;
726     }
727   }
728 }
729 
get_max_absolute_error(cl_image_format * format,image_sampler_data * sampler)730 float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler) {
731     if (sampler->filter_mode == CL_FILTER_NEAREST)
732         return 0.0f;
733 
734     switch (format->image_channel_data_type) {
735         case CL_SNORM_INT8:
736             return 1.0f/127.0f;
737         case CL_UNORM_INT8:
738             return 1.0f/255.0f;
739         case CL_UNORM_INT16:
740             return 1.0f/65535.0f;
741         case CL_SNORM_INT16:
742             return 1.0f/32767.0f;
743         case CL_FLOAT:
744             return CL_FLT_MIN;
745 #ifdef  CL_SFIXED14_APPLE
746         case CL_SFIXED14_APPLE:
747             return 0x1.0p-14f;
748 #endif
749         default:
750             return 0.0f;
751     }
752 }
753 
get_max_relative_error(cl_image_format * format,image_sampler_data * sampler,int is3D,int isLinearFilter)754 float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter )
755 {
756     float maxError = 0.0f;
757     float sampleCount = 1.0f;
758     if( isLinearFilter )
759         sampleCount =  is3D ? 8.0f : 4.0f;
760 
761     // Note that the ULP is defined here as the unit in the last place of the maximum
762     // magnitude sample used for filtering.
763 
764     // Section 8.3
765     switch( format->image_channel_data_type )
766     {
767             // The spec allows 2 ulps of error for normalized formats
768         case CL_SNORM_INT8:
769         case CL_UNORM_INT8:
770         case CL_SNORM_INT16:
771         case CL_UNORM_INT16:
772         case CL_UNORM_SHORT_565:
773         case CL_UNORM_SHORT_555:
774         case CL_UNORM_INT_101010:
775             maxError = 2*FLT_EPSILON*sampleCount;       // Maximum sampling error for round to zero normalization based on multiplication
776             // by reciprocal (using reciprocal generated in round to +inf mode, so that 1.0 matches spec)
777             break;
778 
779             // If the implementation supports these formats then it will have to allow rounding error here too,
780             // because not all 32-bit ints are exactly representable in float
781         case CL_SIGNED_INT32:
782         case CL_UNSIGNED_INT32:
783             maxError = 1*FLT_EPSILON;
784             break;
785     }
786 
787 
788     // Section 8.2
789     if( sampler->addressing_mode == CL_ADDRESS_REPEAT || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT || sampler->filter_mode != CL_FILTER_NEAREST || sampler->normalized_coords )
790 #if defined( __APPLE__ )
791     {
792         if( sampler->filter_mode != CL_FILTER_NEAREST )
793         {
794             // The maximum
795             if( gDeviceType == CL_DEVICE_TYPE_GPU )
796                 maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4);              // Some GPUs ain't so accurate
797             else
798                 // The standard method of 2d linear filtering delivers 4.0 ulps of error in round to nearest (8 in rtz).
799                 maxError += 4.0f * FLT_EPSILON;
800         }
801         else
802             maxError += 4.0f * FLT_EPSILON;    // normalized coordinates will introduce some error into the fractional part of the address, affecting results
803     }
804 #else
805     {
806 #if !defined(_WIN32)
807 #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
808 #endif
809         // The spec allows linear filters to return any result most of the time.
810         // That's fine for implementations but a problem for testing. After all
811         // users aren't going to like garbage images.  We have "picked a number"
812         // here that we are going to attempt to conform to. Implementations are
813         // free to pick another number, like infinity, if they like.
814         // We picked a number for you, to provide /some/ sanity
815         maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
816         // ...but this is what the spec allows:
817         // maxError = INFINITY;
818         // Please feel free to pick any positive number. (NaN wont work.)
819     }
820 #endif
821 
822     // The error calculation itself can introduce error
823     maxError += FLT_EPSILON * 2;
824 
825     return maxError;
826 }
827 
get_format_max_int(cl_image_format * format)828 size_t get_format_max_int( cl_image_format *format )
829 {
830     switch( format->image_channel_data_type )
831     {
832         case CL_SNORM_INT8:
833         case CL_SIGNED_INT8:
834             return 127;
835         case CL_UNORM_INT8:
836         case CL_UNSIGNED_INT8:
837             return 255;
838 
839         case CL_SNORM_INT16:
840         case CL_SIGNED_INT16:
841             return 32767;
842 
843         case CL_UNORM_INT16:
844         case CL_UNSIGNED_INT16:
845             return 65535;
846 
847         case CL_SIGNED_INT32:
848             return 2147483647L;
849 
850         case CL_UNSIGNED_INT32:
851             return 4294967295LL;
852 
853         case CL_UNORM_SHORT_565:
854         case CL_UNORM_SHORT_555:
855             return 31;
856 
857         case CL_UNORM_INT_101010:
858             return 1023;
859 
860         case CL_HALF_FLOAT:
861             return 1<<10;
862 
863 #ifdef CL_SFIXED14_APPLE
864         case CL_SFIXED14_APPLE:
865             return 16384;
866 #endif
867         default:
868             return 0;
869     }
870 }
871 
get_format_min_int(cl_image_format * format)872 int get_format_min_int( cl_image_format *format )
873 {
874     switch( format->image_channel_data_type )
875     {
876         case CL_SNORM_INT8:
877         case CL_SIGNED_INT8:
878             return -128;
879         case CL_UNORM_INT8:
880         case CL_UNSIGNED_INT8:
881             return 0;
882 
883         case CL_SNORM_INT16:
884         case CL_SIGNED_INT16:
885             return -32768;
886 
887         case CL_UNORM_INT16:
888         case CL_UNSIGNED_INT16:
889             return 0;
890 
891         case CL_SIGNED_INT32:
892             return -2147483648LL;
893 
894         case CL_UNSIGNED_INT32:
895             return 0;
896 
897         case CL_UNORM_SHORT_565:
898         case CL_UNORM_SHORT_555:
899         case CL_UNORM_INT_101010:
900             return 0;
901 
902         case CL_HALF_FLOAT: return -(1 << 10);
903 
904 #ifdef CL_SFIXED14_APPLE
905         case CL_SFIXED14_APPLE:
906             return -16384;
907 #endif
908 
909         default:
910             return 0;
911     }
912 }
913 
convert_half_to_float(unsigned short halfValue)914 float convert_half_to_float( unsigned short halfValue )
915 {
916     // We have to take care of a few special cases, but in general, we just extract
917     // the same components from the half that exist in the float and re-stuff them
918     // For a description of the actual half format, see http://en.wikipedia.org/wiki/Half_precision
919     // Note: we store these in 32-bit ints to make the bit manipulations easier later
920     int sign =     ( halfValue >> 15 ) & 0x0001;
921     int exponent = ( halfValue >> 10 ) & 0x001f;
922     int mantissa = ( halfValue )       & 0x03ff;
923 
924     // Note: we use a union here to be able to access the bits of a float directly
925     union
926     {
927         unsigned int bits;
928         float floatValue;
929     } outFloat;
930 
931     // Special cases first
932     if( exponent == 0 )
933     {
934         if( mantissa == 0 )
935         {
936             // If both exponent and mantissa are 0, the number is +/- 0
937             outFloat.bits  = sign << 31;
938             return outFloat.floatValue; // Already done!
939         }
940 
941         // If exponent is 0, it's a denormalized number, so we renormalize it
942         // Note: this is not terribly efficient, but oh well
943         while( ( mantissa & 0x00000400 ) == 0 )
944         {
945             mantissa <<= 1;
946             exponent--;
947         }
948 
949         // The first bit is implicit, so we take it off and inc the exponent accordingly
950         exponent++;
951         mantissa &= ~(0x00000400);
952     }
953     else if( exponent == 31 ) // Special-case "numbers"
954     {
955         // If the exponent is 31, it's a special case number (+/- infinity or NAN).
956         // If the mantissa is 0, it's infinity, else it's NAN, but in either case, the packing
957         // method is the same
958         outFloat.bits = ( sign << 31 ) | 0x7f800000 | ( mantissa << 13 );
959         return outFloat.floatValue;
960     }
961 
962     // Plain ol' normalized number, so adjust to the ranges a 32-bit float expects and repack
963     exponent += ( 127 - 15 );
964     mantissa <<= 13;
965 
966     outFloat.bits = ( sign << 31 ) | ( exponent << 23 ) | mantissa;
967     return outFloat.floatValue;
968 }
969 
970 
971 
convert_float_to_half(float f)972 cl_ushort convert_float_to_half( float f )
973 {
974     switch( gFloatToHalfRoundingMode )
975     {
976         case kRoundToNearestEven:
977             return float2half_rte( f );
978         case kRoundTowardZero:
979             return float2half_rtz( f );
980         default:
981             log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
982             exit(-1);
983             return 0xffff;
984     }
985 
986 }
987 
float2half_rte(float f)988 cl_ushort float2half_rte( float f )
989     {
990     union{ float f; cl_uint u; } u = {f};
991     cl_uint sign = (u.u >> 16) & 0x8000;
992     float x = fabsf(f);
993 
994     //Nan
995     if( x != x )
996     {
997         u.u >>= (24-11);
998         u.u &= 0x7fff;
999         u.u |= 0x0200;      //silence the NaN
1000         return u.u | sign;
1001                 }
1002 
1003     // overflow
1004     if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) )
1005         return 0x7c00 | sign;
1006 
1007     // underflow
1008     if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) )
1009         return sign;    // The halfway case can return 0x0001 or 0. 0 is even.
1010 
1011     // very small
1012     if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) )
1013         return sign | 1;
1014 
1015     // half denormal
1016     if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
1017     {
1018         u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125);
1019         return sign | u.u;
1020         }
1021 
1022     u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13);
1023     u.u &= 0x7f800000;
1024     x += u.f;
1025     u.f = x - u.f;
1026     u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
1027 
1028     return (u.u >> (24-11)) | sign;
1029     }
1030 
float2half_rtz(float f)1031 cl_ushort float2half_rtz( float f )
1032     {
1033     union{ float f; cl_uint u; } u = {f};
1034     cl_uint sign = (u.u >> 16) & 0x8000;
1035     float x = fabsf(f);
1036 
1037     //Nan
1038     if( x != x )
1039         {
1040         u.u >>= (24-11);
1041         u.u &= 0x7fff;
1042         u.u |= 0x0200;      //silence the NaN
1043         return u.u | sign;
1044         }
1045 
1046     // overflow
1047     if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
1048         {
1049         if( x == INFINITY )
1050             return 0x7c00 | sign;
1051 
1052         return 0x7bff | sign;
1053         }
1054 
1055     // underflow
1056     if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
1057         return sign;    // The halfway case can return 0x0001 or 0. 0 is even.
1058 
1059     // half denormal
1060     if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
1061     {
1062         x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
1063         return (cl_ushort)((int) x | sign);
1064     }
1065 
1066     u.u &= 0xFFFFE000U;
1067     u.u -= 0x38000000U;
1068 
1069     return (u.u >> (24-11)) | sign;
1070 }
1071 
1072 class TEST
1073 {
1074 public:
1075     TEST();
1076 };
1077 
1078 static TEST t;
__vstore_half_rte(float f,size_t index,uint16_t * p)1079 void  __vstore_half_rte(float f, size_t index, uint16_t *p)
1080 {
1081     union{ unsigned int u; float f;} u;
1082 
1083     u.f = f;
1084     unsigned short r = (u.u >> 16) & 0x8000;
1085     u.u &= 0x7fffffff;
1086     if( u.u >= 0x33000000U )
1087     {
1088         if( u.u >= 0x47800000 )
1089         {
1090             if( u.u <= 0x7f800000 )
1091                 r |= 0x7c00;
1092             else
1093             {
1094                 r |= 0x7e00 | ( (u.u >> 13) & 0x3ff );
1095             }
1096         }
1097         else
1098         {
1099             float x = u.f;
1100             if( u.u < 0x38800000 )
1101                 u.u = 0x3f000000;
1102             else
1103                 u.u += 0x06800000;
1104             u.u &= 0x7f800000U;
1105             x += u.f;
1106             x -= u.f;
1107             u.f = x * MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
1108             u.u >>= 13;
1109             r |= (unsigned short) u.u;
1110         }
1111     }
1112 
1113     ((unsigned short*)p)[index] = r;
1114 }
1115 
TEST()1116 TEST::TEST()
1117 {
1118     return;
1119     union
1120     {
1121         float f;
1122         uint32_t i;
1123     } test;
1124     uint16_t control, myval;
1125 
1126     log_info(" &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );
1127     test.i = 0;
1128     do
1129     {
1130         if( ( test.i & 0xffffff ) == 0 )
1131         {
1132             if( ( test.i & 0xfffffff ) == 0 )
1133                 log_info( "*" );
1134             else
1135                 log_info( "." );
1136             fflush(stdout);
1137         }
1138         __vstore_half_rte( test.f, 0, &control );
1139         myval = convert_float_to_half( test.f );
1140         if( myval != control )
1141         {
1142             log_info( "\n******** ERROR: MyVal %04x control %04x source %12.24f\n", myval, control, test.f );
1143             log_info( "         source bits: %08x   %a\n", test.i, test.f );
1144             float t, c;
1145             c = convert_half_to_float( control );
1146             t = convert_half_to_float( myval );
1147             log_info( "         converted control: %12.24f myval: %12.24f\n", c, t );
1148         }
1149         test.i++;
1150     } while( test.i != 0 );
1151     log_info("\n &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );
1152 
1153 }
1154 
get_image_size(image_descriptor const * imageInfo)1155 cl_ulong get_image_size( image_descriptor const *imageInfo )
1156 {
1157     cl_ulong imageSize;
1158 
1159     // Assumes rowPitch and slicePitch are always correctly defined
1160     if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 )
1161     {
1162       imageSize = (size_t) compute_mipmapped_image_size(*imageInfo);
1163     }
1164     else
1165     {
1166       switch (imageInfo->type)
1167       {
1168       case CL_MEM_OBJECT_IMAGE1D:
1169         imageSize = imageInfo->rowPitch;
1170         break;
1171       case CL_MEM_OBJECT_IMAGE2D:
1172         imageSize = imageInfo->height * imageInfo->rowPitch;
1173         break;
1174       case CL_MEM_OBJECT_IMAGE3D:
1175         imageSize = imageInfo->depth * imageInfo->slicePitch;
1176         break;
1177       case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1178         imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1179         break;
1180       case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1181         imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1182         break;
1183       default:
1184         log_error("ERROR: Cannot identify image type %x\n", imageInfo->type);
1185         abort();
1186       }
1187     }
1188     return imageSize;
1189 }
1190 
1191 // Calculate image size in megabytes (strictly, mebibytes). Result is rounded up.
get_image_size_mb(image_descriptor const * imageInfo)1192 cl_ulong get_image_size_mb( image_descriptor const *imageInfo )
1193 {
1194     cl_ulong imageSize = get_image_size( imageInfo );
1195     cl_ulong mb = imageSize / ( 1024 * 1024 );
1196     if ( imageSize % ( 1024 * 1024 ) > 0 )
1197     {
1198         mb += 1;
1199     }
1200     return  mb;
1201 }
1202 
1203 
1204 uint64_t gRoundingStartValue = 0;
1205 
1206 
escape_inf_nan_values(char * data,size_t allocSize)1207 void escape_inf_nan_values( char* data, size_t allocSize ) {
1208     // filter values with 8 not-quite-highest bits
1209     unsigned int *intPtr = (unsigned int *)data;
1210     for( size_t i = 0; i < allocSize >> 2; i++ )
1211     {
1212         if( ( intPtr[ i ] & 0x7F800000 ) == 0x7F800000 )
1213             intPtr[ i ] ^= 0x40000000;
1214     }
1215 
1216     // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits = 0x7C00 are special)
1217     unsigned short *shortPtr = (unsigned short *)data;
1218     for( size_t i = 0; i < allocSize >> 1; i++ )
1219     {
1220         if( ( shortPtr[ i ] & 0x7C00 ) == 0x7C00 )
1221             shortPtr[ i ] ^= 0x4000;
1222     }
1223 }
1224 
generate_random_image_data(image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d)1225 char * generate_random_image_data( image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d )
1226 {
1227     size_t allocSize = get_image_size( imageInfo );
1228     size_t pixelRowBytes = imageInfo->width * get_pixel_size( imageInfo->format );
1229     size_t i;
1230 
1231     if (imageInfo->num_mip_levels > 1)
1232       allocSize = compute_mipmapped_image_size(*imageInfo);
1233 
1234 #if defined (__APPLE__ )
1235     char *data = NULL;
1236     if (gDeviceType == CL_DEVICE_TYPE_CPU) {
1237         size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;
1238 
1239         void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
1240         intptr_t data_end = (intptr_t)map + mapSize - 4096;
1241         data = (char *)(data_end - (intptr_t)allocSize);
1242 
1243         mprotect(map, 4096, PROT_NONE);
1244         mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
1245         P.reset(data, map, mapSize,allocSize);
1246     } else {
1247         data = (char *)malloc(allocSize);
1248         P.reset(data,NULL,0,allocSize);
1249     }
1250 #else
1251     P.reset( NULL ); // Free already allocated memory first, then try to allocate new block.
1252     char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format));
1253     P.reset(data,NULL,0,allocSize, true);
1254 #endif
1255 
1256     if (data == NULL) {
1257       log_error( "ERROR: Unable to malloc %lu bytes for generate_random_image_data\n", allocSize );
1258       return 0;
1259     }
1260 
1261     if( gTestRounding )
1262     {
1263         // Special case: fill with a ramp from 0 to the size of the type
1264         size_t typeSize = get_format_type_size( imageInfo->format );
1265         switch( typeSize )
1266         {
1267             case 1:
1268             {
1269                 char *ptr = data;
1270                 for( i = 0; i < allocSize; i++ )
1271                     ptr[i] = (cl_char) (i + gRoundingStartValue);
1272             }
1273                 break;
1274             case 2:
1275             {
1276                 cl_short *ptr = (cl_short*) data;
1277                 for( i = 0; i < allocSize / 2; i++ )
1278                     ptr[i] = (cl_short) (i +  gRoundingStartValue);
1279             }
1280                 break;
1281             case 4:
1282             {
1283                 cl_int *ptr = (cl_int*) data;
1284                 for( i = 0; i < allocSize / 4; i++ )
1285                     ptr[i] = (cl_int) (i +  gRoundingStartValue);
1286             }
1287                 break;
1288         }
1289 
1290         // Note: inf or nan float values would cause problems, although we don't know this will
1291         // actually be a float, so we just know what to look for
1292         escape_inf_nan_values( data, allocSize );
1293         return data;
1294     }
1295 
1296     // Otherwise, we should be able to just fill with random bits no matter what
1297     cl_uint *p = (cl_uint*) data;
1298     for( i = 0; i + 4 <= allocSize; i += 4 )
1299         p[ i / 4 ] = genrand_int32(d);
1300 
1301     for( ; i < allocSize; i++ )
1302         data[i] = genrand_int32(d);
1303 
1304     // Note: inf or nan float values would cause problems, although we don't know this will
1305     // actually be a float, so we just know what to look for
1306     escape_inf_nan_values( data, allocSize );
1307 
1308     if ( /*!gTestMipmaps*/ imageInfo->num_mip_levels < 2 )
1309     {
1310       // Fill unused edges with -1, NaN for float
1311       if (imageInfo->rowPitch > pixelRowBytes)
1312       {
1313           size_t height = 0;
1314 
1315           switch (imageInfo->type)
1316           {
1317               case CL_MEM_OBJECT_IMAGE2D:
1318               case CL_MEM_OBJECT_IMAGE3D:
1319               case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1320                   height = imageInfo->height;
1321                   break;
1322               case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1323                   height = imageInfo->arraySize;
1324                   break;
1325             }
1326 
1327             // Fill in the row padding regions
1328             for( i = 0; i < height; i++ )
1329             {
1330                 size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
1331                 size_t length = imageInfo->rowPitch - pixelRowBytes;
1332                 memset( data + offset, 0xff, length );
1333             }
1334       }
1335 
1336       // Fill in the slice padding regions, if necessary:
1337 
1338       size_t slice_dimension = imageInfo->height;
1339       if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
1340           slice_dimension = imageInfo->arraySize;
1341       }
1342 
1343       if (imageInfo->slicePitch > slice_dimension*imageInfo->rowPitch)
1344       {
1345           size_t depth = 0;
1346           switch (imageInfo->type)
1347           {
1348             case CL_MEM_OBJECT_IMAGE2D:
1349             case CL_MEM_OBJECT_IMAGE3D:
1350                 depth = imageInfo->depth;
1351                 break;
1352             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1353             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1354                 depth = imageInfo->arraySize;
1355                 break;
1356           }
1357 
1358           for( i = 0; i < depth; i++ )
1359           {
1360               size_t offset = i * imageInfo->slicePitch + slice_dimension*imageInfo->rowPitch;
1361               size_t length = imageInfo->slicePitch - slice_dimension*imageInfo->rowPitch;
1362               memset( data + offset, 0xff, length );
1363           }
1364       }
1365     }
1366 
1367     return data;
1368 }
1369 
1370 #define CLAMP_FLOAT( v ) ( fmaxf( fminf( v, 1.f ), -1.f ) )
1371 
1372 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData,int lod)1373 void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1374                             int x, int y, int z, float *outData, int lod )
1375 {
1376     size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth;
1377     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1378 
1379     if ( imageInfo->num_mip_levels > 1 )
1380     {
1381       switch(imageInfo->type)
1382       {
1383       case CL_MEM_OBJECT_IMAGE3D :
1384         depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1;
1385       case CL_MEM_OBJECT_IMAGE2D :
1386       case CL_MEM_OBJECT_IMAGE2D_ARRAY :
1387         height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1;
1388       default :
1389         width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1;
1390       }
1391       row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1392       if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY )
1393         slice_pitch_lod = row_pitch_lod;
1394       else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1395         slice_pitch_lod = row_pitch_lod * height_lod;
1396     }
1397     else
1398     {
1399       row_pitch_lod = imageInfo->rowPitch;
1400       slice_pitch_lod = imageInfo->slicePitch;
1401     }
1402     if ( x < 0 || y < 0 || z < 0 || x >= (int)width_lod
1403                || ( height_lod != 0 && y >= (int)height_lod )
1404                || ( depth_lod != 0 && z >= (int)depth_lod )
1405                || ( imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize ) )
1406     {
1407             outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = outData[ 3 ] = 0;
1408             if (!has_alpha(imageInfo->format))
1409                 outData[3] = 1;
1410         return;
1411     }
1412 
1413     cl_image_format *format = imageInfo->format;
1414 
1415     unsigned int i;
1416     float tempData[ 4 ];
1417 
1418     // Advance to the right spot
1419     char *ptr = (char *)imageData;
1420     size_t pixelSize = get_pixel_size( format );
1421 
1422     ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize;
1423 
1424     // OpenCL only supports reading floats from certain formats
1425     size_t channelCount = get_format_channel_count( format );
1426     switch( format->image_channel_data_type )
1427     {
1428         case CL_SNORM_INT8:
1429         {
1430             cl_char *dPtr = (cl_char *)ptr;
1431             for( i = 0; i < channelCount; i++ )
1432                 tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 127.0f );
1433             break;
1434         }
1435 
1436         case CL_UNORM_INT8:
1437         {
1438             unsigned char *dPtr = (unsigned char *)ptr;
1439             for( i = 0; i < channelCount; i++ ) {
1440                 if((is_sRGBA_order(imageInfo->format->image_channel_order)) && i<3) // only RGB need to be converted for sRGBA
1441                     tempData[ i ] = (float)sRGBunmap((float)dPtr[ i ] / 255.0f) ;
1442                 else
1443                     tempData[ i ] = (float)dPtr[ i ] / 255.0f;
1444             }
1445             break;
1446         }
1447 
1448         case CL_SIGNED_INT8:
1449         {
1450             cl_char *dPtr = (cl_char *)ptr;
1451             for( i = 0; i < channelCount; i++ )
1452                 tempData[ i ] =  (float)dPtr[ i ];
1453             break;
1454         }
1455 
1456         case CL_UNSIGNED_INT8:
1457         {
1458             cl_uchar *dPtr = (cl_uchar *)ptr;
1459             for( i = 0; i < channelCount; i++ )
1460                 tempData[ i ] = (float) dPtr[ i ];
1461             break;
1462         }
1463 
1464         case CL_SNORM_INT16:
1465         {
1466             cl_short *dPtr = (cl_short *)ptr;
1467             for( i = 0; i < channelCount; i++ )
1468                 tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 32767.0f );
1469             break;
1470         }
1471 
1472         case CL_UNORM_INT16:
1473         {
1474             cl_ushort *dPtr = (cl_ushort *)ptr;
1475             for( i = 0; i < channelCount; i++ )
1476                 tempData[ i ] = (float)dPtr[ i ] / 65535.0f;
1477             break;
1478         }
1479 
1480         case CL_SIGNED_INT16:
1481         {
1482             cl_short *dPtr = (cl_short *)ptr;
1483             for( i = 0; i < channelCount; i++ )
1484                 tempData[ i ] = (float)dPtr[ i ];
1485             break;
1486         }
1487 
1488         case CL_UNSIGNED_INT16:
1489         {
1490             cl_ushort *dPtr = (cl_ushort *)ptr;
1491             for( i = 0; i < channelCount; i++ )
1492                 tempData[ i ] = (float) dPtr[ i ];
1493             break;
1494         }
1495 
1496         case CL_HALF_FLOAT:
1497         {
1498             cl_ushort *dPtr = (cl_ushort *)ptr;
1499             for( i = 0; i < channelCount; i++ )
1500                 tempData[ i ] = convert_half_to_float( dPtr[ i ] );
1501             break;
1502         }
1503 
1504         case CL_SIGNED_INT32:
1505         {
1506             cl_int *dPtr = (cl_int *)ptr;
1507             for( i = 0; i < channelCount; i++ )
1508                 tempData[ i ] = (float)dPtr[ i ];
1509             break;
1510         }
1511 
1512         case CL_UNSIGNED_INT32:
1513         {
1514             cl_uint *dPtr = (cl_uint *)ptr;
1515             for( i = 0; i < channelCount; i++ )
1516                 tempData[ i ] = (float)dPtr[ i ];
1517             break;
1518         }
1519 
1520         case CL_UNORM_SHORT_565:
1521         {
1522             cl_ushort *dPtr = (cl_ushort *)ptr;
1523             tempData[ 0 ] = (float)( dPtr[ 0 ] >> 11 ) / (float)31;
1524             tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 63 ) / (float)63;
1525             tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
1526             break;
1527         }
1528 
1529         case CL_UNORM_SHORT_555:
1530         {
1531             cl_ushort *dPtr = (cl_ushort *)ptr;
1532             tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 31 ) / (float)31;
1533             tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 31 ) / (float)31;
1534             tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
1535             break;
1536         }
1537 
1538         case CL_UNORM_INT_101010:
1539         {
1540             cl_uint *dPtr = (cl_uint *)ptr;
1541             tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 20 ) & 0x3ff ) / (float)1023;
1542             tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 0x3ff ) / (float)1023;
1543             tempData[ 2 ] = (float)( dPtr[ 0 ] & 0x3ff ) / (float)1023;
1544             break;
1545         }
1546 
1547         case CL_FLOAT:
1548         {
1549             float *dPtr = (float *)ptr;
1550             for( i = 0; i < channelCount; i++ )
1551                 tempData[ i ] = (float)dPtr[ i ];
1552             break;
1553         }
1554 #ifdef  CL_SFIXED14_APPLE
1555         case CL_SFIXED14_APPLE:
1556         {
1557             cl_ushort *dPtr = (cl_ushort*) ptr;
1558             for( i = 0; i < channelCount; i++ )
1559                 tempData[i] = ((int) dPtr[i] - 16384) * 0x1.0p-14f;
1560             break;
1561         }
1562 #endif
1563     }
1564 
1565 
1566     outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = 0;
1567     outData[ 3 ] = 1;
1568 
1569     switch( format->image_channel_order )
1570     {
1571         case CL_A:
1572             outData[ 3 ] = tempData[ 0 ];
1573             break;
1574         case CL_R:
1575         case CL_Rx:
1576             outData[ 0 ] = tempData[ 0 ];
1577             break;
1578         case CL_RA:
1579             outData[ 0 ] = tempData[ 0 ];
1580             outData[ 3 ] = tempData[ 1 ];
1581             break;
1582         case CL_RG:
1583         case CL_RGx:
1584             outData[ 0 ] = tempData[ 0 ];
1585             outData[ 1 ] = tempData[ 1 ];
1586             break;
1587         case CL_RGB:
1588         case CL_RGBx:
1589         case CL_sRGB:
1590         case CL_sRGBx:
1591             outData[ 0 ] = tempData[ 0 ];
1592             outData[ 1 ] = tempData[ 1 ];
1593             outData[ 2 ] = tempData[ 2 ];
1594             break;
1595         case CL_RGBA:
1596             outData[ 0 ] = tempData[ 0 ];
1597             outData[ 1 ] = tempData[ 1 ];
1598             outData[ 2 ] = tempData[ 2 ];
1599             outData[ 3 ] = tempData[ 3 ];
1600             break;
1601         case CL_ARGB:
1602             outData[ 0 ] = tempData[ 1 ];
1603             outData[ 1 ] = tempData[ 2 ];
1604             outData[ 2 ] = tempData[ 3 ];
1605             outData[ 3 ] = tempData[ 0 ];
1606             break;
1607         case CL_BGRA:
1608         case CL_sBGRA:
1609             outData[ 0 ] = tempData[ 2 ];
1610             outData[ 1 ] = tempData[ 1 ];
1611             outData[ 2 ] = tempData[ 0 ];
1612             outData[ 3 ] = tempData[ 3 ];
1613             break;
1614         case CL_INTENSITY:
1615             outData[ 0 ] = tempData[ 0 ];
1616             outData[ 1 ] = tempData[ 0 ];
1617             outData[ 2 ] = tempData[ 0 ];
1618             outData[ 3 ] = tempData[ 0 ];
1619             break;
1620         case CL_LUMINANCE:
1621             outData[ 0 ] = tempData[ 0 ];
1622             outData[ 1 ] = tempData[ 0 ];
1623             outData[ 2 ] = tempData[ 0 ];
1624             break;
1625 #ifdef CL_1RGB_APPLE
1626         case CL_1RGB_APPLE:
1627             outData[ 0 ] = tempData[ 1 ];
1628             outData[ 1 ] = tempData[ 2 ];
1629             outData[ 2 ] = tempData[ 3 ];
1630             outData[ 3 ] = 1.0f;
1631             break;
1632 #endif
1633 #ifdef CL_BGR1_APPLE
1634         case CL_BGR1_APPLE:
1635             outData[ 0 ] = tempData[ 2 ];
1636             outData[ 1 ] = tempData[ 1 ];
1637             outData[ 2 ] = tempData[ 0 ];
1638             outData[ 3 ] = 1.0f;
1639             break;
1640 #endif
1641         case CL_sRGBA:
1642             outData[ 0 ] = tempData[ 0 ];
1643             outData[ 1 ] = tempData[ 1 ];
1644             outData[ 2 ] = tempData[ 2 ];
1645             outData[ 3 ] = tempData[ 3 ];
1646             break;
1647         case CL_DEPTH:
1648             outData[ 0 ] = tempData[ 0 ];
1649             break;
1650         default:
1651             log_error("Invalid format:");
1652             print_header(format, true);
1653             break;
1654     }
1655 }
1656 
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData)1657 void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1658                             int x, int y, int z, float *outData )
1659 {
1660   read_image_pixel_float( imageData, imageInfo, x, y, z, outData, 0 );
1661 }
1662 
get_integer_coords(float x,float y,float z,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1663 bool get_integer_coords( float x, float y, float z, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ ) {
1664     return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height, depth, imageSampler, imageInfo, outX, outY, outZ);
1665 }
1666 
get_integer_coords_offset(float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1667 bool get_integer_coords_offset( float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
1668                                size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ )
1669 {
1670     AddressFn adFn = sAddressingTable[ imageSampler ];
1671 
1672     float refX = floorf( x ), refY = floorf( y ), refZ = floorf( z );
1673 
1674     // Handle sampler-directed coordinate normalization + clamping.  Note that
1675     // the array coordinate for image array types is expected to be
1676     // unnormalized, and is clamped to 0..arraySize-1.
1677     if( imageSampler->normalized_coords )
1678     {
1679         switch (imageSampler->addressing_mode)
1680         {
1681             case CL_ADDRESS_REPEAT:
1682                 x = RepeatNormalizedAddressFn( x, width );
1683                 if (height != 0) {
1684                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1685                         y = RepeatNormalizedAddressFn( y, height );
1686                 }
1687                 if (depth != 0) {
1688                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1689                         z = RepeatNormalizedAddressFn( z, depth );
1690                 }
1691 
1692                 if (xAddressOffset != 0.0) {
1693                     // Add in the offset
1694                     x += xAddressOffset;
1695                     // Handle wrapping
1696                     if (x > width)
1697                         x -= (float)width;
1698                     if (x < 0)
1699                         x += (float)width;
1700                 }
1701                 if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
1702                     // Add in the offset
1703                     y += yAddressOffset;
1704                     // Handle wrapping
1705                     if (y > height)
1706                         y -= (float)height;
1707                     if (y < 0)
1708                         y += (float)height;
1709                 }
1710                 if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) )  {
1711                     // Add in the offset
1712                     z += zAddressOffset;
1713                     // Handle wrapping
1714                     if (z > depth)
1715                         z -= (float)depth;
1716                     if (z < 0)
1717                         z += (float)depth;
1718                 }
1719                 break;
1720 
1721             case CL_ADDRESS_MIRRORED_REPEAT:
1722                 x = MirroredRepeatNormalizedAddressFn( x, width );
1723                 if (height != 0) {
1724                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1725                         y = MirroredRepeatNormalizedAddressFn( y, height );
1726                 }
1727                 if (depth != 0) {
1728                     if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1729                         z = MirroredRepeatNormalizedAddressFn( z, depth );
1730                 }
1731 
1732                 if (xAddressOffset != 0.0)
1733                 {
1734                     float temp = x + xAddressOffset;
1735                     if( temp > (float) width )
1736                         temp = (float) width - (temp - (float) width );
1737                     x = fabsf( temp );
1738                 }
1739                 if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
1740                     float temp = y + yAddressOffset;
1741                     if( temp > (float) height )
1742                         temp = (float) height - (temp - (float) height );
1743                     y = fabsf( temp );
1744                 }
1745                 if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) )  {
1746                     float temp = z + zAddressOffset;
1747                     if( temp > (float) depth )
1748                         temp = (float) depth - (temp - (float) depth );
1749                     z = fabsf( temp );
1750                 }
1751                 break;
1752 
1753             default:
1754                 // Also, remultiply to the original coords. This simulates any truncation in
1755                 // the pass to OpenCL
1756                 x *= (float)width;
1757                 x += xAddressOffset;
1758 
1759                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1760                 {
1761                     y *= (float)height;
1762                     y += yAddressOffset;
1763                 }
1764 
1765                 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1766                 {
1767                     z *= (float)depth;
1768                     z += zAddressOffset;
1769                 }
1770                 break;
1771         }
1772     }
1773 
1774     // At this point, we're dealing with non-normalized coordinates.
1775 
1776     outX = adFn( floorf( x ), width );
1777 
1778     // 1D and 2D arrays require special care for the index coordinate:
1779 
1780     switch (imageInfo->type) {
1781         case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1782             outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
1783             outZ = 0.0f; /* don't care! */
1784             break;
1785         case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1786             outY = adFn( floorf( y ), height );
1787             outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
1788             break;
1789         default:
1790             // legacy path:
1791             if (height != 0)
1792                 outY = adFn( floorf( y ), height );
1793             if( depth != 0 )
1794                 outZ = adFn( floorf( z ), depth );
1795     }
1796 
1797     return !( (int)refX == outX && (int)refY == outY && (int)refZ == outZ );
1798 }
1799 
frac(float a)1800 static float frac(float a) {
1801     return a - floorf(a);
1802 }
1803 
1804 static inline void pixelMax( const float a[4], const float b[4], float *results );
pixelMax(const float a[4],const float b[4],float * results)1805 static inline void pixelMax( const float a[4], const float b[4], float *results )
1806 {
1807     for( int i = 0; i < 4; i++ )
1808         results[i] = errMax( fabsf(a[i]), fabsf(b[i]) );
1809 }
1810 
1811 // If containsDenorms is NULL, flush denorms to zero
1812 // if containsDenorms is not NULL, record whether there are any denorms
1813 static inline void  check_for_denorms(float a[4], int *containsDenorms );
check_for_denorms(float a[4],int * containsDenorms)1814 static inline void  check_for_denorms(float a[4], int *containsDenorms )
1815 {
1816     if( NULL == containsDenorms )
1817     {
1818         for( int i = 0; i < 4; i++ )
1819         {
1820             if( IsFloatSubnormal( a[i] ) )
1821                 a[i] = copysignf( 0.0f, a[i] );
1822         }
1823     }
1824     else
1825     {
1826         for( int i = 0; i < 4; i++ )
1827         {
1828             if( IsFloatSubnormal( a[i] ) )
1829             {
1830                 *containsDenorms = 1;
1831                 break;
1832             }
1833         }
1834     }
1835 }
1836 
calculate_array_index(float coord,float extent)1837 inline float calculate_array_index( float coord, float extent ) {
1838     // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
1839     //
1840     // given coordinate 'w' that represents an index:
1841     // layer_index = clamp( rint(w), 0, image_array_size - 1)
1842 
1843     float ret = rintf( coord );
1844     ret = ret > extent ? extent : ret;
1845     ret = ret < 0.0f ? 0.0f : ret;
1846 
1847     return ret;
1848 }
1849 
1850 /*
1851  * Utility function to unnormalized a coordinate given a particular sampler.
1852  *
1853  * name     - the name of the coordinate, used for verbose debugging only
1854  * coord    - the coordinate requiring unnormalization
1855  * offset   - an addressing offset to be added to the coordinate
1856  * extent   - the max value for this coordinate (e.g. width for x)
1857  */
unnormalize_coordinate(const char * name,float coord,float offset,float extent,cl_addressing_mode addressing_mode,int verbose)1858 static float unnormalize_coordinate( const char* name, float coord,
1859     float offset, float extent, cl_addressing_mode addressing_mode, int verbose )
1860 {
1861     float ret = 0.0f;
1862 
1863     switch (addressing_mode) {
1864         case CL_ADDRESS_REPEAT:
1865             ret = RepeatNormalizedAddressFn( coord, extent );
1866 
1867             if ( verbose ) {
1868                 log_info( "\tRepeat filter denormalizes %s (%f) to %f\n",
1869                     name, coord, ret );
1870             }
1871 
1872             if (offset != 0.0) {
1873                 // Add in the offset, and handle wrapping.
1874                 ret += offset;
1875                 if (ret > extent) ret -= extent;
1876                 if (ret < 0.0) ret += extent;
1877             }
1878 
1879             if (verbose && offset != 0.0f) {
1880                 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1881             }
1882             break;
1883 
1884         case CL_ADDRESS_MIRRORED_REPEAT:
1885             ret = MirroredRepeatNormalizedAddressFn( coord, extent );
1886 
1887             if ( verbose ) {
1888                 log_info( "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
1889                     name, coord, ret );
1890             }
1891 
1892             if (offset != 0.0) {
1893                 float temp = ret + offset;
1894                 if( temp > extent )
1895                     temp = extent - (temp - extent );
1896                 ret = fabsf( temp );
1897             }
1898 
1899             if (verbose && offset != 0.0f) {
1900                 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1901             }
1902             break;
1903 
1904         default:
1905 
1906             ret = coord * extent;
1907 
1908             if ( verbose ) {
1909                 log_info( "\tFilter denormalizes %s to %f (%f * %f)\n",
1910                     name, ret, coord, extent);
1911             }
1912 
1913             ret += offset;
1914 
1915             if (verbose && offset != 0.0f) {
1916                 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1917             }
1918     }
1919 
1920     return ret;
1921 }
1922 
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)1923 FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1924                                     float x, float y, float z,
1925                                     image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms ) {
1926     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms);
1927 }
1928 
1929 // returns max pixel value of the pixels touched
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1930 FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1931                                     float x, float y, float z,
1932                                     image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod) {
1933     return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms, lod);
1934 }
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1935 FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo,
1936                                            float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
1937                                            image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod)
1938 {
1939     AddressFn adFn = sAddressingTable[ imageSampler ];
1940     FloatPixel returnVal;
1941     size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth;
1942     size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1943 
1944     if ( imageInfo->num_mip_levels > 1 )
1945     {
1946       switch(imageInfo->type)
1947       {
1948       case CL_MEM_OBJECT_IMAGE3D :
1949         depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1;
1950       case CL_MEM_OBJECT_IMAGE2D :
1951       case CL_MEM_OBJECT_IMAGE2D_ARRAY :
1952         height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1;
1953       default :
1954         width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1;
1955       }
1956       row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1957       if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY )
1958         slice_pitch_lod = row_pitch_lod;
1959       else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1960         slice_pitch_lod = row_pitch_lod * height_lod;
1961     }
1962     else
1963     {
1964       slice_pitch_lod = imageInfo->slicePitch;
1965       row_pitch_lod = imageInfo->rowPitch;
1966     }
1967 
1968     if( containsDenorms )
1969         *containsDenorms = 0;
1970 
1971     if( imageSampler->normalized_coords ) {
1972 
1973         // We need to unnormalize our coordinates differently depending on
1974         // the image type, but 'x' is always processed the same way.
1975 
1976         x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod,
1977             imageSampler->addressing_mode, verbose);
1978 
1979         switch (imageInfo->type) {
1980 
1981             // The image array types require special care:
1982 
1983             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1984                 z = 0; // don't care -- unused for 1D arrays
1985                 break;
1986 
1987             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1988                 y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod,
1989                     imageSampler->addressing_mode, verbose);
1990                 break;
1991 
1992             // Everybody else:
1993 
1994             default:
1995                 y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod,
1996                     imageSampler->addressing_mode, verbose);
1997                 z = unnormalize_coordinate("z", z, zAddressOffset, (float)depth_lod,
1998                     imageSampler->addressing_mode, verbose);
1999         }
2000 
2001     } else if ( verbose ) {
2002 
2003         switch (imageInfo->type) {
2004             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2005                 log_info("Starting coordinate: %f, array index %f\n", x, y);
2006                 break;
2007             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2008                 log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z);
2009                 break;
2010             case CL_MEM_OBJECT_IMAGE1D:
2011             case CL_MEM_OBJECT_IMAGE1D_BUFFER:
2012                 log_info("Starting coordinate: %f\b", x);
2013                 break;
2014             case CL_MEM_OBJECT_IMAGE2D:
2015                 log_info("Starting coordinate: %f, %f\n", x, y);
2016                 break;
2017             case CL_MEM_OBJECT_IMAGE3D:
2018             default:
2019                 log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
2020         }
2021     }
2022 
2023     // At this point, we have unnormalized coordinates.
2024 
2025     if( imageSampler->filter_mode == CL_FILTER_NEAREST )
2026     {
2027         int ix, iy, iz;
2028 
2029         // We apply the addressing function to the now-unnormalized
2030         // coordinates.  Note that the array cases again require special
2031         // care, per section 8.4 in the OpenCL 1.2 Specification.
2032 
2033         ix = adFn( floorf( x ), width_lod );
2034 
2035         switch (imageInfo->type) {
2036             case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2037                 iy = calculate_array_index( y, (float)(imageInfo->arraySize - 1) );
2038                 iz = 0;
2039                 if( verbose ) {
2040                   log_info("\tArray index %f evaluates to %d\n",y, iy );
2041                 }
2042                 break;
2043             case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2044                 iy = adFn( floorf( y ), height_lod );
2045                 iz = calculate_array_index( z, (float)(imageInfo->arraySize - 1) );
2046                 if( verbose ) {
2047                     log_info("\tArray index %f evaluates to %d\n",z, iz );
2048                 }
2049                 break;
2050             default:
2051                 iy = adFn( floorf( y ), height_lod );
2052                 if( depth_lod != 0 )
2053                     iz = adFn( floorf( z ), depth_lod );
2054                 else
2055                     iz = 0;
2056         }
2057 
2058         if( verbose ) {
2059             if( iz )
2060                 log_info( "\tReference integer coords calculated: { %d, %d, %d }\n", ix, iy, iz );
2061             else
2062                 log_info( "\tReference integer coords calculated: { %d, %d }\n", ix, iy );
2063         }
2064 
2065         read_image_pixel_float( imageData, imageInfo, ix, iy, iz, outData, lod );
2066         check_for_denorms( outData, containsDenorms );
2067         for( int i = 0; i < 4; i++ )
2068             returnVal.p[i] = fabsf( outData[i] );
2069         return returnVal;
2070     }
2071     else
2072     {
2073         // Linear filtering cases.
2074 
2075         size_t width = width_lod, height = height_lod, depth = depth_lod;
2076 
2077         // Image arrays can use 2D filtering, but require us to walk into the
2078         // image a certain number of slices before reading.
2079 
2080         if( depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY ||
2081                           imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2082         {
2083             float array_index = 0;
2084 
2085             size_t layer_offset = 0;
2086 
2087             if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
2088                 array_index = calculate_array_index(z, (float)(imageInfo->arraySize - 1));
2089                 layer_offset = slice_pitch_lod * (size_t)array_index;
2090             }
2091             else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
2092                 array_index = calculate_array_index(y, (float)(imageInfo->arraySize - 1));
2093                 layer_offset = slice_pitch_lod * (size_t)array_index;
2094 
2095                 // Set up y and height so that the filtering below is correct
2096                 // 1D filtering on a single slice.
2097                 height = 1;
2098             }
2099 
2100             int x1 = adFn( floorf( x - 0.5f ), width );
2101             int y1 = 0;
2102             int x2 = adFn( floorf( x - 0.5f ) + 1, width );
2103             int y2 = 0;
2104             if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) &&
2105                 (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
2106                 (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
2107                 y1 = adFn( floorf( y - 0.5f ), height );
2108                 y2 = adFn( floorf( y - 0.5f ) + 1, height );
2109             } else {
2110               y = 0.5f;
2111             }
2112 
2113             if( verbose ) {
2114                 log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{ %d, %d } and i1:{ %d, %d }\n", x1, y1, x2, y2 );
2115                 log_info( "\tArray coordinate is %f\n", array_index);
2116             }
2117 
2118             // Walk to beginning of the 'correct' slice, if needed.
2119             char* imgPtr = ((char*)imageData) + layer_offset;
2120 
2121             float upLeft[ 4 ], upRight[ 4 ], lowLeft[ 4 ], lowRight[ 4 ];
2122             float maxUp[4], maxLow[4];
2123             read_image_pixel_float( imgPtr, imageInfo, x1, y1, 0, upLeft, lod );
2124             read_image_pixel_float( imgPtr, imageInfo, x2, y1, 0, upRight, lod );
2125             check_for_denorms( upLeft, containsDenorms );
2126             check_for_denorms( upRight, containsDenorms );
2127             pixelMax( upLeft, upRight, maxUp );
2128             read_image_pixel_float( imgPtr, imageInfo, x1, y2, 0, lowLeft, lod );
2129             read_image_pixel_float( imgPtr, imageInfo, x2, y2, 0, lowRight, lod );
2130             check_for_denorms( lowLeft, containsDenorms );
2131             check_for_denorms( lowRight, containsDenorms );
2132             pixelMax( lowLeft, lowRight, maxLow );
2133             pixelMax( maxUp, maxLow, returnVal.p );
2134 
2135             if( verbose )
2136             {
2137                 if( NULL == containsDenorms )
2138                     log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
2139                 else
2140                     log_info( "\tSampled pixels (rgba order):\n" );
2141                 log_info( "\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1], upLeft[2], upLeft[3] );
2142                 log_info( "\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1], upRight[2], upRight[3] );
2143                 log_info( "\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1], lowLeft[2], lowLeft[3] );
2144                 log_info( "\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1], lowRight[2], lowRight[3] );
2145             }
2146 
2147             bool printMe = false;
2148             if( x1 <= 0 || x2 <= 0 || x1 >= (int)width-1 || x2 >= (int)width-1 )
2149                 printMe = true;
2150             if( y1 <= 0 || y2 <= 0 || y1 >= (int)height-1 || y2 >= (int)height-1 )
2151                 printMe = true;
2152 
2153             double weights[ 2 ][ 2 ];
2154 
2155             weights[ 0 ][ 0 ] = weights[ 0 ][ 1 ] = 1.0 - frac( x - 0.5f );
2156             weights[ 1 ][ 0 ] = weights[ 1 ][ 1 ] = frac( x - 0.5f );
2157             weights[ 0 ][ 0 ] *= 1.0 - frac( y - 0.5f );
2158             weights[ 1 ][ 0 ] *= 1.0 - frac( y - 0.5f );
2159             weights[ 0 ][ 1 ] *= frac( y - 0.5f );
2160             weights[ 1 ][ 1 ] *= frac( y - 0.5f );
2161 
2162             if( verbose )
2163                 log_info( "\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f\n",  frac( x - 0.5f ), frac( y - 0.5f ) );
2164 
2165             for( int i = 0; i < 3; i++ )
2166             {
2167                 outData[ i ] = (float)( ( upLeft[ i ] * weights[ 0 ][ 0 ] ) +
2168                                         ( upRight[ i ] * weights[ 1 ][ 0 ] ) +
2169                                         ( lowLeft[ i ] * weights[ 0 ][ 1 ] ) +
2170                                         ( lowRight[ i ] * weights[ 1 ][ 1 ] ));
2171                 // flush subnormal results to zero if necessary
2172                 if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
2173                     outData[i] = copysignf( 0.0f, outData[i] );
2174             }
2175             outData[ 3 ] = (float)( ( upLeft[ 3 ] * weights[ 0 ][ 0 ] ) +
2176                                    ( upRight[ 3 ] * weights[ 1 ][ 0 ] ) +
2177                                    ( lowLeft[ 3 ] * weights[ 0 ][ 1 ] ) +
2178                                    ( lowRight[ 3 ] * weights[ 1 ][ 1 ] ));
2179             // flush subnormal results to zero if necessary
2180             if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN )
2181                 outData[3] = copysignf( 0.0f, outData[3] );
2182         }
2183         else
2184         {
2185             // 3D linear filtering
2186             int x1 = adFn( floorf( x - 0.5f ), width_lod );
2187             int y1 = adFn( floorf( y - 0.5f ), height_lod );
2188             int z1 = adFn( floorf( z - 0.5f ), depth_lod );
2189             int x2 = adFn( floorf( x - 0.5f ) + 1, width_lod );
2190             int y2 = adFn( floorf( y - 0.5f ) + 1, height_lod );
2191             int z2 = adFn( floorf( z - 0.5f ) + 1, depth_lod );
2192 
2193             if( verbose )
2194                 log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", x1, y1, z1, x2, y2, z2 );
2195 
2196             float upLeftA[ 4 ], upRightA[ 4 ], lowLeftA[ 4 ], lowRightA[ 4 ];
2197             float upLeftB[ 4 ], upRightB[ 4 ], lowLeftB[ 4 ], lowRightB[ 4 ];
2198             float pixelMaxA[4], pixelMaxB[4];
2199             read_image_pixel_float( imageData, imageInfo, x1, y1, z1, upLeftA, lod );
2200             read_image_pixel_float( imageData, imageInfo, x2, y1, z1, upRightA, lod );
2201             check_for_denorms( upLeftA, containsDenorms );
2202             check_for_denorms( upRightA, containsDenorms );
2203             pixelMax( upLeftA, upRightA, pixelMaxA );
2204             read_image_pixel_float( imageData, imageInfo, x1, y2, z1, lowLeftA, lod );
2205             read_image_pixel_float( imageData, imageInfo, x2, y2, z1, lowRightA, lod );
2206             check_for_denorms( lowLeftA, containsDenorms );
2207             check_for_denorms( lowRightA, containsDenorms );
2208             pixelMax( lowLeftA, lowRightA, pixelMaxB );
2209             pixelMax( pixelMaxA, pixelMaxB, returnVal.p);
2210             read_image_pixel_float( imageData, imageInfo, x1, y1, z2, upLeftB, lod );
2211             read_image_pixel_float( imageData, imageInfo, x2, y1, z2, upRightB, lod );
2212             check_for_denorms( upLeftB, containsDenorms );
2213             check_for_denorms( upRightB, containsDenorms );
2214             pixelMax( upLeftB, upRightB, pixelMaxA );
2215             read_image_pixel_float( imageData, imageInfo, x1, y2, z2, lowLeftB, lod );
2216             read_image_pixel_float( imageData, imageInfo, x2, y2, z2, lowRightB, lod );
2217             check_for_denorms( lowLeftB, containsDenorms );
2218             check_for_denorms( lowRightB, containsDenorms );
2219             pixelMax( lowLeftB, lowRightB, pixelMaxB );
2220             pixelMax( pixelMaxA, pixelMaxB, pixelMaxA);
2221             pixelMax( pixelMaxA, returnVal.p, returnVal.p );
2222 
2223             if( verbose )
2224             {
2225                 if( NULL == containsDenorms )
2226                     log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
2227                 else
2228                     log_info( "\tSampled pixels (rgba order):\n" );
2229                 log_info( "\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1], upLeftA[2], upLeftA[3] );
2230                 log_info( "\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1], upRightA[2], upRightA[3] );
2231                 log_info( "\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1], lowLeftA[2], lowLeftA[3] );
2232                 log_info( "\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0], lowRightA[1], lowRightA[2], lowRightA[3] );
2233                 log_info( "\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1], upLeftB[2], upLeftB[3] );
2234                 log_info( "\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1], upRightB[2], upRightB[3] );
2235                 log_info( "\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1], lowLeftB[2], lowLeftB[3] );
2236                 log_info( "\t\tp111: %f, %f, %f, %f\n", lowRightB[0], lowRightB[1], lowRightB[2], lowRightB[3] );
2237             }
2238 
2239             double weights[ 2 ][ 2 ][ 2 ];
2240 
2241             float a = frac( x - 0.5f ), b = frac( y - 0.5f ), c = frac( z - 0.5f );
2242             weights[ 0 ][ 0 ][ 0 ] = weights[ 0 ][ 1 ][ 0 ] = weights[ 0 ][ 0 ][ 1 ] = weights[ 0 ][ 1 ][ 1 ] = 1.f - a;
2243             weights[ 1 ][ 0 ][ 0 ] = weights[ 1 ][ 1 ][ 0 ] = weights[ 1 ][ 0 ][ 1 ] = weights[ 1 ][ 1 ][ 1 ] = a;
2244             weights[ 0 ][ 0 ][ 0 ] *= 1.f - b;
2245             weights[ 1 ][ 0 ][ 0 ] *= 1.f - b;
2246             weights[ 0 ][ 0 ][ 1 ] *= 1.f - b;
2247             weights[ 1 ][ 0 ][ 1 ] *= 1.f - b;
2248             weights[ 0 ][ 1 ][ 0 ] *= b;
2249             weights[ 1 ][ 1 ][ 0 ] *= b;
2250             weights[ 0 ][ 1 ][ 1 ] *= b;
2251             weights[ 1 ][ 1 ][ 1 ] *= b;
2252             weights[ 0 ][ 0 ][ 0 ] *= 1.f - c;
2253             weights[ 0 ][ 1 ][ 0 ] *= 1.f - c;
2254             weights[ 1 ][ 0 ][ 0 ] *= 1.f - c;
2255             weights[ 1 ][ 1 ][ 0 ] *= 1.f - c;
2256             weights[ 0 ][ 0 ][ 1 ] *= c;
2257             weights[ 0 ][ 1 ][ 1 ] *= c;
2258             weights[ 1 ][ 0 ][ 1 ] *= c;
2259             weights[ 1 ][ 1 ][ 1 ] *= c;
2260 
2261             if( verbose )
2262                 log_info( "\tfrac( x - 0.5f ) = %f,  frac( y - 0.5f ) = %f, frac( z - 0.5f ) = %f\n",
2263                          frac( x - 0.5f ), frac( y - 0.5f ), frac( z - 0.5f )  );
2264 
2265             for( int i = 0; i < 3; i++ )
2266             {
2267                 outData[ i ] = (float)( ( upLeftA[ i ] * weights[ 0 ][ 0 ][ 0 ] ) +
2268                                         ( upRightA[ i ] * weights[ 1 ][ 0 ][ 0 ] ) +
2269                                         ( lowLeftA[ i ] * weights[ 0 ][ 1 ][ 0 ] ) +
2270                                         ( lowRightA[ i ] * weights[ 1 ][ 1 ][ 0 ] ) +
2271                                         ( upLeftB[ i ] * weights[ 0 ][ 0 ][ 1 ] ) +
2272                                         ( upRightB[ i ] * weights[ 1 ][ 0 ][ 1 ] ) +
2273                                         ( lowLeftB[ i ] * weights[ 0 ][ 1 ][ 1 ] ) +
2274                                         ( lowRightB[ i ] * weights[ 1 ][ 1 ][ 1 ] ));
2275                 // flush subnormal results to zero if necessary
2276                 if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
2277                     outData[i] = copysignf( 0.0f, outData[i] );
2278             }
2279             outData[ 3 ] = (float)( ( upLeftA[ 3 ] * weights[ 0 ][ 0 ][ 0 ] ) +
2280                                    ( upRightA[ 3 ] * weights[ 1 ][ 0 ][ 0 ] ) +
2281                                    ( lowLeftA[ 3 ] * weights[ 0 ][ 1 ][ 0 ] ) +
2282                                    ( lowRightA[ 3 ] * weights[ 1 ][ 1 ][ 0 ] ) +
2283                                    ( upLeftB[ 3 ] * weights[ 0 ][ 0 ][ 1 ] ) +
2284                                    ( upRightB[ 3 ] * weights[ 1 ][ 0 ][ 1 ] ) +
2285                                    ( lowLeftB[ 3 ] * weights[ 0 ][ 1 ][ 1 ] ) +
2286                                    ( lowRightB[ 3 ] * weights[ 1 ][ 1 ][ 1 ] ));
2287             // flush subnormal results to zero if necessary
2288             if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN )
2289                 outData[3] = copysignf( 0.0f, outData[3] );
2290         }
2291 
2292         return returnVal;
2293     }
2294 }
2295 
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)2296 FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo,
2297                                            float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
2298                                            image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms )
2299 {
2300   return sample_image_pixel_float_offset( imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset, zAddressOffset,
2301     imageSampler, outData, verbose, containsDenorms, 0);
2302 }
2303 
2304 
debug_find_vector_in_image(void * imagePtr,image_descriptor * imageInfo,void * vectorToFind,size_t vectorSize,int * outX,int * outY,int * outZ,size_t lod)2305 int debug_find_vector_in_image( void *imagePtr, image_descriptor *imageInfo,
2306                                void *vectorToFind, size_t vectorSize, int *outX, int *outY, int *outZ, size_t lod )
2307 {
2308     int foundCount = 0;
2309     char *iPtr = (char *)imagePtr;
2310     size_t width;
2311     size_t depth;
2312     size_t height;
2313     size_t row_pitch;
2314     size_t slice_pitch;
2315 
2316     switch (imageInfo->type)
2317     {
2318     case CL_MEM_OBJECT_IMAGE1D:
2319       width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2320       height = 1;
2321       depth = 1;
2322       break;
2323     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2324       width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2325       height = 1;
2326       depth = imageInfo->arraySize;
2327       break;
2328     case CL_MEM_OBJECT_IMAGE2D:
2329       width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2330       height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2331       depth = 1;
2332       break;
2333     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2334       width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2335       height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2336       depth = imageInfo->arraySize;
2337       break;
2338     case CL_MEM_OBJECT_IMAGE3D:
2339       width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2340       height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2341       depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
2342       break;
2343     }
2344 
2345     row_pitch = width * get_pixel_size( imageInfo->format );
2346     slice_pitch = row_pitch * height;
2347 
2348     for( size_t z = 0; z < depth; z++ )
2349     {
2350         for( size_t y = 0; y < height; y++ )
2351         {
2352             for( size_t x = 0; x < width; x++)
2353             {
2354                 if( memcmp( iPtr, vectorToFind, vectorSize ) == 0 )
2355                 {
2356                     if( foundCount == 0 )
2357                     {
2358                         *outX = (int)x;
2359                         if (outY != NULL)
2360                             *outY = (int)y;
2361                         if( outZ != NULL )
2362                             *outZ = (int)z;
2363                     }
2364                     foundCount++;
2365                 }
2366                 iPtr += vectorSize;
2367             }
2368             iPtr += row_pitch - ( width * vectorSize );
2369         }
2370         iPtr += slice_pitch - ( height * row_pitch );
2371     }
2372     return foundCount;
2373 }
2374 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,unsigned int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2375 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2376                               unsigned int *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2377 {
2378     char vectorToFind[ 4 * 4 ];
2379     size_t vectorSize = get_format_channel_count( imageInfo->format );
2380 
2381 
2382     if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8 )
2383     {
2384         unsigned char *p = (unsigned char *)vectorToFind;
2385         for( unsigned int i = 0; i < vectorSize; i++ )
2386             p[i] = (unsigned char)valuesToFind[i];
2387     }
2388     else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16 )
2389     {
2390         unsigned short *p = (unsigned short *)vectorToFind;
2391         for( unsigned int i = 0; i < vectorSize; i++ )
2392             p[i] = (unsigned short)valuesToFind[i];
2393         vectorSize *= 2;
2394     }
2395     else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32 )
2396     {
2397         unsigned int *p = (unsigned int *)vectorToFind;
2398         for( unsigned int i = 0; i < vectorSize; i++ )
2399             p[i] = (unsigned int)valuesToFind[i];
2400         vectorSize *= 4;
2401     }
2402     else
2403     {
2404         log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
2405         return false;
2406     }
2407     return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2408 }
2409 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2410 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2411                               int *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2412 {
2413     char vectorToFind[ 4 * 4 ];
2414     size_t vectorSize = get_format_channel_count( imageInfo->format );
2415 
2416     if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT8 )
2417     {
2418         char *p = (char *)vectorToFind;
2419         for( unsigned int i = 0; i < vectorSize; i++ )
2420             p[i] = (char)valuesToFind[i];
2421     }
2422     else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT16 )
2423     {
2424         short *p = (short *)vectorToFind;
2425         for( unsigned int i = 0; i < vectorSize; i++ )
2426             p[i] = (short)valuesToFind[i];
2427         vectorSize *= 2;
2428     }
2429     else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT32 )
2430     {
2431         int *p = (int *)vectorToFind;
2432         for( unsigned int i = 0; i < vectorSize; i++ )
2433             p[i] = (int)valuesToFind[i];
2434         vectorSize *= 4;
2435     }
2436     else
2437     {
2438         log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
2439         return false;
2440     }
2441     return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2442 }
2443 
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,float * valuesToFind,int * outX,int * outY,int * outZ,int lod)2444 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2445                               float *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2446 {
2447     char vectorToFind[ 4 * 4 ];
2448     float swizzled[4];
2449     memcpy( swizzled, valuesToFind, sizeof( swizzled ) );
2450     size_t vectorSize = get_pixel_size( imageInfo->format );
2451     pack_image_pixel( swizzled, imageInfo->format, vectorToFind );
2452     return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2453 }
2454 
swizzle_vector_for_image(T * srcVector,const cl_image_format * imageFormat)2455 template <class T> void swizzle_vector_for_image( T *srcVector, const cl_image_format *imageFormat )
2456 {
2457     T temp;
2458     switch( imageFormat->image_channel_order )
2459     {
2460         case CL_A:
2461             srcVector[ 0 ] = srcVector[ 3 ];
2462             break;
2463         case CL_R:
2464         case CL_Rx:
2465         case CL_RG:
2466         case CL_RGx:
2467         case CL_RGB:
2468         case CL_RGBx:
2469         case CL_RGBA:
2470         case CL_sRGB:
2471         case CL_sRGBx:
2472         case CL_sRGBA:
2473             break;
2474         case CL_RA:
2475             srcVector[ 1 ] = srcVector[ 3 ];
2476             break;
2477         case CL_ARGB:
2478             temp = srcVector[ 3 ];
2479             srcVector[ 3 ] = srcVector[ 2 ];
2480             srcVector[ 2 ] = srcVector[ 1 ];
2481             srcVector[ 1 ] = srcVector[ 0 ];
2482             srcVector[ 0 ] = temp;
2483             break;
2484         case CL_BGRA:
2485         case CL_sBGRA:
2486             temp = srcVector[ 0 ];
2487             srcVector[ 0 ] = srcVector[ 2 ];
2488             srcVector[ 2 ] = temp;
2489             break;
2490         case CL_INTENSITY:
2491             srcVector[ 3 ] = srcVector[ 0 ];
2492             srcVector[ 2 ] = srcVector[ 0 ];
2493             srcVector[ 1 ] = srcVector[ 0 ];
2494             break;
2495         case CL_LUMINANCE:
2496             srcVector[ 2 ] = srcVector[ 0 ];
2497             srcVector[ 1 ] = srcVector[ 0 ];
2498             break;
2499 #ifdef CL_1RGB_APPLE
2500         case CL_1RGB_APPLE:
2501             temp = srcVector[ 3 ];
2502             srcVector[ 3 ] = srcVector[ 2 ];
2503             srcVector[ 2 ] = srcVector[ 1 ];
2504             srcVector[ 1 ] = srcVector[ 0 ];
2505             srcVector[ 0 ] = temp;
2506             break;
2507 #endif
2508 #ifdef CL_BGR1_APPLE
2509         case CL_BGR1_APPLE:
2510             temp = srcVector[ 0 ];
2511             srcVector[ 0 ] = srcVector[ 2 ];
2512             srcVector[ 2 ] = temp;
2513             break;
2514 #endif
2515     }
2516 }
2517 
2518 #define SATURATE( v, min, max ) ( v < min ? min : ( v > max ? max : v ) )
2519 
pack_image_pixel(unsigned int * srcVector,const cl_image_format * imageFormat,void * outData)2520 void pack_image_pixel( unsigned int *srcVector, const cl_image_format *imageFormat, void *outData )
2521 {
2522     swizzle_vector_for_image<unsigned int>( srcVector, imageFormat );
2523     size_t channelCount = get_format_channel_count( imageFormat );
2524 
2525     switch( imageFormat->image_channel_data_type )
2526     {
2527         case CL_UNSIGNED_INT8:
2528         {
2529             unsigned char *ptr = (unsigned char *)outData;
2530             for( unsigned int i = 0; i < channelCount; i++ )
2531                 ptr[ i ] = (unsigned char)SATURATE( srcVector[ i ], 0, 255 );
2532             break;
2533         }
2534         case CL_UNSIGNED_INT16:
2535         {
2536             unsigned short *ptr = (unsigned short *)outData;
2537             for( unsigned int i = 0; i < channelCount; i++ )
2538                 ptr[ i ] = (unsigned short)SATURATE( srcVector[ i ], 0, 65535 );
2539             break;
2540         }
2541         case CL_UNSIGNED_INT32:
2542         {
2543             unsigned int *ptr = (unsigned int *)outData;
2544             for( unsigned int i = 0; i < channelCount; i++ )
2545                 ptr[ i ] = (unsigned int)srcVector[ i ];
2546             break;
2547         }
2548         default:
2549             break;
2550     }
2551 }
2552 
pack_image_pixel(int * srcVector,const cl_image_format * imageFormat,void * outData)2553 void pack_image_pixel( int *srcVector, const cl_image_format *imageFormat, void *outData )
2554 {
2555     swizzle_vector_for_image<int>( srcVector, imageFormat );
2556     size_t chanelCount = get_format_channel_count( imageFormat );
2557 
2558     switch( imageFormat->image_channel_data_type )
2559     {
2560         case CL_SIGNED_INT8:
2561         {
2562             char *ptr = (char *)outData;
2563             for( unsigned int i = 0; i < chanelCount; i++ )
2564                 ptr[ i ] = (char)SATURATE( srcVector[ i ], -128, 127 );
2565             break;
2566         }
2567         case CL_SIGNED_INT16:
2568         {
2569             short *ptr = (short *)outData;
2570             for( unsigned int i = 0; i < chanelCount; i++ )
2571                 ptr[ i ] = (short)SATURATE( srcVector[ i ], -32768, 32767 );
2572             break;
2573         }
2574         case CL_SIGNED_INT32:
2575         {
2576             int *ptr = (int *)outData;
2577             for( unsigned int i = 0; i < chanelCount; i++ )
2578                 ptr[ i ] = (int)srcVector[ i ];
2579             break;
2580         }
2581         default:
2582             break;
2583     }
2584 }
2585 
round_to_even(float v)2586 int round_to_even( float v )
2587 {
2588     // clamp overflow
2589     if( v >= - (float) INT_MIN )
2590         return INT_MAX;
2591     if( v <= (float) INT_MIN )
2592         return INT_MIN;
2593 
2594     // round fractional values to integer value
2595     if( fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) )
2596     {
2597         static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23), MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
2598         float magicVal = magic[ v < 0.0f ];
2599         v += magicVal;
2600         v -= magicVal;
2601     }
2602 
2603     return (int) v;
2604 }
2605 
pack_image_pixel(float * srcVector,const cl_image_format * imageFormat,void * outData)2606 void pack_image_pixel( float *srcVector, const cl_image_format *imageFormat, void *outData )
2607 {
2608     swizzle_vector_for_image<float>( srcVector, imageFormat );
2609     size_t channelCount = get_format_channel_count( imageFormat );
2610     switch( imageFormat->image_channel_data_type )
2611     {
2612         case CL_HALF_FLOAT:
2613         {
2614             cl_ushort *ptr = (cl_ushort *)outData;
2615 
2616             switch( gFloatToHalfRoundingMode )
2617             {
2618                 case kRoundToNearestEven:
2619             for( unsigned int i = 0; i < channelCount; i++ )
2620                         ptr[ i ] = float2half_rte( srcVector[ i ] );
2621             break;
2622                 case kRoundTowardZero:
2623                     for( unsigned int i = 0; i < channelCount; i++ )
2624                         ptr[ i ] = float2half_rtz( srcVector[ i ] );
2625                     break;
2626                 default:
2627                     log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
2628                     exit(-1);
2629                     break;
2630         }
2631             break;
2632         }
2633 
2634         case CL_FLOAT:
2635         {
2636             cl_float *ptr = (cl_float *)outData;
2637             for( unsigned int i = 0; i < channelCount; i++ )
2638                 ptr[ i ] = srcVector[ i ];
2639             break;
2640         }
2641 
2642         case CL_SNORM_INT8:
2643         {
2644             cl_char *ptr = (cl_char *)outData;
2645             for( unsigned int i = 0; i < channelCount; i++ )
2646                 ptr[ i ] = (cl_char)NORMALIZE_SIGNED( srcVector[ i ], -127.0f, 127.f );
2647             break;
2648         }
2649         case CL_SNORM_INT16:
2650         {
2651             cl_short *ptr = (cl_short *)outData;
2652             for( unsigned int i = 0; i < channelCount; i++ )
2653                 ptr[ i ] = (short)NORMALIZE_SIGNED( srcVector[ i ], -32767.f, 32767.f  );
2654             break;
2655         }
2656         case CL_UNORM_INT8:
2657         {
2658             cl_uchar *ptr = (cl_uchar *)outData;
2659             if ( is_sRGBA_order(imageFormat->image_channel_order) )
2660             {
2661                 ptr[ 0 ] = (unsigned char)( sRGBmap( srcVector[ 0 ] ) + 0.5 );
2662                 ptr[ 1 ] = (unsigned char)( sRGBmap( srcVector[ 1 ] ) + 0.5 );
2663                 ptr[ 2 ] = (unsigned char)( sRGBmap( srcVector[ 2 ] ) + 0.5 );
2664                 if (channelCount == 4)
2665                     ptr[ 3 ] = (unsigned char)NORMALIZE( srcVector[ 3 ], 255.f );
2666             }
2667             else
2668             {
2669                 for( unsigned int i = 0; i < channelCount; i++ )
2670                     ptr[ i ] = (unsigned char)NORMALIZE( srcVector[ i ], 255.f );
2671             }
2672 #ifdef CL_1RGB_APPLE
2673             if( imageFormat->image_channel_order == CL_1RGB_APPLE )
2674                 ptr[0] = 255.0f;
2675 #endif
2676 #ifdef CL_BGR1_APPLE
2677             if( imageFormat->image_channel_order == CL_BGR1_APPLE )
2678                 ptr[3] = 255.0f;
2679 #endif
2680             break;
2681         }
2682         case CL_UNORM_INT16:
2683         {
2684             cl_ushort *ptr = (cl_ushort *)outData;
2685             for( unsigned int i = 0; i < channelCount; i++ )
2686                 ptr[ i ] = (unsigned short)NORMALIZE( srcVector[ i ], 65535.f );
2687             break;
2688         }
2689         case CL_UNORM_SHORT_555:
2690         {
2691             cl_ushort *ptr = (cl_ushort *)outData;
2692             ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 10 ) |
2693             ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 31.f ) & 31 ) << 5 ) |
2694             ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
2695             break;
2696         }
2697         case CL_UNORM_SHORT_565:
2698         {
2699             cl_ushort *ptr = (cl_ushort *)outData;
2700             ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 11 ) |
2701             ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 63.f ) & 63 ) << 5 ) |
2702             ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
2703             break;
2704         }
2705         case CL_UNORM_INT_101010:
2706         {
2707             cl_uint *ptr = (cl_uint *)outData;
2708             ptr[ 0 ] = ( ( (unsigned int)NORMALIZE( srcVector[ 0 ], 1023.f ) & 1023 ) << 20 ) |
2709             ( ( (unsigned int)NORMALIZE( srcVector[ 1 ], 1023.f ) & 1023 ) << 10 ) |
2710             ( ( (unsigned int)NORMALIZE( srcVector[ 2 ], 1023.f ) & 1023 ) << 0 );
2711             break;
2712         }
2713         case CL_SIGNED_INT8:
2714         {
2715             cl_char *ptr = (cl_char *)outData;
2716             for( unsigned int i = 0; i < channelCount; i++ )
2717                 ptr[ i ] = (cl_char)CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );
2718             break;
2719         }
2720         case CL_SIGNED_INT16:
2721         {
2722             cl_short *ptr = (cl_short *)outData;
2723             for( unsigned int i = 0; i < channelCount; i++ )
2724                 ptr[ i ] = (short)CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767  );
2725             break;
2726         }
2727         case CL_SIGNED_INT32:
2728         {
2729             cl_int *ptr = (cl_int *)outData;
2730             for( unsigned int i = 0; i < channelCount; i++ )
2731                 ptr[ i ] = (int)CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX  );
2732             break;
2733         }
2734         case CL_UNSIGNED_INT8:
2735         {
2736             cl_uchar *ptr = (cl_uchar *)outData;
2737             for( unsigned int i = 0; i < channelCount; i++ )
2738                 ptr[ i ] = (cl_uchar)CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
2739             break;
2740         }
2741         case CL_UNSIGNED_INT16:
2742         {
2743             cl_ushort *ptr = (cl_ushort *)outData;
2744             for( unsigned int i = 0; i < channelCount; i++ )
2745                 ptr[ i ] = (cl_ushort)CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
2746             break;
2747         }
2748         case CL_UNSIGNED_INT32:
2749         {
2750             cl_uint *ptr = (cl_uint *)outData;
2751             for( unsigned int i = 0; i < channelCount; i++ )
2752                 ptr[ i ] = (cl_uint)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX  );
2753             break;
2754         }
2755 #ifdef CL_SFIXED14_APPLE
2756         case CL_SFIXED14_APPLE:
2757         {
2758             cl_ushort *ptr = (cl_ushort*)outData;
2759             for( unsigned int i = 0; i < channelCount; i++ )
2760             {
2761                 cl_float f = fmaxf( srcVector[i], -1.0f );
2762                 f = fminf( f, 3.0f );
2763                 cl_int d = rintf(f * 0x1.0p14f);
2764                 d += 16384;
2765                 if( d > CL_USHRT_MAX )
2766                     d = CL_USHRT_MAX;
2767                 ptr[i] = d;
2768             }
2769             break;
2770         }
2771 #endif
2772         default:
2773             log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
2774             exit(-1);
2775             break;
2776     }
2777 }
2778 
pack_image_pixel_error(const float * srcVector,const cl_image_format * imageFormat,const void * results,float * errors)2779 void pack_image_pixel_error( const float *srcVector, const cl_image_format *imageFormat, const void *results, float *errors )
2780 {
2781     size_t channelCount = get_format_channel_count( imageFormat );
2782     switch( imageFormat->image_channel_data_type )
2783     {
2784         case CL_HALF_FLOAT:
2785         {
2786             const cl_ushort *ptr = (const cl_ushort *)results;
2787 
2788             for( unsigned int i = 0; i < channelCount; i++ )
2789                 errors[i] = Ulp_Error_Half( ptr[i], srcVector[i] );
2790 
2791             break;
2792         }
2793 
2794         case CL_FLOAT:
2795         {
2796             const cl_ushort *ptr = (const cl_ushort *)results;
2797 
2798             for( unsigned int i = 0; i < channelCount; i++ )
2799                 errors[i] = Ulp_Error( ptr[i], srcVector[i] );
2800 
2801             break;
2802         }
2803 
2804         case CL_SNORM_INT8:
2805         {
2806             const cl_char *ptr = (const cl_char *)results;
2807 
2808             for( unsigned int i = 0; i < channelCount; i++ )
2809                 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -127.0f, 127.f );
2810 
2811             break;
2812         }
2813         case CL_SNORM_INT16:
2814         {
2815             const cl_short *ptr = (const cl_short *)results;
2816 
2817             for( unsigned int i = 0; i < channelCount; i++ )
2818                 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -32767.f, 32767.f  );
2819 
2820             break;
2821         }
2822         case CL_UNORM_INT8:
2823         {
2824             const cl_uchar *ptr = (const cl_uchar *)results;
2825 
2826             for( unsigned int i = 0; i < channelCount; i++ )
2827                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 255.f  );
2828 
2829             break;
2830         }
2831         case CL_UNORM_INT16:
2832         {
2833             const cl_ushort *ptr = (const cl_ushort *)results;
2834 
2835             for( unsigned int i = 0; i < channelCount; i++ )
2836                 errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 65535.f  );
2837 
2838             break;
2839         }
2840         case CL_UNORM_SHORT_555:
2841         {
2842             const cl_ushort *ptr = (const cl_ushort *)results;
2843 
2844             errors[0] = ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
2845             errors[1] = ((ptr[0] >>  5) & 31) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 31.f );
2846             errors[2] = ((ptr[0] >>  0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );
2847 
2848             break;
2849         }
2850         case CL_UNORM_SHORT_565:
2851         {
2852             const cl_ushort *ptr = (const cl_ushort *)results;
2853 
2854             errors[0] = ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
2855             errors[1] = ((ptr[0] >>  5) & 63) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 63.f );
2856             errors[2] = ((ptr[0] >>  0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );
2857 
2858             break;
2859         }
2860         case CL_UNORM_INT_101010:
2861         {
2862             const cl_uint *ptr = (const cl_uint *)results;
2863 
2864             errors[0] = ((ptr[0] >> 20) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 1023.f );
2865             errors[1] = ((ptr[0] >> 10) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 1023.f );
2866             errors[2] = ((ptr[0] >>  0) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 1023.f );
2867 
2868             break;
2869         }
2870         case CL_SIGNED_INT8:
2871         {
2872             const cl_char *ptr = (const cl_char *)results;
2873 
2874             for( unsigned int i = 0; i < channelCount; i++ )
2875                 errors[ i ] = ptr[i] - CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );
2876 
2877             break;
2878         }
2879         case CL_SIGNED_INT16:
2880         {
2881             const cl_short *ptr = (const cl_short *)results;
2882             for( unsigned int i = 0; i < channelCount; i++ )
2883                 errors[i] = ptr[ i ] - CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767  );
2884             break;
2885         }
2886         case CL_SIGNED_INT32:
2887         {
2888             const cl_int *ptr = (const cl_int *)results;
2889             for( unsigned int i = 0; i < channelCount; i++ )
2890                 errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long) CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX  ));
2891             break;
2892         }
2893         case CL_UNSIGNED_INT8:
2894         {
2895             const cl_uchar *ptr = (const cl_uchar *)results;
2896             for( unsigned int i = 0; i < channelCount; i++ )
2897                 errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
2898             break;
2899         }
2900         case CL_UNSIGNED_INT16:
2901         {
2902             const cl_ushort *ptr = (const cl_ushort *)results;
2903             for( unsigned int i = 0; i < channelCount; i++ )
2904                 errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
2905             break;
2906         }
2907         case CL_UNSIGNED_INT32:
2908         {
2909             const cl_uint *ptr = (const cl_uint *)results;
2910             for( unsigned int i = 0; i < channelCount; i++ )
2911                 errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX  ));
2912             break;
2913         }
2914 #ifdef CL_SFIXED14_APPLE
2915         case CL_SFIXED14_APPLE:
2916         {
2917             const cl_ushort *ptr = (const cl_ushort *)results;
2918 
2919             for( unsigned int i = 0; i < channelCount; i++ )
2920                 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( ((int) srcVector[ i ] - 16384), -16384.f, 49151.f  );
2921 
2922             break;
2923         }
2924 #endif
2925         default:
2926             log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
2927             exit(-1);
2928             break;
2929     }
2930 }
2931 
2932 
2933 //
2934 //  Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
2935 //  This should be called lazily before attempting to verify image writes, otherwise an error will occur.
2936 //
DetectFloatToHalfRoundingMode(cl_command_queue q)2937 int  DetectFloatToHalfRoundingMode( cl_command_queue q )  // Returns CL_SUCCESS on success
2938 {
2939     cl_int err = CL_SUCCESS;
2940 
2941     if( gFloatToHalfRoundingMode == kDefaultRoundingMode )
2942     {
2943         // Some numbers near 0.5f, that we look at to see how the values are rounded.
2944         static const cl_uint  inData[4*4] = {   0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U, 0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
2945                                                 0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U, 0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U    };
2946         static const size_t count = sizeof( inData ) / (4*sizeof( inData[0] ));
2947         const float *inp = (const float*) inData;
2948         cl_context context = NULL;
2949 
2950     // Create an input buffer
2951         err = clGetCommandQueueInfo( q, CL_QUEUE_CONTEXT, sizeof(context), &context, NULL );
2952         if( err )
2953         {
2954             log_error( "Error:  could not get context from command queue in DetectFloatToHalfRoundingMode  (%d)", err );
2955             return err;
2956         }
2957 
2958         cl_mem inBuf = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, sizeof( inData ), (void*) inData, &err );
2959         if( NULL == inBuf || err )
2960         {
2961             log_error( "Error:  could not create input buffer in DetectFloatToHalfRoundingMode  (err: %d)", err );
2962             return err;
2963         }
2964 
2965     // Create a small output image
2966         cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
2967         cl_mem outImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &fmt, count, 1, 0, NULL, &err );
2968         if( NULL == outImage || err )
2969         {
2970             log_error( "Error:  could not create half float out image in DetectFloatToHalfRoundingMode  (err: %d)", err );
2971             clReleaseMemObject( inBuf );
2972             return err;
2973         }
2974 
2975     // Create our program, and a kernel
2976         const char *kernel[1] = {
2977             "kernel void detect_round( global float4 *in, write_only image2d_t out )\n"
2978             "{\n"
2979             "   write_imagef( out, (int2)(get_global_id(0),0), in[get_global_id(0)] );\n"
2980             "}\n" };
2981 
2982         clProgramWrapper program;
2983         err = create_single_kernel_helper_create_program(context, &program, 1, kernel);
2984 
2985         if( NULL == program || err )
2986         {
2987             log_error( "Error:  could not create program in DetectFloatToHalfRoundingMode (err: %d)", err );
2988             clReleaseMemObject( inBuf );
2989             clReleaseMemObject( outImage );
2990             return err;
2991         }
2992 
2993         cl_device_id device = NULL;
2994         err = clGetCommandQueueInfo( q, CL_QUEUE_DEVICE, sizeof(device), &device, NULL );
2995         if( err )
2996         {
2997             log_error( "Error:  could not get device from command queue in DetectFloatToHalfRoundingMode  (%d)", err );
2998             clReleaseMemObject( inBuf );
2999             clReleaseMemObject( outImage );
3000             return err;
3001         }
3002 
3003         err = clBuildProgram( program, 1, &device, "", NULL, NULL );
3004         if( err )
3005         {
3006             log_error( "Error:  could not build program in DetectFloatToHalfRoundingMode  (%d)", err );
3007             clReleaseMemObject( inBuf );
3008             clReleaseMemObject( outImage );
3009             return err;
3010         }
3011 
3012         cl_kernel k = clCreateKernel( program, "detect_round", &err );
3013         if( NULL == k || err )
3014         {
3015             log_error( "Error:  could not create kernel in DetectFloatToHalfRoundingMode  (%d)", err );
3016             clReleaseMemObject( inBuf );
3017             clReleaseMemObject( outImage );
3018             return err;
3019         }
3020 
3021         err = clSetKernelArg( k, 0, sizeof( cl_mem ), &inBuf );
3022         if( err )
3023         {
3024             log_error( "Error: could not set argument 0 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
3025             clReleaseMemObject( inBuf );
3026             clReleaseMemObject( outImage );
3027             clReleaseKernel( k );
3028             return err;
3029         }
3030 
3031         err = clSetKernelArg( k, 1, sizeof( cl_mem ), &outImage );
3032         if( err )
3033         {
3034             log_error( "Error: could not set argument 1 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
3035             clReleaseMemObject( inBuf );
3036             clReleaseMemObject( outImage );
3037             clReleaseKernel( k );
3038             return err;
3039         }
3040 
3041     // Run the kernel
3042         size_t global_work_size = count;
3043         err = clEnqueueNDRangeKernel( q, k, 1, NULL, &global_work_size, NULL, 0, NULL, NULL );
3044         if( err )
3045         {
3046             log_error( "Error: could not enqueue kernel in DetectFloatToHalfRoundingMode (%d)", err );
3047             clReleaseMemObject( inBuf );
3048             clReleaseMemObject( outImage );
3049             clReleaseKernel( k );
3050             return err;
3051         }
3052 
3053     // read the results
3054         cl_ushort outBuf[count*4];
3055         memset( outBuf, -1, sizeof( outBuf ) );
3056         size_t origin[3] = {0,0,0};
3057         size_t region[3] = {count,1,1};
3058         err = clEnqueueReadImage( q, outImage, CL_TRUE, origin, region, 0, 0, outBuf, 0, NULL, NULL );
3059         if( err )
3060         {
3061             log_error( "Error: could not read output image in DetectFloatToHalfRoundingMode (%d)", err );
3062             clReleaseMemObject( inBuf );
3063             clReleaseMemObject( outImage );
3064             clReleaseKernel( k );
3065             return err;
3066         }
3067 
3068     // Generate our list of reference results
3069         cl_ushort rte_ref[count*4];
3070         cl_ushort rtz_ref[count*4];
3071         for( size_t i = 0; i < 4 * count; i++ )
3072         {
3073             rte_ref[i] = float2half_rte( inp[i] );
3074             rtz_ref[i] = float2half_rtz( inp[i] );
3075         }
3076 
3077     // Verify that we got something in either rtz or rte mode
3078         if( 0 == memcmp( rte_ref, outBuf, sizeof( rte_ref )) )
3079         {
3080             log_info( "Autodetected float->half rounding mode to be rte\n" );
3081             gFloatToHalfRoundingMode = kRoundToNearestEven;
3082         }
3083         else if ( 0 == memcmp( rtz_ref, outBuf, sizeof( rtz_ref )) )
3084         {
3085             log_info( "Autodetected float->half rounding mode to be rtz\n" );
3086             gFloatToHalfRoundingMode = kRoundTowardZero;
3087         }
3088         else
3089         {
3090             log_error( "ERROR: float to half conversions proceed with invalid rounding mode!\n" );
3091             log_info( "\nfor:" );
3092             for( size_t i = 0; i < count; i++ )
3093                 log_info( " {%a, %a, %a, %a},", inp[4*i], inp[4*i+1], inp[4*i+2], inp[4*i+3] );
3094             log_info( "\ngot:" );
3095             for( size_t i = 0; i < count; i++ )
3096                 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", outBuf[4*i], outBuf[4*i+1], outBuf[4*i+2], outBuf[4*i+3] );
3097             log_info( "\nrte:" );
3098             for( size_t i = 0; i < count; i++ )
3099                 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rte_ref[4*i], rte_ref[4*i+1], rte_ref[4*i+2], rte_ref[4*i+3] );
3100             log_info( "\nrtz:" );
3101             for( size_t i = 0; i < count; i++ )
3102                 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rtz_ref[4*i], rtz_ref[4*i+1], rtz_ref[4*i+2], rtz_ref[4*i+3] );
3103             log_info( "\n" );
3104             err = -1;
3105             gFloatToHalfRoundingMode = kRoundingModeCount;  // illegal value
3106         }
3107 
3108     // clean up
3109         clReleaseMemObject( inBuf );
3110         clReleaseMemObject( outImage );
3111         clReleaseKernel( k );
3112         return err;
3113     }
3114 
3115     // Make sure that the rounding mode was successfully detected, if we checked earlier
3116     if( gFloatToHalfRoundingMode != kRoundToNearestEven && gFloatToHalfRoundingMode != kRoundTowardZero)
3117         return -2;
3118 
3119     return err;
3120 }
3121 
create_random_image_data(ExplicitType dataType,image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d,bool image2DFromBuffer)3122 char *create_random_image_data( ExplicitType dataType, image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d, bool image2DFromBuffer )
3123 {
3124   size_t allocSize, numPixels;
3125   if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 )
3126   {
3127     allocSize = (size_t) (compute_mipmapped_image_size(*imageInfo) * 4 * get_explicit_type_size( dataType ))/get_pixel_size(imageInfo->format);
3128     numPixels = allocSize / (get_explicit_type_size( dataType ) * 4);
3129   }
3130   else
3131   {
3132     numPixels = (image2DFromBuffer? imageInfo->rowPitch: imageInfo->width) * imageInfo->height
3133       * (imageInfo->depth ? imageInfo->depth : 1)
3134       * (imageInfo->arraySize ? imageInfo->arraySize : 1);
3135     allocSize = numPixels * 4 * get_explicit_type_size( dataType );
3136   }
3137 
3138 #if 0 // DEBUG
3139     {
3140       fprintf(stderr,"--- create_random_image_data:\n");
3141       fprintf(stderr,"allocSize = %zu\n",allocSize);
3142       fprintf(stderr,"numPixels = %zu\n",numPixels);
3143       fprintf(stderr,"width = %zu\n",imageInfo->width);
3144       fprintf(stderr,"height = %zu\n",imageInfo->height);
3145       fprintf(stderr,"depth = %zu\n",imageInfo->depth);
3146       fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch);
3147       fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch);
3148       fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize);
3149       fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType));
3150     }
3151 #endif
3152 
3153 #if defined( __APPLE__ )
3154     char *data = NULL;
3155     if (gDeviceType == CL_DEVICE_TYPE_CPU) {
3156       size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages.
3157 
3158       void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
3159       if (map == MAP_FAILED)
3160       {
3161         perror("create_random_image_data: mmap");
3162         log_error("%s:%d: mmap failed, mapSize = %zu\n",__FILE__,__LINE__,mapSize);
3163       }
3164       intptr_t data_end = (intptr_t)map + mapSize - 4096;
3165       data = (char *)(data_end - (intptr_t)allocSize);
3166 
3167       mprotect(map, 4096, PROT_NONE);
3168       mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
3169       P.reset(data, map, mapSize);
3170     } else {
3171       data = (char *)malloc(allocSize);
3172       P.reset(data);
3173     }
3174 #else
3175     char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format));
3176     P.reset(data,NULL,0,allocSize,true);
3177 #endif
3178 
3179     if (data == NULL) {
3180         log_error( "ERROR: Unable to malloc %lu bytes for create_random_image_data\n", allocSize );
3181         return NULL;
3182     }
3183 
3184     switch( dataType )
3185     {
3186         case kFloat:
3187         {
3188             float *inputValues = (float *)data;
3189             switch (imageInfo->format->image_channel_data_type)
3190             {
3191                 case CL_HALF_FLOAT:
3192                     {
3193                         // Generate data that is (mostly) inside the range of a half float
3194                         // const float HALF_MIN = 5.96046448e-08f;
3195                         const float HALF_MAX = 65504.0f;
3196 
3197                         size_t i = 0;
3198                         inputValues[ i++ ] = 0.f;
3199                         inputValues[ i++ ] = 1.f;
3200                         inputValues[ i++ ] = -1.f;
3201                         inputValues[ i++ ] = 2.f;
3202                         for( ; i < numPixels * 4; i++ )
3203                             inputValues[ i ] = get_random_float( -HALF_MAX - 2.f, HALF_MAX + 2.f, d );
3204                     }
3205                     break;
3206 #ifdef CL_SFIXED14_APPLE
3207                 case CL_SFIXED14_APPLE:
3208                     {
3209                         size_t i = 0;
3210                         if( numPixels * 4 >= 8 )
3211                         {
3212                             inputValues[ i++ ] = INFINITY;
3213                             inputValues[ i++ ] = 0x1.0p14f;
3214                             inputValues[ i++ ] = 0x1.0p31f;
3215                             inputValues[ i++ ] = 0x1.0p32f;
3216                             inputValues[ i++ ] = -INFINITY;
3217                             inputValues[ i++ ] = -0x1.0p14f;
3218                             inputValues[ i++ ] = -0x1.0p31f;
3219                             inputValues[ i++ ] = -0x1.1p31f;
3220                         }
3221                         for( ; i < numPixels * 4; i++ )
3222                             inputValues[ i ] = get_random_float( -1.1f, 3.1f, d );
3223                     }
3224                     break;
3225 #endif
3226                 case CL_FLOAT:
3227                     {
3228                         size_t i = 0;
3229                         inputValues[ i++ ] = INFINITY;
3230                         inputValues[ i++ ] = -INFINITY;
3231                         inputValues[ i++ ] = 0.0f;
3232                         inputValues[ i++ ] = 0.0f;
3233                         cl_uint *p = (cl_uint *)data;
3234                         for( ; i < numPixels * 4; i++ )
3235                             p[ i ] = genrand_int32(d);
3236                     }
3237                     break;
3238 
3239                 default:
3240                     size_t i = 0;
3241                     if( numPixels * 4 >= 36 )
3242                     {
3243                         inputValues[ i++ ] = 0.0f;
3244                         inputValues[ i++ ] = 0.5f;
3245                         inputValues[ i++ ] = 31.5f;
3246                         inputValues[ i++ ] = 32.0f;
3247                         inputValues[ i++ ] = 127.5f;
3248                         inputValues[ i++ ] = 128.0f;
3249                         inputValues[ i++ ] = 255.5f;
3250                         inputValues[ i++ ] = 256.0f;
3251                         inputValues[ i++ ] = 1023.5f;
3252                         inputValues[ i++ ] = 1024.0f;
3253                         inputValues[ i++ ] = 32767.5f;
3254                         inputValues[ i++ ] = 32768.0f;
3255                         inputValues[ i++ ] = 65535.5f;
3256                         inputValues[ i++ ] = 65536.0f;
3257                         inputValues[ i++ ] = 2147483648.0f;
3258                         inputValues[ i++ ] = 4294967296.0f;
3259                         inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
3260                         inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
3261                         inputValues[ i++ ] = -0.0f;
3262                         inputValues[ i++ ] = -0.5f;
3263                         inputValues[ i++ ] = -31.5f;
3264                         inputValues[ i++ ] = -32.0f;
3265                         inputValues[ i++ ] = -127.5f;
3266                         inputValues[ i++ ] = -128.0f;
3267                         inputValues[ i++ ] = -255.5f;
3268                         inputValues[ i++ ] = -256.0f;
3269                         inputValues[ i++ ] = -1023.5f;
3270                         inputValues[ i++ ] = -1024.0f;
3271                         inputValues[ i++ ] = -32767.5f;
3272                         inputValues[ i++ ] = -32768.0f;
3273                         inputValues[ i++ ] = -65535.5f;
3274                         inputValues[ i++ ] = -65536.0f;
3275                         inputValues[ i++ ] = -2147483648.0f;
3276                         inputValues[ i++ ] = -4294967296.0f;
3277                         inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
3278                         inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
3279                     }
3280                     if( is_format_signed(imageInfo->format) )
3281                     {
3282                         for( ; i < numPixels * 4; i++ )
3283                             inputValues[ i ] = get_random_float( -1.1f, 1.1f, d );
3284                     }
3285                     else
3286                     {
3287                         for( ; i < numPixels * 4; i++ )
3288                             inputValues[ i ] = get_random_float( -0.1f, 1.1f, d );
3289                     }
3290                     break;
3291             }
3292             break;
3293         }
3294 
3295         case kInt:
3296         {
3297             int *imageData = (int *)data;
3298 
3299             // We want to generate ints (mostly) in range of the target format
3300             int formatMin = get_format_min_int( imageInfo->format );
3301             size_t formatMax = get_format_max_int( imageInfo->format );
3302             if( formatMin == 0 )
3303             {
3304                 // Unsigned values, but we are only an int, so cap the actual max at the max of signed ints
3305                 if( formatMax > 2147483647L )
3306                     formatMax = 2147483647L;
3307             }
3308             // If the final format is small enough, give us a bit of room for out-of-range values to test
3309             if( formatMax < 2147483647L )
3310                 formatMax += 2;
3311             if( formatMin > -2147483648LL )
3312                 formatMin -= 2;
3313 
3314             // Now gen
3315             for( size_t i = 0; i < numPixels * 4; i++ )
3316             {
3317                 imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
3318             }
3319             break;
3320         }
3321 
3322         case kUInt:
3323         case kUnsignedInt:
3324         {
3325             unsigned int *imageData = (unsigned int *)data;
3326 
3327             // We want to generate ints (mostly) in range of the target format
3328             int formatMin = get_format_min_int( imageInfo->format );
3329             size_t formatMax = get_format_max_int( imageInfo->format );
3330             if( formatMin < 0 )
3331                 formatMin = 0;
3332             // If the final format is small enough, give us a bit of room for out-of-range values to test
3333             if( formatMax < 4294967295LL )
3334                 formatMax += 2;
3335 
3336             // Now gen
3337             for( size_t i = 0; i < numPixels * 4; i++ )
3338             {
3339                 imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
3340             }
3341             break;
3342         }
3343         default:
3344             // Unsupported source format
3345             delete [] data;
3346             return NULL;
3347     }
3348 
3349     return data;
3350 }
3351 
3352 /*
3353     deprecated
3354 bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t max, int &outValue )
3355 {
3356     int v = (int)value;
3357 
3358     switch(imageSampler->addressing_mode)
3359     {
3360         case CL_ADDRESS_REPEAT:
3361             outValue = v;
3362             while( v < 0 )
3363                 v += (int)max;
3364             while( v >= (int)max )
3365                 v -= (int)max;
3366             if( v != outValue )
3367             {
3368                 outValue = v;
3369                 return true;
3370             }
3371             return false;
3372 
3373         case CL_ADDRESS_MIRRORED_REPEAT:
3374             log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do we ever use this?
3375             exit(-1);
3376 
3377         default:
3378             if( v < 0 )
3379             {
3380                 outValue = 0;
3381                 return true;
3382             }
3383             if( v >= (int)max )
3384             {
3385                 outValue = (int)max - 1;
3386                 return true;
3387             }
3388             outValue = v;
3389             return false;
3390     }
3391 
3392 }
3393 */
3394 
get_sampler_kernel_code(image_sampler_data * imageSampler,char * outLine)3395 void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine )
3396 {
3397     const char *normalized;
3398     const char *addressMode;
3399     const char *filterMode;
3400 
3401     if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP )
3402         addressMode = "CLK_ADDRESS_CLAMP";
3403     else if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
3404         addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
3405     else if( imageSampler->addressing_mode == CL_ADDRESS_REPEAT )
3406         addressMode = "CLK_ADDRESS_REPEAT";
3407     else if( imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
3408         addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
3409     else if( imageSampler->addressing_mode == CL_ADDRESS_NONE )
3410         addressMode = "CLK_ADDRESS_NONE";
3411     else
3412     {
3413         log_error( "**Error: Unknown addressing mode! Aborting...\n" );
3414         abort();
3415     }
3416 
3417     if( imageSampler->normalized_coords )
3418         normalized = "CLK_NORMALIZED_COORDS_TRUE";
3419     else
3420         normalized = "CLK_NORMALIZED_COORDS_FALSE";
3421 
3422     if( imageSampler->filter_mode == CL_FILTER_LINEAR )
3423         filterMode = "CLK_FILTER_LINEAR";
3424     else
3425         filterMode = "CLK_FILTER_NEAREST";
3426 
3427     sprintf( outLine, "    const sampler_t imageSampler = %s | %s | %s;\n", addressMode, filterMode, normalized );
3428 }
3429 
copy_image_data(image_descriptor * srcImageInfo,image_descriptor * dstImageInfo,void * imageValues,void * destImageValues,const size_t sourcePos[],const size_t destPos[],const size_t regionSize[])3430 void copy_image_data( image_descriptor *srcImageInfo, image_descriptor *dstImageInfo, void *imageValues, void *destImageValues,
3431                      const size_t sourcePos[], const size_t destPos[], const size_t regionSize[] )
3432 {
3433   //  assert( srcImageInfo->format == dstImageInfo->format );
3434 
3435   size_t src_mip_level_offset = 0, dst_mip_level_offset = 0;
3436   size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod;
3437   size_t src_row_pitch_lod, src_slice_pitch_lod;
3438   size_t dst_row_pitch_lod, dst_slice_pitch_lod;
3439 
3440   size_t pixelSize = get_pixel_size( srcImageInfo->format );
3441 
3442   sourcePos_lod[0] = sourcePos[0];
3443   sourcePos_lod[1] = sourcePos[1];
3444   sourcePos_lod[2] = sourcePos[2];
3445   destPos_lod[0] = destPos[0];
3446   destPos_lod[1] = destPos[1];
3447   destPos_lod[2] = destPos[2];
3448   src_row_pitch_lod = srcImageInfo->rowPitch;
3449   dst_row_pitch_lod = dstImageInfo->rowPitch;
3450   src_slice_pitch_lod = srcImageInfo->slicePitch;
3451   dst_slice_pitch_lod = dstImageInfo->slicePitch;
3452 
3453   if( srcImageInfo->num_mip_levels > 1)
3454   {
3455     size_t src_width_lod = 1/*srcImageInfo->width*/;
3456     size_t src_height_lod = 1/*srcImageInfo->height*/;
3457     size_t src_depth_lod = 1/*srcImageInfo->depth*/;
3458 
3459     switch( srcImageInfo->type )
3460     {
3461     case CL_MEM_OBJECT_IMAGE1D:
3462       src_lod = sourcePos[1];
3463       sourcePos_lod[1] = sourcePos_lod[2] = 0;
3464       src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3465       break;
3466     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3467     case CL_MEM_OBJECT_IMAGE2D:
3468       src_lod = sourcePos[2];
3469       sourcePos_lod[1] = sourcePos[1];
3470       sourcePos_lod[2] = 0;
3471       src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3472       if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D )
3473         src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1;
3474       break;
3475     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3476     case CL_MEM_OBJECT_IMAGE3D:
3477       src_lod = sourcePos[3];
3478       sourcePos_lod[1] = sourcePos[1];
3479       sourcePos_lod[2] = sourcePos[2];
3480       src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3481       src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1;
3482       if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D )
3483         src_depth_lod = (srcImageInfo->depth >> src_lod ) ? ( srcImageInfo->depth >> src_lod ): 1;
3484       break;
3485 
3486     }
3487     src_mip_level_offset = compute_mip_level_offset( srcImageInfo, src_lod );
3488     src_row_pitch_lod = src_width_lod * get_pixel_size( srcImageInfo->format );
3489     src_slice_pitch_lod = src_row_pitch_lod * src_height_lod;
3490   }
3491 
3492   if( dstImageInfo->num_mip_levels > 1)
3493   {
3494     size_t dst_width_lod = 1/*dstImageInfo->width*/;
3495     size_t dst_height_lod = 1/*dstImageInfo->height*/;
3496     size_t dst_depth_lod = 1 /*dstImageInfo->depth*/;
3497     switch( dstImageInfo->type )
3498     {
3499     case CL_MEM_OBJECT_IMAGE1D:
3500       dst_lod = destPos[1];
3501       destPos_lod[1] = destPos_lod[2] = 0;
3502       dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3503       break;
3504     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3505     case CL_MEM_OBJECT_IMAGE2D:
3506       dst_lod = destPos[2];
3507       destPos_lod[1] = destPos[1];
3508       destPos_lod[2] = 0;
3509       dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3510       if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D )
3511         dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1;
3512       break;
3513     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3514     case CL_MEM_OBJECT_IMAGE3D:
3515       dst_lod = destPos[3];
3516       destPos_lod[1] = destPos[1];
3517       destPos_lod[2] = destPos[2];
3518       dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3519       dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1;
3520       if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D )
3521         dst_depth_lod = (dstImageInfo->depth >> dst_lod ) ? ( dstImageInfo->depth >> dst_lod ): 1;
3522       break;
3523 
3524     }
3525     dst_mip_level_offset = compute_mip_level_offset( dstImageInfo, dst_lod );
3526     dst_row_pitch_lod = dst_width_lod * get_pixel_size( dstImageInfo->format);
3527     dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod;
3528   }
3529 
3530   // Get initial pointers
3531   char *sourcePtr = (char *)imageValues + sourcePos_lod[ 2 ] * src_slice_pitch_lod + sourcePos_lod[ 1 ] * src_row_pitch_lod + pixelSize * sourcePos_lod[ 0 ] + src_mip_level_offset;
3532   char *destPtr = (char *)destImageValues + destPos_lod[ 2 ] * dst_slice_pitch_lod + destPos_lod[ 1 ] * dst_row_pitch_lod + pixelSize * destPos_lod[ 0 ] + dst_mip_level_offset;
3533 
3534   for( size_t z = 0; z < ( regionSize[ 2 ] > 0 ? regionSize[ 2 ] : 1 ); z++ )
3535   {
3536     char *rowSourcePtr = sourcePtr;
3537     char *rowDestPtr = destPtr;
3538     for( size_t y = 0; y < regionSize[ 1 ]; y++ )
3539     {
3540       memcpy( rowDestPtr, rowSourcePtr, pixelSize * regionSize[ 0 ] );
3541       rowSourcePtr += src_row_pitch_lod;
3542       rowDestPtr += dst_row_pitch_lod;
3543     }
3544 
3545     sourcePtr += src_slice_pitch_lod;
3546     destPtr += dst_slice_pitch_lod;
3547   }
3548 }
3549 
random_float(float low,float high,MTdata d)3550 float random_float(float low, float high, MTdata d)
3551 {
3552     float t = (float) genrand_real1(d);
3553     return (1.0f - t) * low + t * high;
3554 }
3555 
CoordWalker(void * coords,bool useFloats,size_t vecSize)3556 CoordWalker::CoordWalker( void * coords, bool useFloats, size_t vecSize )
3557 {
3558     if( useFloats )
3559     {
3560         mFloatCoords = (cl_float *)coords;
3561         mIntCoords = NULL;
3562     }
3563     else
3564     {
3565         mFloatCoords = NULL;
3566         mIntCoords = (cl_int *)coords;
3567     }
3568     mVecSize = vecSize;
3569 }
3570 
~CoordWalker()3571 CoordWalker::~CoordWalker()
3572 {
3573 }
3574 
Get(size_t idx,size_t el)3575 cl_float CoordWalker::Get( size_t idx, size_t el )
3576 {
3577     if( mIntCoords != NULL )
3578         return (cl_float)mIntCoords[ idx * mVecSize + el ];
3579     else
3580         return mFloatCoords[ idx * mVecSize + el ];
3581 }
3582 
3583 
print_read_header(cl_image_format * format,image_sampler_data * sampler,bool err,int t)3584 void print_read_header( cl_image_format *format, image_sampler_data *sampler, bool err, int t )
3585 {
3586     const char *addressMode = NULL;
3587     const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };
3588 
3589     if( sampler->addressing_mode == CL_ADDRESS_CLAMP )
3590         addressMode = "CL_ADDRESS_CLAMP";
3591     else if( sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
3592         addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
3593     else if( sampler->addressing_mode == CL_ADDRESS_REPEAT )
3594         addressMode = "CL_ADDRESS_REPEAT";
3595     else if( sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
3596         addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
3597     else
3598         addressMode = "CL_ADDRESS_NONE";
3599 
3600     if( t )
3601     {
3602         if( err )
3603             log_error( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3604                       GetChannelTypeName( format->image_channel_data_type ),
3605                       (int)get_format_channel_count( format ),
3606                       sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3607                       addressMode,
3608                       normalizedNames[sampler->normalized_coords ? 1 : 0],
3609                       t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
3610         else
3611             log_info( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3612                      GetChannelTypeName( format->image_channel_data_type ),
3613                      (int)get_format_channel_count( format ),
3614                      sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3615                      addressMode,
3616                      normalizedNames[sampler->normalized_coords ? 1 : 0],
3617                      t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
3618     }
3619     else
3620     {
3621         if( err )
3622             log_error( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3623                       GetChannelTypeName( format->image_channel_data_type ),
3624                       (int)get_format_channel_count( format ),
3625                       sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3626                       addressMode,
3627                       normalizedNames[sampler->normalized_coords ? 1 : 0] );
3628         else
3629             log_info( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3630                      GetChannelTypeName( format->image_channel_data_type ),
3631                      (int)get_format_channel_count( format ),
3632                      sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3633                      addressMode,
3634                      normalizedNames[sampler->normalized_coords ? 1 : 0] );
3635     }
3636 
3637 }
3638 
print_write_header(cl_image_format * format,bool err=false)3639 void print_write_header( cl_image_format *format, bool err = false)
3640 {
3641     if( err )
3642         log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3643                   GetChannelTypeName( format->image_channel_data_type ),
3644                   (int)get_format_channel_count( format ) );
3645     else
3646         log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3647                  GetChannelTypeName( format->image_channel_data_type ),
3648                  (int)get_format_channel_count( format ) );
3649 }
3650 
3651 
print_header(cl_image_format * format,bool err=false)3652 void print_header( cl_image_format *format, bool err = false )
3653 {
3654     if (err) {
3655         log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3656                   GetChannelTypeName( format->image_channel_data_type ),
3657                   (int)get_format_channel_count( format ) );
3658     } else {
3659         log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3660                  GetChannelTypeName( format->image_channel_data_type ),
3661                  (int)get_format_channel_count( format ) );
3662     }
3663 }
3664 
find_format(cl_image_format * formatList,unsigned int numFormats,cl_image_format * formatToFind)3665 bool find_format( cl_image_format *formatList, unsigned int numFormats, cl_image_format *formatToFind )
3666 {
3667     for( unsigned int i = 0; i < numFormats; i++ )
3668     {
3669         if( formatList[ i ].image_channel_order == formatToFind->image_channel_order &&
3670            formatList[ i ].image_channel_data_type == formatToFind->image_channel_data_type )
3671             return true;
3672     }
3673     return false;
3674 }
3675 
build_required_image_formats(cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device,std::vector<cl_image_format> & formatsToSupport)3676 void build_required_image_formats(cl_mem_flags flags,
3677                                   cl_mem_object_type image_type,
3678                                   cl_device_id device,
3679                                   std::vector<cl_image_format>& formatsToSupport)
3680 {
3681 	Version version = get_device_cl_version(device);
3682 
3683 	formatsToSupport.clear();
3684 
3685 	// Required embedded formats.
3686 	static std::vector<cl_image_format> embeddedProfReadOrWriteFormats
3687 	{
3688 		{ CL_RGBA, CL_UNORM_INT8 },
3689 		{ CL_RGBA, CL_UNORM_INT16 },
3690 		{ CL_RGBA, CL_SIGNED_INT8 },
3691 		{ CL_RGBA, CL_SIGNED_INT16 },
3692 		{ CL_RGBA, CL_SIGNED_INT32 },
3693 		{ CL_RGBA, CL_UNSIGNED_INT8 },
3694 		{ CL_RGBA, CL_UNSIGNED_INT16 },
3695 		{ CL_RGBA, CL_UNSIGNED_INT32 },
3696 		{ CL_RGBA, CL_HALF_FLOAT },
3697 		{ CL_RGBA, CL_FLOAT },
3698 	};
3699 
3700 	/*
3701 		Required full profile formats.
3702 		This array does not contain any full profile
3703 		formats that have restrictions on when they
3704 		are required.
3705 	*/
3706 	static std::vector<cl_image_format> fullProfReadOrWriteFormats
3707 	{
3708 		{ CL_RGBA, CL_UNORM_INT8 },
3709 		{ CL_RGBA, CL_UNORM_INT16 },
3710 		{ CL_RGBA, CL_SIGNED_INT8 },
3711 		{ CL_RGBA, CL_SIGNED_INT16 },
3712 		{ CL_RGBA, CL_SIGNED_INT32 },
3713 		{ CL_RGBA, CL_UNSIGNED_INT8 },
3714 		{ CL_RGBA, CL_UNSIGNED_INT16 },
3715 		{ CL_RGBA, CL_UNSIGNED_INT32 },
3716 		{ CL_RGBA, CL_HALF_FLOAT },
3717 		{ CL_RGBA, CL_FLOAT },
3718 		{ CL_BGRA, CL_UNORM_INT8 },
3719 	};
3720 
3721 	/*
3722 		Required full profile formats specifically for 2.x.
3723 		This array does not contain any full profile
3724 		formats that have restrictions on when they
3725 		are required.
3726 	*/
3727 	static std::vector<cl_image_format> fullProf2XReadOrWriteFormats
3728 	{
3729 		{ CL_R, CL_UNORM_INT8 },
3730 		{ CL_R, CL_UNORM_INT16 },
3731 		{ CL_R, CL_SNORM_INT8 },
3732 		{ CL_R, CL_SNORM_INT16 },
3733 		{ CL_R, CL_SIGNED_INT8 },
3734 		{ CL_R, CL_SIGNED_INT16 },
3735 		{ CL_R, CL_SIGNED_INT32 },
3736 		{ CL_R, CL_UNSIGNED_INT8 },
3737 		{ CL_R, CL_UNSIGNED_INT16 },
3738 		{ CL_R, CL_UNSIGNED_INT32 },
3739 		{ CL_R, CL_HALF_FLOAT },
3740 		{ CL_R, CL_FLOAT },
3741 		{ CL_RG, CL_UNORM_INT8 },
3742 		{ CL_RG, CL_UNORM_INT16 },
3743 		{ CL_RG, CL_SNORM_INT8 },
3744 		{ CL_RG, CL_SNORM_INT16 },
3745 		{ CL_RG, CL_SIGNED_INT8 },
3746 		{ CL_RG, CL_SIGNED_INT16 },
3747 		{ CL_RG, CL_SIGNED_INT32 },
3748 		{ CL_RG, CL_UNSIGNED_INT8 },
3749 		{ CL_RG, CL_UNSIGNED_INT16 },
3750 		{ CL_RG, CL_UNSIGNED_INT32 },
3751 		{ CL_RG, CL_HALF_FLOAT },
3752 		{ CL_RG, CL_FLOAT },
3753 		{ CL_RGBA, CL_SNORM_INT8 },
3754 		{ CL_RGBA, CL_SNORM_INT16 },
3755 	};
3756 
3757 	/*
3758 		Required full profile formats for CL_DEPTH
3759 		(specifically 2.x).
3760 		There are cases whereby the format isn't required.
3761 	*/
3762 	static std::vector<cl_image_format> fullProf2XReadOrWriteDepthFormats
3763 	{
3764 		{ CL_DEPTH, CL_UNORM_INT16 },
3765 		{ CL_DEPTH, CL_FLOAT },
3766 	};
3767 
3768 	/*
3769 		Required full profile formats for CL_sRGB
3770 		(specifically 2.x).
3771 		There are cases whereby the format isn't required.
3772 	*/
3773 	static std::vector<cl_image_format> fullProf2XSRGBFormats
3774 	{
3775 		{ CL_sRGBA, CL_UNORM_INT8 },
3776 	};
3777 
3778 	// Embedded profile
3779 	if (gIsEmbedded)
3780 	{
3781 		copy(embeddedProfReadOrWriteFormats.begin(),
3782 		     embeddedProfReadOrWriteFormats.end(),
3783 		     back_inserter(formatsToSupport));
3784 	}
3785 	// Full profile
3786 	else
3787 	{
3788 		copy(fullProfReadOrWriteFormats.begin(),
3789 		     fullProfReadOrWriteFormats.end(),
3790 		     back_inserter(formatsToSupport));
3791 	}
3792 
3793 	// Full profile, OpenCL 2.0, 2.1, 2.2
3794 	if (!gIsEmbedded && version >= Version(2, 0) && version <= Version(2, 2))
3795 	{
3796 		copy(fullProf2XReadOrWriteFormats.begin(),
3797 		     fullProf2XReadOrWriteFormats.end(),
3798 		     back_inserter(formatsToSupport));
3799 
3800 		// Depth images are only required for 2DArray and 2D images
3801 		if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D)
3802 		{
3803 			copy(fullProf2XReadOrWriteDepthFormats.begin(),
3804 			     fullProf2XReadOrWriteDepthFormats.end(),
3805 			     back_inserter(formatsToSupport));
3806 		}
3807 
3808 		// sRGB is not required for 1DImage Buffers
3809 		if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
3810 		{
3811 			// sRGB is only required for reading
3812 			if (flags == CL_MEM_READ_ONLY)
3813 			{
3814 				copy(fullProf2XSRGBFormats.begin(),
3815 				     fullProf2XSRGBFormats.end(),
3816 				     back_inserter(formatsToSupport));
3817 			}
3818 		}
3819 	}
3820 }
3821 
is_image_format_required(cl_image_format format,cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device)3822 bool is_image_format_required(cl_image_format format,
3823                               cl_mem_flags flags,
3824                               cl_mem_object_type image_type,
3825                               cl_device_id device)
3826 {
3827 	std::vector<cl_image_format> formatsToSupport;
3828 	build_required_image_formats(flags, image_type, device, formatsToSupport);
3829 
3830 	for (auto &formatItr: formatsToSupport)
3831 	{
3832 		if (formatItr.image_channel_order == format.image_channel_order &&
3833 		    formatItr.image_channel_data_type == format.image_channel_data_type)
3834 		{
3835 			return true;
3836 		}
3837 	}
3838 
3839 	return false;
3840 }
3841 
compute_max_mip_levels(size_t width,size_t height,size_t depth)3842 cl_uint compute_max_mip_levels( size_t width, size_t height, size_t depth)
3843 {
3844   cl_uint retMaxMipLevels=0, max_dim = 0;
3845 
3846   max_dim = width;
3847   max_dim = height > max_dim ? height : max_dim;
3848   max_dim = depth > max_dim ? depth : max_dim;
3849 
3850   while(max_dim) {
3851     retMaxMipLevels++;
3852     max_dim >>= 1;
3853   }
3854   return retMaxMipLevels;
3855 }
3856 
compute_mipmapped_image_size(image_descriptor imageInfo)3857 cl_ulong compute_mipmapped_image_size( image_descriptor imageInfo)
3858 {
3859   cl_ulong retSize = 0;
3860   size_t curr_width, curr_height, curr_depth, curr_array_size;
3861   curr_width = imageInfo.width;
3862   curr_height = imageInfo.height;
3863   curr_depth = imageInfo.depth;
3864   curr_array_size = imageInfo.arraySize;
3865 
3866   for (int i=0; i < (int) imageInfo.num_mip_levels; i++)
3867   {
3868     switch ( imageInfo.type )
3869     {
3870     case CL_MEM_OBJECT_IMAGE3D :
3871       retSize += (cl_ulong)curr_width * curr_height * curr_depth * get_pixel_size(imageInfo.format);
3872       break;
3873     case CL_MEM_OBJECT_IMAGE2D :
3874       retSize += (cl_ulong)curr_width * curr_height * get_pixel_size(imageInfo.format);
3875       break;
3876     case CL_MEM_OBJECT_IMAGE1D :
3877       retSize += (cl_ulong)curr_width * get_pixel_size(imageInfo.format);
3878       break;
3879     case CL_MEM_OBJECT_IMAGE1D_ARRAY :
3880       retSize += (cl_ulong)curr_width * curr_array_size * get_pixel_size(imageInfo.format);
3881       break;
3882     case CL_MEM_OBJECT_IMAGE2D_ARRAY :
3883       retSize += (cl_ulong)curr_width * curr_height * curr_array_size * get_pixel_size(imageInfo.format);
3884       break;
3885     }
3886 
3887     switch ( imageInfo.type )
3888     {
3889     case CL_MEM_OBJECT_IMAGE3D :
3890       curr_depth = curr_depth >> 1 ? curr_depth >> 1: 1;
3891     case CL_MEM_OBJECT_IMAGE2D :
3892     case CL_MEM_OBJECT_IMAGE2D_ARRAY :
3893       curr_height = curr_height >> 1? curr_height >> 1 : 1;
3894     case CL_MEM_OBJECT_IMAGE1D :
3895     case CL_MEM_OBJECT_IMAGE1D_ARRAY :
3896       curr_width = curr_width >> 1? curr_width >> 1 : 1;
3897     }
3898   }
3899 
3900   return retSize;
3901 }
3902 
compute_mip_level_offset(image_descriptor * imageInfo,size_t lod)3903 size_t compute_mip_level_offset( image_descriptor * imageInfo , size_t lod)
3904 {
3905   size_t retOffset = 0;
3906   size_t width, height,  depth;
3907   width = imageInfo->width;
3908   height = imageInfo->height;
3909   depth = imageInfo->depth;
3910 
3911   for(size_t i=0; i < lod; i++)
3912   {
3913     switch(imageInfo->type)
3914     {
3915     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3916       retOffset += (size_t) width * height * imageInfo->arraySize * get_pixel_size( imageInfo->format );
3917       break;
3918     case CL_MEM_OBJECT_IMAGE3D:
3919       retOffset += (size_t) width * height * depth * get_pixel_size( imageInfo->format );
3920       break;
3921     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3922       retOffset += (size_t) width * imageInfo->arraySize * get_pixel_size( imageInfo->format );
3923       break;
3924     case CL_MEM_OBJECT_IMAGE2D:
3925       retOffset += (size_t) width * height * get_pixel_size( imageInfo->format );
3926       break;
3927     case CL_MEM_OBJECT_IMAGE1D:
3928       retOffset += (size_t) width * get_pixel_size( imageInfo->format );
3929       break;
3930     }
3931 
3932     // Compute next lod dimensions
3933     switch(imageInfo->type)
3934     {
3935     case CL_MEM_OBJECT_IMAGE3D:
3936       depth = ( depth >> 1 ) ? ( depth >> 1 ) : 1;
3937     case CL_MEM_OBJECT_IMAGE2D:
3938     case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3939       height = ( height >> 1 ) ? ( height >> 1 ) : 1;
3940     case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3941     case CL_MEM_OBJECT_IMAGE1D:
3942       width = ( width >> 1 ) ? ( width >> 1 ) : 1;
3943     }
3944 
3945   }
3946   return retOffset;
3947 }
3948