1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "imageHelpers.h"
17 #include <limits.h>
18 #include <assert.h>
19 #if defined( __APPLE__ )
20 #include <sys/mman.h>
21 #endif
22 #if !defined (_WIN32) && !defined(__APPLE__)
23 #include <malloc.h>
24 #endif
25 #include <algorithm>
26 #include <iterator>
27 #if !defined (_WIN32)
28 #include <cmath>
29 #endif
30
31 RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode;
32
33 static cl_ushort float2half_rte( float f );
34 static cl_ushort float2half_rtz( float f );
35
36 cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT;
37 bool gTestRounding = false;
38 double
sRGBmap(float fc)39 sRGBmap(float fc)
40 {
41 double c = (double)fc;
42
43 #if !defined (_WIN32)
44 if (std::isnan(c))
45 c = 0.0;
46 #else
47 if (_isnan(c))
48 c = 0.0;
49 #endif
50
51 if (c > 1.0)
52 c = 1.0;
53 else if (c < 0.0)
54 c = 0.0;
55 else if (c < 0.0031308)
56 c = 12.92 * c;
57 else
58 c = (1055.0/1000.0) * pow(c, 5.0/12.0) - (55.0/1000.0);
59
60 return c * 255.0;
61 }
62
63 double
sRGBunmap(float fc)64 sRGBunmap(float fc)
65 {
66 double c = (double)fc;
67 double result;
68
69 if (c <= 0.04045)
70 result = c / 12.92;
71 else
72 result = pow((c + 0.055) / 1.055, 2.4);
73
74 return result;
75 }
76
77
get_format_type_size(const cl_image_format * format)78 size_t get_format_type_size( const cl_image_format *format )
79 {
80 return get_channel_data_type_size( format->image_channel_data_type );
81 }
82
get_channel_data_type_size(cl_channel_type channelType)83 size_t get_channel_data_type_size( cl_channel_type channelType )
84 {
85 switch( channelType )
86 {
87 case CL_SNORM_INT8:
88 case CL_UNORM_INT8:
89 case CL_SIGNED_INT8:
90 case CL_UNSIGNED_INT8:
91 return 1;
92
93 case CL_SNORM_INT16:
94 case CL_UNORM_INT16:
95 case CL_SIGNED_INT16:
96 case CL_UNSIGNED_INT16:
97 case CL_HALF_FLOAT:
98 #ifdef CL_SFIXED14_APPLE
99 case CL_SFIXED14_APPLE:
100 #endif
101 return sizeof( cl_short );
102
103 case CL_SIGNED_INT32:
104 case CL_UNSIGNED_INT32:
105 return sizeof( cl_int );
106
107 case CL_UNORM_SHORT_565:
108 case CL_UNORM_SHORT_555:
109 #ifdef OBSOLETE_FORAMT
110 case CL_UNORM_SHORT_565_REV:
111 case CL_UNORM_SHORT_555_REV:
112 #endif
113 return 2;
114
115 #ifdef OBSOLETE_FORAMT
116 case CL_UNORM_INT_8888:
117 case CL_UNORM_INT_8888_REV:
118 return 4;
119 #endif
120
121 case CL_UNORM_INT_101010:
122 #ifdef OBSOLETE_FORAMT
123 case CL_UNORM_INT_101010_REV:
124 #endif
125 return 4;
126
127 case CL_FLOAT:
128 return sizeof( cl_float );
129
130 default:
131 return 0;
132 }
133 }
134
get_format_channel_count(const cl_image_format * format)135 size_t get_format_channel_count( const cl_image_format *format )
136 {
137 return get_channel_order_channel_count( format->image_channel_order );
138 }
139
get_channel_order_channel_count(cl_channel_order order)140 size_t get_channel_order_channel_count( cl_channel_order order )
141 {
142 switch( order )
143 {
144 case CL_R:
145 case CL_A:
146 case CL_Rx:
147 case CL_INTENSITY:
148 case CL_LUMINANCE:
149 case CL_DEPTH:
150 case CL_DEPTH_STENCIL:
151 return 1;
152
153 case CL_RG:
154 case CL_RA:
155 case CL_RGx:
156 return 2;
157
158 case CL_RGB:
159 case CL_RGBx:
160 case CL_sRGB:
161 case CL_sRGBx:
162 return 3;
163
164 case CL_RGBA:
165 case CL_ARGB:
166 case CL_BGRA:
167 case CL_sRGBA:
168 case CL_sBGRA:
169 case CL_ABGR:
170 #ifdef CL_1RGB_APPLE
171 case CL_1RGB_APPLE:
172 #endif
173 #ifdef CL_BGR1_APPLE
174 case CL_BGR1_APPLE:
175 #endif
176 #ifdef CL_ABGR_APPLE
177 case CL_ABGR_APPLE:
178 #endif
179 return 4;
180
181 default:
182 log_error("%s does not support 0x%x\n",__FUNCTION__,order);
183 return 0;
184 }
185 }
186
get_channel_type_from_name(const char * name)187 cl_channel_type get_channel_type_from_name( const char *name )
188 {
189 struct {
190 cl_channel_type type;
191 const char *name;
192 } typeNames[] = {
193 { CL_SNORM_INT8, "CL_SNORM_INT8" },
194 { CL_SNORM_INT16, "CL_SNORM_INT16" },
195 { CL_UNORM_INT8, "CL_UNORM_INT8" },
196 { CL_UNORM_INT16, "CL_UNORM_INT16" },
197 { CL_UNORM_INT24, "CL_UNORM_INT24" },
198 { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" },
199 { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" },
200 { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" },
201 { CL_SIGNED_INT8, "CL_SIGNED_INT8" },
202 { CL_SIGNED_INT16, "CL_SIGNED_INT16" },
203 { CL_SIGNED_INT32, "CL_SIGNED_INT32" },
204 { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" },
205 { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" },
206 { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" },
207 { CL_HALF_FLOAT, "CL_HALF_FLOAT" },
208 { CL_FLOAT, "CL_FLOAT" },
209 #ifdef CL_SFIXED14_APPLE
210 { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" }
211 #endif
212 };
213 for( size_t i = 0; i < sizeof( typeNames ) / sizeof( typeNames[ 0 ] ); i++ )
214 {
215 if( strcmp( typeNames[ i ].name, name ) == 0 || strcmp( typeNames[ i ].name + 3, name ) == 0 )
216 return typeNames[ i ].type;
217 }
218 return (cl_channel_type)-1;
219 }
220
get_channel_order_from_name(const char * name)221 cl_channel_order get_channel_order_from_name( const char *name )
222 {
223 const struct
224 {
225 cl_channel_order order;
226 const char *name;
227 }orderNames[] =
228 {
229 { CL_R, "CL_R" },
230 { CL_A, "CL_A" },
231 { CL_Rx, "CL_Rx" },
232 { CL_RG, "CL_RG" },
233 { CL_RA, "CL_RA" },
234 { CL_RGx, "CL_RGx" },
235 { CL_RGB, "CL_RGB" },
236 { CL_RGBx, "CL_RGBx" },
237 { CL_RGBA, "CL_RGBA" },
238 { CL_BGRA, "CL_BGRA" },
239 { CL_ARGB, "CL_ARGB" },
240 { CL_INTENSITY, "CL_INTENSITY"},
241 { CL_LUMINANCE, "CL_LUMINANCE"},
242 { CL_DEPTH, "CL_DEPTH" },
243 { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" },
244 { CL_sRGB, "CL_sRGB" },
245 { CL_sRGBx, "CL_sRGBx" },
246 { CL_sRGBA, "CL_sRGBA" },
247 { CL_sBGRA, "CL_sBGRA" },
248 { CL_ABGR, "CL_ABGR" },
249 #ifdef CL_1RGB_APPLE
250 { CL_1RGB_APPLE, "CL_1RGB_APPLE" },
251 #endif
252 #ifdef CL_BGR1_APPLE
253 { CL_BGR1_APPLE, "CL_BGR1_APPLE" },
254 #endif
255 };
256
257 for( size_t i = 0; i < sizeof( orderNames ) / sizeof( orderNames[ 0 ] ); i++ )
258 {
259 if( strcmp( orderNames[ i ].name, name ) == 0 || strcmp( orderNames[ i ].name + 3, name ) == 0 )
260 return orderNames[ i ].order;
261 }
262 return (cl_channel_order)-1;
263 }
264
265
is_format_signed(const cl_image_format * format)266 int is_format_signed( const cl_image_format *format )
267 {
268 switch( format->image_channel_data_type )
269 {
270 case CL_SNORM_INT8:
271 case CL_SIGNED_INT8:
272 case CL_SNORM_INT16:
273 case CL_SIGNED_INT16:
274 case CL_SIGNED_INT32:
275 case CL_HALF_FLOAT:
276 case CL_FLOAT:
277 #ifdef CL_SFIXED14_APPLE
278 case CL_SFIXED14_APPLE:
279 #endif
280 return 1;
281
282 default:
283 return 0;
284 }
285 }
286
get_pixel_size(cl_image_format * format)287 size_t get_pixel_size( cl_image_format *format )
288 {
289 switch( format->image_channel_data_type )
290 {
291 case CL_SNORM_INT8:
292 case CL_UNORM_INT8:
293 case CL_SIGNED_INT8:
294 case CL_UNSIGNED_INT8:
295 return get_format_channel_count( format );
296
297 case CL_SNORM_INT16:
298 case CL_UNORM_INT16:
299 case CL_SIGNED_INT16:
300 case CL_UNSIGNED_INT16:
301 case CL_HALF_FLOAT:
302 #ifdef CL_SFIXED14_APPLE
303 case CL_SFIXED14_APPLE:
304 #endif
305 return get_format_channel_count( format ) * sizeof( cl_ushort );
306
307 case CL_SIGNED_INT32:
308 case CL_UNSIGNED_INT32:
309 return get_format_channel_count( format ) * sizeof( cl_int );
310
311 case CL_UNORM_SHORT_565:
312 case CL_UNORM_SHORT_555:
313 #ifdef OBSOLETE_FORAMT
314 case CL_UNORM_SHORT_565_REV:
315 case CL_UNORM_SHORT_555_REV:
316 #endif
317 return 2;
318
319 #ifdef OBSOLETE_FORAMT
320 case CL_UNORM_INT_8888:
321 case CL_UNORM_INT_8888_REV:
322 return 4;
323 #endif
324
325 case CL_UNORM_INT_101010:
326 #ifdef OBSOLETE_FORAMT
327 case CL_UNORM_INT_101010_REV:
328 #endif
329 return 4;
330
331 case CL_FLOAT:
332 return get_format_channel_count( format ) * sizeof( cl_float );
333
334 default:
335 return 0;
336 }
337 }
338
get_8_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)339 int get_8_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat )
340 {
341 cl_image_format formatList[ 128 ];
342 unsigned int outFormatCount, i;
343 int error;
344
345
346 /* Make sure each image format is supported */
347 if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount )))
348 return error;
349
350
351 /* Look for one that is an 8-bit format */
352 for( i = 0; i < outFormatCount; i++ )
353 {
354 if( formatList[ i ].image_channel_data_type == CL_SNORM_INT8 ||
355 formatList[ i ].image_channel_data_type == CL_UNORM_INT8 ||
356 formatList[ i ].image_channel_data_type == CL_SIGNED_INT8 ||
357 formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT8 )
358 {
359 if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) )
360 {
361 *outFormat = formatList[ i ];
362 return 0;
363 }
364 }
365 }
366
367 return -1;
368 }
369
get_32_bit_image_format(cl_context context,cl_mem_object_type objType,cl_mem_flags flags,size_t channelCount,cl_image_format * outFormat)370 int get_32_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat )
371 {
372 cl_image_format formatList[ 128 ];
373 unsigned int outFormatCount, i;
374 int error;
375
376
377 /* Make sure each image format is supported */
378 if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount )))
379 return error;
380
381 /* Look for one that is an 8-bit format */
382 for( i = 0; i < outFormatCount; i++ )
383 {
384 if( formatList[ i ].image_channel_data_type == CL_UNORM_INT_101010 ||
385 formatList[ i ].image_channel_data_type == CL_FLOAT ||
386 formatList[ i ].image_channel_data_type == CL_SIGNED_INT32 ||
387 formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT32 )
388 {
389 if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) )
390 {
391 *outFormat = formatList[ i ];
392 return 0;
393 }
394 }
395 }
396
397 return -1;
398 }
399
random_log_in_range(int minV,int maxV,MTdata d)400 int random_log_in_range( int minV, int maxV, MTdata d )
401 {
402 double v = log2( ( (double)genrand_int32(d) / (double)0xffffffff ) + 1 );
403 int iv = (int)( (float)( maxV - minV ) * v );
404 return iv + minV;
405 }
406
407
408 // Define the addressing functions
409 typedef int (*AddressFn)( int value, size_t maxValue );
410
NoAddressFn(int value,size_t maxValue)411 int NoAddressFn( int value, size_t maxValue ) { return value; }
RepeatAddressFn(int value,size_t maxValue)412 int RepeatAddressFn( int value, size_t maxValue )
413 {
414 if( value < 0 )
415 value += (int)maxValue;
416 else if( value >= (int)maxValue )
417 value -= (int)maxValue;
418 return value;
419 }
MirroredRepeatAddressFn(int value,size_t maxValue)420 int MirroredRepeatAddressFn( int value, size_t maxValue )
421 {
422 if( value < 0 )
423 value = 0;
424 else if( (size_t) value >= maxValue )
425 value = (int) (maxValue - 1);
426 return value;
427 }
ClampAddressFn(int value,size_t maxValue)428 int ClampAddressFn( int value, size_t maxValue ) { return ( value < -1 ) ? -1 : ( ( value > (cl_long) maxValue ) ? (int)maxValue : value ); }
ClampToEdgeNearestFn(int value,size_t maxValue)429 int ClampToEdgeNearestFn( int value, size_t maxValue ) { return ( value < 0 ) ? 0 : ( ( (size_t)value > maxValue - 1 ) ? (int)maxValue - 1 : value ); }
430 AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn;
431
432 // Note: normalized coords get repeated in normalized space, not unnormalized space! hence the special case here
433 volatile float gFloatHome;
RepeatNormalizedAddressFn(float fValue,size_t maxValue)434 float RepeatNormalizedAddressFn( float fValue, size_t maxValue )
435 {
436 #ifndef _MSC_VER // Use original if not the VS compiler.
437 // General computation for repeat
438 return (fValue - floorf( fValue )) * (float) maxValue; // Reduce to [0, 1.f]
439 #else // Otherwise, use this instead:
440 // Home the subtraction to a float to break up the sequence of x87
441 // instructions emitted by the VS compiler.
442 gFloatHome = fValue - floorf(fValue);
443 return gFloatHome * (float)maxValue;
444 #endif
445 }
446
MirroredRepeatNormalizedAddressFn(float fValue,size_t maxValue)447 float MirroredRepeatNormalizedAddressFn( float fValue, size_t maxValue )
448 {
449 // Round to nearest multiple of two
450 float s_prime = 2.0f * rintf( fValue * 0.5f ); // Note halfway values flip flop here due to rte, but they both end up pointing the same place at the end of the day
451
452 // Reduce to [-1, 1], Apply mirroring -> [0, 1]
453 s_prime = fabsf( fValue - s_prime );
454
455 // un-normalize
456 return s_prime * (float) maxValue;
457 }
458
459 struct AddressingTable
460 {
AddressingTableAddressingTable461 AddressingTable()
462 {
463 ct_assert( ( CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6 ) );
464 ct_assert( CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2 );
465
466 mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = NoAddressFn;
467 mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = NoAddressFn;
468 mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = RepeatAddressFn;
469 mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = RepeatAddressFn;
470 mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = ClampToEdgeNearestFn;
471 mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = ClampToEdgeLinearFn;
472 mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = ClampAddressFn;
473 mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = ClampAddressFn;
474 mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn;
475 mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn;
476 }
477
operator []AddressingTable478 AddressFn operator[]( image_sampler_data *sampler )
479 {
480 return mTable[ (int)sampler->addressing_mode - CL_ADDRESS_NONE ][ (int)sampler->filter_mode - CL_FILTER_NEAREST ];
481 }
482
483 AddressFn mTable[ 6 ][ 2 ];
484 };
485
486 static AddressingTable sAddressingTable;
487
is_sRGBA_order(cl_channel_order image_channel_order)488 bool is_sRGBA_order(cl_channel_order image_channel_order){
489 switch (image_channel_order) {
490 case CL_sRGB:
491 case CL_sRGBx:
492 case CL_sRGBA:
493 case CL_sBGRA:
494 return true;
495 default:
496 return false;
497 }
498 }
499
500 // Format helpers
501
has_alpha(cl_image_format * format)502 int has_alpha(cl_image_format *format) {
503 switch (format->image_channel_order) {
504 case CL_R:
505 return 0;
506 case CL_A:
507 return 1;
508 case CL_Rx:
509 return 0;
510 case CL_RG:
511 return 0;
512 case CL_RA:
513 return 1;
514 case CL_RGx:
515 return 0;
516 case CL_RGB:
517 case CL_sRGB:
518 return 0;
519 case CL_RGBx:
520 case CL_sRGBx:
521 return 0;
522 case CL_RGBA:
523 return 1;
524 case CL_BGRA:
525 return 1;
526 case CL_ARGB:
527 return 1;
528 case CL_INTENSITY:
529 return 1;
530 case CL_LUMINANCE:
531 return 0;
532 #ifdef CL_BGR1_APPLE
533 case CL_BGR1_APPLE: return 1;
534 #endif
535 #ifdef CL_1RGB_APPLE
536 case CL_1RGB_APPLE: return 1;
537 #endif
538 case CL_sRGBA:
539 case CL_sBGRA:
540 return 1;
541 case CL_DEPTH:
542 return 0;
543 default:
544 log_error("Invalid image channel order: %d\n", format->image_channel_order);
545 return 0;
546 }
547
548 }
549
550 #define PRINT_MAX_SIZE_LOGIC 0
551
552 #define SWAP( _a, _b ) do{ _a ^= _b; _b ^= _a; _a ^= _b; }while(0)
553 #ifndef MAX
554 #define MAX( _a, _b ) ((_a) > (_b) ? (_a) : (_b))
555 #endif
556
get_max_sizes(size_t * numberOfSizes,const int maxNumberOfSizes,size_t sizes[][3],size_t maxWidth,size_t maxHeight,size_t maxDepth,size_t maxArraySize,const cl_ulong maxIndividualAllocSize,const cl_ulong maxTotalAllocSize,cl_mem_object_type image_type,cl_image_format * format,int usingMaxPixelSizeBuffer)557 void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes,
558 size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize,
559 const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE
560 const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE
561 cl_mem_object_type image_type, cl_image_format *format, int usingMaxPixelSizeBuffer) {
562
563 bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D);
564 bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY);
565
566 // Validate we have a reasonable max depth for 3D
567 if (is3D && maxDepth < 2) {
568 log_error("ERROR: Requesting max image sizes for 3D images when max depth is < 2.\n");
569 *numberOfSizes = 0;
570 return;
571 }
572 // Validate we have a reasonable max array size for 1D & 2D image arrays
573 if (isArray && maxArraySize < 2) {
574 log_error("ERROR: Requesting max image sizes for an image array when max array size is < 1.\n");
575 *numberOfSizes = 0;
576 return;
577 }
578
579 // Reduce the maximum because we are trying to test the max image dimensions, not the memory allocation
580 cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4;
581 cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4;
582 log_info("Note: max individual allocation adjusted down from %gMB to %gMB and max total allocation adjusted down from %gMB to %gMB.\n",
583 maxIndividualAllocSize/(1024.0*1024.0), adjustedMaxIndividualAllocSize/(1024.0*1024.0),
584 maxTotalAllocSize/(1024.0*1024.0), adjustedMaxTotalAllocSize/(1024.0*1024.0));
585
586 // Cap our max allocation to 1.0GB.
587 // FIXME -- why? In the interest of not taking a long time? We should still test this stuff...
588 if (adjustedMaxTotalAllocSize > (cl_ulong)1024*1024*1024) {
589 adjustedMaxTotalAllocSize = (cl_ulong)1024*1024*1024;
590 log_info("Limiting max total allocation size to %gMB (down from %gMB) for test.\n",
591 adjustedMaxTotalAllocSize/(1024.0*1024.0), maxTotalAllocSize/(1024.0*1024.0));
592 }
593
594 cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize;
595 if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize*2)
596 maxAllocSize = adjustedMaxTotalAllocSize/2;
597
598 size_t raw_pixel_size = get_pixel_size(format);
599 // If the test will be creating input (src) buffer of type int4 or float4, number of pixels will be
600 // governed by sizeof(int4 or float4) and not sizeof(dest fomat)
601 // Also if pixel size is 12 bytes i.e. RGB or RGBx, we adjust it to 16 bytes as GPUs has no concept
602 // of 3 channel images. GPUs expand these to four channel RGBA.
603 if(usingMaxPixelSizeBuffer || raw_pixel_size == 12)
604 raw_pixel_size = 16;
605 size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size;
606
607 log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, per-allocation limit %gMB.\n",
608 maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize/(1024.0*1024.0)));
609
610 // Keep track of the maximum sizes for each dimension
611 size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth };
612
613 switch (image_type) {
614 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
615 maximum_sizes[1] = maxArraySize;
616 maximum_sizes[2] = 1;
617 break;
618 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
619 maximum_sizes[2] = maxArraySize;
620 break;
621 }
622
623
624 // Given one fixed sized dimension, this code finds one or two other dimensions,
625 // both with very small size, such that the size does not exceed the maximum
626 // passed to this function
627
628 #if defined(__x86_64) || defined (__arm64__) || defined (__ppc64__)
629 size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15};
630 #else
631 size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13};
632 #endif
633
634 static size_t other_size = 0;
635 enum { num_other_sizes = sizeof(other_sizes)/sizeof(size_t) };
636
637 (*numberOfSizes) = 0;
638
639 if (image_type == CL_MEM_OBJECT_IMAGE1D) {
640
641 double M = maximum_sizes[0];
642
643 // Store the size
644 sizes[(*numberOfSizes)][0] = (size_t)M;
645 sizes[(*numberOfSizes)][1] = 1;
646 sizes[(*numberOfSizes)][2] = 1;
647 ++(*numberOfSizes);
648 }
649
650 else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) {
651
652 for (int fixed_dim=0;fixed_dim<2;++fixed_dim) {
653
654 // Determine the size of the fixed dimension
655 double M = maximum_sizes[fixed_dim];
656 double A = max_pixels;
657
658 int x0_dim = !fixed_dim;
659 double x0 = fmin(fmin(other_sizes[(other_size++)%num_other_sizes],A/M), maximum_sizes[x0_dim]);
660
661 // Store the size
662 sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
663 sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
664 sizes[(*numberOfSizes)][2] = 1;
665 ++(*numberOfSizes);
666 }
667 }
668
669 else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE3D) {
670
671 // Iterate over dimensions, finding sizes for the non-fixed dimension
672 for (int fixed_dim=0;fixed_dim<3;++fixed_dim) {
673
674 // Determine the size of the fixed dimension
675 double M = maximum_sizes[fixed_dim];
676 double A = max_pixels;
677
678 // Find two other dimensions, x0 and x1
679 int x0_dim = (fixed_dim == 0) ? 1 : 0;
680 int x1_dim = (fixed_dim == 2) ? 1 : 2;
681
682 // Choose two other sizes for these dimensions
683 double x0 = fmin(fmin(A/M,maximum_sizes[x0_dim]),other_sizes[(other_size++)%num_other_sizes]);
684 // GPUs have certain restrictions on minimum width (row alignment) of images which has given us issues
685 // testing small widths in this test (say we set width to 3 for testing, and compute size based on this width and decide
686 // it fits within vram ... but GPU driver decides that, due to row alignment requirements, it has to use
687 // width of 16 which doesnt fit in vram). For this purpose we are not testing width < 16 for this test.
688 if(x0_dim == 0 && x0 < 16)
689 x0 = 16;
690 double x1 = fmin(fmin(A/M/x0,maximum_sizes[x1_dim]),other_sizes[(other_size++)%num_other_sizes]);
691
692 // Valid image sizes cannot be below 1. Due to the workaround for the xo_dim where x0 is overidden to 16
693 // there might not be enough space left for x1 dimension. This could be a fractional 0.x size that when cast to
694 // integer would result in a value 0. In these cases we clamp the size to a minimum of 1.
695 if ( x1 < 1 )
696 x1 = 1;
697
698 // M and x0 cannot be '0' as they derive from clDeviceInfo calls
699 assert(x0 > 0 && M > 0);
700
701 // Store the size
702 sizes[(*numberOfSizes)][fixed_dim] = (size_t)M;
703 sizes[(*numberOfSizes)][x0_dim] = (size_t)x0;
704 sizes[(*numberOfSizes)][x1_dim] = (size_t)x1;
705 ++(*numberOfSizes);
706 }
707 }
708
709 // Log the results
710 for (int j=0; j<(int)(*numberOfSizes); j++) {
711 switch (image_type) {
712 case CL_MEM_OBJECT_IMAGE1D:
713 log_info(" size[%d] = [%ld] (%g MB image)\n",
714 j, sizes[j][0], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
715 break;
716 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
717 case CL_MEM_OBJECT_IMAGE2D:
718 log_info(" size[%d] = [%ld %ld] (%g MB image)\n",
719 j, sizes[j][0], sizes[j][1], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
720 break;
721 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
722 case CL_MEM_OBJECT_IMAGE3D:
723 log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n",
724 j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0));
725 break;
726 }
727 }
728 }
729
get_max_absolute_error(cl_image_format * format,image_sampler_data * sampler)730 float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler) {
731 if (sampler->filter_mode == CL_FILTER_NEAREST)
732 return 0.0f;
733
734 switch (format->image_channel_data_type) {
735 case CL_SNORM_INT8:
736 return 1.0f/127.0f;
737 case CL_UNORM_INT8:
738 return 1.0f/255.0f;
739 case CL_UNORM_INT16:
740 return 1.0f/65535.0f;
741 case CL_SNORM_INT16:
742 return 1.0f/32767.0f;
743 case CL_FLOAT:
744 return CL_FLT_MIN;
745 #ifdef CL_SFIXED14_APPLE
746 case CL_SFIXED14_APPLE:
747 return 0x1.0p-14f;
748 #endif
749 default:
750 return 0.0f;
751 }
752 }
753
get_max_relative_error(cl_image_format * format,image_sampler_data * sampler,int is3D,int isLinearFilter)754 float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter )
755 {
756 float maxError = 0.0f;
757 float sampleCount = 1.0f;
758 if( isLinearFilter )
759 sampleCount = is3D ? 8.0f : 4.0f;
760
761 // Note that the ULP is defined here as the unit in the last place of the maximum
762 // magnitude sample used for filtering.
763
764 // Section 8.3
765 switch( format->image_channel_data_type )
766 {
767 // The spec allows 2 ulps of error for normalized formats
768 case CL_SNORM_INT8:
769 case CL_UNORM_INT8:
770 case CL_SNORM_INT16:
771 case CL_UNORM_INT16:
772 case CL_UNORM_SHORT_565:
773 case CL_UNORM_SHORT_555:
774 case CL_UNORM_INT_101010:
775 maxError = 2*FLT_EPSILON*sampleCount; // Maximum sampling error for round to zero normalization based on multiplication
776 // by reciprocal (using reciprocal generated in round to +inf mode, so that 1.0 matches spec)
777 break;
778
779 // If the implementation supports these formats then it will have to allow rounding error here too,
780 // because not all 32-bit ints are exactly representable in float
781 case CL_SIGNED_INT32:
782 case CL_UNSIGNED_INT32:
783 maxError = 1*FLT_EPSILON;
784 break;
785 }
786
787
788 // Section 8.2
789 if( sampler->addressing_mode == CL_ADDRESS_REPEAT || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT || sampler->filter_mode != CL_FILTER_NEAREST || sampler->normalized_coords )
790 #if defined( __APPLE__ )
791 {
792 if( sampler->filter_mode != CL_FILTER_NEAREST )
793 {
794 // The maximum
795 if( gDeviceType == CL_DEVICE_TYPE_GPU )
796 maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4); // Some GPUs ain't so accurate
797 else
798 // The standard method of 2d linear filtering delivers 4.0 ulps of error in round to nearest (8 in rtz).
799 maxError += 4.0f * FLT_EPSILON;
800 }
801 else
802 maxError += 4.0f * FLT_EPSILON; // normalized coordinates will introduce some error into the fractional part of the address, affecting results
803 }
804 #else
805 {
806 #if !defined(_WIN32)
807 #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec
808 #endif
809 // The spec allows linear filters to return any result most of the time.
810 // That's fine for implementations but a problem for testing. After all
811 // users aren't going to like garbage images. We have "picked a number"
812 // here that we are going to attempt to conform to. Implementations are
813 // free to pick another number, like infinity, if they like.
814 // We picked a number for you, to provide /some/ sanity
815 maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7);
816 // ...but this is what the spec allows:
817 // maxError = INFINITY;
818 // Please feel free to pick any positive number. (NaN wont work.)
819 }
820 #endif
821
822 // The error calculation itself can introduce error
823 maxError += FLT_EPSILON * 2;
824
825 return maxError;
826 }
827
get_format_max_int(cl_image_format * format)828 size_t get_format_max_int( cl_image_format *format )
829 {
830 switch( format->image_channel_data_type )
831 {
832 case CL_SNORM_INT8:
833 case CL_SIGNED_INT8:
834 return 127;
835 case CL_UNORM_INT8:
836 case CL_UNSIGNED_INT8:
837 return 255;
838
839 case CL_SNORM_INT16:
840 case CL_SIGNED_INT16:
841 return 32767;
842
843 case CL_UNORM_INT16:
844 case CL_UNSIGNED_INT16:
845 return 65535;
846
847 case CL_SIGNED_INT32:
848 return 2147483647L;
849
850 case CL_UNSIGNED_INT32:
851 return 4294967295LL;
852
853 case CL_UNORM_SHORT_565:
854 case CL_UNORM_SHORT_555:
855 return 31;
856
857 case CL_UNORM_INT_101010:
858 return 1023;
859
860 case CL_HALF_FLOAT:
861 return 1<<10;
862
863 #ifdef CL_SFIXED14_APPLE
864 case CL_SFIXED14_APPLE:
865 return 16384;
866 #endif
867 default:
868 return 0;
869 }
870 }
871
get_format_min_int(cl_image_format * format)872 int get_format_min_int( cl_image_format *format )
873 {
874 switch( format->image_channel_data_type )
875 {
876 case CL_SNORM_INT8:
877 case CL_SIGNED_INT8:
878 return -128;
879 case CL_UNORM_INT8:
880 case CL_UNSIGNED_INT8:
881 return 0;
882
883 case CL_SNORM_INT16:
884 case CL_SIGNED_INT16:
885 return -32768;
886
887 case CL_UNORM_INT16:
888 case CL_UNSIGNED_INT16:
889 return 0;
890
891 case CL_SIGNED_INT32:
892 return -2147483648LL;
893
894 case CL_UNSIGNED_INT32:
895 return 0;
896
897 case CL_UNORM_SHORT_565:
898 case CL_UNORM_SHORT_555:
899 case CL_UNORM_INT_101010:
900 return 0;
901
902 case CL_HALF_FLOAT: return -(1 << 10);
903
904 #ifdef CL_SFIXED14_APPLE
905 case CL_SFIXED14_APPLE:
906 return -16384;
907 #endif
908
909 default:
910 return 0;
911 }
912 }
913
convert_half_to_float(unsigned short halfValue)914 float convert_half_to_float( unsigned short halfValue )
915 {
916 // We have to take care of a few special cases, but in general, we just extract
917 // the same components from the half that exist in the float and re-stuff them
918 // For a description of the actual half format, see http://en.wikipedia.org/wiki/Half_precision
919 // Note: we store these in 32-bit ints to make the bit manipulations easier later
920 int sign = ( halfValue >> 15 ) & 0x0001;
921 int exponent = ( halfValue >> 10 ) & 0x001f;
922 int mantissa = ( halfValue ) & 0x03ff;
923
924 // Note: we use a union here to be able to access the bits of a float directly
925 union
926 {
927 unsigned int bits;
928 float floatValue;
929 } outFloat;
930
931 // Special cases first
932 if( exponent == 0 )
933 {
934 if( mantissa == 0 )
935 {
936 // If both exponent and mantissa are 0, the number is +/- 0
937 outFloat.bits = sign << 31;
938 return outFloat.floatValue; // Already done!
939 }
940
941 // If exponent is 0, it's a denormalized number, so we renormalize it
942 // Note: this is not terribly efficient, but oh well
943 while( ( mantissa & 0x00000400 ) == 0 )
944 {
945 mantissa <<= 1;
946 exponent--;
947 }
948
949 // The first bit is implicit, so we take it off and inc the exponent accordingly
950 exponent++;
951 mantissa &= ~(0x00000400);
952 }
953 else if( exponent == 31 ) // Special-case "numbers"
954 {
955 // If the exponent is 31, it's a special case number (+/- infinity or NAN).
956 // If the mantissa is 0, it's infinity, else it's NAN, but in either case, the packing
957 // method is the same
958 outFloat.bits = ( sign << 31 ) | 0x7f800000 | ( mantissa << 13 );
959 return outFloat.floatValue;
960 }
961
962 // Plain ol' normalized number, so adjust to the ranges a 32-bit float expects and repack
963 exponent += ( 127 - 15 );
964 mantissa <<= 13;
965
966 outFloat.bits = ( sign << 31 ) | ( exponent << 23 ) | mantissa;
967 return outFloat.floatValue;
968 }
969
970
971
convert_float_to_half(float f)972 cl_ushort convert_float_to_half( float f )
973 {
974 switch( gFloatToHalfRoundingMode )
975 {
976 case kRoundToNearestEven:
977 return float2half_rte( f );
978 case kRoundTowardZero:
979 return float2half_rtz( f );
980 default:
981 log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
982 exit(-1);
983 return 0xffff;
984 }
985
986 }
987
float2half_rte(float f)988 cl_ushort float2half_rte( float f )
989 {
990 union{ float f; cl_uint u; } u = {f};
991 cl_uint sign = (u.u >> 16) & 0x8000;
992 float x = fabsf(f);
993
994 //Nan
995 if( x != x )
996 {
997 u.u >>= (24-11);
998 u.u &= 0x7fff;
999 u.u |= 0x0200; //silence the NaN
1000 return u.u | sign;
1001 }
1002
1003 // overflow
1004 if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) )
1005 return 0x7c00 | sign;
1006
1007 // underflow
1008 if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) )
1009 return sign; // The halfway case can return 0x0001 or 0. 0 is even.
1010
1011 // very small
1012 if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) )
1013 return sign | 1;
1014
1015 // half denormal
1016 if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
1017 {
1018 u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125);
1019 return sign | u.u;
1020 }
1021
1022 u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13);
1023 u.u &= 0x7f800000;
1024 x += u.f;
1025 u.f = x - u.f;
1026 u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
1027
1028 return (u.u >> (24-11)) | sign;
1029 }
1030
float2half_rtz(float f)1031 cl_ushort float2half_rtz( float f )
1032 {
1033 union{ float f; cl_uint u; } u = {f};
1034 cl_uint sign = (u.u >> 16) & 0x8000;
1035 float x = fabsf(f);
1036
1037 //Nan
1038 if( x != x )
1039 {
1040 u.u >>= (24-11);
1041 u.u &= 0x7fff;
1042 u.u |= 0x0200; //silence the NaN
1043 return u.u | sign;
1044 }
1045
1046 // overflow
1047 if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) )
1048 {
1049 if( x == INFINITY )
1050 return 0x7c00 | sign;
1051
1052 return 0x7bff | sign;
1053 }
1054
1055 // underflow
1056 if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) )
1057 return sign; // The halfway case can return 0x0001 or 0. 0 is even.
1058
1059 // half denormal
1060 if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) )
1061 {
1062 x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24);
1063 return (cl_ushort)((int) x | sign);
1064 }
1065
1066 u.u &= 0xFFFFE000U;
1067 u.u -= 0x38000000U;
1068
1069 return (u.u >> (24-11)) | sign;
1070 }
1071
1072 class TEST
1073 {
1074 public:
1075 TEST();
1076 };
1077
1078 static TEST t;
__vstore_half_rte(float f,size_t index,uint16_t * p)1079 void __vstore_half_rte(float f, size_t index, uint16_t *p)
1080 {
1081 union{ unsigned int u; float f;} u;
1082
1083 u.f = f;
1084 unsigned short r = (u.u >> 16) & 0x8000;
1085 u.u &= 0x7fffffff;
1086 if( u.u >= 0x33000000U )
1087 {
1088 if( u.u >= 0x47800000 )
1089 {
1090 if( u.u <= 0x7f800000 )
1091 r |= 0x7c00;
1092 else
1093 {
1094 r |= 0x7e00 | ( (u.u >> 13) & 0x3ff );
1095 }
1096 }
1097 else
1098 {
1099 float x = u.f;
1100 if( u.u < 0x38800000 )
1101 u.u = 0x3f000000;
1102 else
1103 u.u += 0x06800000;
1104 u.u &= 0x7f800000U;
1105 x += u.f;
1106 x -= u.f;
1107 u.f = x * MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112);
1108 u.u >>= 13;
1109 r |= (unsigned short) u.u;
1110 }
1111 }
1112
1113 ((unsigned short*)p)[index] = r;
1114 }
1115
TEST()1116 TEST::TEST()
1117 {
1118 return;
1119 union
1120 {
1121 float f;
1122 uint32_t i;
1123 } test;
1124 uint16_t control, myval;
1125
1126 log_info(" &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );
1127 test.i = 0;
1128 do
1129 {
1130 if( ( test.i & 0xffffff ) == 0 )
1131 {
1132 if( ( test.i & 0xfffffff ) == 0 )
1133 log_info( "*" );
1134 else
1135 log_info( "." );
1136 fflush(stdout);
1137 }
1138 __vstore_half_rte( test.f, 0, &control );
1139 myval = convert_float_to_half( test.f );
1140 if( myval != control )
1141 {
1142 log_info( "\n******** ERROR: MyVal %04x control %04x source %12.24f\n", myval, control, test.f );
1143 log_info( " source bits: %08x %a\n", test.i, test.f );
1144 float t, c;
1145 c = convert_half_to_float( control );
1146 t = convert_half_to_float( myval );
1147 log_info( " converted control: %12.24f myval: %12.24f\n", c, t );
1148 }
1149 test.i++;
1150 } while( test.i != 0 );
1151 log_info("\n &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" );
1152
1153 }
1154
get_image_size(image_descriptor const * imageInfo)1155 cl_ulong get_image_size( image_descriptor const *imageInfo )
1156 {
1157 cl_ulong imageSize;
1158
1159 // Assumes rowPitch and slicePitch are always correctly defined
1160 if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 )
1161 {
1162 imageSize = (size_t) compute_mipmapped_image_size(*imageInfo);
1163 }
1164 else
1165 {
1166 switch (imageInfo->type)
1167 {
1168 case CL_MEM_OBJECT_IMAGE1D:
1169 imageSize = imageInfo->rowPitch;
1170 break;
1171 case CL_MEM_OBJECT_IMAGE2D:
1172 imageSize = imageInfo->height * imageInfo->rowPitch;
1173 break;
1174 case CL_MEM_OBJECT_IMAGE3D:
1175 imageSize = imageInfo->depth * imageInfo->slicePitch;
1176 break;
1177 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1178 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1179 break;
1180 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1181 imageSize = imageInfo->arraySize * imageInfo->slicePitch;
1182 break;
1183 default:
1184 log_error("ERROR: Cannot identify image type %x\n", imageInfo->type);
1185 abort();
1186 }
1187 }
1188 return imageSize;
1189 }
1190
1191 // Calculate image size in megabytes (strictly, mebibytes). Result is rounded up.
get_image_size_mb(image_descriptor const * imageInfo)1192 cl_ulong get_image_size_mb( image_descriptor const *imageInfo )
1193 {
1194 cl_ulong imageSize = get_image_size( imageInfo );
1195 cl_ulong mb = imageSize / ( 1024 * 1024 );
1196 if ( imageSize % ( 1024 * 1024 ) > 0 )
1197 {
1198 mb += 1;
1199 }
1200 return mb;
1201 }
1202
1203
1204 uint64_t gRoundingStartValue = 0;
1205
1206
escape_inf_nan_values(char * data,size_t allocSize)1207 void escape_inf_nan_values( char* data, size_t allocSize ) {
1208 // filter values with 8 not-quite-highest bits
1209 unsigned int *intPtr = (unsigned int *)data;
1210 for( size_t i = 0; i < allocSize >> 2; i++ )
1211 {
1212 if( ( intPtr[ i ] & 0x7F800000 ) == 0x7F800000 )
1213 intPtr[ i ] ^= 0x40000000;
1214 }
1215
1216 // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits = 0x7C00 are special)
1217 unsigned short *shortPtr = (unsigned short *)data;
1218 for( size_t i = 0; i < allocSize >> 1; i++ )
1219 {
1220 if( ( shortPtr[ i ] & 0x7C00 ) == 0x7C00 )
1221 shortPtr[ i ] ^= 0x4000;
1222 }
1223 }
1224
generate_random_image_data(image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d)1225 char * generate_random_image_data( image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d )
1226 {
1227 size_t allocSize = get_image_size( imageInfo );
1228 size_t pixelRowBytes = imageInfo->width * get_pixel_size( imageInfo->format );
1229 size_t i;
1230
1231 if (imageInfo->num_mip_levels > 1)
1232 allocSize = compute_mipmapped_image_size(*imageInfo);
1233
1234 #if defined (__APPLE__ )
1235 char *data = NULL;
1236 if (gDeviceType == CL_DEVICE_TYPE_CPU) {
1237 size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192;
1238
1239 void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
1240 intptr_t data_end = (intptr_t)map + mapSize - 4096;
1241 data = (char *)(data_end - (intptr_t)allocSize);
1242
1243 mprotect(map, 4096, PROT_NONE);
1244 mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
1245 P.reset(data, map, mapSize,allocSize);
1246 } else {
1247 data = (char *)malloc(allocSize);
1248 P.reset(data,NULL,0,allocSize);
1249 }
1250 #else
1251 P.reset( NULL ); // Free already allocated memory first, then try to allocate new block.
1252 char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format));
1253 P.reset(data,NULL,0,allocSize, true);
1254 #endif
1255
1256 if (data == NULL) {
1257 log_error( "ERROR: Unable to malloc %lu bytes for generate_random_image_data\n", allocSize );
1258 return 0;
1259 }
1260
1261 if( gTestRounding )
1262 {
1263 // Special case: fill with a ramp from 0 to the size of the type
1264 size_t typeSize = get_format_type_size( imageInfo->format );
1265 switch( typeSize )
1266 {
1267 case 1:
1268 {
1269 char *ptr = data;
1270 for( i = 0; i < allocSize; i++ )
1271 ptr[i] = (cl_char) (i + gRoundingStartValue);
1272 }
1273 break;
1274 case 2:
1275 {
1276 cl_short *ptr = (cl_short*) data;
1277 for( i = 0; i < allocSize / 2; i++ )
1278 ptr[i] = (cl_short) (i + gRoundingStartValue);
1279 }
1280 break;
1281 case 4:
1282 {
1283 cl_int *ptr = (cl_int*) data;
1284 for( i = 0; i < allocSize / 4; i++ )
1285 ptr[i] = (cl_int) (i + gRoundingStartValue);
1286 }
1287 break;
1288 }
1289
1290 // Note: inf or nan float values would cause problems, although we don't know this will
1291 // actually be a float, so we just know what to look for
1292 escape_inf_nan_values( data, allocSize );
1293 return data;
1294 }
1295
1296 // Otherwise, we should be able to just fill with random bits no matter what
1297 cl_uint *p = (cl_uint*) data;
1298 for( i = 0; i + 4 <= allocSize; i += 4 )
1299 p[ i / 4 ] = genrand_int32(d);
1300
1301 for( ; i < allocSize; i++ )
1302 data[i] = genrand_int32(d);
1303
1304 // Note: inf or nan float values would cause problems, although we don't know this will
1305 // actually be a float, so we just know what to look for
1306 escape_inf_nan_values( data, allocSize );
1307
1308 if ( /*!gTestMipmaps*/ imageInfo->num_mip_levels < 2 )
1309 {
1310 // Fill unused edges with -1, NaN for float
1311 if (imageInfo->rowPitch > pixelRowBytes)
1312 {
1313 size_t height = 0;
1314
1315 switch (imageInfo->type)
1316 {
1317 case CL_MEM_OBJECT_IMAGE2D:
1318 case CL_MEM_OBJECT_IMAGE3D:
1319 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1320 height = imageInfo->height;
1321 break;
1322 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1323 height = imageInfo->arraySize;
1324 break;
1325 }
1326
1327 // Fill in the row padding regions
1328 for( i = 0; i < height; i++ )
1329 {
1330 size_t offset = i * imageInfo->rowPitch + pixelRowBytes;
1331 size_t length = imageInfo->rowPitch - pixelRowBytes;
1332 memset( data + offset, 0xff, length );
1333 }
1334 }
1335
1336 // Fill in the slice padding regions, if necessary:
1337
1338 size_t slice_dimension = imageInfo->height;
1339 if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
1340 slice_dimension = imageInfo->arraySize;
1341 }
1342
1343 if (imageInfo->slicePitch > slice_dimension*imageInfo->rowPitch)
1344 {
1345 size_t depth = 0;
1346 switch (imageInfo->type)
1347 {
1348 case CL_MEM_OBJECT_IMAGE2D:
1349 case CL_MEM_OBJECT_IMAGE3D:
1350 depth = imageInfo->depth;
1351 break;
1352 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1353 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1354 depth = imageInfo->arraySize;
1355 break;
1356 }
1357
1358 for( i = 0; i < depth; i++ )
1359 {
1360 size_t offset = i * imageInfo->slicePitch + slice_dimension*imageInfo->rowPitch;
1361 size_t length = imageInfo->slicePitch - slice_dimension*imageInfo->rowPitch;
1362 memset( data + offset, 0xff, length );
1363 }
1364 }
1365 }
1366
1367 return data;
1368 }
1369
1370 #define CLAMP_FLOAT( v ) ( fmaxf( fminf( v, 1.f ), -1.f ) )
1371
1372
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData,int lod)1373 void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1374 int x, int y, int z, float *outData, int lod )
1375 {
1376 size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth;
1377 size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1378
1379 if ( imageInfo->num_mip_levels > 1 )
1380 {
1381 switch(imageInfo->type)
1382 {
1383 case CL_MEM_OBJECT_IMAGE3D :
1384 depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1;
1385 case CL_MEM_OBJECT_IMAGE2D :
1386 case CL_MEM_OBJECT_IMAGE2D_ARRAY :
1387 height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1;
1388 default :
1389 width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1;
1390 }
1391 row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1392 if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY )
1393 slice_pitch_lod = row_pitch_lod;
1394 else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1395 slice_pitch_lod = row_pitch_lod * height_lod;
1396 }
1397 else
1398 {
1399 row_pitch_lod = imageInfo->rowPitch;
1400 slice_pitch_lod = imageInfo->slicePitch;
1401 }
1402 if ( x < 0 || y < 0 || z < 0 || x >= (int)width_lod
1403 || ( height_lod != 0 && y >= (int)height_lod )
1404 || ( depth_lod != 0 && z >= (int)depth_lod )
1405 || ( imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize ) )
1406 {
1407 outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = outData[ 3 ] = 0;
1408 if (!has_alpha(imageInfo->format))
1409 outData[3] = 1;
1410 return;
1411 }
1412
1413 cl_image_format *format = imageInfo->format;
1414
1415 unsigned int i;
1416 float tempData[ 4 ];
1417
1418 // Advance to the right spot
1419 char *ptr = (char *)imageData;
1420 size_t pixelSize = get_pixel_size( format );
1421
1422 ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize;
1423
1424 // OpenCL only supports reading floats from certain formats
1425 size_t channelCount = get_format_channel_count( format );
1426 switch( format->image_channel_data_type )
1427 {
1428 case CL_SNORM_INT8:
1429 {
1430 cl_char *dPtr = (cl_char *)ptr;
1431 for( i = 0; i < channelCount; i++ )
1432 tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 127.0f );
1433 break;
1434 }
1435
1436 case CL_UNORM_INT8:
1437 {
1438 unsigned char *dPtr = (unsigned char *)ptr;
1439 for( i = 0; i < channelCount; i++ ) {
1440 if((is_sRGBA_order(imageInfo->format->image_channel_order)) && i<3) // only RGB need to be converted for sRGBA
1441 tempData[ i ] = (float)sRGBunmap((float)dPtr[ i ] / 255.0f) ;
1442 else
1443 tempData[ i ] = (float)dPtr[ i ] / 255.0f;
1444 }
1445 break;
1446 }
1447
1448 case CL_SIGNED_INT8:
1449 {
1450 cl_char *dPtr = (cl_char *)ptr;
1451 for( i = 0; i < channelCount; i++ )
1452 tempData[ i ] = (float)dPtr[ i ];
1453 break;
1454 }
1455
1456 case CL_UNSIGNED_INT8:
1457 {
1458 cl_uchar *dPtr = (cl_uchar *)ptr;
1459 for( i = 0; i < channelCount; i++ )
1460 tempData[ i ] = (float) dPtr[ i ];
1461 break;
1462 }
1463
1464 case CL_SNORM_INT16:
1465 {
1466 cl_short *dPtr = (cl_short *)ptr;
1467 for( i = 0; i < channelCount; i++ )
1468 tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 32767.0f );
1469 break;
1470 }
1471
1472 case CL_UNORM_INT16:
1473 {
1474 cl_ushort *dPtr = (cl_ushort *)ptr;
1475 for( i = 0; i < channelCount; i++ )
1476 tempData[ i ] = (float)dPtr[ i ] / 65535.0f;
1477 break;
1478 }
1479
1480 case CL_SIGNED_INT16:
1481 {
1482 cl_short *dPtr = (cl_short *)ptr;
1483 for( i = 0; i < channelCount; i++ )
1484 tempData[ i ] = (float)dPtr[ i ];
1485 break;
1486 }
1487
1488 case CL_UNSIGNED_INT16:
1489 {
1490 cl_ushort *dPtr = (cl_ushort *)ptr;
1491 for( i = 0; i < channelCount; i++ )
1492 tempData[ i ] = (float) dPtr[ i ];
1493 break;
1494 }
1495
1496 case CL_HALF_FLOAT:
1497 {
1498 cl_ushort *dPtr = (cl_ushort *)ptr;
1499 for( i = 0; i < channelCount; i++ )
1500 tempData[ i ] = convert_half_to_float( dPtr[ i ] );
1501 break;
1502 }
1503
1504 case CL_SIGNED_INT32:
1505 {
1506 cl_int *dPtr = (cl_int *)ptr;
1507 for( i = 0; i < channelCount; i++ )
1508 tempData[ i ] = (float)dPtr[ i ];
1509 break;
1510 }
1511
1512 case CL_UNSIGNED_INT32:
1513 {
1514 cl_uint *dPtr = (cl_uint *)ptr;
1515 for( i = 0; i < channelCount; i++ )
1516 tempData[ i ] = (float)dPtr[ i ];
1517 break;
1518 }
1519
1520 case CL_UNORM_SHORT_565:
1521 {
1522 cl_ushort *dPtr = (cl_ushort *)ptr;
1523 tempData[ 0 ] = (float)( dPtr[ 0 ] >> 11 ) / (float)31;
1524 tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 63 ) / (float)63;
1525 tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
1526 break;
1527 }
1528
1529 case CL_UNORM_SHORT_555:
1530 {
1531 cl_ushort *dPtr = (cl_ushort *)ptr;
1532 tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 31 ) / (float)31;
1533 tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 31 ) / (float)31;
1534 tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31;
1535 break;
1536 }
1537
1538 case CL_UNORM_INT_101010:
1539 {
1540 cl_uint *dPtr = (cl_uint *)ptr;
1541 tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 20 ) & 0x3ff ) / (float)1023;
1542 tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 0x3ff ) / (float)1023;
1543 tempData[ 2 ] = (float)( dPtr[ 0 ] & 0x3ff ) / (float)1023;
1544 break;
1545 }
1546
1547 case CL_FLOAT:
1548 {
1549 float *dPtr = (float *)ptr;
1550 for( i = 0; i < channelCount; i++ )
1551 tempData[ i ] = (float)dPtr[ i ];
1552 break;
1553 }
1554 #ifdef CL_SFIXED14_APPLE
1555 case CL_SFIXED14_APPLE:
1556 {
1557 cl_ushort *dPtr = (cl_ushort*) ptr;
1558 for( i = 0; i < channelCount; i++ )
1559 tempData[i] = ((int) dPtr[i] - 16384) * 0x1.0p-14f;
1560 break;
1561 }
1562 #endif
1563 }
1564
1565
1566 outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = 0;
1567 outData[ 3 ] = 1;
1568
1569 switch( format->image_channel_order )
1570 {
1571 case CL_A:
1572 outData[ 3 ] = tempData[ 0 ];
1573 break;
1574 case CL_R:
1575 case CL_Rx:
1576 outData[ 0 ] = tempData[ 0 ];
1577 break;
1578 case CL_RA:
1579 outData[ 0 ] = tempData[ 0 ];
1580 outData[ 3 ] = tempData[ 1 ];
1581 break;
1582 case CL_RG:
1583 case CL_RGx:
1584 outData[ 0 ] = tempData[ 0 ];
1585 outData[ 1 ] = tempData[ 1 ];
1586 break;
1587 case CL_RGB:
1588 case CL_RGBx:
1589 case CL_sRGB:
1590 case CL_sRGBx:
1591 outData[ 0 ] = tempData[ 0 ];
1592 outData[ 1 ] = tempData[ 1 ];
1593 outData[ 2 ] = tempData[ 2 ];
1594 break;
1595 case CL_RGBA:
1596 outData[ 0 ] = tempData[ 0 ];
1597 outData[ 1 ] = tempData[ 1 ];
1598 outData[ 2 ] = tempData[ 2 ];
1599 outData[ 3 ] = tempData[ 3 ];
1600 break;
1601 case CL_ARGB:
1602 outData[ 0 ] = tempData[ 1 ];
1603 outData[ 1 ] = tempData[ 2 ];
1604 outData[ 2 ] = tempData[ 3 ];
1605 outData[ 3 ] = tempData[ 0 ];
1606 break;
1607 case CL_BGRA:
1608 case CL_sBGRA:
1609 outData[ 0 ] = tempData[ 2 ];
1610 outData[ 1 ] = tempData[ 1 ];
1611 outData[ 2 ] = tempData[ 0 ];
1612 outData[ 3 ] = tempData[ 3 ];
1613 break;
1614 case CL_INTENSITY:
1615 outData[ 0 ] = tempData[ 0 ];
1616 outData[ 1 ] = tempData[ 0 ];
1617 outData[ 2 ] = tempData[ 0 ];
1618 outData[ 3 ] = tempData[ 0 ];
1619 break;
1620 case CL_LUMINANCE:
1621 outData[ 0 ] = tempData[ 0 ];
1622 outData[ 1 ] = tempData[ 0 ];
1623 outData[ 2 ] = tempData[ 0 ];
1624 break;
1625 #ifdef CL_1RGB_APPLE
1626 case CL_1RGB_APPLE:
1627 outData[ 0 ] = tempData[ 1 ];
1628 outData[ 1 ] = tempData[ 2 ];
1629 outData[ 2 ] = tempData[ 3 ];
1630 outData[ 3 ] = 1.0f;
1631 break;
1632 #endif
1633 #ifdef CL_BGR1_APPLE
1634 case CL_BGR1_APPLE:
1635 outData[ 0 ] = tempData[ 2 ];
1636 outData[ 1 ] = tempData[ 1 ];
1637 outData[ 2 ] = tempData[ 0 ];
1638 outData[ 3 ] = 1.0f;
1639 break;
1640 #endif
1641 case CL_sRGBA:
1642 outData[ 0 ] = tempData[ 0 ];
1643 outData[ 1 ] = tempData[ 1 ];
1644 outData[ 2 ] = tempData[ 2 ];
1645 outData[ 3 ] = tempData[ 3 ];
1646 break;
1647 case CL_DEPTH:
1648 outData[ 0 ] = tempData[ 0 ];
1649 break;
1650 default:
1651 log_error("Invalid format:");
1652 print_header(format, true);
1653 break;
1654 }
1655 }
1656
read_image_pixel_float(void * imageData,image_descriptor * imageInfo,int x,int y,int z,float * outData)1657 void read_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1658 int x, int y, int z, float *outData )
1659 {
1660 read_image_pixel_float( imageData, imageInfo, x, y, z, outData, 0 );
1661 }
1662
get_integer_coords(float x,float y,float z,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1663 bool get_integer_coords( float x, float y, float z, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ ) {
1664 return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height, depth, imageSampler, imageInfo, outX, outY, outZ);
1665 }
1666
get_integer_coords_offset(float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,size_t width,size_t height,size_t depth,image_sampler_data * imageSampler,image_descriptor * imageInfo,int & outX,int & outY,int & outZ)1667 bool get_integer_coords_offset( float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
1668 size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ )
1669 {
1670 AddressFn adFn = sAddressingTable[ imageSampler ];
1671
1672 float refX = floorf( x ), refY = floorf( y ), refZ = floorf( z );
1673
1674 // Handle sampler-directed coordinate normalization + clamping. Note that
1675 // the array coordinate for image array types is expected to be
1676 // unnormalized, and is clamped to 0..arraySize-1.
1677 if( imageSampler->normalized_coords )
1678 {
1679 switch (imageSampler->addressing_mode)
1680 {
1681 case CL_ADDRESS_REPEAT:
1682 x = RepeatNormalizedAddressFn( x, width );
1683 if (height != 0) {
1684 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1685 y = RepeatNormalizedAddressFn( y, height );
1686 }
1687 if (depth != 0) {
1688 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1689 z = RepeatNormalizedAddressFn( z, depth );
1690 }
1691
1692 if (xAddressOffset != 0.0) {
1693 // Add in the offset
1694 x += xAddressOffset;
1695 // Handle wrapping
1696 if (x > width)
1697 x -= (float)width;
1698 if (x < 0)
1699 x += (float)width;
1700 }
1701 if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
1702 // Add in the offset
1703 y += yAddressOffset;
1704 // Handle wrapping
1705 if (y > height)
1706 y -= (float)height;
1707 if (y < 0)
1708 y += (float)height;
1709 }
1710 if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) ) {
1711 // Add in the offset
1712 z += zAddressOffset;
1713 // Handle wrapping
1714 if (z > depth)
1715 z -= (float)depth;
1716 if (z < 0)
1717 z += (float)depth;
1718 }
1719 break;
1720
1721 case CL_ADDRESS_MIRRORED_REPEAT:
1722 x = MirroredRepeatNormalizedAddressFn( x, width );
1723 if (height != 0) {
1724 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1725 y = MirroredRepeatNormalizedAddressFn( y, height );
1726 }
1727 if (depth != 0) {
1728 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1729 z = MirroredRepeatNormalizedAddressFn( z, depth );
1730 }
1731
1732 if (xAddressOffset != 0.0)
1733 {
1734 float temp = x + xAddressOffset;
1735 if( temp > (float) width )
1736 temp = (float) width - (temp - (float) width );
1737 x = fabsf( temp );
1738 }
1739 if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) {
1740 float temp = y + yAddressOffset;
1741 if( temp > (float) height )
1742 temp = (float) height - (temp - (float) height );
1743 y = fabsf( temp );
1744 }
1745 if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) ) {
1746 float temp = z + zAddressOffset;
1747 if( temp > (float) depth )
1748 temp = (float) depth - (temp - (float) depth );
1749 z = fabsf( temp );
1750 }
1751 break;
1752
1753 default:
1754 // Also, remultiply to the original coords. This simulates any truncation in
1755 // the pass to OpenCL
1756 x *= (float)width;
1757 x += xAddressOffset;
1758
1759 if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY)
1760 {
1761 y *= (float)height;
1762 y += yAddressOffset;
1763 }
1764
1765 if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY)
1766 {
1767 z *= (float)depth;
1768 z += zAddressOffset;
1769 }
1770 break;
1771 }
1772 }
1773
1774 // At this point, we're dealing with non-normalized coordinates.
1775
1776 outX = adFn( floorf( x ), width );
1777
1778 // 1D and 2D arrays require special care for the index coordinate:
1779
1780 switch (imageInfo->type) {
1781 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1782 outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f);
1783 outZ = 0.0f; /* don't care! */
1784 break;
1785 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1786 outY = adFn( floorf( y ), height );
1787 outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f);
1788 break;
1789 default:
1790 // legacy path:
1791 if (height != 0)
1792 outY = adFn( floorf( y ), height );
1793 if( depth != 0 )
1794 outZ = adFn( floorf( z ), depth );
1795 }
1796
1797 return !( (int)refX == outX && (int)refY == outY && (int)refZ == outZ );
1798 }
1799
frac(float a)1800 static float frac(float a) {
1801 return a - floorf(a);
1802 }
1803
1804 static inline void pixelMax( const float a[4], const float b[4], float *results );
pixelMax(const float a[4],const float b[4],float * results)1805 static inline void pixelMax( const float a[4], const float b[4], float *results )
1806 {
1807 for( int i = 0; i < 4; i++ )
1808 results[i] = errMax( fabsf(a[i]), fabsf(b[i]) );
1809 }
1810
1811 // If containsDenorms is NULL, flush denorms to zero
1812 // if containsDenorms is not NULL, record whether there are any denorms
1813 static inline void check_for_denorms(float a[4], int *containsDenorms );
check_for_denorms(float a[4],int * containsDenorms)1814 static inline void check_for_denorms(float a[4], int *containsDenorms )
1815 {
1816 if( NULL == containsDenorms )
1817 {
1818 for( int i = 0; i < 4; i++ )
1819 {
1820 if( IsFloatSubnormal( a[i] ) )
1821 a[i] = copysignf( 0.0f, a[i] );
1822 }
1823 }
1824 else
1825 {
1826 for( int i = 0; i < 4; i++ )
1827 {
1828 if( IsFloatSubnormal( a[i] ) )
1829 {
1830 *containsDenorms = 1;
1831 break;
1832 }
1833 }
1834 }
1835 }
1836
calculate_array_index(float coord,float extent)1837 inline float calculate_array_index( float coord, float extent ) {
1838 // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array'
1839 //
1840 // given coordinate 'w' that represents an index:
1841 // layer_index = clamp( rint(w), 0, image_array_size - 1)
1842
1843 float ret = rintf( coord );
1844 ret = ret > extent ? extent : ret;
1845 ret = ret < 0.0f ? 0.0f : ret;
1846
1847 return ret;
1848 }
1849
1850 /*
1851 * Utility function to unnormalized a coordinate given a particular sampler.
1852 *
1853 * name - the name of the coordinate, used for verbose debugging only
1854 * coord - the coordinate requiring unnormalization
1855 * offset - an addressing offset to be added to the coordinate
1856 * extent - the max value for this coordinate (e.g. width for x)
1857 */
unnormalize_coordinate(const char * name,float coord,float offset,float extent,cl_addressing_mode addressing_mode,int verbose)1858 static float unnormalize_coordinate( const char* name, float coord,
1859 float offset, float extent, cl_addressing_mode addressing_mode, int verbose )
1860 {
1861 float ret = 0.0f;
1862
1863 switch (addressing_mode) {
1864 case CL_ADDRESS_REPEAT:
1865 ret = RepeatNormalizedAddressFn( coord, extent );
1866
1867 if ( verbose ) {
1868 log_info( "\tRepeat filter denormalizes %s (%f) to %f\n",
1869 name, coord, ret );
1870 }
1871
1872 if (offset != 0.0) {
1873 // Add in the offset, and handle wrapping.
1874 ret += offset;
1875 if (ret > extent) ret -= extent;
1876 if (ret < 0.0) ret += extent;
1877 }
1878
1879 if (verbose && offset != 0.0f) {
1880 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1881 }
1882 break;
1883
1884 case CL_ADDRESS_MIRRORED_REPEAT:
1885 ret = MirroredRepeatNormalizedAddressFn( coord, extent );
1886
1887 if ( verbose ) {
1888 log_info( "\tMirrored repeat filter denormalizes %s (%f) to %f\n",
1889 name, coord, ret );
1890 }
1891
1892 if (offset != 0.0) {
1893 float temp = ret + offset;
1894 if( temp > extent )
1895 temp = extent - (temp - extent );
1896 ret = fabsf( temp );
1897 }
1898
1899 if (verbose && offset != 0.0f) {
1900 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1901 }
1902 break;
1903
1904 default:
1905
1906 ret = coord * extent;
1907
1908 if ( verbose ) {
1909 log_info( "\tFilter denormalizes %s to %f (%f * %f)\n",
1910 name, ret, coord, extent);
1911 }
1912
1913 ret += offset;
1914
1915 if (verbose && offset != 0.0f) {
1916 log_info( "\tAddress offset of %f added to get %f\n", offset, ret );
1917 }
1918 }
1919
1920 return ret;
1921 }
1922
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)1923 FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1924 float x, float y, float z,
1925 image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms ) {
1926 return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms);
1927 }
1928
1929 // returns max pixel value of the pixels touched
sample_image_pixel_float(void * imageData,image_descriptor * imageInfo,float x,float y,float z,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1930 FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo,
1931 float x, float y, float z,
1932 image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod) {
1933 return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms, lod);
1934 }
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms,int lod)1935 FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo,
1936 float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
1937 image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod)
1938 {
1939 AddressFn adFn = sAddressingTable[ imageSampler ];
1940 FloatPixel returnVal;
1941 size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth;
1942 size_t slice_pitch_lod = 0, row_pitch_lod = 0;
1943
1944 if ( imageInfo->num_mip_levels > 1 )
1945 {
1946 switch(imageInfo->type)
1947 {
1948 case CL_MEM_OBJECT_IMAGE3D :
1949 depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1;
1950 case CL_MEM_OBJECT_IMAGE2D :
1951 case CL_MEM_OBJECT_IMAGE2D_ARRAY :
1952 height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1;
1953 default :
1954 width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1;
1955 }
1956 row_pitch_lod = width_lod * get_pixel_size(imageInfo->format);
1957 if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY )
1958 slice_pitch_lod = row_pitch_lod;
1959 else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
1960 slice_pitch_lod = row_pitch_lod * height_lod;
1961 }
1962 else
1963 {
1964 slice_pitch_lod = imageInfo->slicePitch;
1965 row_pitch_lod = imageInfo->rowPitch;
1966 }
1967
1968 if( containsDenorms )
1969 *containsDenorms = 0;
1970
1971 if( imageSampler->normalized_coords ) {
1972
1973 // We need to unnormalize our coordinates differently depending on
1974 // the image type, but 'x' is always processed the same way.
1975
1976 x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod,
1977 imageSampler->addressing_mode, verbose);
1978
1979 switch (imageInfo->type) {
1980
1981 // The image array types require special care:
1982
1983 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
1984 z = 0; // don't care -- unused for 1D arrays
1985 break;
1986
1987 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
1988 y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod,
1989 imageSampler->addressing_mode, verbose);
1990 break;
1991
1992 // Everybody else:
1993
1994 default:
1995 y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod,
1996 imageSampler->addressing_mode, verbose);
1997 z = unnormalize_coordinate("z", z, zAddressOffset, (float)depth_lod,
1998 imageSampler->addressing_mode, verbose);
1999 }
2000
2001 } else if ( verbose ) {
2002
2003 switch (imageInfo->type) {
2004 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2005 log_info("Starting coordinate: %f, array index %f\n", x, y);
2006 break;
2007 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2008 log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z);
2009 break;
2010 case CL_MEM_OBJECT_IMAGE1D:
2011 case CL_MEM_OBJECT_IMAGE1D_BUFFER:
2012 log_info("Starting coordinate: %f\b", x);
2013 break;
2014 case CL_MEM_OBJECT_IMAGE2D:
2015 log_info("Starting coordinate: %f, %f\n", x, y);
2016 break;
2017 case CL_MEM_OBJECT_IMAGE3D:
2018 default:
2019 log_info("Starting coordinate: %f, %f, %f\n", x, y, z);
2020 }
2021 }
2022
2023 // At this point, we have unnormalized coordinates.
2024
2025 if( imageSampler->filter_mode == CL_FILTER_NEAREST )
2026 {
2027 int ix, iy, iz;
2028
2029 // We apply the addressing function to the now-unnormalized
2030 // coordinates. Note that the array cases again require special
2031 // care, per section 8.4 in the OpenCL 1.2 Specification.
2032
2033 ix = adFn( floorf( x ), width_lod );
2034
2035 switch (imageInfo->type) {
2036 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2037 iy = calculate_array_index( y, (float)(imageInfo->arraySize - 1) );
2038 iz = 0;
2039 if( verbose ) {
2040 log_info("\tArray index %f evaluates to %d\n",y, iy );
2041 }
2042 break;
2043 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2044 iy = adFn( floorf( y ), height_lod );
2045 iz = calculate_array_index( z, (float)(imageInfo->arraySize - 1) );
2046 if( verbose ) {
2047 log_info("\tArray index %f evaluates to %d\n",z, iz );
2048 }
2049 break;
2050 default:
2051 iy = adFn( floorf( y ), height_lod );
2052 if( depth_lod != 0 )
2053 iz = adFn( floorf( z ), depth_lod );
2054 else
2055 iz = 0;
2056 }
2057
2058 if( verbose ) {
2059 if( iz )
2060 log_info( "\tReference integer coords calculated: { %d, %d, %d }\n", ix, iy, iz );
2061 else
2062 log_info( "\tReference integer coords calculated: { %d, %d }\n", ix, iy );
2063 }
2064
2065 read_image_pixel_float( imageData, imageInfo, ix, iy, iz, outData, lod );
2066 check_for_denorms( outData, containsDenorms );
2067 for( int i = 0; i < 4; i++ )
2068 returnVal.p[i] = fabsf( outData[i] );
2069 return returnVal;
2070 }
2071 else
2072 {
2073 // Linear filtering cases.
2074
2075 size_t width = width_lod, height = height_lod, depth = depth_lod;
2076
2077 // Image arrays can use 2D filtering, but require us to walk into the
2078 // image a certain number of slices before reading.
2079
2080 if( depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY ||
2081 imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
2082 {
2083 float array_index = 0;
2084
2085 size_t layer_offset = 0;
2086
2087 if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
2088 array_index = calculate_array_index(z, (float)(imageInfo->arraySize - 1));
2089 layer_offset = slice_pitch_lod * (size_t)array_index;
2090 }
2091 else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
2092 array_index = calculate_array_index(y, (float)(imageInfo->arraySize - 1));
2093 layer_offset = slice_pitch_lod * (size_t)array_index;
2094
2095 // Set up y and height so that the filtering below is correct
2096 // 1D filtering on a single slice.
2097 height = 1;
2098 }
2099
2100 int x1 = adFn( floorf( x - 0.5f ), width );
2101 int y1 = 0;
2102 int x2 = adFn( floorf( x - 0.5f ) + 1, width );
2103 int y2 = 0;
2104 if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) &&
2105 (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) &&
2106 (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
2107 y1 = adFn( floorf( y - 0.5f ), height );
2108 y2 = adFn( floorf( y - 0.5f ) + 1, height );
2109 } else {
2110 y = 0.5f;
2111 }
2112
2113 if( verbose ) {
2114 log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{ %d, %d } and i1:{ %d, %d }\n", x1, y1, x2, y2 );
2115 log_info( "\tArray coordinate is %f\n", array_index);
2116 }
2117
2118 // Walk to beginning of the 'correct' slice, if needed.
2119 char* imgPtr = ((char*)imageData) + layer_offset;
2120
2121 float upLeft[ 4 ], upRight[ 4 ], lowLeft[ 4 ], lowRight[ 4 ];
2122 float maxUp[4], maxLow[4];
2123 read_image_pixel_float( imgPtr, imageInfo, x1, y1, 0, upLeft, lod );
2124 read_image_pixel_float( imgPtr, imageInfo, x2, y1, 0, upRight, lod );
2125 check_for_denorms( upLeft, containsDenorms );
2126 check_for_denorms( upRight, containsDenorms );
2127 pixelMax( upLeft, upRight, maxUp );
2128 read_image_pixel_float( imgPtr, imageInfo, x1, y2, 0, lowLeft, lod );
2129 read_image_pixel_float( imgPtr, imageInfo, x2, y2, 0, lowRight, lod );
2130 check_for_denorms( lowLeft, containsDenorms );
2131 check_for_denorms( lowRight, containsDenorms );
2132 pixelMax( lowLeft, lowRight, maxLow );
2133 pixelMax( maxUp, maxLow, returnVal.p );
2134
2135 if( verbose )
2136 {
2137 if( NULL == containsDenorms )
2138 log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
2139 else
2140 log_info( "\tSampled pixels (rgba order):\n" );
2141 log_info( "\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1], upLeft[2], upLeft[3] );
2142 log_info( "\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1], upRight[2], upRight[3] );
2143 log_info( "\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1], lowLeft[2], lowLeft[3] );
2144 log_info( "\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1], lowRight[2], lowRight[3] );
2145 }
2146
2147 bool printMe = false;
2148 if( x1 <= 0 || x2 <= 0 || x1 >= (int)width-1 || x2 >= (int)width-1 )
2149 printMe = true;
2150 if( y1 <= 0 || y2 <= 0 || y1 >= (int)height-1 || y2 >= (int)height-1 )
2151 printMe = true;
2152
2153 double weights[ 2 ][ 2 ];
2154
2155 weights[ 0 ][ 0 ] = weights[ 0 ][ 1 ] = 1.0 - frac( x - 0.5f );
2156 weights[ 1 ][ 0 ] = weights[ 1 ][ 1 ] = frac( x - 0.5f );
2157 weights[ 0 ][ 0 ] *= 1.0 - frac( y - 0.5f );
2158 weights[ 1 ][ 0 ] *= 1.0 - frac( y - 0.5f );
2159 weights[ 0 ][ 1 ] *= frac( y - 0.5f );
2160 weights[ 1 ][ 1 ] *= frac( y - 0.5f );
2161
2162 if( verbose )
2163 log_info( "\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f\n", frac( x - 0.5f ), frac( y - 0.5f ) );
2164
2165 for( int i = 0; i < 3; i++ )
2166 {
2167 outData[ i ] = (float)( ( upLeft[ i ] * weights[ 0 ][ 0 ] ) +
2168 ( upRight[ i ] * weights[ 1 ][ 0 ] ) +
2169 ( lowLeft[ i ] * weights[ 0 ][ 1 ] ) +
2170 ( lowRight[ i ] * weights[ 1 ][ 1 ] ));
2171 // flush subnormal results to zero if necessary
2172 if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
2173 outData[i] = copysignf( 0.0f, outData[i] );
2174 }
2175 outData[ 3 ] = (float)( ( upLeft[ 3 ] * weights[ 0 ][ 0 ] ) +
2176 ( upRight[ 3 ] * weights[ 1 ][ 0 ] ) +
2177 ( lowLeft[ 3 ] * weights[ 0 ][ 1 ] ) +
2178 ( lowRight[ 3 ] * weights[ 1 ][ 1 ] ));
2179 // flush subnormal results to zero if necessary
2180 if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN )
2181 outData[3] = copysignf( 0.0f, outData[3] );
2182 }
2183 else
2184 {
2185 // 3D linear filtering
2186 int x1 = adFn( floorf( x - 0.5f ), width_lod );
2187 int y1 = adFn( floorf( y - 0.5f ), height_lod );
2188 int z1 = adFn( floorf( z - 0.5f ), depth_lod );
2189 int x2 = adFn( floorf( x - 0.5f ) + 1, width_lod );
2190 int y2 = adFn( floorf( y - 0.5f ) + 1, height_lod );
2191 int z2 = adFn( floorf( z - 0.5f ) + 1, depth_lod );
2192
2193 if( verbose )
2194 log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", x1, y1, z1, x2, y2, z2 );
2195
2196 float upLeftA[ 4 ], upRightA[ 4 ], lowLeftA[ 4 ], lowRightA[ 4 ];
2197 float upLeftB[ 4 ], upRightB[ 4 ], lowLeftB[ 4 ], lowRightB[ 4 ];
2198 float pixelMaxA[4], pixelMaxB[4];
2199 read_image_pixel_float( imageData, imageInfo, x1, y1, z1, upLeftA, lod );
2200 read_image_pixel_float( imageData, imageInfo, x2, y1, z1, upRightA, lod );
2201 check_for_denorms( upLeftA, containsDenorms );
2202 check_for_denorms( upRightA, containsDenorms );
2203 pixelMax( upLeftA, upRightA, pixelMaxA );
2204 read_image_pixel_float( imageData, imageInfo, x1, y2, z1, lowLeftA, lod );
2205 read_image_pixel_float( imageData, imageInfo, x2, y2, z1, lowRightA, lod );
2206 check_for_denorms( lowLeftA, containsDenorms );
2207 check_for_denorms( lowRightA, containsDenorms );
2208 pixelMax( lowLeftA, lowRightA, pixelMaxB );
2209 pixelMax( pixelMaxA, pixelMaxB, returnVal.p);
2210 read_image_pixel_float( imageData, imageInfo, x1, y1, z2, upLeftB, lod );
2211 read_image_pixel_float( imageData, imageInfo, x2, y1, z2, upRightB, lod );
2212 check_for_denorms( upLeftB, containsDenorms );
2213 check_for_denorms( upRightB, containsDenorms );
2214 pixelMax( upLeftB, upRightB, pixelMaxA );
2215 read_image_pixel_float( imageData, imageInfo, x1, y2, z2, lowLeftB, lod );
2216 read_image_pixel_float( imageData, imageInfo, x2, y2, z2, lowRightB, lod );
2217 check_for_denorms( lowLeftB, containsDenorms );
2218 check_for_denorms( lowRightB, containsDenorms );
2219 pixelMax( lowLeftB, lowRightB, pixelMaxB );
2220 pixelMax( pixelMaxA, pixelMaxB, pixelMaxA);
2221 pixelMax( pixelMaxA, returnVal.p, returnVal.p );
2222
2223 if( verbose )
2224 {
2225 if( NULL == containsDenorms )
2226 log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" );
2227 else
2228 log_info( "\tSampled pixels (rgba order):\n" );
2229 log_info( "\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1], upLeftA[2], upLeftA[3] );
2230 log_info( "\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1], upRightA[2], upRightA[3] );
2231 log_info( "\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1], lowLeftA[2], lowLeftA[3] );
2232 log_info( "\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0], lowRightA[1], lowRightA[2], lowRightA[3] );
2233 log_info( "\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1], upLeftB[2], upLeftB[3] );
2234 log_info( "\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1], upRightB[2], upRightB[3] );
2235 log_info( "\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1], lowLeftB[2], lowLeftB[3] );
2236 log_info( "\t\tp111: %f, %f, %f, %f\n", lowRightB[0], lowRightB[1], lowRightB[2], lowRightB[3] );
2237 }
2238
2239 double weights[ 2 ][ 2 ][ 2 ];
2240
2241 float a = frac( x - 0.5f ), b = frac( y - 0.5f ), c = frac( z - 0.5f );
2242 weights[ 0 ][ 0 ][ 0 ] = weights[ 0 ][ 1 ][ 0 ] = weights[ 0 ][ 0 ][ 1 ] = weights[ 0 ][ 1 ][ 1 ] = 1.f - a;
2243 weights[ 1 ][ 0 ][ 0 ] = weights[ 1 ][ 1 ][ 0 ] = weights[ 1 ][ 0 ][ 1 ] = weights[ 1 ][ 1 ][ 1 ] = a;
2244 weights[ 0 ][ 0 ][ 0 ] *= 1.f - b;
2245 weights[ 1 ][ 0 ][ 0 ] *= 1.f - b;
2246 weights[ 0 ][ 0 ][ 1 ] *= 1.f - b;
2247 weights[ 1 ][ 0 ][ 1 ] *= 1.f - b;
2248 weights[ 0 ][ 1 ][ 0 ] *= b;
2249 weights[ 1 ][ 1 ][ 0 ] *= b;
2250 weights[ 0 ][ 1 ][ 1 ] *= b;
2251 weights[ 1 ][ 1 ][ 1 ] *= b;
2252 weights[ 0 ][ 0 ][ 0 ] *= 1.f - c;
2253 weights[ 0 ][ 1 ][ 0 ] *= 1.f - c;
2254 weights[ 1 ][ 0 ][ 0 ] *= 1.f - c;
2255 weights[ 1 ][ 1 ][ 0 ] *= 1.f - c;
2256 weights[ 0 ][ 0 ][ 1 ] *= c;
2257 weights[ 0 ][ 1 ][ 1 ] *= c;
2258 weights[ 1 ][ 0 ][ 1 ] *= c;
2259 weights[ 1 ][ 1 ][ 1 ] *= c;
2260
2261 if( verbose )
2262 log_info( "\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f, frac( z - 0.5f ) = %f\n",
2263 frac( x - 0.5f ), frac( y - 0.5f ), frac( z - 0.5f ) );
2264
2265 for( int i = 0; i < 3; i++ )
2266 {
2267 outData[ i ] = (float)( ( upLeftA[ i ] * weights[ 0 ][ 0 ][ 0 ] ) +
2268 ( upRightA[ i ] * weights[ 1 ][ 0 ][ 0 ] ) +
2269 ( lowLeftA[ i ] * weights[ 0 ][ 1 ][ 0 ] ) +
2270 ( lowRightA[ i ] * weights[ 1 ][ 1 ][ 0 ] ) +
2271 ( upLeftB[ i ] * weights[ 0 ][ 0 ][ 1 ] ) +
2272 ( upRightB[ i ] * weights[ 1 ][ 0 ][ 1 ] ) +
2273 ( lowLeftB[ i ] * weights[ 0 ][ 1 ][ 1 ] ) +
2274 ( lowRightB[ i ] * weights[ 1 ][ 1 ][ 1 ] ));
2275 // flush subnormal results to zero if necessary
2276 if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN )
2277 outData[i] = copysignf( 0.0f, outData[i] );
2278 }
2279 outData[ 3 ] = (float)( ( upLeftA[ 3 ] * weights[ 0 ][ 0 ][ 0 ] ) +
2280 ( upRightA[ 3 ] * weights[ 1 ][ 0 ][ 0 ] ) +
2281 ( lowLeftA[ 3 ] * weights[ 0 ][ 1 ][ 0 ] ) +
2282 ( lowRightA[ 3 ] * weights[ 1 ][ 1 ][ 0 ] ) +
2283 ( upLeftB[ 3 ] * weights[ 0 ][ 0 ][ 1 ] ) +
2284 ( upRightB[ 3 ] * weights[ 1 ][ 0 ][ 1 ] ) +
2285 ( lowLeftB[ 3 ] * weights[ 0 ][ 1 ][ 1 ] ) +
2286 ( lowRightB[ 3 ] * weights[ 1 ][ 1 ][ 1 ] ));
2287 // flush subnormal results to zero if necessary
2288 if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN )
2289 outData[3] = copysignf( 0.0f, outData[3] );
2290 }
2291
2292 return returnVal;
2293 }
2294 }
2295
sample_image_pixel_float_offset(void * imageData,image_descriptor * imageInfo,float x,float y,float z,float xAddressOffset,float yAddressOffset,float zAddressOffset,image_sampler_data * imageSampler,float * outData,int verbose,int * containsDenorms)2296 FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo,
2297 float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset,
2298 image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms )
2299 {
2300 return sample_image_pixel_float_offset( imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset, zAddressOffset,
2301 imageSampler, outData, verbose, containsDenorms, 0);
2302 }
2303
2304
debug_find_vector_in_image(void * imagePtr,image_descriptor * imageInfo,void * vectorToFind,size_t vectorSize,int * outX,int * outY,int * outZ,size_t lod)2305 int debug_find_vector_in_image( void *imagePtr, image_descriptor *imageInfo,
2306 void *vectorToFind, size_t vectorSize, int *outX, int *outY, int *outZ, size_t lod )
2307 {
2308 int foundCount = 0;
2309 char *iPtr = (char *)imagePtr;
2310 size_t width;
2311 size_t depth;
2312 size_t height;
2313 size_t row_pitch;
2314 size_t slice_pitch;
2315
2316 switch (imageInfo->type)
2317 {
2318 case CL_MEM_OBJECT_IMAGE1D:
2319 width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2320 height = 1;
2321 depth = 1;
2322 break;
2323 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
2324 width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2325 height = 1;
2326 depth = imageInfo->arraySize;
2327 break;
2328 case CL_MEM_OBJECT_IMAGE2D:
2329 width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2330 height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2331 depth = 1;
2332 break;
2333 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
2334 width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2335 height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2336 depth = imageInfo->arraySize;
2337 break;
2338 case CL_MEM_OBJECT_IMAGE3D:
2339 width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1;
2340 height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
2341 depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
2342 break;
2343 }
2344
2345 row_pitch = width * get_pixel_size( imageInfo->format );
2346 slice_pitch = row_pitch * height;
2347
2348 for( size_t z = 0; z < depth; z++ )
2349 {
2350 for( size_t y = 0; y < height; y++ )
2351 {
2352 for( size_t x = 0; x < width; x++)
2353 {
2354 if( memcmp( iPtr, vectorToFind, vectorSize ) == 0 )
2355 {
2356 if( foundCount == 0 )
2357 {
2358 *outX = (int)x;
2359 if (outY != NULL)
2360 *outY = (int)y;
2361 if( outZ != NULL )
2362 *outZ = (int)z;
2363 }
2364 foundCount++;
2365 }
2366 iPtr += vectorSize;
2367 }
2368 iPtr += row_pitch - ( width * vectorSize );
2369 }
2370 iPtr += slice_pitch - ( height * row_pitch );
2371 }
2372 return foundCount;
2373 }
2374
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,unsigned int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2375 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2376 unsigned int *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2377 {
2378 char vectorToFind[ 4 * 4 ];
2379 size_t vectorSize = get_format_channel_count( imageInfo->format );
2380
2381
2382 if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8 )
2383 {
2384 unsigned char *p = (unsigned char *)vectorToFind;
2385 for( unsigned int i = 0; i < vectorSize; i++ )
2386 p[i] = (unsigned char)valuesToFind[i];
2387 }
2388 else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16 )
2389 {
2390 unsigned short *p = (unsigned short *)vectorToFind;
2391 for( unsigned int i = 0; i < vectorSize; i++ )
2392 p[i] = (unsigned short)valuesToFind[i];
2393 vectorSize *= 2;
2394 }
2395 else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32 )
2396 {
2397 unsigned int *p = (unsigned int *)vectorToFind;
2398 for( unsigned int i = 0; i < vectorSize; i++ )
2399 p[i] = (unsigned int)valuesToFind[i];
2400 vectorSize *= 4;
2401 }
2402 else
2403 {
2404 log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
2405 return false;
2406 }
2407 return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2408 }
2409
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,int * valuesToFind,int * outX,int * outY,int * outZ,int lod)2410 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2411 int *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2412 {
2413 char vectorToFind[ 4 * 4 ];
2414 size_t vectorSize = get_format_channel_count( imageInfo->format );
2415
2416 if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT8 )
2417 {
2418 char *p = (char *)vectorToFind;
2419 for( unsigned int i = 0; i < vectorSize; i++ )
2420 p[i] = (char)valuesToFind[i];
2421 }
2422 else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT16 )
2423 {
2424 short *p = (short *)vectorToFind;
2425 for( unsigned int i = 0; i < vectorSize; i++ )
2426 p[i] = (short)valuesToFind[i];
2427 vectorSize *= 2;
2428 }
2429 else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT32 )
2430 {
2431 int *p = (int *)vectorToFind;
2432 for( unsigned int i = 0; i < vectorSize; i++ )
2433 p[i] = (int)valuesToFind[i];
2434 vectorSize *= 4;
2435 }
2436 else
2437 {
2438 log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" );
2439 return false;
2440 }
2441 return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2442 }
2443
debug_find_pixel_in_image(void * imagePtr,image_descriptor * imageInfo,float * valuesToFind,int * outX,int * outY,int * outZ,int lod)2444 int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo,
2445 float *valuesToFind, int *outX, int *outY, int *outZ, int lod )
2446 {
2447 char vectorToFind[ 4 * 4 ];
2448 float swizzled[4];
2449 memcpy( swizzled, valuesToFind, sizeof( swizzled ) );
2450 size_t vectorSize = get_pixel_size( imageInfo->format );
2451 pack_image_pixel( swizzled, imageInfo->format, vectorToFind );
2452 return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod );
2453 }
2454
swizzle_vector_for_image(T * srcVector,const cl_image_format * imageFormat)2455 template <class T> void swizzle_vector_for_image( T *srcVector, const cl_image_format *imageFormat )
2456 {
2457 T temp;
2458 switch( imageFormat->image_channel_order )
2459 {
2460 case CL_A:
2461 srcVector[ 0 ] = srcVector[ 3 ];
2462 break;
2463 case CL_R:
2464 case CL_Rx:
2465 case CL_RG:
2466 case CL_RGx:
2467 case CL_RGB:
2468 case CL_RGBx:
2469 case CL_RGBA:
2470 case CL_sRGB:
2471 case CL_sRGBx:
2472 case CL_sRGBA:
2473 break;
2474 case CL_RA:
2475 srcVector[ 1 ] = srcVector[ 3 ];
2476 break;
2477 case CL_ARGB:
2478 temp = srcVector[ 3 ];
2479 srcVector[ 3 ] = srcVector[ 2 ];
2480 srcVector[ 2 ] = srcVector[ 1 ];
2481 srcVector[ 1 ] = srcVector[ 0 ];
2482 srcVector[ 0 ] = temp;
2483 break;
2484 case CL_BGRA:
2485 case CL_sBGRA:
2486 temp = srcVector[ 0 ];
2487 srcVector[ 0 ] = srcVector[ 2 ];
2488 srcVector[ 2 ] = temp;
2489 break;
2490 case CL_INTENSITY:
2491 srcVector[ 3 ] = srcVector[ 0 ];
2492 srcVector[ 2 ] = srcVector[ 0 ];
2493 srcVector[ 1 ] = srcVector[ 0 ];
2494 break;
2495 case CL_LUMINANCE:
2496 srcVector[ 2 ] = srcVector[ 0 ];
2497 srcVector[ 1 ] = srcVector[ 0 ];
2498 break;
2499 #ifdef CL_1RGB_APPLE
2500 case CL_1RGB_APPLE:
2501 temp = srcVector[ 3 ];
2502 srcVector[ 3 ] = srcVector[ 2 ];
2503 srcVector[ 2 ] = srcVector[ 1 ];
2504 srcVector[ 1 ] = srcVector[ 0 ];
2505 srcVector[ 0 ] = temp;
2506 break;
2507 #endif
2508 #ifdef CL_BGR1_APPLE
2509 case CL_BGR1_APPLE:
2510 temp = srcVector[ 0 ];
2511 srcVector[ 0 ] = srcVector[ 2 ];
2512 srcVector[ 2 ] = temp;
2513 break;
2514 #endif
2515 }
2516 }
2517
2518 #define SATURATE( v, min, max ) ( v < min ? min : ( v > max ? max : v ) )
2519
pack_image_pixel(unsigned int * srcVector,const cl_image_format * imageFormat,void * outData)2520 void pack_image_pixel( unsigned int *srcVector, const cl_image_format *imageFormat, void *outData )
2521 {
2522 swizzle_vector_for_image<unsigned int>( srcVector, imageFormat );
2523 size_t channelCount = get_format_channel_count( imageFormat );
2524
2525 switch( imageFormat->image_channel_data_type )
2526 {
2527 case CL_UNSIGNED_INT8:
2528 {
2529 unsigned char *ptr = (unsigned char *)outData;
2530 for( unsigned int i = 0; i < channelCount; i++ )
2531 ptr[ i ] = (unsigned char)SATURATE( srcVector[ i ], 0, 255 );
2532 break;
2533 }
2534 case CL_UNSIGNED_INT16:
2535 {
2536 unsigned short *ptr = (unsigned short *)outData;
2537 for( unsigned int i = 0; i < channelCount; i++ )
2538 ptr[ i ] = (unsigned short)SATURATE( srcVector[ i ], 0, 65535 );
2539 break;
2540 }
2541 case CL_UNSIGNED_INT32:
2542 {
2543 unsigned int *ptr = (unsigned int *)outData;
2544 for( unsigned int i = 0; i < channelCount; i++ )
2545 ptr[ i ] = (unsigned int)srcVector[ i ];
2546 break;
2547 }
2548 default:
2549 break;
2550 }
2551 }
2552
pack_image_pixel(int * srcVector,const cl_image_format * imageFormat,void * outData)2553 void pack_image_pixel( int *srcVector, const cl_image_format *imageFormat, void *outData )
2554 {
2555 swizzle_vector_for_image<int>( srcVector, imageFormat );
2556 size_t chanelCount = get_format_channel_count( imageFormat );
2557
2558 switch( imageFormat->image_channel_data_type )
2559 {
2560 case CL_SIGNED_INT8:
2561 {
2562 char *ptr = (char *)outData;
2563 for( unsigned int i = 0; i < chanelCount; i++ )
2564 ptr[ i ] = (char)SATURATE( srcVector[ i ], -128, 127 );
2565 break;
2566 }
2567 case CL_SIGNED_INT16:
2568 {
2569 short *ptr = (short *)outData;
2570 for( unsigned int i = 0; i < chanelCount; i++ )
2571 ptr[ i ] = (short)SATURATE( srcVector[ i ], -32768, 32767 );
2572 break;
2573 }
2574 case CL_SIGNED_INT32:
2575 {
2576 int *ptr = (int *)outData;
2577 for( unsigned int i = 0; i < chanelCount; i++ )
2578 ptr[ i ] = (int)srcVector[ i ];
2579 break;
2580 }
2581 default:
2582 break;
2583 }
2584 }
2585
round_to_even(float v)2586 int round_to_even( float v )
2587 {
2588 // clamp overflow
2589 if( v >= - (float) INT_MIN )
2590 return INT_MAX;
2591 if( v <= (float) INT_MIN )
2592 return INT_MIN;
2593
2594 // round fractional values to integer value
2595 if( fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) )
2596 {
2597 static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23), MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) };
2598 float magicVal = magic[ v < 0.0f ];
2599 v += magicVal;
2600 v -= magicVal;
2601 }
2602
2603 return (int) v;
2604 }
2605
pack_image_pixel(float * srcVector,const cl_image_format * imageFormat,void * outData)2606 void pack_image_pixel( float *srcVector, const cl_image_format *imageFormat, void *outData )
2607 {
2608 swizzle_vector_for_image<float>( srcVector, imageFormat );
2609 size_t channelCount = get_format_channel_count( imageFormat );
2610 switch( imageFormat->image_channel_data_type )
2611 {
2612 case CL_HALF_FLOAT:
2613 {
2614 cl_ushort *ptr = (cl_ushort *)outData;
2615
2616 switch( gFloatToHalfRoundingMode )
2617 {
2618 case kRoundToNearestEven:
2619 for( unsigned int i = 0; i < channelCount; i++ )
2620 ptr[ i ] = float2half_rte( srcVector[ i ] );
2621 break;
2622 case kRoundTowardZero:
2623 for( unsigned int i = 0; i < channelCount; i++ )
2624 ptr[ i ] = float2half_rtz( srcVector[ i ] );
2625 break;
2626 default:
2627 log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" );
2628 exit(-1);
2629 break;
2630 }
2631 break;
2632 }
2633
2634 case CL_FLOAT:
2635 {
2636 cl_float *ptr = (cl_float *)outData;
2637 for( unsigned int i = 0; i < channelCount; i++ )
2638 ptr[ i ] = srcVector[ i ];
2639 break;
2640 }
2641
2642 case CL_SNORM_INT8:
2643 {
2644 cl_char *ptr = (cl_char *)outData;
2645 for( unsigned int i = 0; i < channelCount; i++ )
2646 ptr[ i ] = (cl_char)NORMALIZE_SIGNED( srcVector[ i ], -127.0f, 127.f );
2647 break;
2648 }
2649 case CL_SNORM_INT16:
2650 {
2651 cl_short *ptr = (cl_short *)outData;
2652 for( unsigned int i = 0; i < channelCount; i++ )
2653 ptr[ i ] = (short)NORMALIZE_SIGNED( srcVector[ i ], -32767.f, 32767.f );
2654 break;
2655 }
2656 case CL_UNORM_INT8:
2657 {
2658 cl_uchar *ptr = (cl_uchar *)outData;
2659 if ( is_sRGBA_order(imageFormat->image_channel_order) )
2660 {
2661 ptr[ 0 ] = (unsigned char)( sRGBmap( srcVector[ 0 ] ) + 0.5 );
2662 ptr[ 1 ] = (unsigned char)( sRGBmap( srcVector[ 1 ] ) + 0.5 );
2663 ptr[ 2 ] = (unsigned char)( sRGBmap( srcVector[ 2 ] ) + 0.5 );
2664 if (channelCount == 4)
2665 ptr[ 3 ] = (unsigned char)NORMALIZE( srcVector[ 3 ], 255.f );
2666 }
2667 else
2668 {
2669 for( unsigned int i = 0; i < channelCount; i++ )
2670 ptr[ i ] = (unsigned char)NORMALIZE( srcVector[ i ], 255.f );
2671 }
2672 #ifdef CL_1RGB_APPLE
2673 if( imageFormat->image_channel_order == CL_1RGB_APPLE )
2674 ptr[0] = 255.0f;
2675 #endif
2676 #ifdef CL_BGR1_APPLE
2677 if( imageFormat->image_channel_order == CL_BGR1_APPLE )
2678 ptr[3] = 255.0f;
2679 #endif
2680 break;
2681 }
2682 case CL_UNORM_INT16:
2683 {
2684 cl_ushort *ptr = (cl_ushort *)outData;
2685 for( unsigned int i = 0; i < channelCount; i++ )
2686 ptr[ i ] = (unsigned short)NORMALIZE( srcVector[ i ], 65535.f );
2687 break;
2688 }
2689 case CL_UNORM_SHORT_555:
2690 {
2691 cl_ushort *ptr = (cl_ushort *)outData;
2692 ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 10 ) |
2693 ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 31.f ) & 31 ) << 5 ) |
2694 ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
2695 break;
2696 }
2697 case CL_UNORM_SHORT_565:
2698 {
2699 cl_ushort *ptr = (cl_ushort *)outData;
2700 ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 11 ) |
2701 ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 63.f ) & 63 ) << 5 ) |
2702 ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 );
2703 break;
2704 }
2705 case CL_UNORM_INT_101010:
2706 {
2707 cl_uint *ptr = (cl_uint *)outData;
2708 ptr[ 0 ] = ( ( (unsigned int)NORMALIZE( srcVector[ 0 ], 1023.f ) & 1023 ) << 20 ) |
2709 ( ( (unsigned int)NORMALIZE( srcVector[ 1 ], 1023.f ) & 1023 ) << 10 ) |
2710 ( ( (unsigned int)NORMALIZE( srcVector[ 2 ], 1023.f ) & 1023 ) << 0 );
2711 break;
2712 }
2713 case CL_SIGNED_INT8:
2714 {
2715 cl_char *ptr = (cl_char *)outData;
2716 for( unsigned int i = 0; i < channelCount; i++ )
2717 ptr[ i ] = (cl_char)CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );
2718 break;
2719 }
2720 case CL_SIGNED_INT16:
2721 {
2722 cl_short *ptr = (cl_short *)outData;
2723 for( unsigned int i = 0; i < channelCount; i++ )
2724 ptr[ i ] = (short)CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767 );
2725 break;
2726 }
2727 case CL_SIGNED_INT32:
2728 {
2729 cl_int *ptr = (cl_int *)outData;
2730 for( unsigned int i = 0; i < channelCount; i++ )
2731 ptr[ i ] = (int)CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX );
2732 break;
2733 }
2734 case CL_UNSIGNED_INT8:
2735 {
2736 cl_uchar *ptr = (cl_uchar *)outData;
2737 for( unsigned int i = 0; i < channelCount; i++ )
2738 ptr[ i ] = (cl_uchar)CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
2739 break;
2740 }
2741 case CL_UNSIGNED_INT16:
2742 {
2743 cl_ushort *ptr = (cl_ushort *)outData;
2744 for( unsigned int i = 0; i < channelCount; i++ )
2745 ptr[ i ] = (cl_ushort)CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
2746 break;
2747 }
2748 case CL_UNSIGNED_INT32:
2749 {
2750 cl_uint *ptr = (cl_uint *)outData;
2751 for( unsigned int i = 0; i < channelCount; i++ )
2752 ptr[ i ] = (cl_uint)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX );
2753 break;
2754 }
2755 #ifdef CL_SFIXED14_APPLE
2756 case CL_SFIXED14_APPLE:
2757 {
2758 cl_ushort *ptr = (cl_ushort*)outData;
2759 for( unsigned int i = 0; i < channelCount; i++ )
2760 {
2761 cl_float f = fmaxf( srcVector[i], -1.0f );
2762 f = fminf( f, 3.0f );
2763 cl_int d = rintf(f * 0x1.0p14f);
2764 d += 16384;
2765 if( d > CL_USHRT_MAX )
2766 d = CL_USHRT_MAX;
2767 ptr[i] = d;
2768 }
2769 break;
2770 }
2771 #endif
2772 default:
2773 log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
2774 exit(-1);
2775 break;
2776 }
2777 }
2778
pack_image_pixel_error(const float * srcVector,const cl_image_format * imageFormat,const void * results,float * errors)2779 void pack_image_pixel_error( const float *srcVector, const cl_image_format *imageFormat, const void *results, float *errors )
2780 {
2781 size_t channelCount = get_format_channel_count( imageFormat );
2782 switch( imageFormat->image_channel_data_type )
2783 {
2784 case CL_HALF_FLOAT:
2785 {
2786 const cl_ushort *ptr = (const cl_ushort *)results;
2787
2788 for( unsigned int i = 0; i < channelCount; i++ )
2789 errors[i] = Ulp_Error_Half( ptr[i], srcVector[i] );
2790
2791 break;
2792 }
2793
2794 case CL_FLOAT:
2795 {
2796 const cl_ushort *ptr = (const cl_ushort *)results;
2797
2798 for( unsigned int i = 0; i < channelCount; i++ )
2799 errors[i] = Ulp_Error( ptr[i], srcVector[i] );
2800
2801 break;
2802 }
2803
2804 case CL_SNORM_INT8:
2805 {
2806 const cl_char *ptr = (const cl_char *)results;
2807
2808 for( unsigned int i = 0; i < channelCount; i++ )
2809 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -127.0f, 127.f );
2810
2811 break;
2812 }
2813 case CL_SNORM_INT16:
2814 {
2815 const cl_short *ptr = (const cl_short *)results;
2816
2817 for( unsigned int i = 0; i < channelCount; i++ )
2818 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -32767.f, 32767.f );
2819
2820 break;
2821 }
2822 case CL_UNORM_INT8:
2823 {
2824 const cl_uchar *ptr = (const cl_uchar *)results;
2825
2826 for( unsigned int i = 0; i < channelCount; i++ )
2827 errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 255.f );
2828
2829 break;
2830 }
2831 case CL_UNORM_INT16:
2832 {
2833 const cl_ushort *ptr = (const cl_ushort *)results;
2834
2835 for( unsigned int i = 0; i < channelCount; i++ )
2836 errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 65535.f );
2837
2838 break;
2839 }
2840 case CL_UNORM_SHORT_555:
2841 {
2842 const cl_ushort *ptr = (const cl_ushort *)results;
2843
2844 errors[0] = ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
2845 errors[1] = ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 31.f );
2846 errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );
2847
2848 break;
2849 }
2850 case CL_UNORM_SHORT_565:
2851 {
2852 const cl_ushort *ptr = (const cl_ushort *)results;
2853
2854 errors[0] = ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f );
2855 errors[1] = ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 63.f );
2856 errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f );
2857
2858 break;
2859 }
2860 case CL_UNORM_INT_101010:
2861 {
2862 const cl_uint *ptr = (const cl_uint *)results;
2863
2864 errors[0] = ((ptr[0] >> 20) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 1023.f );
2865 errors[1] = ((ptr[0] >> 10) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 1023.f );
2866 errors[2] = ((ptr[0] >> 0) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 1023.f );
2867
2868 break;
2869 }
2870 case CL_SIGNED_INT8:
2871 {
2872 const cl_char *ptr = (const cl_char *)results;
2873
2874 for( unsigned int i = 0; i < channelCount; i++ )
2875 errors[ i ] = ptr[i] - CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 );
2876
2877 break;
2878 }
2879 case CL_SIGNED_INT16:
2880 {
2881 const cl_short *ptr = (const cl_short *)results;
2882 for( unsigned int i = 0; i < channelCount; i++ )
2883 errors[i] = ptr[ i ] - CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767 );
2884 break;
2885 }
2886 case CL_SIGNED_INT32:
2887 {
2888 const cl_int *ptr = (const cl_int *)results;
2889 for( unsigned int i = 0; i < channelCount; i++ )
2890 errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long) CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX ));
2891 break;
2892 }
2893 case CL_UNSIGNED_INT8:
2894 {
2895 const cl_uchar *ptr = (const cl_uchar *)results;
2896 for( unsigned int i = 0; i < channelCount; i++ )
2897 errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX );
2898 break;
2899 }
2900 case CL_UNSIGNED_INT16:
2901 {
2902 const cl_ushort *ptr = (const cl_ushort *)results;
2903 for( unsigned int i = 0; i < channelCount; i++ )
2904 errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX );
2905 break;
2906 }
2907 case CL_UNSIGNED_INT32:
2908 {
2909 const cl_uint *ptr = (const cl_uint *)results;
2910 for( unsigned int i = 0; i < channelCount; i++ )
2911 errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX ));
2912 break;
2913 }
2914 #ifdef CL_SFIXED14_APPLE
2915 case CL_SFIXED14_APPLE:
2916 {
2917 const cl_ushort *ptr = (const cl_ushort *)results;
2918
2919 for( unsigned int i = 0; i < channelCount; i++ )
2920 errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( ((int) srcVector[ i ] - 16384), -16384.f, 49151.f );
2921
2922 break;
2923 }
2924 #endif
2925 default:
2926 log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type);
2927 exit(-1);
2928 break;
2929 }
2930 }
2931
2932
2933 //
2934 // Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT
2935 // This should be called lazily before attempting to verify image writes, otherwise an error will occur.
2936 //
DetectFloatToHalfRoundingMode(cl_command_queue q)2937 int DetectFloatToHalfRoundingMode( cl_command_queue q ) // Returns CL_SUCCESS on success
2938 {
2939 cl_int err = CL_SUCCESS;
2940
2941 if( gFloatToHalfRoundingMode == kDefaultRoundingMode )
2942 {
2943 // Some numbers near 0.5f, that we look at to see how the values are rounded.
2944 static const cl_uint inData[4*4] = { 0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U, 0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U,
2945 0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U, 0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U };
2946 static const size_t count = sizeof( inData ) / (4*sizeof( inData[0] ));
2947 const float *inp = (const float*) inData;
2948 cl_context context = NULL;
2949
2950 // Create an input buffer
2951 err = clGetCommandQueueInfo( q, CL_QUEUE_CONTEXT, sizeof(context), &context, NULL );
2952 if( err )
2953 {
2954 log_error( "Error: could not get context from command queue in DetectFloatToHalfRoundingMode (%d)", err );
2955 return err;
2956 }
2957
2958 cl_mem inBuf = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, sizeof( inData ), (void*) inData, &err );
2959 if( NULL == inBuf || err )
2960 {
2961 log_error( "Error: could not create input buffer in DetectFloatToHalfRoundingMode (err: %d)", err );
2962 return err;
2963 }
2964
2965 // Create a small output image
2966 cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT };
2967 cl_mem outImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &fmt, count, 1, 0, NULL, &err );
2968 if( NULL == outImage || err )
2969 {
2970 log_error( "Error: could not create half float out image in DetectFloatToHalfRoundingMode (err: %d)", err );
2971 clReleaseMemObject( inBuf );
2972 return err;
2973 }
2974
2975 // Create our program, and a kernel
2976 const char *kernel[1] = {
2977 "kernel void detect_round( global float4 *in, write_only image2d_t out )\n"
2978 "{\n"
2979 " write_imagef( out, (int2)(get_global_id(0),0), in[get_global_id(0)] );\n"
2980 "}\n" };
2981
2982 clProgramWrapper program;
2983 err = create_single_kernel_helper_create_program(context, &program, 1, kernel);
2984
2985 if( NULL == program || err )
2986 {
2987 log_error( "Error: could not create program in DetectFloatToHalfRoundingMode (err: %d)", err );
2988 clReleaseMemObject( inBuf );
2989 clReleaseMemObject( outImage );
2990 return err;
2991 }
2992
2993 cl_device_id device = NULL;
2994 err = clGetCommandQueueInfo( q, CL_QUEUE_DEVICE, sizeof(device), &device, NULL );
2995 if( err )
2996 {
2997 log_error( "Error: could not get device from command queue in DetectFloatToHalfRoundingMode (%d)", err );
2998 clReleaseMemObject( inBuf );
2999 clReleaseMemObject( outImage );
3000 return err;
3001 }
3002
3003 err = clBuildProgram( program, 1, &device, "", NULL, NULL );
3004 if( err )
3005 {
3006 log_error( "Error: could not build program in DetectFloatToHalfRoundingMode (%d)", err );
3007 clReleaseMemObject( inBuf );
3008 clReleaseMemObject( outImage );
3009 return err;
3010 }
3011
3012 cl_kernel k = clCreateKernel( program, "detect_round", &err );
3013 if( NULL == k || err )
3014 {
3015 log_error( "Error: could not create kernel in DetectFloatToHalfRoundingMode (%d)", err );
3016 clReleaseMemObject( inBuf );
3017 clReleaseMemObject( outImage );
3018 return err;
3019 }
3020
3021 err = clSetKernelArg( k, 0, sizeof( cl_mem ), &inBuf );
3022 if( err )
3023 {
3024 log_error( "Error: could not set argument 0 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
3025 clReleaseMemObject( inBuf );
3026 clReleaseMemObject( outImage );
3027 clReleaseKernel( k );
3028 return err;
3029 }
3030
3031 err = clSetKernelArg( k, 1, sizeof( cl_mem ), &outImage );
3032 if( err )
3033 {
3034 log_error( "Error: could not set argument 1 of kernel in DetectFloatToHalfRoundingMode (%d)", err );
3035 clReleaseMemObject( inBuf );
3036 clReleaseMemObject( outImage );
3037 clReleaseKernel( k );
3038 return err;
3039 }
3040
3041 // Run the kernel
3042 size_t global_work_size = count;
3043 err = clEnqueueNDRangeKernel( q, k, 1, NULL, &global_work_size, NULL, 0, NULL, NULL );
3044 if( err )
3045 {
3046 log_error( "Error: could not enqueue kernel in DetectFloatToHalfRoundingMode (%d)", err );
3047 clReleaseMemObject( inBuf );
3048 clReleaseMemObject( outImage );
3049 clReleaseKernel( k );
3050 return err;
3051 }
3052
3053 // read the results
3054 cl_ushort outBuf[count*4];
3055 memset( outBuf, -1, sizeof( outBuf ) );
3056 size_t origin[3] = {0,0,0};
3057 size_t region[3] = {count,1,1};
3058 err = clEnqueueReadImage( q, outImage, CL_TRUE, origin, region, 0, 0, outBuf, 0, NULL, NULL );
3059 if( err )
3060 {
3061 log_error( "Error: could not read output image in DetectFloatToHalfRoundingMode (%d)", err );
3062 clReleaseMemObject( inBuf );
3063 clReleaseMemObject( outImage );
3064 clReleaseKernel( k );
3065 return err;
3066 }
3067
3068 // Generate our list of reference results
3069 cl_ushort rte_ref[count*4];
3070 cl_ushort rtz_ref[count*4];
3071 for( size_t i = 0; i < 4 * count; i++ )
3072 {
3073 rte_ref[i] = float2half_rte( inp[i] );
3074 rtz_ref[i] = float2half_rtz( inp[i] );
3075 }
3076
3077 // Verify that we got something in either rtz or rte mode
3078 if( 0 == memcmp( rte_ref, outBuf, sizeof( rte_ref )) )
3079 {
3080 log_info( "Autodetected float->half rounding mode to be rte\n" );
3081 gFloatToHalfRoundingMode = kRoundToNearestEven;
3082 }
3083 else if ( 0 == memcmp( rtz_ref, outBuf, sizeof( rtz_ref )) )
3084 {
3085 log_info( "Autodetected float->half rounding mode to be rtz\n" );
3086 gFloatToHalfRoundingMode = kRoundTowardZero;
3087 }
3088 else
3089 {
3090 log_error( "ERROR: float to half conversions proceed with invalid rounding mode!\n" );
3091 log_info( "\nfor:" );
3092 for( size_t i = 0; i < count; i++ )
3093 log_info( " {%a, %a, %a, %a},", inp[4*i], inp[4*i+1], inp[4*i+2], inp[4*i+3] );
3094 log_info( "\ngot:" );
3095 for( size_t i = 0; i < count; i++ )
3096 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", outBuf[4*i], outBuf[4*i+1], outBuf[4*i+2], outBuf[4*i+3] );
3097 log_info( "\nrte:" );
3098 for( size_t i = 0; i < count; i++ )
3099 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rte_ref[4*i], rte_ref[4*i+1], rte_ref[4*i+2], rte_ref[4*i+3] );
3100 log_info( "\nrtz:" );
3101 for( size_t i = 0; i < count; i++ )
3102 log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rtz_ref[4*i], rtz_ref[4*i+1], rtz_ref[4*i+2], rtz_ref[4*i+3] );
3103 log_info( "\n" );
3104 err = -1;
3105 gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value
3106 }
3107
3108 // clean up
3109 clReleaseMemObject( inBuf );
3110 clReleaseMemObject( outImage );
3111 clReleaseKernel( k );
3112 return err;
3113 }
3114
3115 // Make sure that the rounding mode was successfully detected, if we checked earlier
3116 if( gFloatToHalfRoundingMode != kRoundToNearestEven && gFloatToHalfRoundingMode != kRoundTowardZero)
3117 return -2;
3118
3119 return err;
3120 }
3121
create_random_image_data(ExplicitType dataType,image_descriptor * imageInfo,BufferOwningPtr<char> & P,MTdata d,bool image2DFromBuffer)3122 char *create_random_image_data( ExplicitType dataType, image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d, bool image2DFromBuffer )
3123 {
3124 size_t allocSize, numPixels;
3125 if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 )
3126 {
3127 allocSize = (size_t) (compute_mipmapped_image_size(*imageInfo) * 4 * get_explicit_type_size( dataType ))/get_pixel_size(imageInfo->format);
3128 numPixels = allocSize / (get_explicit_type_size( dataType ) * 4);
3129 }
3130 else
3131 {
3132 numPixels = (image2DFromBuffer? imageInfo->rowPitch: imageInfo->width) * imageInfo->height
3133 * (imageInfo->depth ? imageInfo->depth : 1)
3134 * (imageInfo->arraySize ? imageInfo->arraySize : 1);
3135 allocSize = numPixels * 4 * get_explicit_type_size( dataType );
3136 }
3137
3138 #if 0 // DEBUG
3139 {
3140 fprintf(stderr,"--- create_random_image_data:\n");
3141 fprintf(stderr,"allocSize = %zu\n",allocSize);
3142 fprintf(stderr,"numPixels = %zu\n",numPixels);
3143 fprintf(stderr,"width = %zu\n",imageInfo->width);
3144 fprintf(stderr,"height = %zu\n",imageInfo->height);
3145 fprintf(stderr,"depth = %zu\n",imageInfo->depth);
3146 fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch);
3147 fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch);
3148 fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize);
3149 fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType));
3150 }
3151 #endif
3152
3153 #if defined( __APPLE__ )
3154 char *data = NULL;
3155 if (gDeviceType == CL_DEVICE_TYPE_CPU) {
3156 size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages.
3157
3158 void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
3159 if (map == MAP_FAILED)
3160 {
3161 perror("create_random_image_data: mmap");
3162 log_error("%s:%d: mmap failed, mapSize = %zu\n",__FILE__,__LINE__,mapSize);
3163 }
3164 intptr_t data_end = (intptr_t)map + mapSize - 4096;
3165 data = (char *)(data_end - (intptr_t)allocSize);
3166
3167 mprotect(map, 4096, PROT_NONE);
3168 mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE);
3169 P.reset(data, map, mapSize);
3170 } else {
3171 data = (char *)malloc(allocSize);
3172 P.reset(data);
3173 }
3174 #else
3175 char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format));
3176 P.reset(data,NULL,0,allocSize,true);
3177 #endif
3178
3179 if (data == NULL) {
3180 log_error( "ERROR: Unable to malloc %lu bytes for create_random_image_data\n", allocSize );
3181 return NULL;
3182 }
3183
3184 switch( dataType )
3185 {
3186 case kFloat:
3187 {
3188 float *inputValues = (float *)data;
3189 switch (imageInfo->format->image_channel_data_type)
3190 {
3191 case CL_HALF_FLOAT:
3192 {
3193 // Generate data that is (mostly) inside the range of a half float
3194 // const float HALF_MIN = 5.96046448e-08f;
3195 const float HALF_MAX = 65504.0f;
3196
3197 size_t i = 0;
3198 inputValues[ i++ ] = 0.f;
3199 inputValues[ i++ ] = 1.f;
3200 inputValues[ i++ ] = -1.f;
3201 inputValues[ i++ ] = 2.f;
3202 for( ; i < numPixels * 4; i++ )
3203 inputValues[ i ] = get_random_float( -HALF_MAX - 2.f, HALF_MAX + 2.f, d );
3204 }
3205 break;
3206 #ifdef CL_SFIXED14_APPLE
3207 case CL_SFIXED14_APPLE:
3208 {
3209 size_t i = 0;
3210 if( numPixels * 4 >= 8 )
3211 {
3212 inputValues[ i++ ] = INFINITY;
3213 inputValues[ i++ ] = 0x1.0p14f;
3214 inputValues[ i++ ] = 0x1.0p31f;
3215 inputValues[ i++ ] = 0x1.0p32f;
3216 inputValues[ i++ ] = -INFINITY;
3217 inputValues[ i++ ] = -0x1.0p14f;
3218 inputValues[ i++ ] = -0x1.0p31f;
3219 inputValues[ i++ ] = -0x1.1p31f;
3220 }
3221 for( ; i < numPixels * 4; i++ )
3222 inputValues[ i ] = get_random_float( -1.1f, 3.1f, d );
3223 }
3224 break;
3225 #endif
3226 case CL_FLOAT:
3227 {
3228 size_t i = 0;
3229 inputValues[ i++ ] = INFINITY;
3230 inputValues[ i++ ] = -INFINITY;
3231 inputValues[ i++ ] = 0.0f;
3232 inputValues[ i++ ] = 0.0f;
3233 cl_uint *p = (cl_uint *)data;
3234 for( ; i < numPixels * 4; i++ )
3235 p[ i ] = genrand_int32(d);
3236 }
3237 break;
3238
3239 default:
3240 size_t i = 0;
3241 if( numPixels * 4 >= 36 )
3242 {
3243 inputValues[ i++ ] = 0.0f;
3244 inputValues[ i++ ] = 0.5f;
3245 inputValues[ i++ ] = 31.5f;
3246 inputValues[ i++ ] = 32.0f;
3247 inputValues[ i++ ] = 127.5f;
3248 inputValues[ i++ ] = 128.0f;
3249 inputValues[ i++ ] = 255.5f;
3250 inputValues[ i++ ] = 256.0f;
3251 inputValues[ i++ ] = 1023.5f;
3252 inputValues[ i++ ] = 1024.0f;
3253 inputValues[ i++ ] = 32767.5f;
3254 inputValues[ i++ ] = 32768.0f;
3255 inputValues[ i++ ] = 65535.5f;
3256 inputValues[ i++ ] = 65536.0f;
3257 inputValues[ i++ ] = 2147483648.0f;
3258 inputValues[ i++ ] = 4294967296.0f;
3259 inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
3260 inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
3261 inputValues[ i++ ] = -0.0f;
3262 inputValues[ i++ ] = -0.5f;
3263 inputValues[ i++ ] = -31.5f;
3264 inputValues[ i++ ] = -32.0f;
3265 inputValues[ i++ ] = -127.5f;
3266 inputValues[ i++ ] = -128.0f;
3267 inputValues[ i++ ] = -255.5f;
3268 inputValues[ i++ ] = -256.0f;
3269 inputValues[ i++ ] = -1023.5f;
3270 inputValues[ i++ ] = -1024.0f;
3271 inputValues[ i++ ] = -32767.5f;
3272 inputValues[ i++ ] = -32768.0f;
3273 inputValues[ i++ ] = -65535.5f;
3274 inputValues[ i++ ] = -65536.0f;
3275 inputValues[ i++ ] = -2147483648.0f;
3276 inputValues[ i++ ] = -4294967296.0f;
3277 inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 );
3278 inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 );
3279 }
3280 if( is_format_signed(imageInfo->format) )
3281 {
3282 for( ; i < numPixels * 4; i++ )
3283 inputValues[ i ] = get_random_float( -1.1f, 1.1f, d );
3284 }
3285 else
3286 {
3287 for( ; i < numPixels * 4; i++ )
3288 inputValues[ i ] = get_random_float( -0.1f, 1.1f, d );
3289 }
3290 break;
3291 }
3292 break;
3293 }
3294
3295 case kInt:
3296 {
3297 int *imageData = (int *)data;
3298
3299 // We want to generate ints (mostly) in range of the target format
3300 int formatMin = get_format_min_int( imageInfo->format );
3301 size_t formatMax = get_format_max_int( imageInfo->format );
3302 if( formatMin == 0 )
3303 {
3304 // Unsigned values, but we are only an int, so cap the actual max at the max of signed ints
3305 if( formatMax > 2147483647L )
3306 formatMax = 2147483647L;
3307 }
3308 // If the final format is small enough, give us a bit of room for out-of-range values to test
3309 if( formatMax < 2147483647L )
3310 formatMax += 2;
3311 if( formatMin > -2147483648LL )
3312 formatMin -= 2;
3313
3314 // Now gen
3315 for( size_t i = 0; i < numPixels * 4; i++ )
3316 {
3317 imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
3318 }
3319 break;
3320 }
3321
3322 case kUInt:
3323 case kUnsignedInt:
3324 {
3325 unsigned int *imageData = (unsigned int *)data;
3326
3327 // We want to generate ints (mostly) in range of the target format
3328 int formatMin = get_format_min_int( imageInfo->format );
3329 size_t formatMax = get_format_max_int( imageInfo->format );
3330 if( formatMin < 0 )
3331 formatMin = 0;
3332 // If the final format is small enough, give us a bit of room for out-of-range values to test
3333 if( formatMax < 4294967295LL )
3334 formatMax += 2;
3335
3336 // Now gen
3337 for( size_t i = 0; i < numPixels * 4; i++ )
3338 {
3339 imageData[ i ] = random_in_range( formatMin, (int)formatMax, d );
3340 }
3341 break;
3342 }
3343 default:
3344 // Unsupported source format
3345 delete [] data;
3346 return NULL;
3347 }
3348
3349 return data;
3350 }
3351
3352 /*
3353 deprecated
3354 bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t max, int &outValue )
3355 {
3356 int v = (int)value;
3357
3358 switch(imageSampler->addressing_mode)
3359 {
3360 case CL_ADDRESS_REPEAT:
3361 outValue = v;
3362 while( v < 0 )
3363 v += (int)max;
3364 while( v >= (int)max )
3365 v -= (int)max;
3366 if( v != outValue )
3367 {
3368 outValue = v;
3369 return true;
3370 }
3371 return false;
3372
3373 case CL_ADDRESS_MIRRORED_REPEAT:
3374 log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do we ever use this?
3375 exit(-1);
3376
3377 default:
3378 if( v < 0 )
3379 {
3380 outValue = 0;
3381 return true;
3382 }
3383 if( v >= (int)max )
3384 {
3385 outValue = (int)max - 1;
3386 return true;
3387 }
3388 outValue = v;
3389 return false;
3390 }
3391
3392 }
3393 */
3394
get_sampler_kernel_code(image_sampler_data * imageSampler,char * outLine)3395 void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine )
3396 {
3397 const char *normalized;
3398 const char *addressMode;
3399 const char *filterMode;
3400
3401 if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP )
3402 addressMode = "CLK_ADDRESS_CLAMP";
3403 else if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
3404 addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE";
3405 else if( imageSampler->addressing_mode == CL_ADDRESS_REPEAT )
3406 addressMode = "CLK_ADDRESS_REPEAT";
3407 else if( imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
3408 addressMode = "CLK_ADDRESS_MIRRORED_REPEAT";
3409 else if( imageSampler->addressing_mode == CL_ADDRESS_NONE )
3410 addressMode = "CLK_ADDRESS_NONE";
3411 else
3412 {
3413 log_error( "**Error: Unknown addressing mode! Aborting...\n" );
3414 abort();
3415 }
3416
3417 if( imageSampler->normalized_coords )
3418 normalized = "CLK_NORMALIZED_COORDS_TRUE";
3419 else
3420 normalized = "CLK_NORMALIZED_COORDS_FALSE";
3421
3422 if( imageSampler->filter_mode == CL_FILTER_LINEAR )
3423 filterMode = "CLK_FILTER_LINEAR";
3424 else
3425 filterMode = "CLK_FILTER_NEAREST";
3426
3427 sprintf( outLine, " const sampler_t imageSampler = %s | %s | %s;\n", addressMode, filterMode, normalized );
3428 }
3429
copy_image_data(image_descriptor * srcImageInfo,image_descriptor * dstImageInfo,void * imageValues,void * destImageValues,const size_t sourcePos[],const size_t destPos[],const size_t regionSize[])3430 void copy_image_data( image_descriptor *srcImageInfo, image_descriptor *dstImageInfo, void *imageValues, void *destImageValues,
3431 const size_t sourcePos[], const size_t destPos[], const size_t regionSize[] )
3432 {
3433 // assert( srcImageInfo->format == dstImageInfo->format );
3434
3435 size_t src_mip_level_offset = 0, dst_mip_level_offset = 0;
3436 size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod;
3437 size_t src_row_pitch_lod, src_slice_pitch_lod;
3438 size_t dst_row_pitch_lod, dst_slice_pitch_lod;
3439
3440 size_t pixelSize = get_pixel_size( srcImageInfo->format );
3441
3442 sourcePos_lod[0] = sourcePos[0];
3443 sourcePos_lod[1] = sourcePos[1];
3444 sourcePos_lod[2] = sourcePos[2];
3445 destPos_lod[0] = destPos[0];
3446 destPos_lod[1] = destPos[1];
3447 destPos_lod[2] = destPos[2];
3448 src_row_pitch_lod = srcImageInfo->rowPitch;
3449 dst_row_pitch_lod = dstImageInfo->rowPitch;
3450 src_slice_pitch_lod = srcImageInfo->slicePitch;
3451 dst_slice_pitch_lod = dstImageInfo->slicePitch;
3452
3453 if( srcImageInfo->num_mip_levels > 1)
3454 {
3455 size_t src_width_lod = 1/*srcImageInfo->width*/;
3456 size_t src_height_lod = 1/*srcImageInfo->height*/;
3457 size_t src_depth_lod = 1/*srcImageInfo->depth*/;
3458
3459 switch( srcImageInfo->type )
3460 {
3461 case CL_MEM_OBJECT_IMAGE1D:
3462 src_lod = sourcePos[1];
3463 sourcePos_lod[1] = sourcePos_lod[2] = 0;
3464 src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3465 break;
3466 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3467 case CL_MEM_OBJECT_IMAGE2D:
3468 src_lod = sourcePos[2];
3469 sourcePos_lod[1] = sourcePos[1];
3470 sourcePos_lod[2] = 0;
3471 src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3472 if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D )
3473 src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1;
3474 break;
3475 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3476 case CL_MEM_OBJECT_IMAGE3D:
3477 src_lod = sourcePos[3];
3478 sourcePos_lod[1] = sourcePos[1];
3479 sourcePos_lod[2] = sourcePos[2];
3480 src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1;
3481 src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1;
3482 if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D )
3483 src_depth_lod = (srcImageInfo->depth >> src_lod ) ? ( srcImageInfo->depth >> src_lod ): 1;
3484 break;
3485
3486 }
3487 src_mip_level_offset = compute_mip_level_offset( srcImageInfo, src_lod );
3488 src_row_pitch_lod = src_width_lod * get_pixel_size( srcImageInfo->format );
3489 src_slice_pitch_lod = src_row_pitch_lod * src_height_lod;
3490 }
3491
3492 if( dstImageInfo->num_mip_levels > 1)
3493 {
3494 size_t dst_width_lod = 1/*dstImageInfo->width*/;
3495 size_t dst_height_lod = 1/*dstImageInfo->height*/;
3496 size_t dst_depth_lod = 1 /*dstImageInfo->depth*/;
3497 switch( dstImageInfo->type )
3498 {
3499 case CL_MEM_OBJECT_IMAGE1D:
3500 dst_lod = destPos[1];
3501 destPos_lod[1] = destPos_lod[2] = 0;
3502 dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3503 break;
3504 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3505 case CL_MEM_OBJECT_IMAGE2D:
3506 dst_lod = destPos[2];
3507 destPos_lod[1] = destPos[1];
3508 destPos_lod[2] = 0;
3509 dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3510 if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D )
3511 dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1;
3512 break;
3513 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3514 case CL_MEM_OBJECT_IMAGE3D:
3515 dst_lod = destPos[3];
3516 destPos_lod[1] = destPos[1];
3517 destPos_lod[2] = destPos[2];
3518 dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1;
3519 dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1;
3520 if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D )
3521 dst_depth_lod = (dstImageInfo->depth >> dst_lod ) ? ( dstImageInfo->depth >> dst_lod ): 1;
3522 break;
3523
3524 }
3525 dst_mip_level_offset = compute_mip_level_offset( dstImageInfo, dst_lod );
3526 dst_row_pitch_lod = dst_width_lod * get_pixel_size( dstImageInfo->format);
3527 dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod;
3528 }
3529
3530 // Get initial pointers
3531 char *sourcePtr = (char *)imageValues + sourcePos_lod[ 2 ] * src_slice_pitch_lod + sourcePos_lod[ 1 ] * src_row_pitch_lod + pixelSize * sourcePos_lod[ 0 ] + src_mip_level_offset;
3532 char *destPtr = (char *)destImageValues + destPos_lod[ 2 ] * dst_slice_pitch_lod + destPos_lod[ 1 ] * dst_row_pitch_lod + pixelSize * destPos_lod[ 0 ] + dst_mip_level_offset;
3533
3534 for( size_t z = 0; z < ( regionSize[ 2 ] > 0 ? regionSize[ 2 ] : 1 ); z++ )
3535 {
3536 char *rowSourcePtr = sourcePtr;
3537 char *rowDestPtr = destPtr;
3538 for( size_t y = 0; y < regionSize[ 1 ]; y++ )
3539 {
3540 memcpy( rowDestPtr, rowSourcePtr, pixelSize * regionSize[ 0 ] );
3541 rowSourcePtr += src_row_pitch_lod;
3542 rowDestPtr += dst_row_pitch_lod;
3543 }
3544
3545 sourcePtr += src_slice_pitch_lod;
3546 destPtr += dst_slice_pitch_lod;
3547 }
3548 }
3549
random_float(float low,float high,MTdata d)3550 float random_float(float low, float high, MTdata d)
3551 {
3552 float t = (float) genrand_real1(d);
3553 return (1.0f - t) * low + t * high;
3554 }
3555
CoordWalker(void * coords,bool useFloats,size_t vecSize)3556 CoordWalker::CoordWalker( void * coords, bool useFloats, size_t vecSize )
3557 {
3558 if( useFloats )
3559 {
3560 mFloatCoords = (cl_float *)coords;
3561 mIntCoords = NULL;
3562 }
3563 else
3564 {
3565 mFloatCoords = NULL;
3566 mIntCoords = (cl_int *)coords;
3567 }
3568 mVecSize = vecSize;
3569 }
3570
~CoordWalker()3571 CoordWalker::~CoordWalker()
3572 {
3573 }
3574
Get(size_t idx,size_t el)3575 cl_float CoordWalker::Get( size_t idx, size_t el )
3576 {
3577 if( mIntCoords != NULL )
3578 return (cl_float)mIntCoords[ idx * mVecSize + el ];
3579 else
3580 return mFloatCoords[ idx * mVecSize + el ];
3581 }
3582
3583
print_read_header(cl_image_format * format,image_sampler_data * sampler,bool err,int t)3584 void print_read_header( cl_image_format *format, image_sampler_data *sampler, bool err, int t )
3585 {
3586 const char *addressMode = NULL;
3587 const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" };
3588
3589 if( sampler->addressing_mode == CL_ADDRESS_CLAMP )
3590 addressMode = "CL_ADDRESS_CLAMP";
3591 else if( sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE )
3592 addressMode = "CL_ADDRESS_CLAMP_TO_EDGE";
3593 else if( sampler->addressing_mode == CL_ADDRESS_REPEAT )
3594 addressMode = "CL_ADDRESS_REPEAT";
3595 else if( sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT )
3596 addressMode = "CL_ADDRESS_MIRRORED_REPEAT";
3597 else
3598 addressMode = "CL_ADDRESS_NONE";
3599
3600 if( t )
3601 {
3602 if( err )
3603 log_error( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3604 GetChannelTypeName( format->image_channel_data_type ),
3605 (int)get_format_channel_count( format ),
3606 sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3607 addressMode,
3608 normalizedNames[sampler->normalized_coords ? 1 : 0],
3609 t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
3610 else
3611 log_info( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3612 GetChannelTypeName( format->image_channel_data_type ),
3613 (int)get_format_channel_count( format ),
3614 sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3615 addressMode,
3616 normalizedNames[sampler->normalized_coords ? 1 : 0],
3617 t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" );
3618 }
3619 else
3620 {
3621 if( err )
3622 log_error( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3623 GetChannelTypeName( format->image_channel_data_type ),
3624 (int)get_format_channel_count( format ),
3625 sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3626 addressMode,
3627 normalizedNames[sampler->normalized_coords ? 1 : 0] );
3628 else
3629 log_info( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ),
3630 GetChannelTypeName( format->image_channel_data_type ),
3631 (int)get_format_channel_count( format ),
3632 sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR",
3633 addressMode,
3634 normalizedNames[sampler->normalized_coords ? 1 : 0] );
3635 }
3636
3637 }
3638
print_write_header(cl_image_format * format,bool err=false)3639 void print_write_header( cl_image_format *format, bool err = false)
3640 {
3641 if( err )
3642 log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3643 GetChannelTypeName( format->image_channel_data_type ),
3644 (int)get_format_channel_count( format ) );
3645 else
3646 log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3647 GetChannelTypeName( format->image_channel_data_type ),
3648 (int)get_format_channel_count( format ) );
3649 }
3650
3651
print_header(cl_image_format * format,bool err=false)3652 void print_header( cl_image_format *format, bool err = false )
3653 {
3654 if (err) {
3655 log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3656 GetChannelTypeName( format->image_channel_data_type ),
3657 (int)get_format_channel_count( format ) );
3658 } else {
3659 log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ),
3660 GetChannelTypeName( format->image_channel_data_type ),
3661 (int)get_format_channel_count( format ) );
3662 }
3663 }
3664
find_format(cl_image_format * formatList,unsigned int numFormats,cl_image_format * formatToFind)3665 bool find_format( cl_image_format *formatList, unsigned int numFormats, cl_image_format *formatToFind )
3666 {
3667 for( unsigned int i = 0; i < numFormats; i++ )
3668 {
3669 if( formatList[ i ].image_channel_order == formatToFind->image_channel_order &&
3670 formatList[ i ].image_channel_data_type == formatToFind->image_channel_data_type )
3671 return true;
3672 }
3673 return false;
3674 }
3675
build_required_image_formats(cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device,std::vector<cl_image_format> & formatsToSupport)3676 void build_required_image_formats(cl_mem_flags flags,
3677 cl_mem_object_type image_type,
3678 cl_device_id device,
3679 std::vector<cl_image_format>& formatsToSupport)
3680 {
3681 Version version = get_device_cl_version(device);
3682
3683 formatsToSupport.clear();
3684
3685 // Required embedded formats.
3686 static std::vector<cl_image_format> embeddedProfReadOrWriteFormats
3687 {
3688 { CL_RGBA, CL_UNORM_INT8 },
3689 { CL_RGBA, CL_UNORM_INT16 },
3690 { CL_RGBA, CL_SIGNED_INT8 },
3691 { CL_RGBA, CL_SIGNED_INT16 },
3692 { CL_RGBA, CL_SIGNED_INT32 },
3693 { CL_RGBA, CL_UNSIGNED_INT8 },
3694 { CL_RGBA, CL_UNSIGNED_INT16 },
3695 { CL_RGBA, CL_UNSIGNED_INT32 },
3696 { CL_RGBA, CL_HALF_FLOAT },
3697 { CL_RGBA, CL_FLOAT },
3698 };
3699
3700 /*
3701 Required full profile formats.
3702 This array does not contain any full profile
3703 formats that have restrictions on when they
3704 are required.
3705 */
3706 static std::vector<cl_image_format> fullProfReadOrWriteFormats
3707 {
3708 { CL_RGBA, CL_UNORM_INT8 },
3709 { CL_RGBA, CL_UNORM_INT16 },
3710 { CL_RGBA, CL_SIGNED_INT8 },
3711 { CL_RGBA, CL_SIGNED_INT16 },
3712 { CL_RGBA, CL_SIGNED_INT32 },
3713 { CL_RGBA, CL_UNSIGNED_INT8 },
3714 { CL_RGBA, CL_UNSIGNED_INT16 },
3715 { CL_RGBA, CL_UNSIGNED_INT32 },
3716 { CL_RGBA, CL_HALF_FLOAT },
3717 { CL_RGBA, CL_FLOAT },
3718 { CL_BGRA, CL_UNORM_INT8 },
3719 };
3720
3721 /*
3722 Required full profile formats specifically for 2.x.
3723 This array does not contain any full profile
3724 formats that have restrictions on when they
3725 are required.
3726 */
3727 static std::vector<cl_image_format> fullProf2XReadOrWriteFormats
3728 {
3729 { CL_R, CL_UNORM_INT8 },
3730 { CL_R, CL_UNORM_INT16 },
3731 { CL_R, CL_SNORM_INT8 },
3732 { CL_R, CL_SNORM_INT16 },
3733 { CL_R, CL_SIGNED_INT8 },
3734 { CL_R, CL_SIGNED_INT16 },
3735 { CL_R, CL_SIGNED_INT32 },
3736 { CL_R, CL_UNSIGNED_INT8 },
3737 { CL_R, CL_UNSIGNED_INT16 },
3738 { CL_R, CL_UNSIGNED_INT32 },
3739 { CL_R, CL_HALF_FLOAT },
3740 { CL_R, CL_FLOAT },
3741 { CL_RG, CL_UNORM_INT8 },
3742 { CL_RG, CL_UNORM_INT16 },
3743 { CL_RG, CL_SNORM_INT8 },
3744 { CL_RG, CL_SNORM_INT16 },
3745 { CL_RG, CL_SIGNED_INT8 },
3746 { CL_RG, CL_SIGNED_INT16 },
3747 { CL_RG, CL_SIGNED_INT32 },
3748 { CL_RG, CL_UNSIGNED_INT8 },
3749 { CL_RG, CL_UNSIGNED_INT16 },
3750 { CL_RG, CL_UNSIGNED_INT32 },
3751 { CL_RG, CL_HALF_FLOAT },
3752 { CL_RG, CL_FLOAT },
3753 { CL_RGBA, CL_SNORM_INT8 },
3754 { CL_RGBA, CL_SNORM_INT16 },
3755 };
3756
3757 /*
3758 Required full profile formats for CL_DEPTH
3759 (specifically 2.x).
3760 There are cases whereby the format isn't required.
3761 */
3762 static std::vector<cl_image_format> fullProf2XReadOrWriteDepthFormats
3763 {
3764 { CL_DEPTH, CL_UNORM_INT16 },
3765 { CL_DEPTH, CL_FLOAT },
3766 };
3767
3768 /*
3769 Required full profile formats for CL_sRGB
3770 (specifically 2.x).
3771 There are cases whereby the format isn't required.
3772 */
3773 static std::vector<cl_image_format> fullProf2XSRGBFormats
3774 {
3775 { CL_sRGBA, CL_UNORM_INT8 },
3776 };
3777
3778 // Embedded profile
3779 if (gIsEmbedded)
3780 {
3781 copy(embeddedProfReadOrWriteFormats.begin(),
3782 embeddedProfReadOrWriteFormats.end(),
3783 back_inserter(formatsToSupport));
3784 }
3785 // Full profile
3786 else
3787 {
3788 copy(fullProfReadOrWriteFormats.begin(),
3789 fullProfReadOrWriteFormats.end(),
3790 back_inserter(formatsToSupport));
3791 }
3792
3793 // Full profile, OpenCL 2.0, 2.1, 2.2
3794 if (!gIsEmbedded && version >= Version(2, 0) && version <= Version(2, 2))
3795 {
3796 copy(fullProf2XReadOrWriteFormats.begin(),
3797 fullProf2XReadOrWriteFormats.end(),
3798 back_inserter(formatsToSupport));
3799
3800 // Depth images are only required for 2DArray and 2D images
3801 if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D)
3802 {
3803 copy(fullProf2XReadOrWriteDepthFormats.begin(),
3804 fullProf2XReadOrWriteDepthFormats.end(),
3805 back_inserter(formatsToSupport));
3806 }
3807
3808 // sRGB is not required for 1DImage Buffers
3809 if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)
3810 {
3811 // sRGB is only required for reading
3812 if (flags == CL_MEM_READ_ONLY)
3813 {
3814 copy(fullProf2XSRGBFormats.begin(),
3815 fullProf2XSRGBFormats.end(),
3816 back_inserter(formatsToSupport));
3817 }
3818 }
3819 }
3820 }
3821
is_image_format_required(cl_image_format format,cl_mem_flags flags,cl_mem_object_type image_type,cl_device_id device)3822 bool is_image_format_required(cl_image_format format,
3823 cl_mem_flags flags,
3824 cl_mem_object_type image_type,
3825 cl_device_id device)
3826 {
3827 std::vector<cl_image_format> formatsToSupport;
3828 build_required_image_formats(flags, image_type, device, formatsToSupport);
3829
3830 for (auto &formatItr: formatsToSupport)
3831 {
3832 if (formatItr.image_channel_order == format.image_channel_order &&
3833 formatItr.image_channel_data_type == format.image_channel_data_type)
3834 {
3835 return true;
3836 }
3837 }
3838
3839 return false;
3840 }
3841
compute_max_mip_levels(size_t width,size_t height,size_t depth)3842 cl_uint compute_max_mip_levels( size_t width, size_t height, size_t depth)
3843 {
3844 cl_uint retMaxMipLevels=0, max_dim = 0;
3845
3846 max_dim = width;
3847 max_dim = height > max_dim ? height : max_dim;
3848 max_dim = depth > max_dim ? depth : max_dim;
3849
3850 while(max_dim) {
3851 retMaxMipLevels++;
3852 max_dim >>= 1;
3853 }
3854 return retMaxMipLevels;
3855 }
3856
compute_mipmapped_image_size(image_descriptor imageInfo)3857 cl_ulong compute_mipmapped_image_size( image_descriptor imageInfo)
3858 {
3859 cl_ulong retSize = 0;
3860 size_t curr_width, curr_height, curr_depth, curr_array_size;
3861 curr_width = imageInfo.width;
3862 curr_height = imageInfo.height;
3863 curr_depth = imageInfo.depth;
3864 curr_array_size = imageInfo.arraySize;
3865
3866 for (int i=0; i < (int) imageInfo.num_mip_levels; i++)
3867 {
3868 switch ( imageInfo.type )
3869 {
3870 case CL_MEM_OBJECT_IMAGE3D :
3871 retSize += (cl_ulong)curr_width * curr_height * curr_depth * get_pixel_size(imageInfo.format);
3872 break;
3873 case CL_MEM_OBJECT_IMAGE2D :
3874 retSize += (cl_ulong)curr_width * curr_height * get_pixel_size(imageInfo.format);
3875 break;
3876 case CL_MEM_OBJECT_IMAGE1D :
3877 retSize += (cl_ulong)curr_width * get_pixel_size(imageInfo.format);
3878 break;
3879 case CL_MEM_OBJECT_IMAGE1D_ARRAY :
3880 retSize += (cl_ulong)curr_width * curr_array_size * get_pixel_size(imageInfo.format);
3881 break;
3882 case CL_MEM_OBJECT_IMAGE2D_ARRAY :
3883 retSize += (cl_ulong)curr_width * curr_height * curr_array_size * get_pixel_size(imageInfo.format);
3884 break;
3885 }
3886
3887 switch ( imageInfo.type )
3888 {
3889 case CL_MEM_OBJECT_IMAGE3D :
3890 curr_depth = curr_depth >> 1 ? curr_depth >> 1: 1;
3891 case CL_MEM_OBJECT_IMAGE2D :
3892 case CL_MEM_OBJECT_IMAGE2D_ARRAY :
3893 curr_height = curr_height >> 1? curr_height >> 1 : 1;
3894 case CL_MEM_OBJECT_IMAGE1D :
3895 case CL_MEM_OBJECT_IMAGE1D_ARRAY :
3896 curr_width = curr_width >> 1? curr_width >> 1 : 1;
3897 }
3898 }
3899
3900 return retSize;
3901 }
3902
compute_mip_level_offset(image_descriptor * imageInfo,size_t lod)3903 size_t compute_mip_level_offset( image_descriptor * imageInfo , size_t lod)
3904 {
3905 size_t retOffset = 0;
3906 size_t width, height, depth;
3907 width = imageInfo->width;
3908 height = imageInfo->height;
3909 depth = imageInfo->depth;
3910
3911 for(size_t i=0; i < lod; i++)
3912 {
3913 switch(imageInfo->type)
3914 {
3915 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3916 retOffset += (size_t) width * height * imageInfo->arraySize * get_pixel_size( imageInfo->format );
3917 break;
3918 case CL_MEM_OBJECT_IMAGE3D:
3919 retOffset += (size_t) width * height * depth * get_pixel_size( imageInfo->format );
3920 break;
3921 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3922 retOffset += (size_t) width * imageInfo->arraySize * get_pixel_size( imageInfo->format );
3923 break;
3924 case CL_MEM_OBJECT_IMAGE2D:
3925 retOffset += (size_t) width * height * get_pixel_size( imageInfo->format );
3926 break;
3927 case CL_MEM_OBJECT_IMAGE1D:
3928 retOffset += (size_t) width * get_pixel_size( imageInfo->format );
3929 break;
3930 }
3931
3932 // Compute next lod dimensions
3933 switch(imageInfo->type)
3934 {
3935 case CL_MEM_OBJECT_IMAGE3D:
3936 depth = ( depth >> 1 ) ? ( depth >> 1 ) : 1;
3937 case CL_MEM_OBJECT_IMAGE2D:
3938 case CL_MEM_OBJECT_IMAGE2D_ARRAY:
3939 height = ( height >> 1 ) ? ( height >> 1 ) : 1;
3940 case CL_MEM_OBJECT_IMAGE1D_ARRAY:
3941 case CL_MEM_OBJECT_IMAGE1D:
3942 width = ( width >> 1 ) ? ( width >> 1 ) : 1;
3943 }
3944
3945 }
3946 return retOffset;
3947 }
3948