// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "imageHelpers.h" #include #include #if defined( __APPLE__ ) #include #endif #if !defined (_WIN32) && !defined(__APPLE__) #include #endif #include #include #if !defined (_WIN32) #include #endif RoundingMode gFloatToHalfRoundingMode = kDefaultRoundingMode; static cl_ushort float2half_rte( float f ); static cl_ushort float2half_rtz( float f ); cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT; bool gTestRounding = false; double sRGBmap(float fc) { double c = (double)fc; #if !defined (_WIN32) if (std::isnan(c)) c = 0.0; #else if (_isnan(c)) c = 0.0; #endif if (c > 1.0) c = 1.0; else if (c < 0.0) c = 0.0; else if (c < 0.0031308) c = 12.92 * c; else c = (1055.0/1000.0) * pow(c, 5.0/12.0) - (55.0/1000.0); return c * 255.0; } double sRGBunmap(float fc) { double c = (double)fc; double result; if (c <= 0.04045) result = c / 12.92; else result = pow((c + 0.055) / 1.055, 2.4); return result; } size_t get_format_type_size( const cl_image_format *format ) { return get_channel_data_type_size( format->image_channel_data_type ); } size_t get_channel_data_type_size( cl_channel_type channelType ) { switch( channelType ) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: return 1; case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_SIGNED_INT16: case CL_UNSIGNED_INT16: case CL_HALF_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return sizeof( cl_short ); case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: return sizeof( cl_int ); case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: #ifdef OBSOLETE_FORAMT case CL_UNORM_SHORT_565_REV: case CL_UNORM_SHORT_555_REV: #endif return 2; #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_8888: case CL_UNORM_INT_8888_REV: return 4; #endif case CL_UNORM_INT_101010: #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_101010_REV: #endif return 4; case CL_FLOAT: return sizeof( cl_float ); default: return 0; } } size_t get_format_channel_count( const cl_image_format *format ) { return get_channel_order_channel_count( format->image_channel_order ); } size_t get_channel_order_channel_count( cl_channel_order order ) { switch( order ) { case CL_R: case CL_A: case CL_Rx: case CL_INTENSITY: case CL_LUMINANCE: case CL_DEPTH: case CL_DEPTH_STENCIL: return 1; case CL_RG: case CL_RA: case CL_RGx: return 2; case CL_RGB: case CL_RGBx: case CL_sRGB: case CL_sRGBx: return 3; case CL_RGBA: case CL_ARGB: case CL_BGRA: case CL_sRGBA: case CL_sBGRA: case CL_ABGR: #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: #endif #ifdef CL_ABGR_APPLE case CL_ABGR_APPLE: #endif return 4; default: log_error("%s does not support 0x%x\n",__FUNCTION__,order); return 0; } } cl_channel_type get_channel_type_from_name( const char *name ) { struct { cl_channel_type type; const char *name; } typeNames[] = { { CL_SNORM_INT8, "CL_SNORM_INT8" }, { CL_SNORM_INT16, "CL_SNORM_INT16" }, { CL_UNORM_INT8, "CL_UNORM_INT8" }, { CL_UNORM_INT16, "CL_UNORM_INT16" }, { CL_UNORM_INT24, "CL_UNORM_INT24" }, { CL_UNORM_SHORT_565, "CL_UNORM_SHORT_565" }, { CL_UNORM_SHORT_555, "CL_UNORM_SHORT_555" }, { CL_UNORM_INT_101010, "CL_UNORM_INT_101010" }, { CL_SIGNED_INT8, "CL_SIGNED_INT8" }, { CL_SIGNED_INT16, "CL_SIGNED_INT16" }, { CL_SIGNED_INT32, "CL_SIGNED_INT32" }, { CL_UNSIGNED_INT8, "CL_UNSIGNED_INT8" }, { CL_UNSIGNED_INT16, "CL_UNSIGNED_INT16" }, { CL_UNSIGNED_INT32, "CL_UNSIGNED_INT32" }, { CL_HALF_FLOAT, "CL_HALF_FLOAT" }, { CL_FLOAT, "CL_FLOAT" }, #ifdef CL_SFIXED14_APPLE { CL_SFIXED14_APPLE, "CL_SFIXED14_APPLE" } #endif }; for( size_t i = 0; i < sizeof( typeNames ) / sizeof( typeNames[ 0 ] ); i++ ) { if( strcmp( typeNames[ i ].name, name ) == 0 || strcmp( typeNames[ i ].name + 3, name ) == 0 ) return typeNames[ i ].type; } return (cl_channel_type)-1; } cl_channel_order get_channel_order_from_name( const char *name ) { const struct { cl_channel_order order; const char *name; }orderNames[] = { { CL_R, "CL_R" }, { CL_A, "CL_A" }, { CL_Rx, "CL_Rx" }, { CL_RG, "CL_RG" }, { CL_RA, "CL_RA" }, { CL_RGx, "CL_RGx" }, { CL_RGB, "CL_RGB" }, { CL_RGBx, "CL_RGBx" }, { CL_RGBA, "CL_RGBA" }, { CL_BGRA, "CL_BGRA" }, { CL_ARGB, "CL_ARGB" }, { CL_INTENSITY, "CL_INTENSITY"}, { CL_LUMINANCE, "CL_LUMINANCE"}, { CL_DEPTH, "CL_DEPTH" }, { CL_DEPTH_STENCIL, "CL_DEPTH_STENCIL" }, { CL_sRGB, "CL_sRGB" }, { CL_sRGBx, "CL_sRGBx" }, { CL_sRGBA, "CL_sRGBA" }, { CL_sBGRA, "CL_sBGRA" }, { CL_ABGR, "CL_ABGR" }, #ifdef CL_1RGB_APPLE { CL_1RGB_APPLE, "CL_1RGB_APPLE" }, #endif #ifdef CL_BGR1_APPLE { CL_BGR1_APPLE, "CL_BGR1_APPLE" }, #endif }; for( size_t i = 0; i < sizeof( orderNames ) / sizeof( orderNames[ 0 ] ); i++ ) { if( strcmp( orderNames[ i ].name, name ) == 0 || strcmp( orderNames[ i ].name + 3, name ) == 0 ) return orderNames[ i ].order; } return (cl_channel_order)-1; } int is_format_signed( const cl_image_format *format ) { switch( format->image_channel_data_type ) { case CL_SNORM_INT8: case CL_SIGNED_INT8: case CL_SNORM_INT16: case CL_SIGNED_INT16: case CL_SIGNED_INT32: case CL_HALF_FLOAT: case CL_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return 1; default: return 0; } } size_t get_pixel_size( cl_image_format *format ) { switch( format->image_channel_data_type ) { case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SIGNED_INT8: case CL_UNSIGNED_INT8: return get_format_channel_count( format ); case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_SIGNED_INT16: case CL_UNSIGNED_INT16: case CL_HALF_FLOAT: #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: #endif return get_format_channel_count( format ) * sizeof( cl_ushort ); case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: return get_format_channel_count( format ) * sizeof( cl_int ); case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: #ifdef OBSOLETE_FORAMT case CL_UNORM_SHORT_565_REV: case CL_UNORM_SHORT_555_REV: #endif return 2; #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_8888: case CL_UNORM_INT_8888_REV: return 4; #endif case CL_UNORM_INT_101010: #ifdef OBSOLETE_FORAMT case CL_UNORM_INT_101010_REV: #endif return 4; case CL_FLOAT: return get_format_channel_count( format ) * sizeof( cl_float ); default: return 0; } } int get_8_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat ) { cl_image_format formatList[ 128 ]; unsigned int outFormatCount, i; int error; /* Make sure each image format is supported */ if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount ))) return error; /* Look for one that is an 8-bit format */ for( i = 0; i < outFormatCount; i++ ) { if( formatList[ i ].image_channel_data_type == CL_SNORM_INT8 || formatList[ i ].image_channel_data_type == CL_UNORM_INT8 || formatList[ i ].image_channel_data_type == CL_SIGNED_INT8 || formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT8 ) { if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) ) { *outFormat = formatList[ i ]; return 0; } } } return -1; } int get_32_bit_image_format( cl_context context, cl_mem_object_type objType, cl_mem_flags flags, size_t channelCount, cl_image_format *outFormat ) { cl_image_format formatList[ 128 ]; unsigned int outFormatCount, i; int error; /* Make sure each image format is supported */ if ((error = clGetSupportedImageFormats( context, flags, objType, 128, formatList, &outFormatCount ))) return error; /* Look for one that is an 8-bit format */ for( i = 0; i < outFormatCount; i++ ) { if( formatList[ i ].image_channel_data_type == CL_UNORM_INT_101010 || formatList[ i ].image_channel_data_type == CL_FLOAT || formatList[ i ].image_channel_data_type == CL_SIGNED_INT32 || formatList[ i ].image_channel_data_type == CL_UNSIGNED_INT32 ) { if ( !channelCount || ( channelCount && ( get_format_channel_count( &formatList[ i ] ) == channelCount ) ) ) { *outFormat = formatList[ i ]; return 0; } } } return -1; } int random_log_in_range( int minV, int maxV, MTdata d ) { double v = log2( ( (double)genrand_int32(d) / (double)0xffffffff ) + 1 ); int iv = (int)( (float)( maxV - minV ) * v ); return iv + minV; } // Define the addressing functions typedef int (*AddressFn)( int value, size_t maxValue ); int NoAddressFn( int value, size_t maxValue ) { return value; } int RepeatAddressFn( int value, size_t maxValue ) { if( value < 0 ) value += (int)maxValue; else if( value >= (int)maxValue ) value -= (int)maxValue; return value; } int MirroredRepeatAddressFn( int value, size_t maxValue ) { if( value < 0 ) value = 0; else if( (size_t) value >= maxValue ) value = (int) (maxValue - 1); return value; } int ClampAddressFn( int value, size_t maxValue ) { return ( value < -1 ) ? -1 : ( ( value > (cl_long) maxValue ) ? (int)maxValue : value ); } int ClampToEdgeNearestFn( int value, size_t maxValue ) { return ( value < 0 ) ? 0 : ( ( (size_t)value > maxValue - 1 ) ? (int)maxValue - 1 : value ); } AddressFn ClampToEdgeLinearFn = ClampToEdgeNearestFn; // Note: normalized coords get repeated in normalized space, not unnormalized space! hence the special case here volatile float gFloatHome; float RepeatNormalizedAddressFn( float fValue, size_t maxValue ) { #ifndef _MSC_VER // Use original if not the VS compiler. // General computation for repeat return (fValue - floorf( fValue )) * (float) maxValue; // Reduce to [0, 1.f] #else // Otherwise, use this instead: // Home the subtraction to a float to break up the sequence of x87 // instructions emitted by the VS compiler. gFloatHome = fValue - floorf(fValue); return gFloatHome * (float)maxValue; #endif } float MirroredRepeatNormalizedAddressFn( float fValue, size_t maxValue ) { // Round to nearest multiple of two float s_prime = 2.0f * rintf( fValue * 0.5f ); // Note halfway values flip flop here due to rte, but they both end up pointing the same place at the end of the day // Reduce to [-1, 1], Apply mirroring -> [0, 1] s_prime = fabsf( fValue - s_prime ); // un-normalize return s_prime * (float) maxValue; } struct AddressingTable { AddressingTable() { ct_assert( ( CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6 ) ); ct_assert( CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2 ); mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = NoAddressFn; mTable[ CL_ADDRESS_NONE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = NoAddressFn; mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = RepeatAddressFn; mTable[ CL_ADDRESS_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = RepeatAddressFn; mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = ClampToEdgeNearestFn; mTable[ CL_ADDRESS_CLAMP_TO_EDGE - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = ClampToEdgeLinearFn; mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = ClampAddressFn; mTable[ CL_ADDRESS_CLAMP - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = ClampAddressFn; mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_NEAREST - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn; mTable[ CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE ][ CL_FILTER_LINEAR - CL_FILTER_NEAREST ] = MirroredRepeatAddressFn; } AddressFn operator[]( image_sampler_data *sampler ) { return mTable[ (int)sampler->addressing_mode - CL_ADDRESS_NONE ][ (int)sampler->filter_mode - CL_FILTER_NEAREST ]; } AddressFn mTable[ 6 ][ 2 ]; }; static AddressingTable sAddressingTable; bool is_sRGBA_order(cl_channel_order image_channel_order){ switch (image_channel_order) { case CL_sRGB: case CL_sRGBx: case CL_sRGBA: case CL_sBGRA: return true; default: return false; } } // Format helpers int has_alpha(cl_image_format *format) { switch (format->image_channel_order) { case CL_R: return 0; case CL_A: return 1; case CL_Rx: return 0; case CL_RG: return 0; case CL_RA: return 1; case CL_RGx: return 0; case CL_RGB: case CL_sRGB: return 0; case CL_RGBx: case CL_sRGBx: return 0; case CL_RGBA: return 1; case CL_BGRA: return 1; case CL_ARGB: return 1; case CL_INTENSITY: return 1; case CL_LUMINANCE: return 0; #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: return 1; #endif #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: return 1; #endif case CL_sRGBA: case CL_sBGRA: return 1; case CL_DEPTH: return 0; default: log_error("Invalid image channel order: %d\n", format->image_channel_order); return 0; } } #define PRINT_MAX_SIZE_LOGIC 0 #define SWAP( _a, _b ) do{ _a ^= _b; _b ^= _a; _a ^= _b; }while(0) #ifndef MAX #define MAX( _a, _b ) ((_a) > (_b) ? (_a) : (_b)) #endif void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize, const cl_ulong maxIndividualAllocSize, // CL_DEVICE_MAX_MEM_ALLOC_SIZE const cl_ulong maxTotalAllocSize, // CL_DEVICE_GLOBAL_MEM_SIZE cl_mem_object_type image_type, cl_image_format *format, int usingMaxPixelSizeBuffer) { bool is3D = (image_type == CL_MEM_OBJECT_IMAGE3D); bool isArray = (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY); // Validate we have a reasonable max depth for 3D if (is3D && maxDepth < 2) { log_error("ERROR: Requesting max image sizes for 3D images when max depth is < 2.\n"); *numberOfSizes = 0; return; } // Validate we have a reasonable max array size for 1D & 2D image arrays if (isArray && maxArraySize < 2) { log_error("ERROR: Requesting max image sizes for an image array when max array size is < 1.\n"); *numberOfSizes = 0; return; } // Reduce the maximum because we are trying to test the max image dimensions, not the memory allocation cl_ulong adjustedMaxTotalAllocSize = maxTotalAllocSize / 4; cl_ulong adjustedMaxIndividualAllocSize = maxIndividualAllocSize / 4; log_info("Note: max individual allocation adjusted down from %gMB to %gMB and max total allocation adjusted down from %gMB to %gMB.\n", maxIndividualAllocSize/(1024.0*1024.0), adjustedMaxIndividualAllocSize/(1024.0*1024.0), maxTotalAllocSize/(1024.0*1024.0), adjustedMaxTotalAllocSize/(1024.0*1024.0)); // Cap our max allocation to 1.0GB. // FIXME -- why? In the interest of not taking a long time? We should still test this stuff... if (adjustedMaxTotalAllocSize > (cl_ulong)1024*1024*1024) { adjustedMaxTotalAllocSize = (cl_ulong)1024*1024*1024; log_info("Limiting max total allocation size to %gMB (down from %gMB) for test.\n", adjustedMaxTotalAllocSize/(1024.0*1024.0), maxTotalAllocSize/(1024.0*1024.0)); } cl_ulong maxAllocSize = adjustedMaxIndividualAllocSize; if (adjustedMaxTotalAllocSize < adjustedMaxIndividualAllocSize*2) maxAllocSize = adjustedMaxTotalAllocSize/2; size_t raw_pixel_size = get_pixel_size(format); // If the test will be creating input (src) buffer of type int4 or float4, number of pixels will be // governed by sizeof(int4 or float4) and not sizeof(dest fomat) // Also if pixel size is 12 bytes i.e. RGB or RGBx, we adjust it to 16 bytes as GPUs has no concept // of 3 channel images. GPUs expand these to four channel RGBA. if(usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16; size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size; log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, per-allocation limit %gMB.\n", maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize/(1024.0*1024.0))); // Keep track of the maximum sizes for each dimension size_t maximum_sizes[] = { maxWidth, maxHeight, maxDepth }; switch (image_type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: maximum_sizes[1] = maxArraySize; maximum_sizes[2] = 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: maximum_sizes[2] = maxArraySize; break; } // Given one fixed sized dimension, this code finds one or two other dimensions, // both with very small size, such that the size does not exceed the maximum // passed to this function #if defined(__x86_64) || defined (__arm64__) || defined (__ppc64__) size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 10, 11, 13, 15}; #else size_t other_sizes[] = { 2, 3, 5, 6, 7, 9, 11, 13}; #endif static size_t other_size = 0; enum { num_other_sizes = sizeof(other_sizes)/sizeof(size_t) }; (*numberOfSizes) = 0; if (image_type == CL_MEM_OBJECT_IMAGE1D) { double M = maximum_sizes[0]; // Store the size sizes[(*numberOfSizes)][0] = (size_t)M; sizes[(*numberOfSizes)][1] = 1; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); } else if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) { for (int fixed_dim=0;fixed_dim<2;++fixed_dim) { // Determine the size of the fixed dimension double M = maximum_sizes[fixed_dim]; double A = max_pixels; int x0_dim = !fixed_dim; double x0 = fmin(fmin(other_sizes[(other_size++)%num_other_sizes],A/M), maximum_sizes[x0_dim]); // Store the size sizes[(*numberOfSizes)][fixed_dim] = (size_t)M; sizes[(*numberOfSizes)][x0_dim] = (size_t)x0; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); } } else if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE3D) { // Iterate over dimensions, finding sizes for the non-fixed dimension for (int fixed_dim=0;fixed_dim<3;++fixed_dim) { // Determine the size of the fixed dimension double M = maximum_sizes[fixed_dim]; double A = max_pixels; // Find two other dimensions, x0 and x1 int x0_dim = (fixed_dim == 0) ? 1 : 0; int x1_dim = (fixed_dim == 2) ? 1 : 2; // Choose two other sizes for these dimensions double x0 = fmin(fmin(A/M,maximum_sizes[x0_dim]),other_sizes[(other_size++)%num_other_sizes]); // GPUs have certain restrictions on minimum width (row alignment) of images which has given us issues // testing small widths in this test (say we set width to 3 for testing, and compute size based on this width and decide // it fits within vram ... but GPU driver decides that, due to row alignment requirements, it has to use // width of 16 which doesnt fit in vram). For this purpose we are not testing width < 16 for this test. if(x0_dim == 0 && x0 < 16) x0 = 16; double x1 = fmin(fmin(A/M/x0,maximum_sizes[x1_dim]),other_sizes[(other_size++)%num_other_sizes]); // Valid image sizes cannot be below 1. Due to the workaround for the xo_dim where x0 is overidden to 16 // there might not be enough space left for x1 dimension. This could be a fractional 0.x size that when cast to // integer would result in a value 0. In these cases we clamp the size to a minimum of 1. if ( x1 < 1 ) x1 = 1; // M and x0 cannot be '0' as they derive from clDeviceInfo calls assert(x0 > 0 && M > 0); // Store the size sizes[(*numberOfSizes)][fixed_dim] = (size_t)M; sizes[(*numberOfSizes)][x0_dim] = (size_t)x0; sizes[(*numberOfSizes)][x1_dim] = (size_t)x1; ++(*numberOfSizes); } } // Log the results for (int j=0; j<(int)(*numberOfSizes); j++) { switch (image_type) { case CL_MEM_OBJECT_IMAGE1D: log_info(" size[%d] = [%ld] (%g MB image)\n", j, sizes[j][0], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0)); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: log_info(" size[%d] = [%ld %ld] (%g MB image)\n", j, sizes[j][0], sizes[j][1], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0)); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n", j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size*sizes[j][0]*sizes[j][1]*sizes[j][2]/(1024.0*1024.0)); break; } } } float get_max_absolute_error( cl_image_format *format, image_sampler_data *sampler) { if (sampler->filter_mode == CL_FILTER_NEAREST) return 0.0f; switch (format->image_channel_data_type) { case CL_SNORM_INT8: return 1.0f/127.0f; case CL_UNORM_INT8: return 1.0f/255.0f; case CL_UNORM_INT16: return 1.0f/65535.0f; case CL_SNORM_INT16: return 1.0f/32767.0f; case CL_FLOAT: return CL_FLT_MIN; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return 0x1.0p-14f; #endif default: return 0.0f; } } float get_max_relative_error( cl_image_format *format, image_sampler_data *sampler, int is3D, int isLinearFilter ) { float maxError = 0.0f; float sampleCount = 1.0f; if( isLinearFilter ) sampleCount = is3D ? 8.0f : 4.0f; // Note that the ULP is defined here as the unit in the last place of the maximum // magnitude sample used for filtering. // Section 8.3 switch( format->image_channel_data_type ) { // The spec allows 2 ulps of error for normalized formats case CL_SNORM_INT8: case CL_UNORM_INT8: case CL_SNORM_INT16: case CL_UNORM_INT16: case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: maxError = 2*FLT_EPSILON*sampleCount; // Maximum sampling error for round to zero normalization based on multiplication // by reciprocal (using reciprocal generated in round to +inf mode, so that 1.0 matches spec) break; // If the implementation supports these formats then it will have to allow rounding error here too, // because not all 32-bit ints are exactly representable in float case CL_SIGNED_INT32: case CL_UNSIGNED_INT32: maxError = 1*FLT_EPSILON; break; } // Section 8.2 if( sampler->addressing_mode == CL_ADDRESS_REPEAT || sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT || sampler->filter_mode != CL_FILTER_NEAREST || sampler->normalized_coords ) #if defined( __APPLE__ ) { if( sampler->filter_mode != CL_FILTER_NEAREST ) { // The maximum if( gDeviceType == CL_DEVICE_TYPE_GPU ) maxError += MAKE_HEX_FLOAT(0x1.0p-4f, 0x1L, -4); // Some GPUs ain't so accurate else // The standard method of 2d linear filtering delivers 4.0 ulps of error in round to nearest (8 in rtz). maxError += 4.0f * FLT_EPSILON; } else maxError += 4.0f * FLT_EPSILON; // normalized coordinates will introduce some error into the fractional part of the address, affecting results } #else { #if !defined(_WIN32) #warning Implementations will likely wish to pick a max allowable sampling error policy here that is better than the spec #endif // The spec allows linear filters to return any result most of the time. // That's fine for implementations but a problem for testing. After all // users aren't going to like garbage images. We have "picked a number" // here that we are going to attempt to conform to. Implementations are // free to pick another number, like infinity, if they like. // We picked a number for you, to provide /some/ sanity maxError = MAKE_HEX_FLOAT(0x1.0p-7f, 0x1L, -7); // ...but this is what the spec allows: // maxError = INFINITY; // Please feel free to pick any positive number. (NaN wont work.) } #endif // The error calculation itself can introduce error maxError += FLT_EPSILON * 2; return maxError; } size_t get_format_max_int( cl_image_format *format ) { switch( format->image_channel_data_type ) { case CL_SNORM_INT8: case CL_SIGNED_INT8: return 127; case CL_UNORM_INT8: case CL_UNSIGNED_INT8: return 255; case CL_SNORM_INT16: case CL_SIGNED_INT16: return 32767; case CL_UNORM_INT16: case CL_UNSIGNED_INT16: return 65535; case CL_SIGNED_INT32: return 2147483647L; case CL_UNSIGNED_INT32: return 4294967295LL; case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: return 31; case CL_UNORM_INT_101010: return 1023; case CL_HALF_FLOAT: return 1<<10; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return 16384; #endif default: return 0; } } int get_format_min_int( cl_image_format *format ) { switch( format->image_channel_data_type ) { case CL_SNORM_INT8: case CL_SIGNED_INT8: return -128; case CL_UNORM_INT8: case CL_UNSIGNED_INT8: return 0; case CL_SNORM_INT16: case CL_SIGNED_INT16: return -32768; case CL_UNORM_INT16: case CL_UNSIGNED_INT16: return 0; case CL_SIGNED_INT32: return -2147483648LL; case CL_UNSIGNED_INT32: return 0; case CL_UNORM_SHORT_565: case CL_UNORM_SHORT_555: case CL_UNORM_INT_101010: return 0; case CL_HALF_FLOAT: return -(1 << 10); #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return -16384; #endif default: return 0; } } float convert_half_to_float( unsigned short halfValue ) { // We have to take care of a few special cases, but in general, we just extract // the same components from the half that exist in the float and re-stuff them // For a description of the actual half format, see http://en.wikipedia.org/wiki/Half_precision // Note: we store these in 32-bit ints to make the bit manipulations easier later int sign = ( halfValue >> 15 ) & 0x0001; int exponent = ( halfValue >> 10 ) & 0x001f; int mantissa = ( halfValue ) & 0x03ff; // Note: we use a union here to be able to access the bits of a float directly union { unsigned int bits; float floatValue; } outFloat; // Special cases first if( exponent == 0 ) { if( mantissa == 0 ) { // If both exponent and mantissa are 0, the number is +/- 0 outFloat.bits = sign << 31; return outFloat.floatValue; // Already done! } // If exponent is 0, it's a denormalized number, so we renormalize it // Note: this is not terribly efficient, but oh well while( ( mantissa & 0x00000400 ) == 0 ) { mantissa <<= 1; exponent--; } // The first bit is implicit, so we take it off and inc the exponent accordingly exponent++; mantissa &= ~(0x00000400); } else if( exponent == 31 ) // Special-case "numbers" { // If the exponent is 31, it's a special case number (+/- infinity or NAN). // If the mantissa is 0, it's infinity, else it's NAN, but in either case, the packing // method is the same outFloat.bits = ( sign << 31 ) | 0x7f800000 | ( mantissa << 13 ); return outFloat.floatValue; } // Plain ol' normalized number, so adjust to the ranges a 32-bit float expects and repack exponent += ( 127 - 15 ); mantissa <<= 13; outFloat.bits = ( sign << 31 ) | ( exponent << 23 ) | mantissa; return outFloat.floatValue; } cl_ushort convert_float_to_half( float f ) { switch( gFloatToHalfRoundingMode ) { case kRoundToNearestEven: return float2half_rte( f ); case kRoundTowardZero: return float2half_rtz( f ); default: log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" ); exit(-1); return 0xffff; } } cl_ushort float2half_rte( float f ) { union{ float f; cl_uint u; } u = {f}; cl_uint sign = (u.u >> 16) & 0x8000; float x = fabsf(f); //Nan if( x != x ) { u.u >>= (24-11); u.u &= 0x7fff; u.u |= 0x0200; //silence the NaN return u.u | sign; } // overflow if( x >= MAKE_HEX_FLOAT(0x1.ffep15f, 0x1ffeL, 3) ) return 0x7c00 | sign; // underflow if( x <= MAKE_HEX_FLOAT(0x1.0p-25f, 0x1L, -25) ) return sign; // The halfway case can return 0x0001 or 0. 0 is even. // very small if( x < MAKE_HEX_FLOAT(0x1.8p-24f, 0x18L, -28) ) return sign | 1; // half denormal if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) ) { u.f = x * MAKE_HEX_FLOAT(0x1.0p-125f, 0x1L, -125); return sign | u.u; } u.f *= MAKE_HEX_FLOAT(0x1.0p13f, 0x1L, 13); u.u &= 0x7f800000; x += u.f; u.f = x - u.f; u.f *= MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112); return (u.u >> (24-11)) | sign; } cl_ushort float2half_rtz( float f ) { union{ float f; cl_uint u; } u = {f}; cl_uint sign = (u.u >> 16) & 0x8000; float x = fabsf(f); //Nan if( x != x ) { u.u >>= (24-11); u.u &= 0x7fff; u.u |= 0x0200; //silence the NaN return u.u | sign; } // overflow if( x >= MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) ) { if( x == INFINITY ) return 0x7c00 | sign; return 0x7bff | sign; } // underflow if( x < MAKE_HEX_FLOAT(0x1.0p-24f, 0x1L, -24) ) return sign; // The halfway case can return 0x0001 or 0. 0 is even. // half denormal if( x < MAKE_HEX_FLOAT(0x1.0p-14f, 0x1L, -14) ) { x *= MAKE_HEX_FLOAT(0x1.0p24f, 0x1L, 24); return (cl_ushort)((int) x | sign); } u.u &= 0xFFFFE000U; u.u -= 0x38000000U; return (u.u >> (24-11)) | sign; } class TEST { public: TEST(); }; static TEST t; void __vstore_half_rte(float f, size_t index, uint16_t *p) { union{ unsigned int u; float f;} u; u.f = f; unsigned short r = (u.u >> 16) & 0x8000; u.u &= 0x7fffffff; if( u.u >= 0x33000000U ) { if( u.u >= 0x47800000 ) { if( u.u <= 0x7f800000 ) r |= 0x7c00; else { r |= 0x7e00 | ( (u.u >> 13) & 0x3ff ); } } else { float x = u.f; if( u.u < 0x38800000 ) u.u = 0x3f000000; else u.u += 0x06800000; u.u &= 0x7f800000U; x += u.f; x -= u.f; u.f = x * MAKE_HEX_FLOAT(0x1.0p-112f, 0x1L, -112); u.u >>= 13; r |= (unsigned short) u.u; } } ((unsigned short*)p)[index] = r; } TEST::TEST() { return; union { float f; uint32_t i; } test; uint16_t control, myval; log_info(" &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" ); test.i = 0; do { if( ( test.i & 0xffffff ) == 0 ) { if( ( test.i & 0xfffffff ) == 0 ) log_info( "*" ); else log_info( "." ); fflush(stdout); } __vstore_half_rte( test.f, 0, &control ); myval = convert_float_to_half( test.f ); if( myval != control ) { log_info( "\n******** ERROR: MyVal %04x control %04x source %12.24f\n", myval, control, test.f ); log_info( " source bits: %08x %a\n", test.i, test.f ); float t, c; c = convert_half_to_float( control ); t = convert_half_to_float( myval ); log_info( " converted control: %12.24f myval: %12.24f\n", c, t ); } test.i++; } while( test.i != 0 ); log_info("\n &&&&&&&&&&&&&&&&&&&&&&&&&&&& TESTING HALFS &&&&&&&&&&&&&&&&&&&&\n" ); } cl_ulong get_image_size( image_descriptor const *imageInfo ) { cl_ulong imageSize; // Assumes rowPitch and slicePitch are always correctly defined if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 ) { imageSize = (size_t) compute_mipmapped_image_size(*imageInfo); } else { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: imageSize = imageInfo->rowPitch; break; case CL_MEM_OBJECT_IMAGE2D: imageSize = imageInfo->height * imageInfo->rowPitch; break; case CL_MEM_OBJECT_IMAGE3D: imageSize = imageInfo->depth * imageInfo->slicePitch; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: imageSize = imageInfo->arraySize * imageInfo->slicePitch; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: imageSize = imageInfo->arraySize * imageInfo->slicePitch; break; default: log_error("ERROR: Cannot identify image type %x\n", imageInfo->type); abort(); } } return imageSize; } // Calculate image size in megabytes (strictly, mebibytes). Result is rounded up. cl_ulong get_image_size_mb( image_descriptor const *imageInfo ) { cl_ulong imageSize = get_image_size( imageInfo ); cl_ulong mb = imageSize / ( 1024 * 1024 ); if ( imageSize % ( 1024 * 1024 ) > 0 ) { mb += 1; } return mb; } uint64_t gRoundingStartValue = 0; void escape_inf_nan_values( char* data, size_t allocSize ) { // filter values with 8 not-quite-highest bits unsigned int *intPtr = (unsigned int *)data; for( size_t i = 0; i < allocSize >> 2; i++ ) { if( ( intPtr[ i ] & 0x7F800000 ) == 0x7F800000 ) intPtr[ i ] ^= 0x40000000; } // Ditto with half floats (16-bit numbers with the 5 not-quite-highest bits = 0x7C00 are special) unsigned short *shortPtr = (unsigned short *)data; for( size_t i = 0; i < allocSize >> 1; i++ ) { if( ( shortPtr[ i ] & 0x7C00 ) == 0x7C00 ) shortPtr[ i ] ^= 0x4000; } } char * generate_random_image_data( image_descriptor *imageInfo, BufferOwningPtr &P, MTdata d ) { size_t allocSize = get_image_size( imageInfo ); size_t pixelRowBytes = imageInfo->width * get_pixel_size( imageInfo->format ); size_t i; if (imageInfo->num_mip_levels > 1) allocSize = compute_mipmapped_image_size(*imageInfo); #if defined (__APPLE__ ) char *data = NULL; if (gDeviceType == CL_DEVICE_TYPE_CPU) { size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); intptr_t data_end = (intptr_t)map + mapSize - 4096; data = (char *)(data_end - (intptr_t)allocSize); mprotect(map, 4096, PROT_NONE); mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE); P.reset(data, map, mapSize,allocSize); } else { data = (char *)malloc(allocSize); P.reset(data,NULL,0,allocSize); } #else P.reset( NULL ); // Free already allocated memory first, then try to allocate new block. char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format)); P.reset(data,NULL,0,allocSize, true); #endif if (data == NULL) { log_error( "ERROR: Unable to malloc %lu bytes for generate_random_image_data\n", allocSize ); return 0; } if( gTestRounding ) { // Special case: fill with a ramp from 0 to the size of the type size_t typeSize = get_format_type_size( imageInfo->format ); switch( typeSize ) { case 1: { char *ptr = data; for( i = 0; i < allocSize; i++ ) ptr[i] = (cl_char) (i + gRoundingStartValue); } break; case 2: { cl_short *ptr = (cl_short*) data; for( i = 0; i < allocSize / 2; i++ ) ptr[i] = (cl_short) (i + gRoundingStartValue); } break; case 4: { cl_int *ptr = (cl_int*) data; for( i = 0; i < allocSize / 4; i++ ) ptr[i] = (cl_int) (i + gRoundingStartValue); } break; } // Note: inf or nan float values would cause problems, although we don't know this will // actually be a float, so we just know what to look for escape_inf_nan_values( data, allocSize ); return data; } // Otherwise, we should be able to just fill with random bits no matter what cl_uint *p = (cl_uint*) data; for( i = 0; i + 4 <= allocSize; i += 4 ) p[ i / 4 ] = genrand_int32(d); for( ; i < allocSize; i++ ) data[i] = genrand_int32(d); // Note: inf or nan float values would cause problems, although we don't know this will // actually be a float, so we just know what to look for escape_inf_nan_values( data, allocSize ); if ( /*!gTestMipmaps*/ imageInfo->num_mip_levels < 2 ) { // Fill unused edges with -1, NaN for float if (imageInfo->rowPitch > pixelRowBytes) { size_t height = 0; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height = imageInfo->height; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: height = imageInfo->arraySize; break; } // Fill in the row padding regions for( i = 0; i < height; i++ ) { size_t offset = i * imageInfo->rowPitch + pixelRowBytes; size_t length = imageInfo->rowPitch - pixelRowBytes; memset( data + offset, 0xff, length ); } } // Fill in the slice padding regions, if necessary: size_t slice_dimension = imageInfo->height; if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { slice_dimension = imageInfo->arraySize; } if (imageInfo->slicePitch > slice_dimension*imageInfo->rowPitch) { size_t depth = 0; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE3D: depth = imageInfo->depth; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D_ARRAY: depth = imageInfo->arraySize; break; } for( i = 0; i < depth; i++ ) { size_t offset = i * imageInfo->slicePitch + slice_dimension*imageInfo->rowPitch; size_t length = imageInfo->slicePitch - slice_dimension*imageInfo->rowPitch; memset( data + offset, 0xff, length ); } } } return data; } #define CLAMP_FLOAT( v ) ( fmaxf( fminf( v, 1.f ), -1.f ) ) void read_image_pixel_float( void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData, int lod ) { size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if ( imageInfo->num_mip_levels > 1 ) { switch(imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D : depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1; case CL_MEM_OBJECT_IMAGE2D : case CL_MEM_OBJECT_IMAGE2D_ARRAY : height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1; default : width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY ) slice_pitch_lod = row_pitch_lod; else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { row_pitch_lod = imageInfo->rowPitch; slice_pitch_lod = imageInfo->slicePitch; } if ( x < 0 || y < 0 || z < 0 || x >= (int)width_lod || ( height_lod != 0 && y >= (int)height_lod ) || ( depth_lod != 0 && z >= (int)depth_lod ) || ( imageInfo->arraySize != 0 && z >= (int)imageInfo->arraySize ) ) { outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = outData[ 3 ] = 0; if (!has_alpha(imageInfo->format)) outData[3] = 1; return; } cl_image_format *format = imageInfo->format; unsigned int i; float tempData[ 4 ]; // Advance to the right spot char *ptr = (char *)imageData; size_t pixelSize = get_pixel_size( format ); ptr += z * slice_pitch_lod + y * row_pitch_lod + x * pixelSize; // OpenCL only supports reading floats from certain formats size_t channelCount = get_format_channel_count( format ); switch( format->image_channel_data_type ) { case CL_SNORM_INT8: { cl_char *dPtr = (cl_char *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 127.0f ); break; } case CL_UNORM_INT8: { unsigned char *dPtr = (unsigned char *)ptr; for( i = 0; i < channelCount; i++ ) { if((is_sRGBA_order(imageInfo->format->image_channel_order)) && i<3) // only RGB need to be converted for sRGBA tempData[ i ] = (float)sRGBunmap((float)dPtr[ i ] / 255.0f) ; else tempData[ i ] = (float)dPtr[ i ] / 255.0f; } break; } case CL_SIGNED_INT8: { cl_char *dPtr = (cl_char *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ]; break; } case CL_UNSIGNED_INT8: { cl_uchar *dPtr = (cl_uchar *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float) dPtr[ i ]; break; } case CL_SNORM_INT16: { cl_short *dPtr = (cl_short *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = CLAMP_FLOAT( (float)dPtr[ i ] / 32767.0f ); break; } case CL_UNORM_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ] / 65535.0f; break; } case CL_SIGNED_INT16: { cl_short *dPtr = (cl_short *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ]; break; } case CL_UNSIGNED_INT16: { cl_ushort *dPtr = (cl_ushort *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float) dPtr[ i ]; break; } case CL_HALF_FLOAT: { cl_ushort *dPtr = (cl_ushort *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = convert_half_to_float( dPtr[ i ] ); break; } case CL_SIGNED_INT32: { cl_int *dPtr = (cl_int *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ]; break; } case CL_UNSIGNED_INT32: { cl_uint *dPtr = (cl_uint *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ]; break; } case CL_UNORM_SHORT_565: { cl_ushort *dPtr = (cl_ushort *)ptr; tempData[ 0 ] = (float)( dPtr[ 0 ] >> 11 ) / (float)31; tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 63 ) / (float)63; tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31; break; } case CL_UNORM_SHORT_555: { cl_ushort *dPtr = (cl_ushort *)ptr; tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 31 ) / (float)31; tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 5 ) & 31 ) / (float)31; tempData[ 2 ] = (float)( dPtr[ 0 ] & 31 ) / (float)31; break; } case CL_UNORM_INT_101010: { cl_uint *dPtr = (cl_uint *)ptr; tempData[ 0 ] = (float)( ( dPtr[ 0 ] >> 20 ) & 0x3ff ) / (float)1023; tempData[ 1 ] = (float)( ( dPtr[ 0 ] >> 10 ) & 0x3ff ) / (float)1023; tempData[ 2 ] = (float)( dPtr[ 0 ] & 0x3ff ) / (float)1023; break; } case CL_FLOAT: { float *dPtr = (float *)ptr; for( i = 0; i < channelCount; i++ ) tempData[ i ] = (float)dPtr[ i ]; break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { cl_ushort *dPtr = (cl_ushort*) ptr; for( i = 0; i < channelCount; i++ ) tempData[i] = ((int) dPtr[i] - 16384) * 0x1.0p-14f; break; } #endif } outData[ 0 ] = outData[ 1 ] = outData[ 2 ] = 0; outData[ 3 ] = 1; switch( format->image_channel_order ) { case CL_A: outData[ 3 ] = tempData[ 0 ]; break; case CL_R: case CL_Rx: outData[ 0 ] = tempData[ 0 ]; break; case CL_RA: outData[ 0 ] = tempData[ 0 ]; outData[ 3 ] = tempData[ 1 ]; break; case CL_RG: case CL_RGx: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 1 ]; break; case CL_RGB: case CL_RGBx: case CL_sRGB: case CL_sRGBx: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 1 ]; outData[ 2 ] = tempData[ 2 ]; break; case CL_RGBA: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 1 ]; outData[ 2 ] = tempData[ 2 ]; outData[ 3 ] = tempData[ 3 ]; break; case CL_ARGB: outData[ 0 ] = tempData[ 1 ]; outData[ 1 ] = tempData[ 2 ]; outData[ 2 ] = tempData[ 3 ]; outData[ 3 ] = tempData[ 0 ]; break; case CL_BGRA: case CL_sBGRA: outData[ 0 ] = tempData[ 2 ]; outData[ 1 ] = tempData[ 1 ]; outData[ 2 ] = tempData[ 0 ]; outData[ 3 ] = tempData[ 3 ]; break; case CL_INTENSITY: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 0 ]; outData[ 2 ] = tempData[ 0 ]; outData[ 3 ] = tempData[ 0 ]; break; case CL_LUMINANCE: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 0 ]; outData[ 2 ] = tempData[ 0 ]; break; #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: outData[ 0 ] = tempData[ 1 ]; outData[ 1 ] = tempData[ 2 ]; outData[ 2 ] = tempData[ 3 ]; outData[ 3 ] = 1.0f; break; #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: outData[ 0 ] = tempData[ 2 ]; outData[ 1 ] = tempData[ 1 ]; outData[ 2 ] = tempData[ 0 ]; outData[ 3 ] = 1.0f; break; #endif case CL_sRGBA: outData[ 0 ] = tempData[ 0 ]; outData[ 1 ] = tempData[ 1 ]; outData[ 2 ] = tempData[ 2 ]; outData[ 3 ] = tempData[ 3 ]; break; case CL_DEPTH: outData[ 0 ] = tempData[ 0 ]; break; default: log_error("Invalid format:"); print_header(format, true); break; } } void read_image_pixel_float( void *imageData, image_descriptor *imageInfo, int x, int y, int z, float *outData ) { read_image_pixel_float( imageData, imageInfo, x, y, z, outData, 0 ); } bool get_integer_coords( float x, float y, float z, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ ) { return get_integer_coords_offset(x, y, z, 0.0f, 0.0f, 0.0f, width, height, depth, imageSampler, imageInfo, outX, outY, outZ); } bool get_integer_coords_offset( float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, size_t width, size_t height, size_t depth, image_sampler_data *imageSampler, image_descriptor *imageInfo, int &outX, int &outY, int &outZ ) { AddressFn adFn = sAddressingTable[ imageSampler ]; float refX = floorf( x ), refY = floorf( y ), refZ = floorf( z ); // Handle sampler-directed coordinate normalization + clamping. Note that // the array coordinate for image array types is expected to be // unnormalized, and is clamped to 0..arraySize-1. if( imageSampler->normalized_coords ) { switch (imageSampler->addressing_mode) { case CL_ADDRESS_REPEAT: x = RepeatNormalizedAddressFn( x, width ); if (height != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) y = RepeatNormalizedAddressFn( y, height ); } if (depth != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) z = RepeatNormalizedAddressFn( z, depth ); } if (xAddressOffset != 0.0) { // Add in the offset x += xAddressOffset; // Handle wrapping if (x > width) x -= (float)width; if (x < 0) x += (float)width; } if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) { // Add in the offset y += yAddressOffset; // Handle wrapping if (y > height) y -= (float)height; if (y < 0) y += (float)height; } if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) ) { // Add in the offset z += zAddressOffset; // Handle wrapping if (z > depth) z -= (float)depth; if (z < 0) z += (float)depth; } break; case CL_ADDRESS_MIRRORED_REPEAT: x = MirroredRepeatNormalizedAddressFn( x, width ); if (height != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) y = MirroredRepeatNormalizedAddressFn( y, height ); } if (depth != 0) { if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) z = MirroredRepeatNormalizedAddressFn( z, depth ); } if (xAddressOffset != 0.0) { float temp = x + xAddressOffset; if( temp > (float) width ) temp = (float) width - (temp - (float) width ); x = fabsf( temp ); } if ( (yAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) ) { float temp = y + yAddressOffset; if( temp > (float) height ) temp = (float) height - (temp - (float) height ); y = fabsf( temp ); } if ( (zAddressOffset != 0.0) && (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) ) { float temp = z + zAddressOffset; if( temp > (float) depth ) temp = (float) depth - (temp - (float) depth ); z = fabsf( temp ); } break; default: // Also, remultiply to the original coords. This simulates any truncation in // the pass to OpenCL x *= (float)width; x += xAddressOffset; if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) { y *= (float)height; y += yAddressOffset; } if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) { z *= (float)depth; z += zAddressOffset; } break; } } // At this point, we're dealing with non-normalized coordinates. outX = adFn( floorf( x ), width ); // 1D and 2D arrays require special care for the index coordinate: switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f); outZ = 0.0f; /* don't care! */ break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: outY = adFn( floorf( y ), height ); outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f); break; default: // legacy path: if (height != 0) outY = adFn( floorf( y ), height ); if( depth != 0 ) outZ = adFn( floorf( z ), depth ); } return !( (int)refX == outX && (int)refY == outY && (int)refZ == outZ ); } static float frac(float a) { return a - floorf(a); } static inline void pixelMax( const float a[4], const float b[4], float *results ); static inline void pixelMax( const float a[4], const float b[4], float *results ) { for( int i = 0; i < 4; i++ ) results[i] = errMax( fabsf(a[i]), fabsf(b[i]) ); } // If containsDenorms is NULL, flush denorms to zero // if containsDenorms is not NULL, record whether there are any denorms static inline void check_for_denorms(float a[4], int *containsDenorms ); static inline void check_for_denorms(float a[4], int *containsDenorms ) { if( NULL == containsDenorms ) { for( int i = 0; i < 4; i++ ) { if( IsFloatSubnormal( a[i] ) ) a[i] = copysignf( 0.0f, a[i] ); } } else { for( int i = 0; i < 4; i++ ) { if( IsFloatSubnormal( a[i] ) ) { *containsDenorms = 1; break; } } } } inline float calculate_array_index( float coord, float extent ) { // from Section 8.4 of the 1.2 Spec 'Selecting an Image from an Image Array' // // given coordinate 'w' that represents an index: // layer_index = clamp( rint(w), 0, image_array_size - 1) float ret = rintf( coord ); ret = ret > extent ? extent : ret; ret = ret < 0.0f ? 0.0f : ret; return ret; } /* * Utility function to unnormalized a coordinate given a particular sampler. * * name - the name of the coordinate, used for verbose debugging only * coord - the coordinate requiring unnormalization * offset - an addressing offset to be added to the coordinate * extent - the max value for this coordinate (e.g. width for x) */ static float unnormalize_coordinate( const char* name, float coord, float offset, float extent, cl_addressing_mode addressing_mode, int verbose ) { float ret = 0.0f; switch (addressing_mode) { case CL_ADDRESS_REPEAT: ret = RepeatNormalizedAddressFn( coord, extent ); if ( verbose ) { log_info( "\tRepeat filter denormalizes %s (%f) to %f\n", name, coord, ret ); } if (offset != 0.0) { // Add in the offset, and handle wrapping. ret += offset; if (ret > extent) ret -= extent; if (ret < 0.0) ret += extent; } if (verbose && offset != 0.0f) { log_info( "\tAddress offset of %f added to get %f\n", offset, ret ); } break; case CL_ADDRESS_MIRRORED_REPEAT: ret = MirroredRepeatNormalizedAddressFn( coord, extent ); if ( verbose ) { log_info( "\tMirrored repeat filter denormalizes %s (%f) to %f\n", name, coord, ret ); } if (offset != 0.0) { float temp = ret + offset; if( temp > extent ) temp = extent - (temp - extent ); ret = fabsf( temp ); } if (verbose && offset != 0.0f) { log_info( "\tAddress offset of %f added to get %f\n", offset, ret ); } break; default: ret = coord * extent; if ( verbose ) { log_info( "\tFilter denormalizes %s to %f (%f * %f)\n", name, ret, coord, extent); } ret += offset; if (verbose && offset != 0.0f) { log_info( "\tAddress offset of %f added to get %f\n", offset, ret ); } } return ret; } FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo, float x, float y, float z, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms ) { return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms); } // returns max pixel value of the pixels touched FloatPixel sample_image_pixel_float( void *imageData, image_descriptor *imageInfo, float x, float y, float z, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod) { return sample_image_pixel_float_offset(imageData, imageInfo, x, y, z, 0.0f, 0.0f, 0.0f, imageSampler, outData, verbose, containsDenorms, lod); } FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms , int lod) { AddressFn adFn = sAddressingTable[ imageSampler ]; FloatPixel returnVal; size_t width_lod = imageInfo->width, height_lod = imageInfo->height, depth_lod = imageInfo->depth; size_t slice_pitch_lod = 0, row_pitch_lod = 0; if ( imageInfo->num_mip_levels > 1 ) { switch(imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D : depth_lod = ( imageInfo->depth >> lod ) ? ( imageInfo->depth >> lod ) : 1; case CL_MEM_OBJECT_IMAGE2D : case CL_MEM_OBJECT_IMAGE2D_ARRAY : height_lod = ( imageInfo->height >> lod ) ? ( imageInfo->height >> lod ) : 1; default : width_lod = ( imageInfo->width >> lod ) ? ( imageInfo->width >> lod ) : 1; } row_pitch_lod = width_lod * get_pixel_size(imageInfo->format); if ( imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY ) slice_pitch_lod = row_pitch_lod; else if ( imageInfo->type == CL_MEM_OBJECT_IMAGE3D || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) slice_pitch_lod = row_pitch_lod * height_lod; } else { slice_pitch_lod = imageInfo->slicePitch; row_pitch_lod = imageInfo->rowPitch; } if( containsDenorms ) *containsDenorms = 0; if( imageSampler->normalized_coords ) { // We need to unnormalize our coordinates differently depending on // the image type, but 'x' is always processed the same way. x = unnormalize_coordinate("x", x, xAddressOffset, (float)width_lod, imageSampler->addressing_mode, verbose); switch (imageInfo->type) { // The image array types require special care: case CL_MEM_OBJECT_IMAGE1D_ARRAY: z = 0; // don't care -- unused for 1D arrays break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod, imageSampler->addressing_mode, verbose); break; // Everybody else: default: y = unnormalize_coordinate("y", y, yAddressOffset, (float)height_lod, imageSampler->addressing_mode, verbose); z = unnormalize_coordinate("z", z, zAddressOffset, (float)depth_lod, imageSampler->addressing_mode, verbose); } } else if ( verbose ) { switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: log_info("Starting coordinate: %f, array index %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: log_info("Starting coordinate: %f, %f, array index %f\n", x, y, z); break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: log_info("Starting coordinate: %f\b", x); break; case CL_MEM_OBJECT_IMAGE2D: log_info("Starting coordinate: %f, %f\n", x, y); break; case CL_MEM_OBJECT_IMAGE3D: default: log_info("Starting coordinate: %f, %f, %f\n", x, y, z); } } // At this point, we have unnormalized coordinates. if( imageSampler->filter_mode == CL_FILTER_NEAREST ) { int ix, iy, iz; // We apply the addressing function to the now-unnormalized // coordinates. Note that the array cases again require special // care, per section 8.4 in the OpenCL 1.2 Specification. ix = adFn( floorf( x ), width_lod ); switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: iy = calculate_array_index( y, (float)(imageInfo->arraySize - 1) ); iz = 0; if( verbose ) { log_info("\tArray index %f evaluates to %d\n",y, iy ); } break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: iy = adFn( floorf( y ), height_lod ); iz = calculate_array_index( z, (float)(imageInfo->arraySize - 1) ); if( verbose ) { log_info("\tArray index %f evaluates to %d\n",z, iz ); } break; default: iy = adFn( floorf( y ), height_lod ); if( depth_lod != 0 ) iz = adFn( floorf( z ), depth_lod ); else iz = 0; } if( verbose ) { if( iz ) log_info( "\tReference integer coords calculated: { %d, %d, %d }\n", ix, iy, iz ); else log_info( "\tReference integer coords calculated: { %d, %d }\n", ix, iy ); } read_image_pixel_float( imageData, imageInfo, ix, iy, iz, outData, lod ); check_for_denorms( outData, containsDenorms ); for( int i = 0; i < 4; i++ ) returnVal.p[i] = fabsf( outData[i] ); return returnVal; } else { // Linear filtering cases. size_t width = width_lod, height = height_lod, depth = depth_lod; // Image arrays can use 2D filtering, but require us to walk into the // image a certain number of slices before reading. if( depth == 0 || imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY || imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { float array_index = 0; size_t layer_offset = 0; if (imageInfo->type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { array_index = calculate_array_index(z, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; } else if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { array_index = calculate_array_index(y, (float)(imageInfo->arraySize - 1)); layer_offset = slice_pitch_lod * (size_t)array_index; // Set up y and height so that the filtering below is correct // 1D filtering on a single slice. height = 1; } int x1 = adFn( floorf( x - 0.5f ), width ); int y1 = 0; int x2 = adFn( floorf( x - 0.5f ) + 1, width ); int y2 = 0; if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) { y1 = adFn( floorf( y - 0.5f ), height ); y2 = adFn( floorf( y - 0.5f ) + 1, height ); } else { y = 0.5f; } if( verbose ) { log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{ %d, %d } and i1:{ %d, %d }\n", x1, y1, x2, y2 ); log_info( "\tArray coordinate is %f\n", array_index); } // Walk to beginning of the 'correct' slice, if needed. char* imgPtr = ((char*)imageData) + layer_offset; float upLeft[ 4 ], upRight[ 4 ], lowLeft[ 4 ], lowRight[ 4 ]; float maxUp[4], maxLow[4]; read_image_pixel_float( imgPtr, imageInfo, x1, y1, 0, upLeft, lod ); read_image_pixel_float( imgPtr, imageInfo, x2, y1, 0, upRight, lod ); check_for_denorms( upLeft, containsDenorms ); check_for_denorms( upRight, containsDenorms ); pixelMax( upLeft, upRight, maxUp ); read_image_pixel_float( imgPtr, imageInfo, x1, y2, 0, lowLeft, lod ); read_image_pixel_float( imgPtr, imageInfo, x2, y2, 0, lowRight, lod ); check_for_denorms( lowLeft, containsDenorms ); check_for_denorms( lowRight, containsDenorms ); pixelMax( lowLeft, lowRight, maxLow ); pixelMax( maxUp, maxLow, returnVal.p ); if( verbose ) { if( NULL == containsDenorms ) log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" ); else log_info( "\tSampled pixels (rgba order):\n" ); log_info( "\t\tp00: %f, %f, %f, %f\n", upLeft[0], upLeft[1], upLeft[2], upLeft[3] ); log_info( "\t\tp01: %f, %f, %f, %f\n", upRight[0], upRight[1], upRight[2], upRight[3] ); log_info( "\t\tp10: %f, %f, %f, %f\n", lowLeft[0], lowLeft[1], lowLeft[2], lowLeft[3] ); log_info( "\t\tp11: %f, %f, %f, %f\n", lowRight[0], lowRight[1], lowRight[2], lowRight[3] ); } bool printMe = false; if( x1 <= 0 || x2 <= 0 || x1 >= (int)width-1 || x2 >= (int)width-1 ) printMe = true; if( y1 <= 0 || y2 <= 0 || y1 >= (int)height-1 || y2 >= (int)height-1 ) printMe = true; double weights[ 2 ][ 2 ]; weights[ 0 ][ 0 ] = weights[ 0 ][ 1 ] = 1.0 - frac( x - 0.5f ); weights[ 1 ][ 0 ] = weights[ 1 ][ 1 ] = frac( x - 0.5f ); weights[ 0 ][ 0 ] *= 1.0 - frac( y - 0.5f ); weights[ 1 ][ 0 ] *= 1.0 - frac( y - 0.5f ); weights[ 0 ][ 1 ] *= frac( y - 0.5f ); weights[ 1 ][ 1 ] *= frac( y - 0.5f ); if( verbose ) log_info( "\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f\n", frac( x - 0.5f ), frac( y - 0.5f ) ); for( int i = 0; i < 3; i++ ) { outData[ i ] = (float)( ( upLeft[ i ] * weights[ 0 ][ 0 ] ) + ( upRight[ i ] * weights[ 1 ][ 0 ] ) + ( lowLeft[ i ] * weights[ 0 ][ 1 ] ) + ( lowRight[ i ] * weights[ 1 ][ 1 ] )); // flush subnormal results to zero if necessary if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN ) outData[i] = copysignf( 0.0f, outData[i] ); } outData[ 3 ] = (float)( ( upLeft[ 3 ] * weights[ 0 ][ 0 ] ) + ( upRight[ 3 ] * weights[ 1 ][ 0 ] ) + ( lowLeft[ 3 ] * weights[ 0 ][ 1 ] ) + ( lowRight[ 3 ] * weights[ 1 ][ 1 ] )); // flush subnormal results to zero if necessary if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN ) outData[3] = copysignf( 0.0f, outData[3] ); } else { // 3D linear filtering int x1 = adFn( floorf( x - 0.5f ), width_lod ); int y1 = adFn( floorf( y - 0.5f ), height_lod ); int z1 = adFn( floorf( z - 0.5f ), depth_lod ); int x2 = adFn( floorf( x - 0.5f ) + 1, width_lod ); int y2 = adFn( floorf( y - 0.5f ) + 1, height_lod ); int z2 = adFn( floorf( z - 0.5f ) + 1, depth_lod ); if( verbose ) log_info( "\tActual integer coords used (i = floor(x-.5)): i0:{%d, %d, %d} and i1:{%d, %d, %d}\n", x1, y1, z1, x2, y2, z2 ); float upLeftA[ 4 ], upRightA[ 4 ], lowLeftA[ 4 ], lowRightA[ 4 ]; float upLeftB[ 4 ], upRightB[ 4 ], lowLeftB[ 4 ], lowRightB[ 4 ]; float pixelMaxA[4], pixelMaxB[4]; read_image_pixel_float( imageData, imageInfo, x1, y1, z1, upLeftA, lod ); read_image_pixel_float( imageData, imageInfo, x2, y1, z1, upRightA, lod ); check_for_denorms( upLeftA, containsDenorms ); check_for_denorms( upRightA, containsDenorms ); pixelMax( upLeftA, upRightA, pixelMaxA ); read_image_pixel_float( imageData, imageInfo, x1, y2, z1, lowLeftA, lod ); read_image_pixel_float( imageData, imageInfo, x2, y2, z1, lowRightA, lod ); check_for_denorms( lowLeftA, containsDenorms ); check_for_denorms( lowRightA, containsDenorms ); pixelMax( lowLeftA, lowRightA, pixelMaxB ); pixelMax( pixelMaxA, pixelMaxB, returnVal.p); read_image_pixel_float( imageData, imageInfo, x1, y1, z2, upLeftB, lod ); read_image_pixel_float( imageData, imageInfo, x2, y1, z2, upRightB, lod ); check_for_denorms( upLeftB, containsDenorms ); check_for_denorms( upRightB, containsDenorms ); pixelMax( upLeftB, upRightB, pixelMaxA ); read_image_pixel_float( imageData, imageInfo, x1, y2, z2, lowLeftB, lod ); read_image_pixel_float( imageData, imageInfo, x2, y2, z2, lowRightB, lod ); check_for_denorms( lowLeftB, containsDenorms ); check_for_denorms( lowRightB, containsDenorms ); pixelMax( lowLeftB, lowRightB, pixelMaxB ); pixelMax( pixelMaxA, pixelMaxB, pixelMaxA); pixelMax( pixelMaxA, returnVal.p, returnVal.p ); if( verbose ) { if( NULL == containsDenorms ) log_info( "\tSampled pixels (rgba order, denorms flushed to zero):\n" ); else log_info( "\tSampled pixels (rgba order):\n" ); log_info( "\t\tp000: %f, %f, %f, %f\n", upLeftA[0], upLeftA[1], upLeftA[2], upLeftA[3] ); log_info( "\t\tp001: %f, %f, %f, %f\n", upRightA[0], upRightA[1], upRightA[2], upRightA[3] ); log_info( "\t\tp010: %f, %f, %f, %f\n", lowLeftA[0], lowLeftA[1], lowLeftA[2], lowLeftA[3] ); log_info( "\t\tp011: %f, %f, %f, %f\n\n", lowRightA[0], lowRightA[1], lowRightA[2], lowRightA[3] ); log_info( "\t\tp100: %f, %f, %f, %f\n", upLeftB[0], upLeftB[1], upLeftB[2], upLeftB[3] ); log_info( "\t\tp101: %f, %f, %f, %f\n", upRightB[0], upRightB[1], upRightB[2], upRightB[3] ); log_info( "\t\tp110: %f, %f, %f, %f\n", lowLeftB[0], lowLeftB[1], lowLeftB[2], lowLeftB[3] ); log_info( "\t\tp111: %f, %f, %f, %f\n", lowRightB[0], lowRightB[1], lowRightB[2], lowRightB[3] ); } double weights[ 2 ][ 2 ][ 2 ]; float a = frac( x - 0.5f ), b = frac( y - 0.5f ), c = frac( z - 0.5f ); weights[ 0 ][ 0 ][ 0 ] = weights[ 0 ][ 1 ][ 0 ] = weights[ 0 ][ 0 ][ 1 ] = weights[ 0 ][ 1 ][ 1 ] = 1.f - a; weights[ 1 ][ 0 ][ 0 ] = weights[ 1 ][ 1 ][ 0 ] = weights[ 1 ][ 0 ][ 1 ] = weights[ 1 ][ 1 ][ 1 ] = a; weights[ 0 ][ 0 ][ 0 ] *= 1.f - b; weights[ 1 ][ 0 ][ 0 ] *= 1.f - b; weights[ 0 ][ 0 ][ 1 ] *= 1.f - b; weights[ 1 ][ 0 ][ 1 ] *= 1.f - b; weights[ 0 ][ 1 ][ 0 ] *= b; weights[ 1 ][ 1 ][ 0 ] *= b; weights[ 0 ][ 1 ][ 1 ] *= b; weights[ 1 ][ 1 ][ 1 ] *= b; weights[ 0 ][ 0 ][ 0 ] *= 1.f - c; weights[ 0 ][ 1 ][ 0 ] *= 1.f - c; weights[ 1 ][ 0 ][ 0 ] *= 1.f - c; weights[ 1 ][ 1 ][ 0 ] *= 1.f - c; weights[ 0 ][ 0 ][ 1 ] *= c; weights[ 0 ][ 1 ][ 1 ] *= c; weights[ 1 ][ 0 ][ 1 ] *= c; weights[ 1 ][ 1 ][ 1 ] *= c; if( verbose ) log_info( "\tfrac( x - 0.5f ) = %f, frac( y - 0.5f ) = %f, frac( z - 0.5f ) = %f\n", frac( x - 0.5f ), frac( y - 0.5f ), frac( z - 0.5f ) ); for( int i = 0; i < 3; i++ ) { outData[ i ] = (float)( ( upLeftA[ i ] * weights[ 0 ][ 0 ][ 0 ] ) + ( upRightA[ i ] * weights[ 1 ][ 0 ][ 0 ] ) + ( lowLeftA[ i ] * weights[ 0 ][ 1 ][ 0 ] ) + ( lowRightA[ i ] * weights[ 1 ][ 1 ][ 0 ] ) + ( upLeftB[ i ] * weights[ 0 ][ 0 ][ 1 ] ) + ( upRightB[ i ] * weights[ 1 ][ 0 ][ 1 ] ) + ( lowLeftB[ i ] * weights[ 0 ][ 1 ][ 1 ] ) + ( lowRightB[ i ] * weights[ 1 ][ 1 ][ 1 ] )); // flush subnormal results to zero if necessary if( NULL == containsDenorms && fabs(outData[i]) < FLT_MIN ) outData[i] = copysignf( 0.0f, outData[i] ); } outData[ 3 ] = (float)( ( upLeftA[ 3 ] * weights[ 0 ][ 0 ][ 0 ] ) + ( upRightA[ 3 ] * weights[ 1 ][ 0 ][ 0 ] ) + ( lowLeftA[ 3 ] * weights[ 0 ][ 1 ][ 0 ] ) + ( lowRightA[ 3 ] * weights[ 1 ][ 1 ][ 0 ] ) + ( upLeftB[ 3 ] * weights[ 0 ][ 0 ][ 1 ] ) + ( upRightB[ 3 ] * weights[ 1 ][ 0 ][ 1 ] ) + ( lowLeftB[ 3 ] * weights[ 0 ][ 1 ][ 1 ] ) + ( lowRightB[ 3 ] * weights[ 1 ][ 1 ][ 1 ] )); // flush subnormal results to zero if necessary if( NULL == containsDenorms && fabs(outData[3]) < FLT_MIN ) outData[3] = copysignf( 0.0f, outData[3] ); } return returnVal; } } FloatPixel sample_image_pixel_float_offset( void *imageData, image_descriptor *imageInfo, float x, float y, float z, float xAddressOffset, float yAddressOffset, float zAddressOffset, image_sampler_data *imageSampler, float *outData, int verbose, int *containsDenorms ) { return sample_image_pixel_float_offset( imageData, imageInfo, x, y, z, xAddressOffset, yAddressOffset, zAddressOffset, imageSampler, outData, verbose, containsDenorms, 0); } int debug_find_vector_in_image( void *imagePtr, image_descriptor *imageInfo, void *vectorToFind, size_t vectorSize, int *outX, int *outY, int *outZ, size_t lod ) { int foundCount = 0; char *iPtr = (char *)imagePtr; size_t width; size_t depth; size_t height; size_t row_pitch; size_t slice_pitch; switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = 1; depth = 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = 1; depth = imageInfo->arraySize; break; case CL_MEM_OBJECT_IMAGE2D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = imageInfo->arraySize; break; case CL_MEM_OBJECT_IMAGE3D: width = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; break; } row_pitch = width * get_pixel_size( imageInfo->format ); slice_pitch = row_pitch * height; for( size_t z = 0; z < depth; z++ ) { for( size_t y = 0; y < height; y++ ) { for( size_t x = 0; x < width; x++) { if( memcmp( iPtr, vectorToFind, vectorSize ) == 0 ) { if( foundCount == 0 ) { *outX = (int)x; if (outY != NULL) *outY = (int)y; if( outZ != NULL ) *outZ = (int)z; } foundCount++; } iPtr += vectorSize; } iPtr += row_pitch - ( width * vectorSize ); } iPtr += slice_pitch - ( height * row_pitch ); } return foundCount; } int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo, unsigned int *valuesToFind, int *outX, int *outY, int *outZ, int lod ) { char vectorToFind[ 4 * 4 ]; size_t vectorSize = get_format_channel_count( imageInfo->format ); if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT8 ) { unsigned char *p = (unsigned char *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (unsigned char)valuesToFind[i]; } else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT16 ) { unsigned short *p = (unsigned short *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (unsigned short)valuesToFind[i]; vectorSize *= 2; } else if( imageInfo->format->image_channel_data_type == CL_UNSIGNED_INT32 ) { unsigned int *p = (unsigned int *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (unsigned int)valuesToFind[i]; vectorSize *= 4; } else { log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" ); return false; } return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod ); } int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo, int *valuesToFind, int *outX, int *outY, int *outZ, int lod ) { char vectorToFind[ 4 * 4 ]; size_t vectorSize = get_format_channel_count( imageInfo->format ); if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT8 ) { char *p = (char *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (char)valuesToFind[i]; } else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT16 ) { short *p = (short *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (short)valuesToFind[i]; vectorSize *= 2; } else if( imageInfo->format->image_channel_data_type == CL_SIGNED_INT32 ) { int *p = (int *)vectorToFind; for( unsigned int i = 0; i < vectorSize; i++ ) p[i] = (int)valuesToFind[i]; vectorSize *= 4; } else { log_info( "WARNING: Unable to search for debug pixel: invalid image format\n" ); return false; } return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod ); } int debug_find_pixel_in_image( void *imagePtr, image_descriptor *imageInfo, float *valuesToFind, int *outX, int *outY, int *outZ, int lod ) { char vectorToFind[ 4 * 4 ]; float swizzled[4]; memcpy( swizzled, valuesToFind, sizeof( swizzled ) ); size_t vectorSize = get_pixel_size( imageInfo->format ); pack_image_pixel( swizzled, imageInfo->format, vectorToFind ); return debug_find_vector_in_image( imagePtr, imageInfo, vectorToFind, vectorSize, outX, outY, outZ, lod ); } template void swizzle_vector_for_image( T *srcVector, const cl_image_format *imageFormat ) { T temp; switch( imageFormat->image_channel_order ) { case CL_A: srcVector[ 0 ] = srcVector[ 3 ]; break; case CL_R: case CL_Rx: case CL_RG: case CL_RGx: case CL_RGB: case CL_RGBx: case CL_RGBA: case CL_sRGB: case CL_sRGBx: case CL_sRGBA: break; case CL_RA: srcVector[ 1 ] = srcVector[ 3 ]; break; case CL_ARGB: temp = srcVector[ 3 ]; srcVector[ 3 ] = srcVector[ 2 ]; srcVector[ 2 ] = srcVector[ 1 ]; srcVector[ 1 ] = srcVector[ 0 ]; srcVector[ 0 ] = temp; break; case CL_BGRA: case CL_sBGRA: temp = srcVector[ 0 ]; srcVector[ 0 ] = srcVector[ 2 ]; srcVector[ 2 ] = temp; break; case CL_INTENSITY: srcVector[ 3 ] = srcVector[ 0 ]; srcVector[ 2 ] = srcVector[ 0 ]; srcVector[ 1 ] = srcVector[ 0 ]; break; case CL_LUMINANCE: srcVector[ 2 ] = srcVector[ 0 ]; srcVector[ 1 ] = srcVector[ 0 ]; break; #ifdef CL_1RGB_APPLE case CL_1RGB_APPLE: temp = srcVector[ 3 ]; srcVector[ 3 ] = srcVector[ 2 ]; srcVector[ 2 ] = srcVector[ 1 ]; srcVector[ 1 ] = srcVector[ 0 ]; srcVector[ 0 ] = temp; break; #endif #ifdef CL_BGR1_APPLE case CL_BGR1_APPLE: temp = srcVector[ 0 ]; srcVector[ 0 ] = srcVector[ 2 ]; srcVector[ 2 ] = temp; break; #endif } } #define SATURATE( v, min, max ) ( v < min ? min : ( v > max ? max : v ) ) void pack_image_pixel( unsigned int *srcVector, const cl_image_format *imageFormat, void *outData ) { swizzle_vector_for_image( srcVector, imageFormat ); size_t channelCount = get_format_channel_count( imageFormat ); switch( imageFormat->image_channel_data_type ) { case CL_UNSIGNED_INT8: { unsigned char *ptr = (unsigned char *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (unsigned char)SATURATE( srcVector[ i ], 0, 255 ); break; } case CL_UNSIGNED_INT16: { unsigned short *ptr = (unsigned short *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (unsigned short)SATURATE( srcVector[ i ], 0, 65535 ); break; } case CL_UNSIGNED_INT32: { unsigned int *ptr = (unsigned int *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (unsigned int)srcVector[ i ]; break; } default: break; } } void pack_image_pixel( int *srcVector, const cl_image_format *imageFormat, void *outData ) { swizzle_vector_for_image( srcVector, imageFormat ); size_t chanelCount = get_format_channel_count( imageFormat ); switch( imageFormat->image_channel_data_type ) { case CL_SIGNED_INT8: { char *ptr = (char *)outData; for( unsigned int i = 0; i < chanelCount; i++ ) ptr[ i ] = (char)SATURATE( srcVector[ i ], -128, 127 ); break; } case CL_SIGNED_INT16: { short *ptr = (short *)outData; for( unsigned int i = 0; i < chanelCount; i++ ) ptr[ i ] = (short)SATURATE( srcVector[ i ], -32768, 32767 ); break; } case CL_SIGNED_INT32: { int *ptr = (int *)outData; for( unsigned int i = 0; i < chanelCount; i++ ) ptr[ i ] = (int)srcVector[ i ]; break; } default: break; } } int round_to_even( float v ) { // clamp overflow if( v >= - (float) INT_MIN ) return INT_MAX; if( v <= (float) INT_MIN ) return INT_MIN; // round fractional values to integer value if( fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) ) { static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23), MAKE_HEX_FLOAT(-0x1.0p23f, -0x1L, 23) }; float magicVal = magic[ v < 0.0f ]; v += magicVal; v -= magicVal; } return (int) v; } void pack_image_pixel( float *srcVector, const cl_image_format *imageFormat, void *outData ) { swizzle_vector_for_image( srcVector, imageFormat ); size_t channelCount = get_format_channel_count( imageFormat ); switch( imageFormat->image_channel_data_type ) { case CL_HALF_FLOAT: { cl_ushort *ptr = (cl_ushort *)outData; switch( gFloatToHalfRoundingMode ) { case kRoundToNearestEven: for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = float2half_rte( srcVector[ i ] ); break; case kRoundTowardZero: for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = float2half_rtz( srcVector[ i ] ); break; default: log_error( "ERROR: Test internal error -- unhandled or unknown float->half rounding mode.\n" ); exit(-1); break; } break; } case CL_FLOAT: { cl_float *ptr = (cl_float *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = srcVector[ i ]; break; } case CL_SNORM_INT8: { cl_char *ptr = (cl_char *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (cl_char)NORMALIZE_SIGNED( srcVector[ i ], -127.0f, 127.f ); break; } case CL_SNORM_INT16: { cl_short *ptr = (cl_short *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (short)NORMALIZE_SIGNED( srcVector[ i ], -32767.f, 32767.f ); break; } case CL_UNORM_INT8: { cl_uchar *ptr = (cl_uchar *)outData; if ( is_sRGBA_order(imageFormat->image_channel_order) ) { ptr[ 0 ] = (unsigned char)( sRGBmap( srcVector[ 0 ] ) + 0.5 ); ptr[ 1 ] = (unsigned char)( sRGBmap( srcVector[ 1 ] ) + 0.5 ); ptr[ 2 ] = (unsigned char)( sRGBmap( srcVector[ 2 ] ) + 0.5 ); if (channelCount == 4) ptr[ 3 ] = (unsigned char)NORMALIZE( srcVector[ 3 ], 255.f ); } else { for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (unsigned char)NORMALIZE( srcVector[ i ], 255.f ); } #ifdef CL_1RGB_APPLE if( imageFormat->image_channel_order == CL_1RGB_APPLE ) ptr[0] = 255.0f; #endif #ifdef CL_BGR1_APPLE if( imageFormat->image_channel_order == CL_BGR1_APPLE ) ptr[3] = 255.0f; #endif break; } case CL_UNORM_INT16: { cl_ushort *ptr = (cl_ushort *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (unsigned short)NORMALIZE( srcVector[ i ], 65535.f ); break; } case CL_UNORM_SHORT_555: { cl_ushort *ptr = (cl_ushort *)outData; ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 10 ) | ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 31.f ) & 31 ) << 5 ) | ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 ); break; } case CL_UNORM_SHORT_565: { cl_ushort *ptr = (cl_ushort *)outData; ptr[ 0 ] = ( ( (unsigned short)NORMALIZE( srcVector[ 0 ], 31.f ) & 31 ) << 11 ) | ( ( (unsigned short)NORMALIZE( srcVector[ 1 ], 63.f ) & 63 ) << 5 ) | ( ( (unsigned short)NORMALIZE( srcVector[ 2 ], 31.f ) & 31 ) << 0 ); break; } case CL_UNORM_INT_101010: { cl_uint *ptr = (cl_uint *)outData; ptr[ 0 ] = ( ( (unsigned int)NORMALIZE( srcVector[ 0 ], 1023.f ) & 1023 ) << 20 ) | ( ( (unsigned int)NORMALIZE( srcVector[ 1 ], 1023.f ) & 1023 ) << 10 ) | ( ( (unsigned int)NORMALIZE( srcVector[ 2 ], 1023.f ) & 1023 ) << 0 ); break; } case CL_SIGNED_INT8: { cl_char *ptr = (cl_char *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (cl_char)CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 ); break; } case CL_SIGNED_INT16: { cl_short *ptr = (cl_short *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (short)CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767 ); break; } case CL_SIGNED_INT32: { cl_int *ptr = (cl_int *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (int)CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX ); break; } case CL_UNSIGNED_INT8: { cl_uchar *ptr = (cl_uchar *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (cl_uchar)CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX ); break; } case CL_UNSIGNED_INT16: { cl_ushort *ptr = (cl_ushort *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (cl_ushort)CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX ); break; } case CL_UNSIGNED_INT32: { cl_uint *ptr = (cl_uint *)outData; for( unsigned int i = 0; i < channelCount; i++ ) ptr[ i ] = (cl_uint)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX ); break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { cl_ushort *ptr = (cl_ushort*)outData; for( unsigned int i = 0; i < channelCount; i++ ) { cl_float f = fmaxf( srcVector[i], -1.0f ); f = fminf( f, 3.0f ); cl_int d = rintf(f * 0x1.0p14f); d += 16384; if( d > CL_USHRT_MAX ) d = CL_USHRT_MAX; ptr[i] = d; } break; } #endif default: log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type); exit(-1); break; } } void pack_image_pixel_error( const float *srcVector, const cl_image_format *imageFormat, const void *results, float *errors ) { size_t channelCount = get_format_channel_count( imageFormat ); switch( imageFormat->image_channel_data_type ) { case CL_HALF_FLOAT: { const cl_ushort *ptr = (const cl_ushort *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = Ulp_Error_Half( ptr[i], srcVector[i] ); break; } case CL_FLOAT: { const cl_ushort *ptr = (const cl_ushort *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = Ulp_Error( ptr[i], srcVector[i] ); break; } case CL_SNORM_INT8: { const cl_char *ptr = (const cl_char *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -127.0f, 127.f ); break; } case CL_SNORM_INT16: { const cl_short *ptr = (const cl_short *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( srcVector[ i ], -32767.f, 32767.f ); break; } case CL_UNORM_INT8: { const cl_uchar *ptr = (const cl_uchar *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 255.f ); break; } case CL_UNORM_INT16: { const cl_ushort *ptr = (const cl_ushort *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[i] - NORMALIZE_UNROUNDED( srcVector[ i ], 65535.f ); break; } case CL_UNORM_SHORT_555: { const cl_ushort *ptr = (const cl_ushort *)results; errors[0] = ((ptr[0] >> 10) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f ); errors[1] = ((ptr[0] >> 5) & 31) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 31.f ); errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f ); break; } case CL_UNORM_SHORT_565: { const cl_ushort *ptr = (const cl_ushort *)results; errors[0] = ((ptr[0] >> 11) & 31) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 31.f ); errors[1] = ((ptr[0] >> 5) & 63) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 63.f ); errors[2] = ((ptr[0] >> 0) & 31) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 31.f ); break; } case CL_UNORM_INT_101010: { const cl_uint *ptr = (const cl_uint *)results; errors[0] = ((ptr[0] >> 20) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 0 ], 1023.f ); errors[1] = ((ptr[0] >> 10) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 1 ], 1023.f ); errors[2] = ((ptr[0] >> 0) & 1023) - NORMALIZE_UNROUNDED( srcVector[ 2 ], 1023.f ); break; } case CL_SIGNED_INT8: { const cl_char *ptr = (const cl_char *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[ i ] = ptr[i] - CONVERT_INT( srcVector[ i ], -127.0f, 127.f, 127 ); break; } case CL_SIGNED_INT16: { const cl_short *ptr = (const cl_short *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[ i ] - CONVERT_INT( srcVector[ i ], -32767.f, 32767.f, 32767 ); break; } case CL_SIGNED_INT32: { const cl_int *ptr = (const cl_int *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long) CONVERT_INT( srcVector[ i ], MAKE_HEX_FLOAT( -0x1.0p31f, -1, 31), MAKE_HEX_FLOAT( 0x1.fffffep30f, 0x1fffffe, 30-23), CL_INT_MAX )); break; } case CL_UNSIGNED_INT8: { const cl_uchar *ptr = (const cl_uchar *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 255.f, CL_UCHAR_MAX ); break; } case CL_UNSIGNED_INT16: { const cl_ushort *ptr = (const cl_ushort *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = (cl_int) ptr[ i ] - (cl_int) CONVERT_UINT( srcVector[ i ], 32767.f, CL_USHRT_MAX ); break; } case CL_UNSIGNED_INT32: { const cl_uint *ptr = (const cl_uint *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = (cl_float)((cl_long) ptr[ i ] - (cl_long)CONVERT_UINT( srcVector[ i ], MAKE_HEX_FLOAT( 0x1.fffffep31f, 0x1fffffe, 31-23), CL_UINT_MAX )); break; } #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { const cl_ushort *ptr = (const cl_ushort *)results; for( unsigned int i = 0; i < channelCount; i++ ) errors[i] = ptr[i] - NORMALIZE_SIGNED_UNROUNDED( ((int) srcVector[ i ] - 16384), -16384.f, 49151.f ); break; } #endif default: log_error( "INTERNAL ERROR: unknown format (%d)\n", imageFormat->image_channel_data_type); exit(-1); break; } } // // Autodetect which rounding mode is used for image writes to CL_HALF_FLOAT // This should be called lazily before attempting to verify image writes, otherwise an error will occur. // int DetectFloatToHalfRoundingMode( cl_command_queue q ) // Returns CL_SUCCESS on success { cl_int err = CL_SUCCESS; if( gFloatToHalfRoundingMode == kDefaultRoundingMode ) { // Some numbers near 0.5f, that we look at to see how the values are rounded. static const cl_uint inData[4*4] = { 0x3f000fffU, 0x3f001000U, 0x3f001001U, 0U, 0x3f001fffU, 0x3f002000U, 0x3f002001U, 0U, 0x3f002fffU, 0x3f003000U, 0x3f003001U, 0U, 0x3f003fffU, 0x3f004000U, 0x3f004001U, 0U }; static const size_t count = sizeof( inData ) / (4*sizeof( inData[0] )); const float *inp = (const float*) inData; cl_context context = NULL; // Create an input buffer err = clGetCommandQueueInfo( q, CL_QUEUE_CONTEXT, sizeof(context), &context, NULL ); if( err ) { log_error( "Error: could not get context from command queue in DetectFloatToHalfRoundingMode (%d)", err ); return err; } cl_mem inBuf = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR, sizeof( inData ), (void*) inData, &err ); if( NULL == inBuf || err ) { log_error( "Error: could not create input buffer in DetectFloatToHalfRoundingMode (err: %d)", err ); return err; } // Create a small output image cl_image_format fmt = { CL_RGBA, CL_HALF_FLOAT }; cl_mem outImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &fmt, count, 1, 0, NULL, &err ); if( NULL == outImage || err ) { log_error( "Error: could not create half float out image in DetectFloatToHalfRoundingMode (err: %d)", err ); clReleaseMemObject( inBuf ); return err; } // Create our program, and a kernel const char *kernel[1] = { "kernel void detect_round( global float4 *in, write_only image2d_t out )\n" "{\n" " write_imagef( out, (int2)(get_global_id(0),0), in[get_global_id(0)] );\n" "}\n" }; clProgramWrapper program; err = create_single_kernel_helper_create_program(context, &program, 1, kernel); if( NULL == program || err ) { log_error( "Error: could not create program in DetectFloatToHalfRoundingMode (err: %d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); return err; } cl_device_id device = NULL; err = clGetCommandQueueInfo( q, CL_QUEUE_DEVICE, sizeof(device), &device, NULL ); if( err ) { log_error( "Error: could not get device from command queue in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); return err; } err = clBuildProgram( program, 1, &device, "", NULL, NULL ); if( err ) { log_error( "Error: could not build program in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); return err; } cl_kernel k = clCreateKernel( program, "detect_round", &err ); if( NULL == k || err ) { log_error( "Error: could not create kernel in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); return err; } err = clSetKernelArg( k, 0, sizeof( cl_mem ), &inBuf ); if( err ) { log_error( "Error: could not set argument 0 of kernel in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); clReleaseKernel( k ); return err; } err = clSetKernelArg( k, 1, sizeof( cl_mem ), &outImage ); if( err ) { log_error( "Error: could not set argument 1 of kernel in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); clReleaseKernel( k ); return err; } // Run the kernel size_t global_work_size = count; err = clEnqueueNDRangeKernel( q, k, 1, NULL, &global_work_size, NULL, 0, NULL, NULL ); if( err ) { log_error( "Error: could not enqueue kernel in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); clReleaseKernel( k ); return err; } // read the results cl_ushort outBuf[count*4]; memset( outBuf, -1, sizeof( outBuf ) ); size_t origin[3] = {0,0,0}; size_t region[3] = {count,1,1}; err = clEnqueueReadImage( q, outImage, CL_TRUE, origin, region, 0, 0, outBuf, 0, NULL, NULL ); if( err ) { log_error( "Error: could not read output image in DetectFloatToHalfRoundingMode (%d)", err ); clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); clReleaseKernel( k ); return err; } // Generate our list of reference results cl_ushort rte_ref[count*4]; cl_ushort rtz_ref[count*4]; for( size_t i = 0; i < 4 * count; i++ ) { rte_ref[i] = float2half_rte( inp[i] ); rtz_ref[i] = float2half_rtz( inp[i] ); } // Verify that we got something in either rtz or rte mode if( 0 == memcmp( rte_ref, outBuf, sizeof( rte_ref )) ) { log_info( "Autodetected float->half rounding mode to be rte\n" ); gFloatToHalfRoundingMode = kRoundToNearestEven; } else if ( 0 == memcmp( rtz_ref, outBuf, sizeof( rtz_ref )) ) { log_info( "Autodetected float->half rounding mode to be rtz\n" ); gFloatToHalfRoundingMode = kRoundTowardZero; } else { log_error( "ERROR: float to half conversions proceed with invalid rounding mode!\n" ); log_info( "\nfor:" ); for( size_t i = 0; i < count; i++ ) log_info( " {%a, %a, %a, %a},", inp[4*i], inp[4*i+1], inp[4*i+2], inp[4*i+3] ); log_info( "\ngot:" ); for( size_t i = 0; i < count; i++ ) log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", outBuf[4*i], outBuf[4*i+1], outBuf[4*i+2], outBuf[4*i+3] ); log_info( "\nrte:" ); for( size_t i = 0; i < count; i++ ) log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rte_ref[4*i], rte_ref[4*i+1], rte_ref[4*i+2], rte_ref[4*i+3] ); log_info( "\nrtz:" ); for( size_t i = 0; i < count; i++ ) log_info( " {0x%4.4x, 0x%4.4x, 0x%4.4x, 0x%4.4x},", rtz_ref[4*i], rtz_ref[4*i+1], rtz_ref[4*i+2], rtz_ref[4*i+3] ); log_info( "\n" ); err = -1; gFloatToHalfRoundingMode = kRoundingModeCount; // illegal value } // clean up clReleaseMemObject( inBuf ); clReleaseMemObject( outImage ); clReleaseKernel( k ); return err; } // Make sure that the rounding mode was successfully detected, if we checked earlier if( gFloatToHalfRoundingMode != kRoundToNearestEven && gFloatToHalfRoundingMode != kRoundTowardZero) return -2; return err; } char *create_random_image_data( ExplicitType dataType, image_descriptor *imageInfo, BufferOwningPtr &P, MTdata d, bool image2DFromBuffer ) { size_t allocSize, numPixels; if ( /*gTestMipmaps*/ imageInfo->num_mip_levels > 1 ) { allocSize = (size_t) (compute_mipmapped_image_size(*imageInfo) * 4 * get_explicit_type_size( dataType ))/get_pixel_size(imageInfo->format); numPixels = allocSize / (get_explicit_type_size( dataType ) * 4); } else { numPixels = (image2DFromBuffer? imageInfo->rowPitch: imageInfo->width) * imageInfo->height * (imageInfo->depth ? imageInfo->depth : 1) * (imageInfo->arraySize ? imageInfo->arraySize : 1); allocSize = numPixels * 4 * get_explicit_type_size( dataType ); } #if 0 // DEBUG { fprintf(stderr,"--- create_random_image_data:\n"); fprintf(stderr,"allocSize = %zu\n",allocSize); fprintf(stderr,"numPixels = %zu\n",numPixels); fprintf(stderr,"width = %zu\n",imageInfo->width); fprintf(stderr,"height = %zu\n",imageInfo->height); fprintf(stderr,"depth = %zu\n",imageInfo->depth); fprintf(stderr,"rowPitch = %zu\n",imageInfo->rowPitch); fprintf(stderr,"slicePitch = %zu\n",imageInfo->slicePitch); fprintf(stderr,"arraySize = %zu\n",imageInfo->arraySize); fprintf(stderr,"explicit_type_size = %zu\n",get_explicit_type_size(dataType)); } #endif #if defined( __APPLE__ ) char *data = NULL; if (gDeviceType == CL_DEVICE_TYPE_CPU) { size_t mapSize = ((allocSize + 4095L) & -4096L) + 8192; // alloc two extra pages. void *map = mmap(0, mapSize, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); if (map == MAP_FAILED) { perror("create_random_image_data: mmap"); log_error("%s:%d: mmap failed, mapSize = %zu\n",__FILE__,__LINE__,mapSize); } intptr_t data_end = (intptr_t)map + mapSize - 4096; data = (char *)(data_end - (intptr_t)allocSize); mprotect(map, 4096, PROT_NONE); mprotect((void *)((char *)map + mapSize - 4096), 4096, PROT_NONE); P.reset(data, map, mapSize); } else { data = (char *)malloc(allocSize); P.reset(data); } #else char *data = (char *)align_malloc(allocSize, get_pixel_size(imageInfo->format)); P.reset(data,NULL,0,allocSize,true); #endif if (data == NULL) { log_error( "ERROR: Unable to malloc %lu bytes for create_random_image_data\n", allocSize ); return NULL; } switch( dataType ) { case kFloat: { float *inputValues = (float *)data; switch (imageInfo->format->image_channel_data_type) { case CL_HALF_FLOAT: { // Generate data that is (mostly) inside the range of a half float // const float HALF_MIN = 5.96046448e-08f; const float HALF_MAX = 65504.0f; size_t i = 0; inputValues[ i++ ] = 0.f; inputValues[ i++ ] = 1.f; inputValues[ i++ ] = -1.f; inputValues[ i++ ] = 2.f; for( ; i < numPixels * 4; i++ ) inputValues[ i ] = get_random_float( -HALF_MAX - 2.f, HALF_MAX + 2.f, d ); } break; #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: { size_t i = 0; if( numPixels * 4 >= 8 ) { inputValues[ i++ ] = INFINITY; inputValues[ i++ ] = 0x1.0p14f; inputValues[ i++ ] = 0x1.0p31f; inputValues[ i++ ] = 0x1.0p32f; inputValues[ i++ ] = -INFINITY; inputValues[ i++ ] = -0x1.0p14f; inputValues[ i++ ] = -0x1.0p31f; inputValues[ i++ ] = -0x1.1p31f; } for( ; i < numPixels * 4; i++ ) inputValues[ i ] = get_random_float( -1.1f, 3.1f, d ); } break; #endif case CL_FLOAT: { size_t i = 0; inputValues[ i++ ] = INFINITY; inputValues[ i++ ] = -INFINITY; inputValues[ i++ ] = 0.0f; inputValues[ i++ ] = 0.0f; cl_uint *p = (cl_uint *)data; for( ; i < numPixels * 4; i++ ) p[ i ] = genrand_int32(d); } break; default: size_t i = 0; if( numPixels * 4 >= 36 ) { inputValues[ i++ ] = 0.0f; inputValues[ i++ ] = 0.5f; inputValues[ i++ ] = 31.5f; inputValues[ i++ ] = 32.0f; inputValues[ i++ ] = 127.5f; inputValues[ i++ ] = 128.0f; inputValues[ i++ ] = 255.5f; inputValues[ i++ ] = 256.0f; inputValues[ i++ ] = 1023.5f; inputValues[ i++ ] = 1024.0f; inputValues[ i++ ] = 32767.5f; inputValues[ i++ ] = 32768.0f; inputValues[ i++ ] = 65535.5f; inputValues[ i++ ] = 65536.0f; inputValues[ i++ ] = 2147483648.0f; inputValues[ i++ ] = 4294967296.0f; inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 ); inputValues[ i++ ] = MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 ); inputValues[ i++ ] = -0.0f; inputValues[ i++ ] = -0.5f; inputValues[ i++ ] = -31.5f; inputValues[ i++ ] = -32.0f; inputValues[ i++ ] = -127.5f; inputValues[ i++ ] = -128.0f; inputValues[ i++ ] = -255.5f; inputValues[ i++ ] = -256.0f; inputValues[ i++ ] = -1023.5f; inputValues[ i++ ] = -1024.0f; inputValues[ i++ ] = -32767.5f; inputValues[ i++ ] = -32768.0f; inputValues[ i++ ] = -65535.5f; inputValues[ i++ ] = -65536.0f; inputValues[ i++ ] = -2147483648.0f; inputValues[ i++ ] = -4294967296.0f; inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p63f, 1, 63 ); inputValues[ i++ ] = -MAKE_HEX_FLOAT( 0x1.0p64f, 1, 64 ); } if( is_format_signed(imageInfo->format) ) { for( ; i < numPixels * 4; i++ ) inputValues[ i ] = get_random_float( -1.1f, 1.1f, d ); } else { for( ; i < numPixels * 4; i++ ) inputValues[ i ] = get_random_float( -0.1f, 1.1f, d ); } break; } break; } case kInt: { int *imageData = (int *)data; // We want to generate ints (mostly) in range of the target format int formatMin = get_format_min_int( imageInfo->format ); size_t formatMax = get_format_max_int( imageInfo->format ); if( formatMin == 0 ) { // Unsigned values, but we are only an int, so cap the actual max at the max of signed ints if( formatMax > 2147483647L ) formatMax = 2147483647L; } // If the final format is small enough, give us a bit of room for out-of-range values to test if( formatMax < 2147483647L ) formatMax += 2; if( formatMin > -2147483648LL ) formatMin -= 2; // Now gen for( size_t i = 0; i < numPixels * 4; i++ ) { imageData[ i ] = random_in_range( formatMin, (int)formatMax, d ); } break; } case kUInt: case kUnsignedInt: { unsigned int *imageData = (unsigned int *)data; // We want to generate ints (mostly) in range of the target format int formatMin = get_format_min_int( imageInfo->format ); size_t formatMax = get_format_max_int( imageInfo->format ); if( formatMin < 0 ) formatMin = 0; // If the final format is small enough, give us a bit of room for out-of-range values to test if( formatMax < 4294967295LL ) formatMax += 2; // Now gen for( size_t i = 0; i < numPixels * 4; i++ ) { imageData[ i ] = random_in_range( formatMin, (int)formatMax, d ); } break; } default: // Unsupported source format delete [] data; return NULL; } return data; } /* deprecated bool clamp_image_coord( image_sampler_data *imageSampler, float value, size_t max, int &outValue ) { int v = (int)value; switch(imageSampler->addressing_mode) { case CL_ADDRESS_REPEAT: outValue = v; while( v < 0 ) v += (int)max; while( v >= (int)max ) v -= (int)max; if( v != outValue ) { outValue = v; return true; } return false; case CL_ADDRESS_MIRRORED_REPEAT: log_info( "ERROR: unimplemented for CL_ADDRESS_MIRRORED_REPEAT. Do we ever use this? exit(-1); default: if( v < 0 ) { outValue = 0; return true; } if( v >= (int)max ) { outValue = (int)max - 1; return true; } outValue = v; return false; } } */ void get_sampler_kernel_code( image_sampler_data *imageSampler, char *outLine ) { const char *normalized; const char *addressMode; const char *filterMode; if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP ) addressMode = "CLK_ADDRESS_CLAMP"; else if( imageSampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE ) addressMode = "CLK_ADDRESS_CLAMP_TO_EDGE"; else if( imageSampler->addressing_mode == CL_ADDRESS_REPEAT ) addressMode = "CLK_ADDRESS_REPEAT"; else if( imageSampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT ) addressMode = "CLK_ADDRESS_MIRRORED_REPEAT"; else if( imageSampler->addressing_mode == CL_ADDRESS_NONE ) addressMode = "CLK_ADDRESS_NONE"; else { log_error( "**Error: Unknown addressing mode! Aborting...\n" ); abort(); } if( imageSampler->normalized_coords ) normalized = "CLK_NORMALIZED_COORDS_TRUE"; else normalized = "CLK_NORMALIZED_COORDS_FALSE"; if( imageSampler->filter_mode == CL_FILTER_LINEAR ) filterMode = "CLK_FILTER_LINEAR"; else filterMode = "CLK_FILTER_NEAREST"; sprintf( outLine, " const sampler_t imageSampler = %s | %s | %s;\n", addressMode, filterMode, normalized ); } void copy_image_data( image_descriptor *srcImageInfo, image_descriptor *dstImageInfo, void *imageValues, void *destImageValues, const size_t sourcePos[], const size_t destPos[], const size_t regionSize[] ) { // assert( srcImageInfo->format == dstImageInfo->format ); size_t src_mip_level_offset = 0, dst_mip_level_offset = 0; size_t sourcePos_lod[3], destPos_lod[3], src_lod, dst_lod; size_t src_row_pitch_lod, src_slice_pitch_lod; size_t dst_row_pitch_lod, dst_slice_pitch_lod; size_t pixelSize = get_pixel_size( srcImageInfo->format ); sourcePos_lod[0] = sourcePos[0]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = sourcePos[2]; destPos_lod[0] = destPos[0]; destPos_lod[1] = destPos[1]; destPos_lod[2] = destPos[2]; src_row_pitch_lod = srcImageInfo->rowPitch; dst_row_pitch_lod = dstImageInfo->rowPitch; src_slice_pitch_lod = srcImageInfo->slicePitch; dst_slice_pitch_lod = dstImageInfo->slicePitch; if( srcImageInfo->num_mip_levels > 1) { size_t src_width_lod = 1/*srcImageInfo->width*/; size_t src_height_lod = 1/*srcImageInfo->height*/; size_t src_depth_lod = 1/*srcImageInfo->depth*/; switch( srcImageInfo->type ) { case CL_MEM_OBJECT_IMAGE1D: src_lod = sourcePos[1]; sourcePos_lod[1] = sourcePos_lod[2] = 0; src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: src_lod = sourcePos[2]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = 0; src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1; if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE2D ) src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: src_lod = sourcePos[3]; sourcePos_lod[1] = sourcePos[1]; sourcePos_lod[2] = sourcePos[2]; src_width_lod = (srcImageInfo->width >> src_lod ) ? ( srcImageInfo->width >> src_lod ): 1; src_height_lod = (srcImageInfo->height >> src_lod ) ? ( srcImageInfo->height >> src_lod ): 1; if( srcImageInfo->type == CL_MEM_OBJECT_IMAGE3D ) src_depth_lod = (srcImageInfo->depth >> src_lod ) ? ( srcImageInfo->depth >> src_lod ): 1; break; } src_mip_level_offset = compute_mip_level_offset( srcImageInfo, src_lod ); src_row_pitch_lod = src_width_lod * get_pixel_size( srcImageInfo->format ); src_slice_pitch_lod = src_row_pitch_lod * src_height_lod; } if( dstImageInfo->num_mip_levels > 1) { size_t dst_width_lod = 1/*dstImageInfo->width*/; size_t dst_height_lod = 1/*dstImageInfo->height*/; size_t dst_depth_lod = 1 /*dstImageInfo->depth*/; switch( dstImageInfo->type ) { case CL_MEM_OBJECT_IMAGE1D: dst_lod = destPos[1]; destPos_lod[1] = destPos_lod[2] = 0; dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1; break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: dst_lod = destPos[2]; destPos_lod[1] = destPos[1]; destPos_lod[2] = 0; dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1; if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE2D ) dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1; break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: dst_lod = destPos[3]; destPos_lod[1] = destPos[1]; destPos_lod[2] = destPos[2]; dst_width_lod = (dstImageInfo->width >> dst_lod ) ? ( dstImageInfo->width >> dst_lod ): 1; dst_height_lod = (dstImageInfo->height >> dst_lod ) ? ( dstImageInfo->height >> dst_lod ): 1; if( dstImageInfo->type == CL_MEM_OBJECT_IMAGE3D ) dst_depth_lod = (dstImageInfo->depth >> dst_lod ) ? ( dstImageInfo->depth >> dst_lod ): 1; break; } dst_mip_level_offset = compute_mip_level_offset( dstImageInfo, dst_lod ); dst_row_pitch_lod = dst_width_lod * get_pixel_size( dstImageInfo->format); dst_slice_pitch_lod = dst_row_pitch_lod * dst_height_lod; } // Get initial pointers char *sourcePtr = (char *)imageValues + sourcePos_lod[ 2 ] * src_slice_pitch_lod + sourcePos_lod[ 1 ] * src_row_pitch_lod + pixelSize * sourcePos_lod[ 0 ] + src_mip_level_offset; char *destPtr = (char *)destImageValues + destPos_lod[ 2 ] * dst_slice_pitch_lod + destPos_lod[ 1 ] * dst_row_pitch_lod + pixelSize * destPos_lod[ 0 ] + dst_mip_level_offset; for( size_t z = 0; z < ( regionSize[ 2 ] > 0 ? regionSize[ 2 ] : 1 ); z++ ) { char *rowSourcePtr = sourcePtr; char *rowDestPtr = destPtr; for( size_t y = 0; y < regionSize[ 1 ]; y++ ) { memcpy( rowDestPtr, rowSourcePtr, pixelSize * regionSize[ 0 ] ); rowSourcePtr += src_row_pitch_lod; rowDestPtr += dst_row_pitch_lod; } sourcePtr += src_slice_pitch_lod; destPtr += dst_slice_pitch_lod; } } float random_float(float low, float high, MTdata d) { float t = (float) genrand_real1(d); return (1.0f - t) * low + t * high; } CoordWalker::CoordWalker( void * coords, bool useFloats, size_t vecSize ) { if( useFloats ) { mFloatCoords = (cl_float *)coords; mIntCoords = NULL; } else { mFloatCoords = NULL; mIntCoords = (cl_int *)coords; } mVecSize = vecSize; } CoordWalker::~CoordWalker() { } cl_float CoordWalker::Get( size_t idx, size_t el ) { if( mIntCoords != NULL ) return (cl_float)mIntCoords[ idx * mVecSize + el ]; else return mFloatCoords[ idx * mVecSize + el ]; } void print_read_header( cl_image_format *format, image_sampler_data *sampler, bool err, int t ) { const char *addressMode = NULL; const char *normalizedNames[2] = { "UNNORMALIZED", "NORMALIZED" }; if( sampler->addressing_mode == CL_ADDRESS_CLAMP ) addressMode = "CL_ADDRESS_CLAMP"; else if( sampler->addressing_mode == CL_ADDRESS_CLAMP_TO_EDGE ) addressMode = "CL_ADDRESS_CLAMP_TO_EDGE"; else if( sampler->addressing_mode == CL_ADDRESS_REPEAT ) addressMode = "CL_ADDRESS_REPEAT"; else if( sampler->addressing_mode == CL_ADDRESS_MIRRORED_REPEAT ) addressMode = "CL_ADDRESS_MIRRORED_REPEAT"; else addressMode = "CL_ADDRESS_NONE"; if( t ) { if( err ) log_error( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0], t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" ); else log_info( "[%-7s %-24s %d] - %s - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0], t == 1 ? "TRANSPOSED" : "NON-TRANSPOSED" ); } else { if( err ) log_error( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0] ); else log_info( "[%-7s %-24s %d] - %s - %s - %s\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ), sampler->filter_mode == CL_FILTER_NEAREST ? "CL_FILTER_NEAREST" : "CL_FILTER_LINEAR", addressMode, normalizedNames[sampler->normalized_coords ? 1 : 0] ); } } void print_write_header( cl_image_format *format, bool err = false) { if( err ) log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ) ); else log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ) ); } void print_header( cl_image_format *format, bool err = false ) { if (err) { log_error( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ) ); } else { log_info( "[%-7s %-24s %d]\n", GetChannelOrderName( format->image_channel_order ), GetChannelTypeName( format->image_channel_data_type ), (int)get_format_channel_count( format ) ); } } bool find_format( cl_image_format *formatList, unsigned int numFormats, cl_image_format *formatToFind ) { for( unsigned int i = 0; i < numFormats; i++ ) { if( formatList[ i ].image_channel_order == formatToFind->image_channel_order && formatList[ i ].image_channel_data_type == formatToFind->image_channel_data_type ) return true; } return false; } void build_required_image_formats(cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device, std::vector& formatsToSupport) { Version version = get_device_cl_version(device); formatsToSupport.clear(); // Required embedded formats. static std::vector embeddedProfReadOrWriteFormats { { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_UNORM_INT16 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, }; /* Required full profile formats. This array does not contain any full profile formats that have restrictions on when they are required. */ static std::vector fullProfReadOrWriteFormats { { CL_RGBA, CL_UNORM_INT8 }, { CL_RGBA, CL_UNORM_INT16 }, { CL_RGBA, CL_SIGNED_INT8 }, { CL_RGBA, CL_SIGNED_INT16 }, { CL_RGBA, CL_SIGNED_INT32 }, { CL_RGBA, CL_UNSIGNED_INT8 }, { CL_RGBA, CL_UNSIGNED_INT16 }, { CL_RGBA, CL_UNSIGNED_INT32 }, { CL_RGBA, CL_HALF_FLOAT }, { CL_RGBA, CL_FLOAT }, { CL_BGRA, CL_UNORM_INT8 }, }; /* Required full profile formats specifically for 2.x. This array does not contain any full profile formats that have restrictions on when they are required. */ static std::vector fullProf2XReadOrWriteFormats { { CL_R, CL_UNORM_INT8 }, { CL_R, CL_UNORM_INT16 }, { CL_R, CL_SNORM_INT8 }, { CL_R, CL_SNORM_INT16 }, { CL_R, CL_SIGNED_INT8 }, { CL_R, CL_SIGNED_INT16 }, { CL_R, CL_SIGNED_INT32 }, { CL_R, CL_UNSIGNED_INT8 }, { CL_R, CL_UNSIGNED_INT16 }, { CL_R, CL_UNSIGNED_INT32 }, { CL_R, CL_HALF_FLOAT }, { CL_R, CL_FLOAT }, { CL_RG, CL_UNORM_INT8 }, { CL_RG, CL_UNORM_INT16 }, { CL_RG, CL_SNORM_INT8 }, { CL_RG, CL_SNORM_INT16 }, { CL_RG, CL_SIGNED_INT8 }, { CL_RG, CL_SIGNED_INT16 }, { CL_RG, CL_SIGNED_INT32 }, { CL_RG, CL_UNSIGNED_INT8 }, { CL_RG, CL_UNSIGNED_INT16 }, { CL_RG, CL_UNSIGNED_INT32 }, { CL_RG, CL_HALF_FLOAT }, { CL_RG, CL_FLOAT }, { CL_RGBA, CL_SNORM_INT8 }, { CL_RGBA, CL_SNORM_INT16 }, }; /* Required full profile formats for CL_DEPTH (specifically 2.x). There are cases whereby the format isn't required. */ static std::vector fullProf2XReadOrWriteDepthFormats { { CL_DEPTH, CL_UNORM_INT16 }, { CL_DEPTH, CL_FLOAT }, }; /* Required full profile formats for CL_sRGB (specifically 2.x). There are cases whereby the format isn't required. */ static std::vector fullProf2XSRGBFormats { { CL_sRGBA, CL_UNORM_INT8 }, }; // Embedded profile if (gIsEmbedded) { copy(embeddedProfReadOrWriteFormats.begin(), embeddedProfReadOrWriteFormats.end(), back_inserter(formatsToSupport)); } // Full profile else { copy(fullProfReadOrWriteFormats.begin(), fullProfReadOrWriteFormats.end(), back_inserter(formatsToSupport)); } // Full profile, OpenCL 2.0, 2.1, 2.2 if (!gIsEmbedded && version >= Version(2, 0) && version <= Version(2, 2)) { copy(fullProf2XReadOrWriteFormats.begin(), fullProf2XReadOrWriteFormats.end(), back_inserter(formatsToSupport)); // Depth images are only required for 2DArray and 2D images if (image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D) { copy(fullProf2XReadOrWriteDepthFormats.begin(), fullProf2XReadOrWriteDepthFormats.end(), back_inserter(formatsToSupport)); } // sRGB is not required for 1DImage Buffers if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) { // sRGB is only required for reading if (flags == CL_MEM_READ_ONLY) { copy(fullProf2XSRGBFormats.begin(), fullProf2XSRGBFormats.end(), back_inserter(formatsToSupport)); } } } } bool is_image_format_required(cl_image_format format, cl_mem_flags flags, cl_mem_object_type image_type, cl_device_id device) { std::vector formatsToSupport; build_required_image_formats(flags, image_type, device, formatsToSupport); for (auto &formatItr: formatsToSupport) { if (formatItr.image_channel_order == format.image_channel_order && formatItr.image_channel_data_type == format.image_channel_data_type) { return true; } } return false; } cl_uint compute_max_mip_levels( size_t width, size_t height, size_t depth) { cl_uint retMaxMipLevels=0, max_dim = 0; max_dim = width; max_dim = height > max_dim ? height : max_dim; max_dim = depth > max_dim ? depth : max_dim; while(max_dim) { retMaxMipLevels++; max_dim >>= 1; } return retMaxMipLevels; } cl_ulong compute_mipmapped_image_size( image_descriptor imageInfo) { cl_ulong retSize = 0; size_t curr_width, curr_height, curr_depth, curr_array_size; curr_width = imageInfo.width; curr_height = imageInfo.height; curr_depth = imageInfo.depth; curr_array_size = imageInfo.arraySize; for (int i=0; i < (int) imageInfo.num_mip_levels; i++) { switch ( imageInfo.type ) { case CL_MEM_OBJECT_IMAGE3D : retSize += (cl_ulong)curr_width * curr_height * curr_depth * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE2D : retSize += (cl_ulong)curr_width * curr_height * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE1D : retSize += (cl_ulong)curr_width * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY : retSize += (cl_ulong)curr_width * curr_array_size * get_pixel_size(imageInfo.format); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY : retSize += (cl_ulong)curr_width * curr_height * curr_array_size * get_pixel_size(imageInfo.format); break; } switch ( imageInfo.type ) { case CL_MEM_OBJECT_IMAGE3D : curr_depth = curr_depth >> 1 ? curr_depth >> 1: 1; case CL_MEM_OBJECT_IMAGE2D : case CL_MEM_OBJECT_IMAGE2D_ARRAY : curr_height = curr_height >> 1? curr_height >> 1 : 1; case CL_MEM_OBJECT_IMAGE1D : case CL_MEM_OBJECT_IMAGE1D_ARRAY : curr_width = curr_width >> 1? curr_width >> 1 : 1; } } return retSize; } size_t compute_mip_level_offset( image_descriptor * imageInfo , size_t lod) { size_t retOffset = 0; size_t width, height, depth; width = imageInfo->width; height = imageInfo->height; depth = imageInfo->depth; for(size_t i=0; i < lod; i++) { switch(imageInfo->type) { case CL_MEM_OBJECT_IMAGE2D_ARRAY: retOffset += (size_t) width * height * imageInfo->arraySize * get_pixel_size( imageInfo->format ); break; case CL_MEM_OBJECT_IMAGE3D: retOffset += (size_t) width * height * depth * get_pixel_size( imageInfo->format ); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: retOffset += (size_t) width * imageInfo->arraySize * get_pixel_size( imageInfo->format ); break; case CL_MEM_OBJECT_IMAGE2D: retOffset += (size_t) width * height * get_pixel_size( imageInfo->format ); break; case CL_MEM_OBJECT_IMAGE1D: retOffset += (size_t) width * get_pixel_size( imageInfo->format ); break; } // Compute next lod dimensions switch(imageInfo->type) { case CL_MEM_OBJECT_IMAGE3D: depth = ( depth >> 1 ) ? ( depth >> 1 ) : 1; case CL_MEM_OBJECT_IMAGE2D: case CL_MEM_OBJECT_IMAGE2D_ARRAY: height = ( height >> 1 ) ? ( height >> 1 ) : 1; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE1D: width = ( width >> 1 ) ? ( width >> 1 ) : 1; } } return retOffset; }