1/* 2 * Copyright 2020-2022 Matias N. Goldberg 3 * Copyright 2022 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24#version 310 es 25 26#if defined(GL_ES) && GL_ES == 1 27 // Desktop GLSL allows the const keyword for either compile-time or 28 // run-time constants. GLSL ES only allows the keyword for compile-time 29 // constants. Since we use const on run-time constants, define it to 30 // nothing. 31 #define const 32#endif 33 34#define __sharedOnlyBarrier memoryBarrierShared();barrier(); 35 36%s // include "CrossPlatformSettings_piece_all.glsl" 37 38shared float2 g_minMaxValues[4u * 4u * 4u]; 39shared uint2 g_mask[4u * 4u]; 40 41layout( location = 0 ) uniform uint2 params; 42 43#define p_channelIdx params.x 44#define p_useSNorm params.y 45 46uniform sampler2D srcTex; 47 48layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; 49 50layout( local_size_x = 4, // 51 local_size_y = 4, // 52 local_size_z = 4 ) in; 53 54/// Each block is 16 pixels 55/// Each thread works on 4 pixels 56/// Therefore each block needs 4 threads, generating 8 masks 57/// At the end these 8 masks get merged into 2 and results written to output 58/// 59/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** 60/// 61/// A: It's a sweetspot. 62/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) 63/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) 64/// overhead, and also more LDS usage which reduces occupancy. 65/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities 66void main() 67{ 68 float minVal, maxVal; 69 float4 srcPixel; 70 71 const uint blockThreadId = gl_LocalInvocationID.x; 72 73 const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; 74 75 for( uint i = 0u; i < 4u; ++i ) 76 { 77 const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); 78 79 const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; 80 srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); 81 srcPixel[i] *= 255.0f; 82 } 83 84 minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); 85 maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); 86 minVal = min( minVal, srcPixel.w ); 87 maxVal = max( maxVal, srcPixel.w ); 88 89 const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); 90 const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; 91 92 g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); 93 g_mask[maskIdxBase] = uint2( 0u, 0u ); 94 95 __sharedOnlyBarrier; 96 97 // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded 98 for( uint i = 0u; i < 4u; ++i ) 99 { 100 minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); 101 maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); 102 } 103 104 float dist = maxVal - minVal; 105 float dist4 = dist * 4.0f; 106 float dist2 = dist * 2.0f; 107 float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); 108 bias -= minVal * 7.0f; 109 110 uint mask0 = 0u, mask1 = 0u; 111 112 for( uint i = 0u; i < 4u; ++i ) 113 { 114 float a = srcPixel[i] * 7.0f + bias; 115 116 int ind = 0; 117 118 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). 119 if( a >= dist4 ) 120 { 121 ind = 4; 122 a -= dist4; 123 } 124 125 if( a >= dist2 ) 126 { 127 ind += 2; 128 a -= dist2; 129 } 130 131 if( a >= dist ) 132 ind += 1; 133 134 // turn linear scale into DXT index (0/1 are extremal pts) 135 ind = -ind & 7; 136 ind ^= ( 2 > ind ) ? 1 : 0; 137 138 // write index 139 const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; 140 if( bits < 32u ) 141 { 142 mask0 |= uint( ind ) << bits; 143 if( bits + 3u > 32u ) 144 { 145 mask1 |= uint( ind ) >> ( 32u - bits ); 146 } 147 } 148 else 149 { 150 mask1 |= uint( ind ) << ( bits - 32u ); 151 } 152 } 153 154 if( mask0 != 0u ) 155 atomicOr( g_mask[maskIdxBase].x, mask0 ); 156 if( mask1 != 0u ) 157 atomicOr( g_mask[maskIdxBase].y, mask1 ); 158 159 __sharedOnlyBarrier; 160 161 if( blockThreadId == 0u ) 162 { 163 // Save data 164 uint4 outputBytes; 165 166 if( p_useSNorm != 0u ) 167 { 168 outputBytes.x = 169 packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 170 minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); 171 } 172 else 173 { 174 outputBytes.x = packUnorm4x8( 175 float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); 176 } 177 outputBytes.y = g_mask[maskIdxBase].x >> 16u; 178 outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; 179 outputBytes.w = g_mask[maskIdxBase].y >> 16u; 180 181 uint2 dstUV = gl_GlobalInvocationID.yz; 182 imageStore( dstTexture, int2( dstUV ), outputBytes ); 183 } 184} 185