1/* 2 * Copyright 2020-2022 Matias N. Goldberg 3 * Copyright 2022 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24#version 310 es 25 26#if defined(GL_ES) && GL_ES == 1 27 // Desktop GLSL allows the const keyword for either compile-time or 28 // run-time constants. GLSL ES only allows the keyword for compile-time 29 // constants. Since we use const on run-time constants, define it to 30 // nothing. 31 #define const 32#endif 33 34%s // include "CrossPlatformSettings_piece_all.glsl" 35 36#define FLT_MAX 340282346638528859811704183484516925440.0f 37 38layout( location = 0 ) uniform uint p_numRefinements; 39 40uniform sampler2D srcTex; 41 42layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; 43 44layout( std430, binding = 1 ) readonly restrict buffer globalBuffer 45{ 46 float2 c_oMatch5[256]; 47 float2 c_oMatch6[256]; 48}; 49 50layout( local_size_x = 8, // 51 local_size_y = 8, // 52 local_size_z = 1 ) in; 53 54float3 rgb565to888( float rgb565 ) 55{ 56 float3 retVal; 57 retVal.x = floor( rgb565 / 2048.0f ); 58 retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f ); 59 retVal.z = floor( mod( rgb565, 32.0f ) ); 60 return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) ); 61} 62 63float rgb888to565( float3 rgbValue ) 64{ 65 rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f ); 66 rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f ); 67 68 return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; 69} 70 71// linear interpolation at 1/3 point between a and b, using desired rounding type 72float3 lerp13( float3 a, float3 b ) 73{ 74#ifdef STB_DXT_USE_ROUNDING_BIAS 75 // with rounding bias 76 return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f ); 77#else 78 // without rounding bias 79 return floor( ( 2.0f * a + b ) / 3.0f ); 80#endif 81} 82 83/// Unpacks a block of 4 colours from two 16-bit endpoints 84void EvalColors( out float3 colours[4], float c0, float c1 ) 85{ 86 colours[0] = rgb565to888( c0 ); 87 colours[1] = rgb565to888( c1 ); 88 colours[2] = lerp13( colours[0], colours[1] ); 89 colours[3] = lerp13( colours[1], colours[0] ); 90} 91 92/** The color optimization function. (Clever code, part 1) 93@param outMinEndp16 [out] 94 Minimum endpoint, in RGB565 95@param outMaxEndp16 [out] 96 Maximum endpoint, in RGB565 97*/ 98void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 ) 99{ 100 // determine color distribution 101 float3 avgColour; 102 float3 minColour; 103 float3 maxColour; 104 105 avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz; 106 for( int i = 1; i < 16; ++i ) 107 { 108 const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz; 109 avgColour += currColourUnorm; 110 minColour = min( minColour, currColourUnorm ); 111 maxColour = max( maxColour, currColourUnorm ); 112 } 113 114 avgColour = round( avgColour * 255.0f / 16.0f ); 115 maxColour *= 255.0f; 116 minColour *= 255.0f; 117 118 // determine covariance matrix 119 float cov[6]; 120 for( int i = 0; i < 6; ++i ) 121 cov[i] = 0.0f; 122 123 for( int i = 0; i < 16; ++i ) 124 { 125 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 126 float3 rgbDiff = currColour - avgColour; 127 128 cov[0] += rgbDiff.r * rgbDiff.r; 129 cov[1] += rgbDiff.r * rgbDiff.g; 130 cov[2] += rgbDiff.r * rgbDiff.b; 131 cov[3] += rgbDiff.g * rgbDiff.g; 132 cov[4] += rgbDiff.g * rgbDiff.b; 133 cov[5] += rgbDiff.b * rgbDiff.b; 134 } 135 136 // convert covariance matrix to float, find principal axis via power iter 137 for( int i = 0; i < 6; ++i ) 138 cov[i] /= 255.0f; 139 140 float3 vF = maxColour - minColour; 141 142 const int nIterPower = 4; 143 for( int iter = 0; iter < nIterPower; ++iter ) 144 { 145 const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; 146 const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; 147 const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; 148 149 vF.r = r; 150 vF.g = g; 151 vF.b = b; 152 } 153 154 float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) ); 155 float3 v; 156 157 if( magn < 4.0f ) 158 { // too small, default to luminance 159 v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. 160 v.g = 587.0f; 161 v.b = 114.0f; 162 } 163 else 164 { 165 v = trunc( vF * ( 512.0f / magn ) ); 166 } 167 168 // Pick colors at extreme points 169 float3 minEndpoint, maxEndpoint; 170 float minDot = FLT_MAX; 171 float maxDot = -FLT_MAX; 172 for( int i = 0; i < 16; ++i ) 173 { 174 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 175 const float dotValue = dot( currColour, v ); 176 177 if( dotValue < minDot ) 178 { 179 minDot = dotValue; 180 minEndpoint = currColour; 181 } 182 183 if( dotValue > maxDot ) 184 { 185 maxDot = dotValue; 186 maxEndpoint = currColour; 187 } 188 } 189 190 outMinEndp16 = rgb888to565( minEndpoint ); 191 outMaxEndp16 = rgb888to565( maxEndpoint ); 192} 193 194// The color matching function 195uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] ) 196{ 197 uint mask = 0u; 198 float3 dir = colour[0] - colour[1]; 199 float stops[4]; 200 201 for( int i = 0; i < 4; ++i ) 202 stops[i] = dot( colour[i], dir ); 203 float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f ); 204 float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f ); 205 float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f ); 206 207#ifndef BC1_DITHER 208 // the version without dithering is straightforward 209 for( uint i = 16u; i-- > 0u; ) 210 { 211 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 212 213 const float dotValue = dot( currColour, dir ); 214 mask <<= 2u; 215 216 if( dotValue < halfPoint ) 217 mask |= ( ( dotValue < c0Point ) ? 1u : 3u ); 218 else 219 mask |= ( ( dotValue < c3Point ) ? 2u : 0u ); 220 } 221#else 222 // with floyd-steinberg dithering 223 float4 ep1 = float4( 0, 0, 0, 0 ); 224 float4 ep2 = float4( 0, 0, 0, 0 ); 225 226 c0Point *= 16.0f; 227 halfPoint *= 16.0f; 228 c3Point *= 16.0f; 229 230 for( uint y = 0u; y < 4u; ++y ) 231 { 232 float ditherDot; 233 uint lmask, step; 234 235 float3 currColour; 236 float dotValue; 237 238 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f; 239 dotValue = dot( currColour, dir ); 240 241 ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] ); 242 if( ditherDot < halfPoint ) 243 step = ( ditherDot < c0Point ) ? 1u : 3u; 244 else 245 step = ( ditherDot < c3Point ) ? 2u : 0u; 246 ep1[0] = dotValue - stops[step]; 247 lmask = step; 248 249 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f; 250 dotValue = dot( currColour, dir ); 251 252 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ); 253 if( ditherDot < halfPoint ) 254 step = ( ditherDot < c0Point ) ? 1u : 3u; 255 else 256 step = ( ditherDot < c3Point ) ? 2u : 0u; 257 ep1[1] = dotValue - stops[step]; 258 lmask |= step << 2u; 259 260 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; 261 dotValue = dot( currColour, dir ); 262 263 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ); 264 if( ditherDot < halfPoint ) 265 step = ( ditherDot < c0Point ) ? 1u : 3u; 266 else 267 step = ( ditherDot < c3Point ) ? 2u : 0u; 268 ep1[2] = dotValue - stops[step]; 269 lmask |= step << 4u; 270 271 currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; 272 dotValue = dot( currColour, dir ); 273 274 ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ); 275 if( ditherDot < halfPoint ) 276 step = ( ditherDot < c0Point ) ? 1u : 3u; 277 else 278 step = ( ditherDot < c3Point ) ? 2u : 0u; 279 ep1[3] = dotValue - stops[step]; 280 lmask |= step << 6u; 281 282 mask |= lmask << ( y * 8u ); 283 { 284 float4 tmp = ep1; 285 ep1 = ep2; 286 ep2 = tmp; 287 } // swap 288 } 289#endif 290 291 return mask; 292} 293 294// The refinement function. (Clever code, part 2) 295// Tries to optimize colors to suit block contents better. 296// (By solving a least squares system via normal equations+Cramer's rule) 297bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, 298 inout float inOutMaxEndp16 ) 299{ 300 float newMin16, newMax16; 301 const float oldMin = inOutMinEndp16; 302 const float oldMax = inOutMaxEndp16; 303 304 if( ( mask ^ ( mask << 2u ) ) < 4u ) // all pixels have the same index? 305 { 306 // yes, linear system would be singular; solve using optimal 307 // single-color match on average color 308 float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f ); 309 for( int i = 0; i < 16; ++i ) 310 rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz; 311 312 rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) ); 313 314 newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f + // 315 c_oMatch6[uint( rgbVal.g )][0] * 32.0f + // 316 c_oMatch5[uint( rgbVal.b )][0]; 317 newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f + // 318 c_oMatch6[uint( rgbVal.g )][1] * 32.0f + // 319 c_oMatch5[uint( rgbVal.b )][1]; 320 } 321 else 322 { 323 const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f ); 324 const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f ); 325 // ^some magic to save a lot of multiplies in the accumulating loop... 326 // (precomputed products of weights for least squares system, accumulated inside one 32-bit 327 // register) 328 329 float akku = 0.0f; 330 uint cm = mask; 331 float3 at1 = float3( 0, 0, 0 ); 332 float3 at2 = float3( 0, 0, 0 ); 333 for( int i = 0; i < 16; ++i, cm >>= 2u ) 334 { 335 const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; 336 337 const uint step = cm & 3u; 338 const float w1 = w1Tab[step]; 339 akku += prods[step]; 340 at1 += currColour * w1; 341 at2 += currColour; 342 } 343 344 at2 = 3.0f * at2 - at1; 345 346 // extract solutions and decide solvability 347 const float xx = floor( akku / 65535.0f ); 348 const float yy = floor( mod( akku, 65535.0f ) / 256.0f ); 349 const float xy = mod( akku, 256.0f ); 350 351 float2 f_rb_g; 352 f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy ); 353 f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; 354 355 // solve. 356 const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ), 357 float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); 358 newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; 359 360 const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ), 361 float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); 362 newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; 363 } 364 365 inOutMinEndp16 = newMin16; 366 inOutMaxEndp16 = newMax16; 367 368 return oldMin != newMin16 || oldMax != newMax16; 369} 370 371#ifdef BC1_DITHER 372/// Quantizes 'srcValue' which is originally in 888 (full range), 373/// converting it to 565 and then back to 888 (quantized) 374float3 quant( float3 srcValue ) 375{ 376 srcValue = clamp( srcValue, 0.0f, 255.0f ); 377 // Convert 888 -> 565 378 srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f ); 379 // Convert 565 -> 888 back 380 srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) ); 381 382 return srcValue; 383} 384 385void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] ) 386{ 387 float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); 388 float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); 389 390 for( uint y = 0u; y < 16u; y += 4u ) 391 { 392 float3 srcPixel, dithPixel; 393 394 srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f; 395 dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) ); 396 ep1[0] = srcPixel - dithPixel; 397 dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 398 399 srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f; 400 dithPixel = quant( 401 srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) ); 402 ep1[1] = srcPixel - dithPixel; 403 dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 404 405 srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f; 406 dithPixel = quant( 407 srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) ); 408 ep1[2] = srcPixel - dithPixel; 409 dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 410 411 srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f; 412 dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) ); 413 ep1[3] = srcPixel - dithPixel; 414 dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); 415 416 // swap( ep1, ep2 ) 417 for( uint i = 0u; i < 4u; ++i ) 418 { 419 float3 tmp = ep1[i]; 420 ep1[i] = ep2[i]; 421 ep2[i] = tmp; 422 } 423 } 424} 425#endif 426 427void main() 428{ 429 uint srcPixelsBlock[16]; 430 431 bool bAllColoursEqual = true; 432 433 // Load the whole 4x4 block 434 const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; 435 for( uint i = 0u; i < 16u; ++i ) 436 { 437 const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u ); 438 const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz; 439 srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) ); 440 bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; 441 } 442 443 float maxEndp16, minEndp16; 444 uint mask = 0u; 445 446 if( bAllColoursEqual ) 447 { 448 const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f ); 449 mask = 0xAAAAAAAAu; 450 maxEndp16 = 451 c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; 452 minEndp16 = 453 c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; 454 } 455 else 456 { 457#ifdef BC1_DITHER 458 uint ditherPixelsBlock[16]; 459 // first step: compute dithered version for PCA if desired 460 DitherBlock( srcPixelsBlock, ditherPixelsBlock ); 461#else 462# define ditherPixelsBlock srcPixelsBlock 463#endif 464 465 // second step: pca+map along principal axis 466 OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 ); 467 if( minEndp16 != maxEndp16 ) 468 { 469 float3 colours[4]; 470 EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted 471 mask = MatchColorsBlock( srcPixelsBlock, colours ); 472 } 473 474 // third step: refine (multiple times if requested) 475 bool bStopRefinement = false; 476 for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i ) 477 { 478 const uint lastMask = mask; 479 480 if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) ) 481 { 482 if( minEndp16 != maxEndp16 ) 483 { 484 float3 colours[4]; 485 EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted 486 mask = MatchColorsBlock( srcPixelsBlock, colours ); 487 } 488 else 489 { 490 mask = 0u; 491 bStopRefinement = true; 492 } 493 } 494 495 bStopRefinement = mask == lastMask || bStopRefinement; 496 } 497 } 498 499 // write the color block 500 if( maxEndp16 < minEndp16 ) 501 { 502 const float tmpValue = minEndp16; 503 minEndp16 = maxEndp16; 504 maxEndp16 = tmpValue; 505 mask ^= 0x55555555u; 506 } 507 508 uint4 outputBytes; 509 outputBytes.x = uint( maxEndp16 ); 510 outputBytes.y = uint( minEndp16 ); 511 outputBytes.z = mask & 0xFFFFu; 512 outputBytes.w = mask >> 16u; 513 514 uint2 dstUV = gl_GlobalInvocationID.xy; 515 imageStore( dstTexture, int2( dstUV ), outputBytes ); 516} 517