• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2020-2022 Matias N. Goldberg
3 * Copyright 2022 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#version 310 es
25
26#if defined(GL_ES) && GL_ES == 1
27	// Desktop GLSL allows the const keyword for either compile-time or
28	// run-time constants. GLSL ES only allows the keyword for compile-time
29	// constants. Since we use const on run-time constants, define it to
30	// nothing.
31	#define const
32#endif
33
34%s // include "CrossPlatformSettings_piece_all.glsl"
35
36#define FLT_MAX 340282346638528859811704183484516925440.0f
37
38layout( location = 0 ) uniform uint p_numRefinements;
39
40uniform sampler2D srcTex;
41
42layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
43
44layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
45{
46	float2 c_oMatch5[256];
47	float2 c_oMatch6[256];
48};
49
50layout( local_size_x = 8,  //
51		local_size_y = 8,  //
52		local_size_z = 1 ) in;
53
54float3 rgb565to888( float rgb565 )
55{
56	float3 retVal;
57	retVal.x = floor( rgb565 / 2048.0f );
58	retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f );
59	retVal.z = floor( mod( rgb565, 32.0f ) );
60	return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) );
61}
62
63float rgb888to565( float3 rgbValue )
64{
65	rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f );
66	rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f );
67
68	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
69}
70
71// linear interpolation at 1/3 point between a and b, using desired rounding type
72float3 lerp13( float3 a, float3 b )
73{
74#ifdef STB_DXT_USE_ROUNDING_BIAS
75	// with rounding bias
76	return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f );
77#else
78	// without rounding bias
79	return floor( ( 2.0f * a + b ) / 3.0f );
80#endif
81}
82
83/// Unpacks a block of 4 colours from two 16-bit endpoints
84void EvalColors( out float3 colours[4], float c0, float c1 )
85{
86	colours[0] = rgb565to888( c0 );
87	colours[1] = rgb565to888( c1 );
88	colours[2] = lerp13( colours[0], colours[1] );
89	colours[3] = lerp13( colours[1], colours[0] );
90}
91
92/** The color optimization function. (Clever code, part 1)
93@param outMinEndp16 [out]
94	Minimum endpoint, in RGB565
95@param outMaxEndp16 [out]
96	Maximum endpoint, in RGB565
97*/
98void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 )
99{
100	// determine color distribution
101	float3 avgColour;
102	float3 minColour;
103	float3 maxColour;
104
105	avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz;
106	for( int i = 1; i < 16; ++i )
107	{
108		const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
109		avgColour += currColourUnorm;
110		minColour = min( minColour, currColourUnorm );
111		maxColour = max( maxColour, currColourUnorm );
112	}
113
114	avgColour = round( avgColour * 255.0f / 16.0f );
115	maxColour *= 255.0f;
116	minColour *= 255.0f;
117
118	// determine covariance matrix
119	float cov[6];
120	for( int i = 0; i < 6; ++i )
121		cov[i] = 0.0f;
122
123	for( int i = 0; i < 16; ++i )
124	{
125		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
126		float3 rgbDiff = currColour - avgColour;
127
128		cov[0] += rgbDiff.r * rgbDiff.r;
129		cov[1] += rgbDiff.r * rgbDiff.g;
130		cov[2] += rgbDiff.r * rgbDiff.b;
131		cov[3] += rgbDiff.g * rgbDiff.g;
132		cov[4] += rgbDiff.g * rgbDiff.b;
133		cov[5] += rgbDiff.b * rgbDiff.b;
134	}
135
136	// convert covariance matrix to float, find principal axis via power iter
137	for( int i = 0; i < 6; ++i )
138		cov[i] /= 255.0f;
139
140	float3 vF = maxColour - minColour;
141
142	const int nIterPower = 4;
143	for( int iter = 0; iter < nIterPower; ++iter )
144	{
145		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
146		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
147		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
148
149		vF.r = r;
150		vF.g = g;
151		vF.b = b;
152	}
153
154	float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) );
155	float3 v;
156
157	if( magn < 4.0f )
158	{                  // too small, default to luminance
159		v.r = 299.0f;  // JPEG YCbCr luma coefs, scaled by 1000.
160		v.g = 587.0f;
161		v.b = 114.0f;
162	}
163	else
164	{
165		v = trunc( vF * ( 512.0f / magn ) );
166	}
167
168	// Pick colors at extreme points
169	float3 minEndpoint, maxEndpoint;
170	float minDot = FLT_MAX;
171	float maxDot = -FLT_MAX;
172	for( int i = 0; i < 16; ++i )
173	{
174		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
175		const float dotValue = dot( currColour, v );
176
177		if( dotValue < minDot )
178		{
179			minDot = dotValue;
180			minEndpoint = currColour;
181		}
182
183		if( dotValue > maxDot )
184		{
185			maxDot = dotValue;
186			maxEndpoint = currColour;
187		}
188	}
189
190	outMinEndp16 = rgb888to565( minEndpoint );
191	outMaxEndp16 = rgb888to565( maxEndpoint );
192}
193
194// The color matching function
195uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] )
196{
197	uint mask = 0u;
198	float3 dir = colour[0] - colour[1];
199	float stops[4];
200
201	for( int i = 0; i < 4; ++i )
202		stops[i] = dot( colour[i], dir );
203	float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f );
204	float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f );
205	float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f );
206
207#ifndef BC1_DITHER
208	// the version without dithering is straightforward
209	for( uint i = 16u; i-- > 0u; )
210	{
211		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
212
213		const float dotValue = dot( currColour, dir );
214		mask <<= 2u;
215
216		if( dotValue < halfPoint )
217			mask |= ( ( dotValue < c0Point ) ? 1u : 3u );
218		else
219			mask |= ( ( dotValue < c3Point ) ? 2u : 0u );
220	}
221#else
222	// with floyd-steinberg dithering
223	float4 ep1 = float4( 0, 0, 0, 0 );
224	float4 ep2 = float4( 0, 0, 0, 0 );
225
226	c0Point *= 16.0f;
227	halfPoint *= 16.0f;
228	c3Point *= 16.0f;
229
230	for( uint y = 0u; y < 4u; ++y )
231	{
232		float ditherDot;
233		uint lmask, step;
234
235		float3 currColour;
236		float dotValue;
237
238		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f;
239		dotValue = dot( currColour, dir );
240
241		ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] );
242		if( ditherDot < halfPoint )
243			step = ( ditherDot < c0Point ) ? 1u : 3u;
244		else
245			step = ( ditherDot < c3Point ) ? 2u : 0u;
246		ep1[0] = dotValue - stops[step];
247		lmask = step;
248
249		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f;
250		dotValue = dot( currColour, dir );
251
252		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] );
253		if( ditherDot < halfPoint )
254			step = ( ditherDot < c0Point ) ? 1u : 3u;
255		else
256			step = ( ditherDot < c3Point ) ? 2u : 0u;
257		ep1[1] = dotValue - stops[step];
258		lmask |= step << 2u;
259
260		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
261		dotValue = dot( currColour, dir );
262
263		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] );
264		if( ditherDot < halfPoint )
265			step = ( ditherDot < c0Point ) ? 1u : 3u;
266		else
267			step = ( ditherDot < c3Point ) ? 2u : 0u;
268		ep1[2] = dotValue - stops[step];
269		lmask |= step << 4u;
270
271		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
272		dotValue = dot( currColour, dir );
273
274		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] );
275		if( ditherDot < halfPoint )
276			step = ( ditherDot < c0Point ) ? 1u : 3u;
277		else
278			step = ( ditherDot < c3Point ) ? 2u : 0u;
279		ep1[3] = dotValue - stops[step];
280		lmask |= step << 6u;
281
282		mask |= lmask << ( y * 8u );
283		{
284			float4 tmp = ep1;
285			ep1 = ep2;
286			ep2 = tmp;
287		}  // swap
288	}
289#endif
290
291	return mask;
292}
293
294// The refinement function. (Clever code, part 2)
295// Tries to optimize colors to suit block contents better.
296// (By solving a least squares system via normal equations+Cramer's rule)
297bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
298				  inout float inOutMaxEndp16 )
299{
300	float newMin16, newMax16;
301	const float oldMin = inOutMinEndp16;
302	const float oldMax = inOutMaxEndp16;
303
304	if( ( mask ^ ( mask << 2u ) ) < 4u )  // all pixels have the same index?
305	{
306		// yes, linear system would be singular; solve using optimal
307		// single-color match on average color
308		float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f );
309		for( int i = 0; i < 16; ++i )
310			rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
311
312		rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) );
313
314		newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f +  //
315				   c_oMatch6[uint( rgbVal.g )][0] * 32.0f +    //
316				   c_oMatch5[uint( rgbVal.b )][0];
317		newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f +  //
318				   c_oMatch6[uint( rgbVal.g )][1] * 32.0f +    //
319				   c_oMatch5[uint( rgbVal.b )][1];
320	}
321	else
322	{
323		const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f );
324		const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f );
325		// ^some magic to save a lot of multiplies in the accumulating loop...
326		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
327		// register)
328
329		float akku = 0.0f;
330		uint cm = mask;
331		float3 at1 = float3( 0, 0, 0 );
332		float3 at2 = float3( 0, 0, 0 );
333		for( int i = 0; i < 16; ++i, cm >>= 2u )
334		{
335			const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
336
337			const uint step = cm & 3u;
338			const float w1 = w1Tab[step];
339			akku += prods[step];
340			at1 += currColour * w1;
341			at2 += currColour;
342		}
343
344		at2 = 3.0f * at2 - at1;
345
346		// extract solutions and decide solvability
347		const float xx = floor( akku / 65535.0f );
348		const float yy = floor( mod( akku, 65535.0f ) / 256.0f );
349		const float xy = mod( akku, 256.0f );
350
351		float2 f_rb_g;
352		f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy );
353		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
354
355		// solve.
356		const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ),
357										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
358		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
359
360		const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ),
361										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
362		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
363	}
364
365	inOutMinEndp16 = newMin16;
366	inOutMaxEndp16 = newMax16;
367
368	return oldMin != newMin16 || oldMax != newMax16;
369}
370
371#ifdef BC1_DITHER
372/// Quantizes 'srcValue' which is originally in 888 (full range),
373/// converting it to 565 and then back to 888 (quantized)
374float3 quant( float3 srcValue )
375{
376	srcValue = clamp( srcValue, 0.0f, 255.0f );
377	// Convert 888 -> 565
378	srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f );
379	// Convert 565 -> 888 back
380	srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) );
381
382	return srcValue;
383}
384
385void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] )
386{
387	float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
388	float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
389
390	for( uint y = 0u; y < 16u; y += 4u )
391	{
392		float3 srcPixel, dithPixel;
393
394		srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f;
395		dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) );
396		ep1[0] = srcPixel - dithPixel;
397		dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
398
399		srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f;
400		dithPixel = quant(
401			srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) );
402		ep1[1] = srcPixel - dithPixel;
403		dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
404
405		srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f;
406		dithPixel = quant(
407			srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) );
408		ep1[2] = srcPixel - dithPixel;
409		dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
410
411		srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f;
412		dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) );
413		ep1[3] = srcPixel - dithPixel;
414		dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
415
416		// swap( ep1, ep2 )
417		for( uint i = 0u; i < 4u; ++i )
418		{
419			float3 tmp = ep1[i];
420			ep1[i] = ep2[i];
421			ep2[i] = tmp;
422		}
423	}
424}
425#endif
426
427void main()
428{
429	uint srcPixelsBlock[16];
430
431	bool bAllColoursEqual = true;
432
433	// Load the whole 4x4 block
434	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
435	for( uint i = 0u; i < 16u; ++i )
436	{
437		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u );
438		const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz;
439		srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) );
440		bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
441	}
442
443	float maxEndp16, minEndp16;
444	uint mask = 0u;
445
446	if( bAllColoursEqual )
447	{
448		const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f );
449		mask = 0xAAAAAAAAu;
450		maxEndp16 =
451			c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
452		minEndp16 =
453			c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
454	}
455	else
456	{
457#ifdef BC1_DITHER
458		uint ditherPixelsBlock[16];
459		// first step: compute dithered version for PCA if desired
460		DitherBlock( srcPixelsBlock, ditherPixelsBlock );
461#else
462#	define ditherPixelsBlock srcPixelsBlock
463#endif
464
465		// second step: pca+map along principal axis
466		OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 );
467		if( minEndp16 != maxEndp16 )
468		{
469			float3 colours[4];
470			EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
471			mask = MatchColorsBlock( srcPixelsBlock, colours );
472		}
473
474		// third step: refine (multiple times if requested)
475		bool bStopRefinement = false;
476		for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i )
477		{
478			const uint lastMask = mask;
479
480			if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) )
481			{
482				if( minEndp16 != maxEndp16 )
483				{
484					float3 colours[4];
485					EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
486					mask = MatchColorsBlock( srcPixelsBlock, colours );
487				}
488				else
489				{
490					mask = 0u;
491					bStopRefinement = true;
492				}
493			}
494
495			bStopRefinement = mask == lastMask || bStopRefinement;
496		}
497	}
498
499	// write the color block
500	if( maxEndp16 < minEndp16 )
501	{
502		const float tmpValue = minEndp16;
503		minEndp16 = maxEndp16;
504		maxEndp16 = tmpValue;
505		mask ^= 0x55555555u;
506	}
507
508	uint4 outputBytes;
509	outputBytes.x = uint( maxEndp16 );
510	outputBytes.y = uint( minEndp16 );
511	outputBytes.z = mask & 0xFFFFu;
512	outputBytes.w = mask >> 16u;
513
514	uint2 dstUV = gl_GlobalInvocationID.xy;
515	imageStore( dstTexture, int2( dstUV ), outputBytes );
516}
517