1// 2// Copyright 2019 The ANGLE Project Authors. All rights reserved. 3// Use of this source code is governed by a BSD-style license that can be 4// found in the LICENSE file. 5 6#version 450 core 7 8#extension GL_GOOGLE_include_directive : require 9 10#if EtcRgb8ToBC1 11#define OUTFORMAT rg32ui 12#define DECODE_RGBA 1 13#define ENCODE_RGBA 1 14#elif EtcRgba8ToBC3 15#define DECODE_RGBA 1 16#define ENCODE_RGBA 1 17#define OUTFORMAT rgba32ui 18#elif EtcR11ToBC4 19#define DECODE_R11 1 20#define ENCODE_R11 1 21#define OUTFORMAT rg32ui 22#define R11 1 23#elif EtcRg11ToBC5 24#define DECODE_R11 1 25#define ENCODE_R11 1 26#define DECODE_G11 1 27#define ENCODE_G11 1 28#define OUTFORMAT rgba32ui 29#define R11 1 30#elif EtcR11ToR8 31#define DECODE_R11 1 32#define OUTFORMAT r8ui 33#define R11 1 34#elif EtcRg11ToRG8 35#define DECODE_R11 1 36#define DECODE_G11 1 37#define OUTFORMAT rg8ui 38#define R11 1 39#else //EtcToRGBA 40#define DECODE_RGBA 1 41#define OUTFORMAT rgba8ui 42#endif 43 44#if ENCODE_RGBA || ENCODE_R11 45#define SUBGROUP_OP 1 46#define TRANSCODE 1 47#endif 48 49 50#if SUBGROUP_OP 51#extension GL_KHR_shader_subgroup_clustered : enable 52#extension GL_KHR_shader_subgroup_shuffle : enable 53#endif 54 55 56 57layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in; 58layout(binding = 0) uniform highp usamplerBuffer uInputBuffer; 59layout(binding = 1, rgba32ui) writeonly uniform uimage2D uOutput; 60 61 62layout(push_constant) uniform imagInfo { 63 // for transcode to BC the width and height need to be aligned to block size 64 // we need full block data. if decode to RGBA, we don't need to write full block data out. 65 // offsetX, offsetY both need to be multiple of four. 66 uint offsetX; 67 uint offsetY; 68 int texelOffset; 69 uint width; 70 uint height; 71 uint alphaBits; 72 uint isSigned; 73 uint isEacRg; 74}; 75 76#include "third_party/etc_decoder/etc_decoder.h" 77 78ivec2 build_coord() 79{ 80 uvec2 base = (gl_WorkGroupID.xy) * 8; 81 uint blockid = gl_LocalInvocationID.x >> 4u; 82 uint blockxy = gl_LocalInvocationID.x & 0xfu; 83 base.x += 4 * (blockid & 0x1); 84 base.y += 2 * (blockid & 0x2); 85 base += uvec2(blockxy & 0x3, blockxy >> 0x2); 86 return ivec2(base); 87} 88 89uint flip_endian(uint v) 90{ 91 uvec4 words = uvec4(v) >> uvec4(0, 8, 16, 24); 92 words &= 0xffu; 93 return (words.x << 24u) | (words.y << 16u) | (words.z << 8u) | (words.w << 0u); 94} 95 96uvec2 flip_endian(uvec2 v) 97{ 98 return uvec2(flip_endian(v.y), flip_endian(v.x)); 99} 100 101#if SUBGROUP_OP 102uint GetIndicesRGB(vec3 color, vec3 minColor, vec3 maxColor, bool transparent) 103{ 104 vec3 dir = maxColor - minColor; 105 float distMin = dot(minColor, dir); 106 float distMax = dot(maxColor, dir); 107 float dist = dot(color, dir); 108 int ind = int(round( clamp((dist - distMin) / (distMax - distMin), 0.0, 1.0) * (transparent ? 2.0 : 3.0))); 109 110 // BC1 index mapping 111 // color0: maxColor 112 // color1: minColor 113 // color2: (2/3)*maxColor + (1/3)*minColor 114 // color3: (1/3)*maxColor + (2/3)*minColor 115 // The mapping is: 116 // 0 -> 1 117 // 1 -> 3 118 // 2 -> 2 119 // 3 -> 0 120 // Tranparent case 121 // color0: minColor 122 // color1: maxColor 123 // color2: (1/2)*maxColor + (1/2)*minColor 124 // color3: 0 125 // The mapping is: 126 // 0 -> 0 127 // 1 -> 2 128 // 2 -> 1 129 return bitfieldExtract( transparent ? 0x18u : 0x2du, ind * 2, 2); 130} 131 132// Select end point using PCA 133void ComputeMaxMinColor(uvec3 rgbColor, inout uvec3 minColor, inout uvec3 maxColor) { 134 ivec3 dx; 135 if( alphaBits == 1 ) { 136 int count = subgroupClusteredAdd(1, 16); 137 ivec3 avg = ivec3((subgroupClusteredAdd(rgbColor, 16) * 2 + count)/ (2*count)); 138 dx = ivec3(rgbColor) - avg; 139 } 140 else { 141 dx = ivec3(rgbColor) - ivec3(subgroupClusteredAdd(rgbColor, 16) + 8 >> 4); 142 } 143 vec3 cov0 = vec3(subgroupClusteredAdd(dx.r * dx, 16)); 144 vec3 cov1 = vec3(subgroupClusteredAdd(dx.ggb * dx.gbb, 16)); 145 vec3 vg = vec3(subgroupClusteredMax(rgbColor, 16) - subgroupClusteredMin(rgbColor, 16)); 146 147 // Then build the matrix. 148 mat3 covMat = mat3(cov0, // rr, rg, rb 149 vec3(cov0.y, cov1.xy), // rg, gg, gb 150 vec3(cov0.z, cov1.yz)); // rb, gb, bb 151 // normalized power iteration. 152 // power iteration at some special case maybe wrong. 153 float eigenvalue = 0.0f; 154 for( int i = 0; i<4; i++ ) { 155 vg = covMat * vg; 156 eigenvalue = sqrt(dot(vg, vg)); 157 if( eigenvalue > 0.0f ) { 158 float invNorm = 1.0f/eigenvalue; 159 vg *= invNorm; 160 } 161 } 162 const float kDefaultLuminanceThreshold = 4.0f * 255; 163 const float kQuantizeRange = 0.512f; 164 165 if (eigenvalue < kDefaultLuminanceThreshold) { 166 vg = vec3(0.299f, 0.587f, 0.114f); 167 } 168 else { 169 float magn = max(max(abs(vg.r), abs(vg.g)), abs(vg.b)); 170 vg *= kQuantizeRange / magn; 171 } 172 float dist = dot(vec3(rgbColor), vg); 173 float min_dist = subgroupClusteredMin(dist, 16); 174 float max_dist = subgroupClusteredMax(dist, 16); 175 uvec2 indices = uvec2(dist == min_dist? gl_SubgroupInvocationID : 0, 176 dist == max_dist? gl_SubgroupInvocationID : 0); 177 uvec2 minMaxIndex = subgroupClusteredMax(indices, 16); 178 minColor = subgroupShuffle(rgbColor, minMaxIndex.x); 179 maxColor = subgroupShuffle(rgbColor, minMaxIndex.y); 180} 181 182uint GetIndicesAlpha(int alpha, int minAlpha, int maxAlpha) 183{ 184 float dist = float(maxAlpha-minAlpha); 185 int ind = int(round(clamp((alpha - minAlpha)/dist*7.0f, 0.0, 7.0))); 186 // 0 : maxAlpha 187 // 1 : minAlpha 188 // 2 : 6/7*maxAlpha + 1/7*minAlpha; 189 // 3 : 5/7*maxAlpha + 2/7*minAlpha; 190 // 4 : 4/7*maxAlpha + 3/7*minAlpha; 191 // 5 : 3/7*maxAlpha + 4/7*minAlpha; 192 // 6 : 2/7*maxAlpha + 5/7*minAlpha; 193 // 7 : 1/7*maxAlpha + 6/7*minAlpha; 194 // so the mapping is 195 // 0 -> 1 196 // 1 -> 7 197 // 2 -> 6 198 // 3 -> 5 199 // 4 -> 4 200 // 5 -> 3 201 // 6 -> 2 202 // 7 -> 0 203 return bitfieldExtract(0x2345671u, ind * 4, 4); 204} 205 206void ComputeMaxMin(int alpha, inout int minAlpha, inout int maxAlpha) { 207 minAlpha = subgroupClusteredMin(alpha, 16); 208 maxAlpha = subgroupClusteredMax(alpha, 16); 209} 210uvec2 EncodeBC4(int value, uint pid) { 211 int minValue, maxValue; 212 ComputeMaxMin(value, minValue, maxValue); 213 uint indices = 0; 214 if( minValue != maxValue ) 215 indices = GetIndicesAlpha(value, minValue, maxValue); 216 217 uvec2 mask = uvec2( pid <= 5 ? indices << ( 16 + 3 * pid ) : 0x0, 218 pid >= 5 ? ( indices << 29 ) >> ( 45 - 3 * pid ) : 0x0 ); 219 220 mask = subgroupClusteredOr( mask, 16); 221 return uvec2((maxValue & 0xff) | ((minValue & 0xff) << 8) | mask.x, mask.y); 222} 223#endif 224 225 226uvec3 scaleColorToRGB565(uvec3 color) { 227 return uvec3(round(vec3(color) * vec3(31.0/255.0, 63.0/255.0, 31.0/255.0))); 228} 229 230// This function simulate hardware behavior. 231// only a few number not equal to golden reference. 232uvec3 convertRGB565ToRGB888(uvec3 color) { 233 return uvec3(color.x << 3 | (color.x >> 2), 234 color.y << 2 | (color.y >> 4), 235 color.z << 3 | (color.z >> 2)); 236} 237 238uint packRGB565(uvec3 color565) { 239 return color565.r << 11 | ( color565.g << 5 ) | color565.b; 240} 241 242//This change tries to change one endpoint to an adjacent one (not optimal) in RGB565, 243//so that all the colors are interpolated from these two endpoints. 244void modifyMinMax(inout uvec3 minColor, inout uvec3 maxColor) { 245 uvec3 minColor565 = scaleColorToRGB565(minColor); 246 uvec3 maxColor565 = scaleColorToRGB565(maxColor); 247 if( all(equal(minColor565, maxColor565)) ) { 248 uvec3 simulatedColor = convertRGB565ToRGB888(minColor565); 249 ivec3 signMax = sign(ivec3(maxColor) - ivec3(simulatedColor)); 250 ivec3 signMin = sign(ivec3(minColor) - ivec3(simulatedColor)); 251 bvec3 needCorrect = greaterThan(signMax * signMin, ivec3(0, 0, 0)); 252 bvec3 positive = greaterThan(signMin, ivec3(0, 0, 0)); 253 maxColor565.r += needCorrect.r && positive.r ? 1 : 0; 254 maxColor565.g += needCorrect.g && positive.g ? 1 : 0; 255 maxColor565.b += needCorrect.b && positive.b ? 1 : 0; 256 minColor565.r -= needCorrect.r && !positive.r ? 1 : 0; 257 minColor565.g -= needCorrect.g && !positive.g ? 1 : 0; 258 minColor565.b -= needCorrect.b && !positive.b ? 1 : 0; 259 } 260 minColor = minColor565; 261 maxColor = maxColor565; 262} 263 264void swap( inout uint a, inout uint b) { 265 uint t = a; 266 a = b; 267 b = t; 268} 269 270void main() 271{ 272 ivec2 coord = build_coord(); 273 if( any(greaterThanEqual(coord, ivec2(width, height)) )) 274 return; 275 276 ivec2 tile_coord = coord >> 2; 277 ivec2 pixel_coord = coord & 3; 278 int linear_pixel = 4 * pixel_coord.x + pixel_coord.y; 279 int pid = 4 * pixel_coord.y + pixel_coord.x; 280 uvec4 payload = texelFetch(uInputBuffer, tile_coord.y * int((width+3)>>2) + tile_coord.x + texelOffset); 281 282 ivec4 result; 283#if DECODE_RGBA 284 uvec2 color_payload = flip_endian(alphaBits == 8 ? payload.zw : payload.xy); 285 bool nonOpaque = alphaBits == 1 && (color_payload.y & 2u) == 0u; 286 bool punchthrough = nonOpaque; 287 result = DecodeRGB(pixel_coord, color_payload, linear_pixel, punchthrough); 288 if( alphaBits == 8 ) { 289 uvec2 alpha_payload = flip_endian(payload.xy); 290 result.a = decode_etc2_alpha(alpha_payload, linear_pixel); 291 } 292#endif 293 294#if DECODE_R11 295 result.r = decode_etc2_alpha(flip_endian(payload.xy), linear_pixel); 296 if( isEacRg != 0 ) { 297 result.g = decode_etc2_alpha(flip_endian(payload.zw), linear_pixel); 298 } 299#endif 300 301 uvec4 finalResult; 302#if ENCODE_RGBA 303 uvec3 minColor, maxColor; 304 uint indices = 0; 305 uint color565 = 0; 306 307 // Encode the alpha compoenent first. On some AMD GPUs, we see a very 308 // strange issue where doing this later produces incorrect results in the 309 // subgroup operations. See b/300672851 for details. 310 finalResult.ba = alphaBits == 8 ? EncodeBC4(result.a, pid) : uvec2(0,0); 311 312 // Encode the RGB component 313 bool controlFlag = alphaBits != 1 || result.a > 0; 314 if( controlFlag ) 315 { 316 ComputeMaxMinColor(uvec3(result.r, result.g, result.b), minColor, maxColor); 317 modifyMinMax(minColor, maxColor); 318 uint minColor565 = packRGB565(minColor); 319 uint maxColor565 = packRGB565(maxColor); 320 321 if( minColor565 != maxColor565 ) { 322 indices = GetIndicesRGB(vec3(result.r, result.g, result.b), 323 vec3(convertRGB565ToRGB888(minColor)), 324 vec3(convertRGB565ToRGB888(maxColor)), 325 nonOpaque); 326 } 327 bool flip = maxColor565 < minColor565; 328 if( flip ) { 329 indices ^= 1; 330 // nonOpaque only need flip 0-->1, 1-->0. fix 2-->3. 331 if( nonOpaque && indices == 3 ){ 332 indices = 2; 333 } 334 } 335 if( flip != nonOpaque ) { 336 swap(maxColor565, minColor565); 337 } 338 color565 = maxColor565 | (minColor565<<16); 339 } 340 if( alphaBits == 1 ) { 341 int active_lane_index = subgroupClusteredMax(controlFlag ? int(gl_SubgroupInvocationID) : -1, 16); 342 if(active_lane_index != -1) { 343 color565 = subgroupShuffle(color565, active_lane_index); 344 } 345 if( punchthrough && result.a == 0 ) { 346 indices = 3; 347 } 348 } 349 uint mask = subgroupClusteredOr(indices << (2*pid), 16); 350 351 finalResult.rg = uvec2(color565, mask); 352 if( alphaBits == 8 ) { 353 finalResult = finalResult.barg; 354 } 355#endif 356 357#if ENCODE_R11 358 finalResult.rg = EncodeBC4(result.r, pid); 359 if( isEacRg != 0 ) 360 finalResult.ba = EncodeBC4(result.g, pid); 361#endif 362 363 364 365#if TRANSCODE 366 if( pid == 0 ) { 367 tile_coord += ivec2(offsetX/4, offsetY/4); 368 imageStore(uOutput, tile_coord, finalResult); 369 } 370#else 371 coord += ivec2(offsetX, offsetY); 372 imageStore(uOutput, coord, uvec4(result)); 373#endif 374} 375