• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright 2019 The ANGLE Project Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style license that can be
4// found in the LICENSE file.
5
6#version 450 core
7
8#extension GL_GOOGLE_include_directive : require
9
10#if EtcRgb8ToBC1
11#define OUTFORMAT rg32ui
12#define DECODE_RGBA 1
13#define ENCODE_RGBA 1
14#elif EtcRgba8ToBC3
15#define DECODE_RGBA 1
16#define ENCODE_RGBA 1
17#define OUTFORMAT rgba32ui
18#elif EtcR11ToBC4
19#define DECODE_R11 1
20#define ENCODE_R11 1
21#define OUTFORMAT rg32ui
22#define R11 1
23#elif EtcRg11ToBC5
24#define DECODE_R11 1
25#define ENCODE_R11 1
26#define DECODE_G11 1
27#define ENCODE_G11 1
28#define OUTFORMAT rgba32ui
29#define R11 1
30#elif EtcR11ToR8
31#define DECODE_R11 1
32#define OUTFORMAT r8ui
33#define R11 1
34#elif EtcRg11ToRG8
35#define DECODE_R11 1
36#define DECODE_G11 1
37#define OUTFORMAT rg8ui
38#define R11 1
39#else //EtcToRGBA
40#define DECODE_RGBA 1
41#define OUTFORMAT rgba8ui
42#endif
43
44#if ENCODE_RGBA || ENCODE_R11
45#define SUBGROUP_OP 1
46#define TRANSCODE 1
47#endif
48
49
50#if SUBGROUP_OP
51#extension GL_KHR_shader_subgroup_clustered : enable
52#extension GL_KHR_shader_subgroup_shuffle : enable
53#endif
54
55
56
57layout(local_size_x = 64, local_size_y = 1, local_size_z = 1) in;
58layout(binding = 0) uniform highp usamplerBuffer  uInputBuffer;
59layout(binding = 1, rgba32ui) writeonly uniform  uimage2D uOutput;
60
61
62layout(push_constant) uniform imagInfo {
63    // for transcode to BC the width and height need to be aligned to block size
64    // we need full block data. if decode to RGBA, we don't need to write full block data out.
65    // offsetX, offsetY both need to be multiple of four.
66    uint offsetX;
67    uint offsetY;
68    int  texelOffset;
69    uint width;
70    uint height;
71    uint alphaBits;
72    uint isSigned;
73    uint isEacRg;
74};
75
76#include "third_party/etc_decoder/etc_decoder.h"
77
78ivec2 build_coord()
79{
80    uvec2 base = (gl_WorkGroupID.xy) * 8;
81    uint blockid = gl_LocalInvocationID.x >> 4u;
82    uint blockxy = gl_LocalInvocationID.x & 0xfu;
83    base.x +=  4 * (blockid & 0x1);
84    base.y +=  2 * (blockid & 0x2);
85    base += uvec2(blockxy & 0x3, blockxy >> 0x2);
86    return ivec2(base);
87}
88
89uint flip_endian(uint v)
90{
91    uvec4 words = uvec4(v) >> uvec4(0, 8, 16, 24);
92    words &= 0xffu;
93    return (words.x << 24u) | (words.y << 16u) | (words.z << 8u) | (words.w << 0u);
94}
95
96uvec2 flip_endian(uvec2 v)
97{
98    return uvec2(flip_endian(v.y), flip_endian(v.x));
99}
100
101#if SUBGROUP_OP
102uint GetIndicesRGB(vec3 color, vec3 minColor, vec3 maxColor, bool transparent)
103{
104    vec3 dir = maxColor - minColor;
105    float distMin = dot(minColor, dir);
106    float distMax = dot(maxColor, dir);
107    float dist = dot(color, dir);
108    int ind = int(round( clamp((dist - distMin) / (distMax - distMin), 0.0, 1.0) * (transparent ? 2.0 : 3.0)));
109
110    // BC1 index mapping
111    //  color0: maxColor
112    //  color1: minColor
113    //  color2: (2/3)*maxColor + (1/3)*minColor
114    //  color3: (1/3)*maxColor + (2/3)*minColor
115    // The mapping is:
116    //  0 -> 1
117    //  1 -> 3
118    //  2 -> 2
119    //  3 -> 0
120    // Tranparent case
121    //  color0: minColor
122    //  color1: maxColor
123    //  color2: (1/2)*maxColor + (1/2)*minColor
124    //  color3: 0
125    // The mapping is:
126    //  0 -> 0
127    //  1 -> 2
128    //  2 -> 1
129    return bitfieldExtract( transparent ? 0x18u : 0x2du, ind * 2, 2);
130}
131
132// Select end point using PCA
133void ComputeMaxMinColor(uvec3 rgbColor, inout uvec3 minColor, inout uvec3 maxColor) {
134    ivec3 dx;
135    if( alphaBits == 1 ) {
136        int count = subgroupClusteredAdd(1, 16);
137        ivec3 avg = ivec3((subgroupClusteredAdd(rgbColor, 16) * 2 + count)/ (2*count));
138        dx = ivec3(rgbColor) - avg;
139    }
140    else {
141        dx = ivec3(rgbColor) - ivec3(subgroupClusteredAdd(rgbColor, 16) + 8 >> 4);
142    }
143    vec3 cov0 = vec3(subgroupClusteredAdd(dx.r * dx, 16));
144    vec3 cov1 = vec3(subgroupClusteredAdd(dx.ggb * dx.gbb, 16));
145    vec3 vg = vec3(subgroupClusteredMax(rgbColor, 16) - subgroupClusteredMin(rgbColor, 16));
146
147    // Then build the matrix.
148    mat3 covMat = mat3(cov0,                   // rr, rg, rb
149                       vec3(cov0.y, cov1.xy),  // rg, gg, gb
150                       vec3(cov0.z, cov1.yz)); // rb, gb, bb
151    // normalized power iteration.
152    // power iteration at some special case maybe wrong.
153    float eigenvalue = 0.0f;
154    for( int i = 0; i<4; i++ ) {
155        vg = covMat * vg;
156        eigenvalue = sqrt(dot(vg, vg));
157        if( eigenvalue > 0.0f ) {
158            float invNorm = 1.0f/eigenvalue;
159            vg *= invNorm;
160        }
161    }
162    const float kDefaultLuminanceThreshold = 4.0f * 255;
163    const float kQuantizeRange             = 0.512f;
164
165    if (eigenvalue < kDefaultLuminanceThreshold) {
166        vg = vec3(0.299f, 0.587f, 0.114f);
167    }
168    else {
169        float magn = max(max(abs(vg.r), abs(vg.g)), abs(vg.b));
170        vg *= kQuantizeRange / magn;
171    }
172    float dist = dot(vec3(rgbColor), vg);
173    float min_dist = subgroupClusteredMin(dist, 16);
174    float max_dist = subgroupClusteredMax(dist, 16);
175    uvec2 indices = uvec2(dist == min_dist? gl_SubgroupInvocationID : 0,
176                          dist == max_dist? gl_SubgroupInvocationID : 0);
177    uvec2 minMaxIndex = subgroupClusteredMax(indices, 16);
178    minColor = subgroupShuffle(rgbColor, minMaxIndex.x);
179    maxColor = subgroupShuffle(rgbColor, minMaxIndex.y);
180}
181
182uint GetIndicesAlpha(int alpha, int minAlpha, int maxAlpha)
183{
184    float dist  = float(maxAlpha-minAlpha);
185    int ind = int(round(clamp((alpha - minAlpha)/dist*7.0f, 0.0, 7.0)));
186    // 0 : maxAlpha
187    // 1 : minAlpha
188    // 2 : 6/7*maxAlpha + 1/7*minAlpha;
189    // 3 : 5/7*maxAlpha + 2/7*minAlpha;
190    // 4 : 4/7*maxAlpha + 3/7*minAlpha;
191    // 5 : 3/7*maxAlpha + 4/7*minAlpha;
192    // 6 : 2/7*maxAlpha + 5/7*minAlpha;
193    // 7 : 1/7*maxAlpha + 6/7*minAlpha;
194    // so the mapping is
195    // 0 -> 1
196    // 1 -> 7
197    // 2 -> 6
198    // 3 -> 5
199    // 4 -> 4
200    // 5 -> 3
201    // 6 -> 2
202    // 7 -> 0
203    return bitfieldExtract(0x2345671u, ind * 4, 4);
204}
205
206void ComputeMaxMin(int alpha, inout int minAlpha, inout int maxAlpha) {
207    minAlpha = subgroupClusteredMin(alpha, 16);
208    maxAlpha = subgroupClusteredMax(alpha, 16);
209}
210uvec2 EncodeBC4(int value, uint pid) {
211    int minValue, maxValue;
212    ComputeMaxMin(value, minValue, maxValue);
213    uint indices = 0;
214    if( minValue != maxValue )
215        indices = GetIndicesAlpha(value, minValue, maxValue);
216
217    uvec2 mask = uvec2( pid <= 5 ? indices << ( 16 + 3 * pid ) : 0x0,
218                        pid >= 5 ? ( indices << 29 ) >> ( 45 - 3 * pid ) : 0x0 );
219
220    mask = subgroupClusteredOr( mask, 16);
221    return  uvec2((maxValue & 0xff) | ((minValue & 0xff) << 8) |  mask.x, mask.y);
222}
223#endif
224
225
226uvec3 scaleColorToRGB565(uvec3 color) {
227    return uvec3(round(vec3(color) * vec3(31.0/255.0, 63.0/255.0, 31.0/255.0)));
228}
229
230// This function simulate hardware behavior.
231// only a few number not equal to golden reference.
232uvec3 convertRGB565ToRGB888(uvec3 color) {
233    return uvec3(color.x << 3 | (color.x >> 2),
234                 color.y << 2 | (color.y >> 4),
235                 color.z << 3 | (color.z >> 2));
236}
237
238uint packRGB565(uvec3 color565) {
239     return color565.r << 11 | ( color565.g << 5 ) | color565.b;
240}
241
242//This change tries to change one endpoint to an adjacent one (not optimal) in RGB565,
243//so that all the colors are interpolated from these two endpoints.
244void modifyMinMax(inout uvec3 minColor, inout uvec3 maxColor) {
245    uvec3 minColor565 = scaleColorToRGB565(minColor);
246    uvec3 maxColor565 = scaleColorToRGB565(maxColor);
247    if( all(equal(minColor565, maxColor565)) ) {
248        uvec3 simulatedColor = convertRGB565ToRGB888(minColor565);
249        ivec3 signMax = sign(ivec3(maxColor) - ivec3(simulatedColor));
250        ivec3 signMin = sign(ivec3(minColor) - ivec3(simulatedColor));
251        bvec3 needCorrect = greaterThan(signMax * signMin, ivec3(0, 0, 0));
252        bvec3 positive = greaterThan(signMin, ivec3(0, 0, 0));
253        maxColor565.r += needCorrect.r && positive.r ? 1 : 0;
254        maxColor565.g += needCorrect.g && positive.g ? 1 : 0;
255        maxColor565.b += needCorrect.b && positive.b ? 1 : 0;
256        minColor565.r -= needCorrect.r && !positive.r ? 1 : 0;
257        minColor565.g -= needCorrect.g && !positive.g ? 1 : 0;
258        minColor565.b -= needCorrect.b && !positive.b ? 1 : 0;
259    }
260    minColor = minColor565;
261    maxColor = maxColor565;
262}
263
264void swap( inout uint a, inout uint b) {
265    uint t = a;
266    a = b;
267    b = t;
268}
269
270void main()
271{
272    ivec2 coord = build_coord();
273    if( any(greaterThanEqual(coord, ivec2(width, height)) ))
274        return;
275
276    ivec2 tile_coord = coord >> 2;
277    ivec2 pixel_coord = coord & 3;
278    int linear_pixel = 4 * pixel_coord.x + pixel_coord.y;
279    int pid = 4 * pixel_coord.y + pixel_coord.x;
280    uvec4 payload = texelFetch(uInputBuffer, tile_coord.y * int((width+3)>>2) + tile_coord.x + texelOffset);
281
282    ivec4 result;
283#if DECODE_RGBA
284    uvec2 color_payload = flip_endian(alphaBits == 8 ? payload.zw : payload.xy);
285    bool nonOpaque = alphaBits == 1 && (color_payload.y & 2u) == 0u;
286    bool punchthrough = nonOpaque;
287    result = DecodeRGB(pixel_coord, color_payload, linear_pixel, punchthrough);
288    if( alphaBits == 8 ) {
289        uvec2 alpha_payload = flip_endian(payload.xy);
290        result.a = decode_etc2_alpha(alpha_payload, linear_pixel);
291    }
292#endif
293
294#if DECODE_R11
295    result.r = decode_etc2_alpha(flip_endian(payload.xy), linear_pixel);
296    if( isEacRg != 0 ) {
297        result.g = decode_etc2_alpha(flip_endian(payload.zw), linear_pixel);
298    }
299#endif
300
301    uvec4 finalResult;
302#if ENCODE_RGBA
303    uvec3 minColor, maxColor;
304    uint indices = 0;
305    uint color565 = 0;
306
307    // Encode the alpha compoenent first. On some AMD GPUs, we see a very
308    // strange issue where doing this later produces incorrect results in the
309    // subgroup operations. See b/300672851 for details.
310    finalResult.ba = alphaBits == 8 ? EncodeBC4(result.a, pid) : uvec2(0,0);
311
312    // Encode the RGB component
313    bool controlFlag = alphaBits != 1 || result.a > 0;
314    if( controlFlag )
315    {
316        ComputeMaxMinColor(uvec3(result.r, result.g, result.b), minColor, maxColor);
317        modifyMinMax(minColor, maxColor);
318        uint minColor565 = packRGB565(minColor);
319        uint maxColor565 = packRGB565(maxColor);
320
321        if( minColor565 != maxColor565 ) {
322            indices = GetIndicesRGB(vec3(result.r, result.g, result.b),
323                                    vec3(convertRGB565ToRGB888(minColor)),
324                                    vec3(convertRGB565ToRGB888(maxColor)),
325                                    nonOpaque);
326        }
327        bool flip = maxColor565 < minColor565;
328        if( flip ) {
329            indices ^= 1;
330            // nonOpaque only need flip 0-->1, 1-->0. fix 2-->3.
331            if( nonOpaque && indices == 3 ){
332                indices = 2;
333            }
334        }
335        if( flip != nonOpaque ) {
336            swap(maxColor565, minColor565);
337        }
338        color565 = maxColor565 | (minColor565<<16);
339    }
340    if( alphaBits == 1 ) {
341        int active_lane_index = subgroupClusteredMax(controlFlag ? int(gl_SubgroupInvocationID) : -1, 16);
342        if(active_lane_index != -1) {
343            color565 = subgroupShuffle(color565, active_lane_index);
344        }
345        if( punchthrough && result.a == 0 ) {
346            indices = 3;
347        }
348    }
349    uint mask = subgroupClusteredOr(indices << (2*pid), 16);
350
351    finalResult.rg = uvec2(color565, mask);
352    if( alphaBits == 8 ) {
353        finalResult = finalResult.barg;
354    }
355#endif
356
357#if ENCODE_R11
358    finalResult.rg = EncodeBC4(result.r, pid);
359    if( isEacRg != 0 )
360        finalResult.ba = EncodeBC4(result.g, pid);
361#endif
362
363
364
365#if TRANSCODE
366    if( pid == 0 ) {
367        tile_coord += ivec2(offsetX/4, offsetY/4);
368        imageStore(uOutput, tile_coord, finalResult);
369    }
370#else
371    coord +=  ivec2(offsetX, offsetY);
372    imageStore(uOutput, coord, uvec4(result));
373#endif
374}
375