// // Copyright 2020 The ANGLE Project. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // #include "common.h" using namespace rx::mtl_shader; #define kThreadGroupXYZ \ (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim * \ kGenerateMipThreadGroupSizePerDim) #define kThreadGroupXY (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim) #define kThreadGroupX kGenerateMipThreadGroupSizePerDim #define TEXEL_STORE(index, texel) \ sR[index] = texel.r; \ sG[index] = texel.g; \ sB[index] = texel.b; \ sA[index] = texel.a; #define TEXEL_LOAD(index) float4(sR[index], sG[index], sB[index], sA[index]) #define TO_LINEAR(texel) (options.sRGB ? sRGBtoLinear(texel) : texel) #define OUT_OF_BOUND_CHECK(edgeValue, targetValue, condition) \ (condition) ? (edgeValue) : (targetValue) struct GenMipParams { uint srcLevel; uint numMipLevelsToGen; bool sRGB; }; // NOTE(hqle): For numMipLevelsToGen > 1, this function assumes the texture is power of two. If it // is not, quality will not be good. kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]], ushort3 gIndices [[thread_position_in_grid]], texture3d srcTexture [[texture(0)]], texture3d dstMip1 [[texture(1)]], texture3d dstMip2 [[texture(2)]], texture3d dstMip3 [[texture(3)]], texture3d dstMip4 [[texture(4)]], constant GenMipParams &options [[buffer(0)]]) { ushort3 mipSize = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth()); bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z; constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear); // NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use. // Use struct of array style to avoid bank conflict. threadgroup float sR[kThreadGroupXYZ]; threadgroup float sG[kThreadGroupXYZ]; threadgroup float sB[kThreadGroupXYZ]; threadgroup float sA[kThreadGroupXYZ]; // ----- First mip level ------- float4 texel1; if (validThread) { float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize); texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel)); // Write to texture dstMip1.write(texel1, gIndices); } else { // This will invalidate all subsequent checks lIndex = 0xffffffff; } if (options.numMipLevelsToGen == 1) { return; } // ---- Second mip level -------- // Write to shared memory if (options.sRGB) { texel1 = linearToSRGB(texel1); } TEXEL_STORE(lIndex, texel1); threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be even if ((lIndex & 0x49) == 0) // (lIndex & b1001001) == 0 { bool3 atEdge = gIndices == (mipSize - ushort3(1)); // (x+1, y, z) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x); // (x, y+1, z) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y); // (x, y, z+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupXY), atEdge.z); // (x+1, y+1, z) float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)), atEdge.x | atEdge.y); // (x+1, y, z+1) float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupXY + 1)), atEdge.x | atEdge.z); // (x, y+1, z+1) float4 texel7 = OUT_OF_BOUND_CHECK( texel3, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX)), atEdge.y | atEdge.z); // (x+1, y+1, z+1) float4 texel8 = OUT_OF_BOUND_CHECK(texel5, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX + 1)), atEdge.x | atEdge.y | atEdge.z); texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0; dstMip2.write(TO_LINEAR(texel1), gIndices >> 1); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 2) { return; } // ---- 3rd mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 4 if ((lIndex & 0xdb) == 0) // (lIndex & b11011011) == 0 { mipSize = max(mipSize >> 1, ushort3(1)); bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1)); // (x+1, y, z) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x); // (x, y+1, z) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupX)), atEdge.y); // (x, y, z+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY)), atEdge.z); // (x+1, y+1, z) float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)), atEdge.x | atEdge.y); // (x+1, y, z+1) float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2)), atEdge.x | atEdge.z); // (x, y+1, z+1) float4 texel7 = OUT_OF_BOUND_CHECK( texel3, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX)), atEdge.y | atEdge.z); // (x+1, y+1, z+1) float4 texel8 = OUT_OF_BOUND_CHECK( texel5, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX + 2)), atEdge.x | atEdge.y | atEdge.z); texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0; dstMip3.write(TO_LINEAR(texel1), gIndices >> 2); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 3) { return; } // ---- 4th mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 8 if ((lIndex & 0x1ff) == 0) // (lIndex & b111111111) == 0 { mipSize = max(mipSize >> 1, ushort3(1)); bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1)); // (x+1, y, z) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x); // (x, y+1, z) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupX)), atEdge.y); // (x, y, z+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY)), atEdge.z); // (x+1, y+1, z) float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)), atEdge.x | atEdge.y); // (x+1, y, z+1) float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4)), atEdge.x | atEdge.z); // (x, y+1, z+1) float4 texel7 = OUT_OF_BOUND_CHECK( texel3, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX)), atEdge.y | atEdge.z); // (x+1, y+1, z+1) float4 texel8 = OUT_OF_BOUND_CHECK( texel5, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX + 4)), atEdge.x | atEdge.y | atEdge.z); texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0; dstMip4.write(TO_LINEAR(texel1), gIndices >> 3); } } kernel void generate2DMipmaps(uint lIndex [[thread_index_in_threadgroup]], ushort2 gIndices [[thread_position_in_grid]], texture2d srcTexture [[texture(0)]], texture2d dstMip1 [[texture(1)]], texture2d dstMip2 [[texture(2)]], texture2d dstMip3 [[texture(3)]], texture2d dstMip4 [[texture(4)]], constant GenMipParams &options [[buffer(0)]]) { uint firstMipLevel = options.srcLevel + 1; ushort2 mipSize = ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel)); bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y; constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear); // NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use. // Use struct of array style to avoid bank conflict. threadgroup float sR[kThreadGroupXY]; threadgroup float sG[kThreadGroupXY]; threadgroup float sB[kThreadGroupXY]; threadgroup float sA[kThreadGroupXY]; // ----- First mip level ------- float4 texel1; if (validThread) { float2 texCoords = (float2(gIndices) + float2(0.5, 0.5)) / float2(mipSize); texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel)); // Write to texture dstMip1.write(TO_LINEAR(texel1), gIndices); } else { // This will invalidate all subsequent checks lIndex = 0xffffffff; } if (options.numMipLevelsToGen == 1) { return; } // ---- Second mip level -------- // Write to shared memory TEXEL_STORE(lIndex, texel1); threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be even if ((lIndex & 0x09) == 0) // (lIndex & b001001) == 0 { bool2 atEdge = gIndices == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip2.write(TO_LINEAR(texel1), gIndices >> 1); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 2) { return; } // ---- 3rd mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 4 if ((lIndex & 0x1b) == 0) // (lIndex & b011011) == 0 { mipSize = max(mipSize >> 1, ushort2(1)); bool2 atEdge = (gIndices >> 1) == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip3.write(TO_LINEAR(texel1), gIndices >> 2); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 3) { return; } // ---- 4th mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 8 if ((lIndex & 0x3f) == 0) // (lIndex & b111111) == 0 { mipSize = max(mipSize >> 1, ushort2(1)); bool2 atEdge = (gIndices >> 2) == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip4.write(TO_LINEAR(texel1), gIndices >> 3); } } template static __attribute__((always_inline)) void generateCubeOr2DArray2ndAndMoreMipmaps( uint lIndex, ushort3 gIndices, TextureTypeR srcTexture, TextureTypeW dstMip2, TextureTypeW dstMip3, TextureTypeW dstMip4, ushort2 mip1Size, float4 mip1Texel, threadgroup float *sR, threadgroup float *sG, threadgroup float *sB, threadgroup float *sA, constant GenMipParams &options) { ushort2 mipSize = mip1Size; float4 texel1 = mip1Texel; // ---- Second mip level -------- // Write to shared memory TEXEL_STORE(lIndex, texel1); threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be even if ((lIndex & 0x09) == 0) // (lIndex & b001001) == 0 { bool2 atEdge = gIndices.xy == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip2.write(TO_LINEAR(texel1), gIndices.xy >> 1, gIndices.z); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 2) { return; } // ---- 3rd mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 4 if ((lIndex & 0x1b) == 0) // (lIndex & b011011) == 0 { mipSize = max(mipSize >> 1, ushort2(1)); bool2 atEdge = (gIndices.xy >> 1) == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip3.write(TO_LINEAR(texel1), gIndices.xy >> 2, gIndices.z); // Write to shared memory TEXEL_STORE(lIndex, texel1); } if (options.numMipLevelsToGen == 3) { return; } // ---- 4th mip level -------- threadgroup_barrier(mem_flags::mem_threadgroup); // Index must be multiple of 8 if ((lIndex & 0x3f) == 0) // (lIndex & b111111) == 0 { mipSize = max(mipSize >> 1, ushort2(1)); bool2 atEdge = (gIndices.xy >> 2) == (mipSize - ushort2(1)); // (x+1, y) // If the width of mip is 1, texel2 will equal to texel1: float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x); // (x, y+1) float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y); // (x+1, y+1) float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)), atEdge.x | atEdge.y); texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0; dstMip4.write(TO_LINEAR(texel1), gIndices.xy >> 3, gIndices.z); } } kernel void generateCubeMipmaps(uint lIndex [[thread_index_in_threadgroup]], ushort3 gIndices [[thread_position_in_grid]], texturecube srcTexture [[texture(0)]], texturecube dstMip1 [[texture(1)]], texturecube dstMip2 [[texture(2)]], texturecube dstMip3 [[texture(3)]], texturecube dstMip4 [[texture(4)]], constant GenMipParams &options [[buffer(0)]]) { uint firstMipLevel = options.srcLevel + 1; ushort2 mip1Size = ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel)); bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y; constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear); // ----- First mip level ------- float4 mip1Texel; if (validThread) { float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size); mip1Texel = srcTexture.sample(textureSampler, cubeTexcoords(texCoords, int(gIndices.z)), level(options.srcLevel)); // Write to texture dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z); } else { // This will invalidate all subsequent checks lIndex = 0xffffffff; } if (options.numMipLevelsToGen == 1) { return; } // Use struct of array style to avoid bank conflict. threadgroup float sR[kThreadGroupXY]; threadgroup float sG[kThreadGroupXY]; threadgroup float sB[kThreadGroupXY]; threadgroup float sA[kThreadGroupXY]; generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4, mip1Size, mip1Texel, sR, sG, sB, sA, options); } kernel void generate2DArrayMipmaps(uint lIndex [[thread_index_in_threadgroup]], ushort3 gIndices [[thread_position_in_grid]], texture2d_array srcTexture [[texture(0)]], texture2d_array dstMip1 [[texture(1)]], texture2d_array dstMip2 [[texture(2)]], texture2d_array dstMip3 [[texture(3)]], texture2d_array dstMip4 [[texture(4)]], constant GenMipParams &options [[buffer(0)]]) { uint firstMipLevel = options.srcLevel + 1; ushort2 mip1Size = ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel)); bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y; constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear); // ----- First mip level ------- float4 mip1Texel; if (validThread) { float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size); mip1Texel = srcTexture.sample(textureSampler, texCoords, gIndices.z, level(options.srcLevel)); // Write to texture dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z); } else { // This will invalidate all subsequent checks lIndex = 0xffffffff; } if (options.numMipLevelsToGen == 1) { return; } // Use struct of array style to avoid bank conflict. threadgroup float sR[kThreadGroupXY]; threadgroup float sG[kThreadGroupXY]; threadgroup float sB[kThreadGroupXY]; threadgroup float sA[kThreadGroupXY]; generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4, mip1Size, mip1Texel, sR, sG, sB, sA, options); }