1/* 2 * Copyright 2024 Valve Corporation 3 * SPDX-License-Identifier: MIT 4 */ 5#include "compiler/libcl/libcl.h" 6#include "compiler/nir/nir_defines.h" 7#include "compiler/shader_enums.h" 8#include "agx_pack.h" 9#include "compression.h" 10#include "libagx_intrinsics.h" 11 12/* 13 * Decompress in place. The metadata is updated, so other processes can read the 14 * image with a compressed texture descriptor. 15 * 16 * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1 17 * workgroups, matching the warp size, meaning each work-item must process 18 * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp 19 * barriers. It also minimizes launched threads, accelerating the early exit. 20 */ 21 22/* Our compiler represents a bindless handle as a uint2 of a uniform base and an 23 * offset in bytes. Since the descriptors are all in the u0_u1 push, the former 24 * is hardcoded and the latter is an offsetof. 25 */ 26#define HANDLE(field) \ 27 (uint2)(0, offsetof(struct libagx_decompress_images, field)) 28 29/* 30 * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits. 31 * While dimensions are padded to powers-of-two, they are not padded to a 32 * square. If the width is more than 2x the height or vice versa, the additional 33 * bits are linear. So we interleave as much as possible, and then add what's 34 * remaining. Finally, layers are strided linear and added at the end. 35 */ 36static uint 37index_metadata(uint3 c, uint width, uint height, uint layer_stride) 38{ 39 uint major_coord = width > height ? c.x : c.y; 40 uint minor_dim = min(width, height); 41 42 uint intl_bits = util_logbase2_ceil(minor_dim); 43 uint intl_mask = (1 << intl_bits) - 1; 44 uint2 intl_coords = c.xy & intl_mask; 45 46 return nir_interleave_agx(intl_coords.x, intl_coords.y) + 47 ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z); 48} 49 50/* 51 * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel. 52 * The following two helpers convert a coordinate in samples into a coordinate 53 * in pixels and a sample ID, respectively. They each assume that samples > 1. 54 */ 55static int4 56decompose_px(int4 c, uint samples) 57{ 58 if (samples == 4) 59 c.xy >>= 1; 60 else 61 c.y >>= 1; 62 63 return c; 64} 65 66static uint 67sample_id(int4 c, uint samples) 68{ 69 if (samples == 4) 70 return (c.x & 1) | ((c.y & 1) << 1); 71 else 72 return c.y & 1; 73} 74 75KERNEL(32) 76libagx_decompress(constant struct libagx_decompress_images *images, 77 global uint64_t *metadata, uint64_t tile_uncompressed, 78 uint32_t metadata_layer_stride_tl, uint16_t metadata_width_tl, 79 uint16_t metadata_height_tl, 80 uint log2_samples__3 /* 1x, 2x, 4x */) 81{ 82 uint3 coord_tl = (uint3)(get_group_id(0), get_group_id(1), get_group_id(2)); 83 uint local_id = get_local_id(0); 84 uint samples = 1 << log2_samples__3; 85 86 /* Index into the metadata buffer */ 87 uint index_tl = index_metadata(coord_tl, metadata_width_tl, 88 metadata_height_tl, metadata_layer_stride_tl); 89 90 /* If the tile is already uncompressed, there's nothing to do. */ 91 if (metadata[index_tl] == tile_uncompressed) 92 return; 93 94 /* Tiles are 16x16 */ 95 uint2 coord_sa = (coord_tl.xy * 16); 96 uint layer = coord_tl.z; 97 98 /* Since we use a 32x1 workgroup, each work-item handles half of a row. */ 99 uint offs_y_sa = local_id >> 1; 100 uint offs_x_sa = (local_id & 1) ? 8 : 0; 101 102 int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa); 103 int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0); 104 105 /* Read our half-row into registers. */ 106 uint4 texels[8]; 107 for (uint i = 0; i < 8; ++i) { 108 int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 109 if (samples == 1) { 110 texels[i] = nir_bindless_image_load( 111 HANDLE(compressed), c_sa, 0, 0, GLSL_SAMPLER_DIM_2D, true, 0, 112 ACCESS_IN_BOUNDS_AGX, nir_type_uint32); 113 } else { 114 int4 dec_px = decompose_px(c_sa, samples); 115 texels[i] = nir_bindless_image_load( 116 HANDLE(compressed), dec_px, sample_id(c_sa, samples), 0, 117 GLSL_SAMPLER_DIM_MS, true, 0, ACCESS_IN_BOUNDS_AGX, 118 nir_type_uint32); 119 } 120 } 121 122 sub_group_barrier(CLK_LOCAL_MEM_FENCE); 123 124 /* Now that the whole tile is read, we write without racing. */ 125 for (uint i = 0; i < 8; ++i) { 126 int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0); 127 if (samples == 1) { 128 nir_bindless_image_store(HANDLE(uncompressed), c_sa, 0, texels[i], 0, 129 GLSL_SAMPLER_DIM_2D, true, 0, 130 ACCESS_NON_READABLE, nir_type_uint32); 131 } else { 132 int4 dec_px = decompose_px(c_sa, samples); 133 134 nir_bindless_image_store(HANDLE(uncompressed), dec_px, 135 sample_id(c_sa, samples), texels[i], 0, 136 GLSL_SAMPLER_DIM_MS, true, 0, 137 ACCESS_NON_READABLE, nir_type_uint32); 138 } 139 } 140 141 /* We've replaced the body buffer. Mark the tile as uncompressed. */ 142 if (local_id == 0) { 143 metadata[index_tl] = tile_uncompressed; 144 } 145} 146