• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright 2024 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5#include "compiler/libcl/libcl.h"
6#include "compiler/nir/nir_defines.h"
7#include "compiler/shader_enums.h"
8#include "agx_pack.h"
9#include "compression.h"
10#include "libagx_intrinsics.h"
11
12/*
13 * Decompress in place. The metadata is updated, so other processes can read the
14 * image with a compressed texture descriptor.
15 *
16 * Each workgroup processes one 16x16 tile, avoiding races. We use 32x1
17 * workgroups, matching the warp size, meaning each work-item must process
18 * (16*16)/(32*1) = 8 sampels. Matching the warp size eliminates cross-warp
19 * barriers. It also minimizes launched threads, accelerating the early exit.
20 */
21
22/* Our compiler represents a bindless handle as a uint2 of a uniform base and an
23 * offset in bytes. Since the descriptors are all in the u0_u1 push, the former
24 * is hardcoded and the latter is an offsetof.
25 */
26#define HANDLE(field)                                                          \
27   (uint2)(0, offsetof(struct libagx_decompress_images, field))
28
29/*
30 * The metadata buffer is fully twiddled, so interleave the X/Y coordinate bits.
31 * While dimensions are padded to powers-of-two, they are not padded to a
32 * square. If the width is more than 2x the height or vice versa, the additional
33 * bits are linear. So we interleave as much as possible, and then add what's
34 * remaining. Finally, layers are strided linear and added at the end.
35 */
36static uint
37index_metadata(uint3 c, uint width, uint height, uint layer_stride)
38{
39   uint major_coord = width > height ? c.x : c.y;
40   uint minor_dim = min(width, height);
41
42   uint intl_bits = util_logbase2_ceil(minor_dim);
43   uint intl_mask = (1 << intl_bits) - 1;
44   uint2 intl_coords = c.xy & intl_mask;
45
46   return nir_interleave_agx(intl_coords.x, intl_coords.y) +
47          ((major_coord & ~intl_mask) << intl_bits) + (layer_stride * c.z);
48}
49
50/*
51 * For multisampled images, a 2x2 or 1x2 group of samples form a single pixel.
52 * The following two helpers convert a coordinate in samples into a coordinate
53 * in pixels and a sample ID, respectively. They each assume that samples > 1.
54 */
55static int4
56decompose_px(int4 c, uint samples)
57{
58   if (samples == 4)
59      c.xy >>= 1;
60   else
61      c.y >>= 1;
62
63   return c;
64}
65
66static uint
67sample_id(int4 c, uint samples)
68{
69   if (samples == 4)
70      return (c.x & 1) | ((c.y & 1) << 1);
71   else
72      return c.y & 1;
73}
74
75KERNEL(32)
76libagx_decompress(constant struct libagx_decompress_images *images,
77                  global uint64_t *metadata, uint64_t tile_uncompressed,
78                  uint32_t metadata_layer_stride_tl, uint16_t metadata_width_tl,
79                  uint16_t metadata_height_tl,
80                  uint log2_samples__3 /* 1x, 2x, 4x */)
81{
82   uint3 coord_tl = (uint3)(get_group_id(0), get_group_id(1), get_group_id(2));
83   uint local_id = get_local_id(0);
84   uint samples = 1 << log2_samples__3;
85
86   /* Index into the metadata buffer */
87   uint index_tl = index_metadata(coord_tl, metadata_width_tl,
88                                  metadata_height_tl, metadata_layer_stride_tl);
89
90   /* If the tile is already uncompressed, there's nothing to do. */
91   if (metadata[index_tl] == tile_uncompressed)
92      return;
93
94   /* Tiles are 16x16 */
95   uint2 coord_sa = (coord_tl.xy * 16);
96   uint layer = coord_tl.z;
97
98   /* Since we use a 32x1 workgroup, each work-item handles half of a row. */
99   uint offs_y_sa = local_id >> 1;
100   uint offs_x_sa = (local_id & 1) ? 8 : 0;
101
102   int2 img_coord_sa_2d = convert_int2(coord_sa) + (int2)(offs_x_sa, offs_y_sa);
103   int4 img_coord_sa = (int4)(img_coord_sa_2d.x, img_coord_sa_2d.y, layer, 0);
104
105   /* Read our half-row into registers. */
106   uint4 texels[8];
107   for (uint i = 0; i < 8; ++i) {
108      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
109      if (samples == 1) {
110         texels[i] = nir_bindless_image_load(
111            HANDLE(compressed), c_sa, 0, 0, GLSL_SAMPLER_DIM_2D, true, 0,
112            ACCESS_IN_BOUNDS_AGX, nir_type_uint32);
113      } else {
114         int4 dec_px = decompose_px(c_sa, samples);
115         texels[i] = nir_bindless_image_load(
116            HANDLE(compressed), dec_px, sample_id(c_sa, samples), 0,
117            GLSL_SAMPLER_DIM_MS, true, 0, ACCESS_IN_BOUNDS_AGX,
118            nir_type_uint32);
119      }
120   }
121
122   sub_group_barrier(CLK_LOCAL_MEM_FENCE);
123
124   /* Now that the whole tile is read, we write without racing. */
125   for (uint i = 0; i < 8; ++i) {
126      int4 c_sa = img_coord_sa + (int4)(i, 0, 0, 0);
127      if (samples == 1) {
128         nir_bindless_image_store(HANDLE(uncompressed), c_sa, 0, texels[i], 0,
129                                  GLSL_SAMPLER_DIM_2D, true, 0,
130                                  ACCESS_NON_READABLE, nir_type_uint32);
131      } else {
132         int4 dec_px = decompose_px(c_sa, samples);
133
134         nir_bindless_image_store(HANDLE(uncompressed), dec_px,
135                                  sample_id(c_sa, samples), texels[i], 0,
136                                  GLSL_SAMPLER_DIM_MS, true, 0,
137                                  ACCESS_NON_READABLE, nir_type_uint32);
138      }
139   }
140
141   /* We've replaced the body buffer. Mark the tile as uncompressed. */
142   if (local_id == 0) {
143      metadata[index_tl] = tile_uncompressed;
144   }
145}
146