• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2023 Collabora, Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 
9 static nir_def *
tess_ctrl_output_vtx(nir_builder * b,nir_def * vtx)10 tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx)
11 {
12    /* This is the pattern we see emitted by the blob driver:
13     *
14     *    S2R R0, SR_LANEID
15     *    S2R R6, SR_INVOCATION_ID
16     *    # R3 is our vertex index
17     *    SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0
18     *    IMAD.IADD R5, R0, 0x1, -R6
19     *    IMAD.SHL.U32 R0, R3, 0x4, RZ
20     *    LEA.HI.SX32 R4, R0, R5, 0x1e
21     *    ALD.O R4, a[0x88], R4
22     *
23     * Translating the MADs and re-naming registers, this is
24     *
25     *    %r0 = iadd %lane -%invoc
26     *    %r1 = imul %vtx 0x4
27     *    %r2 = lea.hi.sx32 %r1 %r0 0x1e
28     *    %out = ald.o a[%r2][0x88]
29     *
30     * But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`.
31     * Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left
32     * is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow.  So, this means
33     *
34     *    %r0 = iadd %lane -%invoc
35     *    %r2 = iadd %vtx %r0
36     *    %out = ald.o a[%r2][0x88]
37     *
38     * In other words, the hardware actually indexes them by lane index with
39     * all of the invocations for a given TCS dispatch going in a sequential
40     * range of lanes.  We have to compute the lane index of the requested
41     * invocation from the invocation index.
42     */
43    nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID,
44                                       .access = ACCESS_CAN_REORDER);
45    nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID,
46                                        .access = ACCESS_CAN_REORDER);
47 
48    return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc)));
49 }
50 
51 static bool
lower_vtg_io_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)52 lower_vtg_io_intrin(nir_builder *b,
53                     nir_intrinsic_instr *intrin,
54                     void *cb_data)
55 {
56    const struct nak_compiler *nak = cb_data;
57    b->cursor = nir_before_instr(&intrin->instr);
58 
59    nir_def *vtx = NULL, *offset = NULL, *data = NULL;
60    switch (intrin->intrinsic) {
61    case nir_intrinsic_load_input:
62    case nir_intrinsic_load_output:
63       offset = intrin->src[0].ssa;
64       break;
65 
66    case nir_intrinsic_load_per_vertex_input:
67       vtx = intrin->src[0].ssa;
68       offset = intrin->src[1].ssa;
69       break;
70 
71    case nir_intrinsic_load_per_vertex_output:
72       if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
73          vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa);
74       else
75          vtx = intrin->src[0].ssa;
76       offset = intrin->src[1].ssa;
77       break;
78 
79    case nir_intrinsic_store_output:
80       data = intrin->src[0].ssa;
81       offset = intrin->src[1].ssa;
82       break;
83 
84    case nir_intrinsic_store_per_vertex_output:
85       data = intrin->src[0].ssa;
86       vtx = intrin->src[1].ssa;
87       offset = intrin->src[2].ssa;
88       break;
89 
90    default:
91       return false;
92    }
93    const bool offset_is_const = nir_src_is_const(nir_src_for_ssa(offset));
94 
95    const bool is_store = data != NULL;
96 
97    bool is_output;
98    switch (intrin->intrinsic) {
99    case nir_intrinsic_load_input:
100    case nir_intrinsic_load_per_vertex_input:
101       is_output = false;
102       break;
103 
104    case nir_intrinsic_load_output:
105    case nir_intrinsic_load_per_vertex_output:
106    case nir_intrinsic_store_output:
107    case nir_intrinsic_store_per_vertex_output:
108       is_output = true;
109       break;
110 
111    default:
112       unreachable("Unknown NIR I/O intrinsic");
113    }
114 
115    bool is_patch;
116    switch (b->shader->info.stage) {
117    case MESA_SHADER_VERTEX:
118    case MESA_SHADER_GEOMETRY:
119       is_patch = false;
120       break;
121 
122    case MESA_SHADER_TESS_CTRL:
123       is_patch = is_output && vtx == NULL;
124       break;
125 
126    case MESA_SHADER_TESS_EVAL:
127       is_patch = !is_output && vtx == NULL;
128       break;
129 
130    default:
131       unreachable("Unknown shader stage");
132    }
133 
134    nir_component_mask_t mask;
135    if (is_store)
136       mask = nir_intrinsic_write_mask(intrin);
137    else
138       mask = nir_component_mask(intrin->num_components);
139 
140    if (vtx != NULL && !is_output) {
141       nir_def *info = nir_load_sysval_nv(b, 32,
142                                          .base = NAK_SV_INVOCATION_INFO,
143                                          .access = ACCESS_CAN_REORDER);
144       nir_def *lo = nir_extract_u8_imm(b, info, 0);
145       nir_def *hi = nir_extract_u8_imm(b, info, 2);
146       nir_def *idx = nir_iadd(b, nir_imul(b, lo, hi), vtx);
147       vtx = nir_isberd_nv(b, idx);
148    }
149 
150    if (vtx == NULL)
151       vtx = nir_imm_int(b, 0);
152 
153    const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
154    unsigned component = nir_intrinsic_component(intrin);
155 
156    uint32_t base_addr;
157    if (b->shader->info.stage == MESA_SHADER_VERTEX && !is_output)
158       base_addr = nak_attribute_attr_addr(nak, sem.location);
159    else
160       base_addr = nak_varying_attr_addr(nak, sem.location);
161    base_addr += 4 * component;
162 
163    uint32_t range;
164    if (offset_is_const) {
165       unsigned const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
166 
167       /* Tighten the range */
168       base_addr += const_offset * 16;
169       range = 4 * intrin->num_components;
170 
171       if (const_offset != 0)
172          offset = nir_imm_int(b, 0);
173    } else {
174       /* Offsets from NIR are in vec4's */
175       offset = nir_imul_imm(b, offset, 16);
176       range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
177    }
178 
179    const struct nak_nir_attr_io_flags flags = {
180       .output = is_output,
181       .patch = is_patch,
182       .phys = !offset_is_const && !is_patch,
183    };
184 
185    uint32_t flags_u32;
186    STATIC_ASSERT(sizeof(flags_u32) == sizeof(flags));
187    memcpy(&flags_u32, &flags, sizeof(flags_u32));
188 
189    nir_def *dst_comps[NIR_MAX_VEC_COMPONENTS];
190    while (mask) {
191       const unsigned c = ffs(mask) - 1;
192       unsigned comps = ffs(~(mask >> c)) - 1;
193       assert(comps > 0);
194 
195       unsigned c_addr = base_addr + 4 * c;
196 
197       /* vec2 has to be vec2 aligned, vec3/4 have to be vec4 aligned.  We
198        * don't have actual alignment information on these intrinsics but we
199        * can assume that the indirect offset (if any) is a multiple of 16 so
200        * we don't need to worry about that and can just look at c_addr.
201        */
202       comps = MIN2(comps, 4);
203       if (c_addr & 0xf)
204          comps = MIN2(comps, 2);
205       if (c_addr & 0x7)
206          comps = 1;
207       assert(!(c_addr & 0x3));
208 
209       nir_def *c_offset = offset;
210       if (flags.phys) {
211          /* Physical addressing has to be scalar */
212          comps = 1;
213 
214          /* Use al2p to compute a physical address */
215          c_offset = nir_al2p_nv(b, offset, .base = c_addr,
216                                 .flags = flags_u32);
217          c_addr = 0;
218       }
219 
220       if (is_store) {
221          nir_def *c_data = nir_channels(b, data, BITFIELD_RANGE(c, comps));
222          nir_ast_nv(b, c_data, vtx, c_offset,
223                     .base = c_addr,
224                     .flags = flags_u32,
225                     .range_base = base_addr,
226                     .range = range);
227       } else {
228          uint32_t access = flags.output ? 0 : ACCESS_CAN_REORDER;
229          nir_def *c_data = nir_ald_nv(b, comps, vtx, c_offset,
230                                       .base = c_addr,
231                                       .flags = flags_u32,
232                                       .range_base = base_addr,
233                                       .range = range,
234                                       .access = access);
235          for (unsigned i = 0; i < comps; i++)
236             dst_comps[c + i] = nir_channel(b, c_data, i);
237       }
238 
239       mask &= ~BITFIELD_RANGE(c, comps);
240    }
241 
242    if (!is_store) {
243       nir_def *dst = nir_vec(b, dst_comps, intrin->num_components);
244       nir_def_rewrite_uses(&intrin->def, dst);
245    }
246 
247    nir_instr_remove(&intrin->instr);
248 
249    return true;
250 }
251 
252 bool
nak_nir_lower_vtg_io(nir_shader * nir,const struct nak_compiler * nak)253 nak_nir_lower_vtg_io(nir_shader *nir, const struct nak_compiler *nak)
254 {
255    return nir_shader_intrinsics_pass(nir, lower_vtg_io_intrin,
256                                      nir_metadata_control_flow,
257                                      (void *)nak);
258 }
259