• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2023 Collabora, Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "nak_private.h"
7 #include "nir_builder.h"
8 
9 static nir_def *
tess_ctrl_output_vtx(nir_builder * b,nir_def * vtx)10 tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx)
11 {
12    /* This is the pattern we see emitted by the blob driver:
13     *
14     *    S2R R0, SR_LANEID
15     *    S2R R6, SR_INVOCATION_ID
16     *    # R3 is our vertex index
17     *    SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0
18     *    IMAD.IADD R5, R0, 0x1, -R6
19     *    IMAD.SHL.U32 R0, R3, 0x4, RZ
20     *    LEA.HI.SX32 R4, R0, R5, 0x1e
21     *    ALD.O R4, a[0x88], R4
22     *
23     * Translating the MADs and re-naming registers, this is
24     *
25     *    %r0 = iadd %lane -%invoc
26     *    %r1 = imul %vtx 0x4
27     *    %r2 = lea.hi.sx32 %r1 %r0 0x1e
28     *    %out = ald.o a[%r2][0x88]
29     *
30     * But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`.
31     * Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left
32     * is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow.  So, this means
33     *
34     *    %r0 = iadd %lane -%invoc
35     *    %r2 = iadd %vtx %r0
36     *    %out = ald.o a[%r2][0x88]
37     *
38     * In other words, the hardware actually indexes them by lane index with
39     * all of the invocations for a given TCS dispatch going in a sequential
40     * range of lanes.  We have to compute the lane index of the requested
41     * invocation from the invocation index.
42     */
43    nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID,
44                                       .access = ACCESS_CAN_REORDER);
45    nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID,
46                                        .access = ACCESS_CAN_REORDER);
47 
48    return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc)));
49 }
50 
51 static bool
lower_vtg_io_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)52 lower_vtg_io_intrin(nir_builder *b,
53                     nir_intrinsic_instr *intrin,
54                     void *cb_data)
55 {
56    b->cursor = nir_before_instr(&intrin->instr);
57 
58    nir_def *vtx = NULL, *offset = NULL, *data = NULL;
59    switch (intrin->intrinsic) {
60    case nir_intrinsic_load_input:
61    case nir_intrinsic_load_output:
62       offset = intrin->src[0].ssa;
63       break;
64 
65    case nir_intrinsic_load_per_vertex_input:
66       vtx = intrin->src[0].ssa;
67       offset = intrin->src[1].ssa;
68       break;
69 
70    case nir_intrinsic_load_per_vertex_output:
71       if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
72          vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa);
73       else
74          vtx = intrin->src[0].ssa;
75       offset = intrin->src[1].ssa;
76       break;
77 
78    case nir_intrinsic_store_output:
79       data = intrin->src[0].ssa;
80       offset = intrin->src[1].ssa;
81       break;
82 
83    case nir_intrinsic_store_per_vertex_output:
84       data = intrin->src[0].ssa;
85       vtx = intrin->src[1].ssa;
86       offset = intrin->src[2].ssa;
87       break;
88 
89    default:
90       return false;
91    }
92 
93    const bool is_store = data != NULL;
94 
95    unsigned base = nir_intrinsic_base(intrin);
96    unsigned range = nir_intrinsic_range(intrin);
97    unsigned component = nir_intrinsic_component(intrin);
98 
99    bool is_output;
100    switch (intrin->intrinsic) {
101    case nir_intrinsic_load_input:
102    case nir_intrinsic_load_per_vertex_input:
103       is_output = false;
104       break;
105 
106    case nir_intrinsic_load_output:
107    case nir_intrinsic_load_per_vertex_output:
108    case nir_intrinsic_store_output:
109    case nir_intrinsic_store_per_vertex_output:
110       is_output = true;
111       break;
112 
113    default:
114       unreachable("Unknown NIR I/O intrinsic");
115    }
116 
117    bool is_patch;
118    switch (b->shader->info.stage) {
119    case MESA_SHADER_VERTEX:
120    case MESA_SHADER_GEOMETRY:
121       is_patch = false;
122       break;
123 
124    case MESA_SHADER_TESS_CTRL:
125       is_patch = is_output && vtx == NULL;
126       break;
127 
128    case MESA_SHADER_TESS_EVAL:
129       is_patch = !is_output && vtx == NULL;
130       break;
131 
132    default:
133       unreachable("Unknown shader stage");
134    }
135 
136    nir_component_mask_t mask;
137    if (is_store)
138       mask = nir_intrinsic_write_mask(intrin);
139    else
140       mask = nir_component_mask(intrin->num_components);
141 
142    if (vtx != NULL && !is_output) {
143       nir_def *info = nir_load_sysval_nv(b, 32,
144                                          .base = NAK_SV_INVOCATION_INFO,
145                                          .access = ACCESS_CAN_REORDER);
146       nir_def *lo = nir_extract_u8_imm(b, info, 0);
147       nir_def *hi = nir_extract_u8_imm(b, info, 2);
148       nir_def *idx = nir_iadd(b, nir_imul(b, lo, hi), vtx);
149       vtx = nir_isberd_nv(b, idx);
150    }
151 
152    if (vtx == NULL)
153       vtx = nir_imm_int(b, 0);
154 
155    unsigned addr = base + 4 * component;
156    const bool offset_is_const = nir_src_is_const(nir_src_for_ssa(offset));
157    if (offset_is_const) {
158       unsigned const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
159       assert(const_offset % 16 == 0);
160       addr += const_offset;
161 
162       /* Tighten the range */
163       base = addr;
164       range = 4 * intrin->num_components;
165 
166       if (const_offset != 0)
167          offset = nir_imm_int(b, 0);
168    }
169 
170    const struct nak_nir_attr_io_flags flags = {
171       .output = is_output,
172       .patch = is_patch,
173       .phys = !offset_is_const && !is_patch,
174    };
175 
176    uint32_t flags_u32;
177    STATIC_ASSERT(sizeof(flags_u32) == sizeof(flags));
178    memcpy(&flags_u32, &flags, sizeof(flags_u32));
179 
180    nir_def *dst_comps[NIR_MAX_VEC_COMPONENTS];
181    while (mask) {
182       const unsigned c = ffs(mask) - 1;
183       unsigned comps = ffs(~(mask >> c)) - 1;
184       assert(comps > 0);
185 
186       unsigned c_addr = addr + 4 * c;
187 
188       /* vec2 has to be vec2 aligned, vec3/4 have to be vec4 aligned.  We
189        * don't have actual alignment information on these intrinsics but we
190        * can assume that the indirect offset (if any) is a multiple of 16 so
191        * we don't need to worry about that and can just look at c_addr.
192        */
193       comps = MIN2(comps, 4);
194       if (c_addr & 0xf)
195          comps = MIN2(comps, 2);
196       if (c_addr & 0x7)
197          comps = 1;
198       assert(!(c_addr & 0x3));
199 
200       nir_def *c_offset = offset;
201       if (flags.phys) {
202          /* Physical addressing has to be scalar */
203          comps = 1;
204 
205          /* Use al2p to compute a physical address */
206          c_offset = nir_al2p_nv(b, offset, .base = c_addr,
207                                 .flags = flags_u32);
208          c_addr = 0;
209       }
210 
211       if (is_store) {
212          nir_def *c_data = nir_channels(b, data, BITFIELD_RANGE(c, comps));
213          nir_ast_nv(b, c_data, vtx, c_offset,
214                     .base = c_addr,
215                     .flags = flags_u32,
216                     .range_base = base,
217                     .range = range);
218       } else {
219          uint32_t access = flags.output ? 0 : ACCESS_CAN_REORDER;
220          nir_def *c_data = nir_ald_nv(b, comps, vtx, c_offset,
221                                       .base = c_addr,
222                                       .flags = flags_u32,
223                                       .range_base = base,
224                                       .range = range,
225                                       .access = access);
226          for (unsigned i = 0; i < comps; i++)
227             dst_comps[c + i] = nir_channel(b, c_data, i);
228       }
229 
230       mask &= ~BITFIELD_RANGE(c, comps);
231    }
232 
233    if (!is_store) {
234       nir_def *dst = nir_vec(b, dst_comps, intrin->num_components);
235       nir_def_rewrite_uses(&intrin->def, dst);
236    }
237 
238    nir_instr_remove(&intrin->instr);
239 
240    return true;
241 }
242 
243 bool
nak_nir_lower_vtg_io(nir_shader * nir,const struct nak_compiler * nak)244 nak_nir_lower_vtg_io(nir_shader *nir, const struct nak_compiler *nak)
245 {
246    return nir_shader_intrinsics_pass(nir, lower_vtg_io_intrin,
247                                      nir_metadata_block_index |
248                                      nir_metadata_dominance,
249                                      NULL);
250 }
251