1 /*
2 * Copyright © 2023 Collabora, Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "nak_private.h"
7 #include "nir_builder.h"
8
9 static nir_def *
tess_ctrl_output_vtx(nir_builder * b,nir_def * vtx)10 tess_ctrl_output_vtx(nir_builder *b, nir_def *vtx)
11 {
12 /* This is the pattern we see emitted by the blob driver:
13 *
14 * S2R R0, SR_LANEID
15 * S2R R6, SR_INVOCATION_ID
16 * # R3 is our vertex index
17 * SULD.P.2D.CTA.R.IGN R3, [R2], 0x1d, 0x0
18 * IMAD.IADD R5, R0, 0x1, -R6
19 * IMAD.SHL.U32 R0, R3, 0x4, RZ
20 * LEA.HI.SX32 R4, R0, R5, 0x1e
21 * ALD.O R4, a[0x88], R4
22 *
23 * Translating the MADs and re-naming registers, this is
24 *
25 * %r0 = iadd %lane -%invoc
26 * %r1 = imul %vtx 0x4
27 * %r2 = lea.hi.sx32 %r1 %r0 0x1e
28 * %out = ald.o a[%r2][0x88]
29 *
30 * But `lea.hi.sx32 %r1 %r0 0x1e` is just `(%r1 >> (32 - 0x1e)) + %r0`.
31 * Since %r1 is just `%vtx * 4` and 0x1e is 30, the whole bit on the left
32 * is `(%vtx * 4) >> 2 = %vtx`, assuming no overflow. So, this means
33 *
34 * %r0 = iadd %lane -%invoc
35 * %r2 = iadd %vtx %r0
36 * %out = ald.o a[%r2][0x88]
37 *
38 * In other words, the hardware actually indexes them by lane index with
39 * all of the invocations for a given TCS dispatch going in a sequential
40 * range of lanes. We have to compute the lane index of the requested
41 * invocation from the invocation index.
42 */
43 nir_def *lane = nir_load_sysval_nv(b, 32, .base = NAK_SV_LANE_ID,
44 .access = ACCESS_CAN_REORDER);
45 nir_def *invoc = nir_load_sysval_nv(b, 32, .base = NAK_SV_INVOCATION_ID,
46 .access = ACCESS_CAN_REORDER);
47
48 return nir_iadd(b, lane, nir_iadd(b, vtx, nir_ineg(b, invoc)));
49 }
50
51 static bool
lower_vtg_io_intrin(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)52 lower_vtg_io_intrin(nir_builder *b,
53 nir_intrinsic_instr *intrin,
54 void *cb_data)
55 {
56 const struct nak_compiler *nak = cb_data;
57 b->cursor = nir_before_instr(&intrin->instr);
58
59 nir_def *vtx = NULL, *offset = NULL, *data = NULL;
60 switch (intrin->intrinsic) {
61 case nir_intrinsic_load_input:
62 case nir_intrinsic_load_output:
63 offset = intrin->src[0].ssa;
64 break;
65
66 case nir_intrinsic_load_per_vertex_input:
67 vtx = intrin->src[0].ssa;
68 offset = intrin->src[1].ssa;
69 break;
70
71 case nir_intrinsic_load_per_vertex_output:
72 if (b->shader->info.stage == MESA_SHADER_TESS_CTRL)
73 vtx = tess_ctrl_output_vtx(b, intrin->src[0].ssa);
74 else
75 vtx = intrin->src[0].ssa;
76 offset = intrin->src[1].ssa;
77 break;
78
79 case nir_intrinsic_store_output:
80 data = intrin->src[0].ssa;
81 offset = intrin->src[1].ssa;
82 break;
83
84 case nir_intrinsic_store_per_vertex_output:
85 data = intrin->src[0].ssa;
86 vtx = intrin->src[1].ssa;
87 offset = intrin->src[2].ssa;
88 break;
89
90 default:
91 return false;
92 }
93 const bool offset_is_const = nir_src_is_const(nir_src_for_ssa(offset));
94
95 const bool is_store = data != NULL;
96
97 bool is_output;
98 switch (intrin->intrinsic) {
99 case nir_intrinsic_load_input:
100 case nir_intrinsic_load_per_vertex_input:
101 is_output = false;
102 break;
103
104 case nir_intrinsic_load_output:
105 case nir_intrinsic_load_per_vertex_output:
106 case nir_intrinsic_store_output:
107 case nir_intrinsic_store_per_vertex_output:
108 is_output = true;
109 break;
110
111 default:
112 unreachable("Unknown NIR I/O intrinsic");
113 }
114
115 bool is_patch;
116 switch (b->shader->info.stage) {
117 case MESA_SHADER_VERTEX:
118 case MESA_SHADER_GEOMETRY:
119 is_patch = false;
120 break;
121
122 case MESA_SHADER_TESS_CTRL:
123 is_patch = is_output && vtx == NULL;
124 break;
125
126 case MESA_SHADER_TESS_EVAL:
127 is_patch = !is_output && vtx == NULL;
128 break;
129
130 default:
131 unreachable("Unknown shader stage");
132 }
133
134 nir_component_mask_t mask;
135 if (is_store)
136 mask = nir_intrinsic_write_mask(intrin);
137 else
138 mask = nir_component_mask(intrin->num_components);
139
140 if (vtx != NULL && !is_output) {
141 nir_def *info = nir_load_sysval_nv(b, 32,
142 .base = NAK_SV_INVOCATION_INFO,
143 .access = ACCESS_CAN_REORDER);
144 nir_def *lo = nir_extract_u8_imm(b, info, 0);
145 nir_def *hi = nir_extract_u8_imm(b, info, 2);
146 nir_def *idx = nir_iadd(b, nir_imul(b, lo, hi), vtx);
147 vtx = nir_isberd_nv(b, idx);
148 }
149
150 if (vtx == NULL)
151 vtx = nir_imm_int(b, 0);
152
153 const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
154 unsigned component = nir_intrinsic_component(intrin);
155
156 uint32_t base_addr;
157 if (b->shader->info.stage == MESA_SHADER_VERTEX && !is_output)
158 base_addr = nak_attribute_attr_addr(nak, sem.location);
159 else
160 base_addr = nak_varying_attr_addr(nak, sem.location);
161 base_addr += 4 * component;
162
163 uint32_t range;
164 if (offset_is_const) {
165 unsigned const_offset = nir_src_as_uint(nir_src_for_ssa(offset));
166
167 /* Tighten the range */
168 base_addr += const_offset * 16;
169 range = 4 * intrin->num_components;
170
171 if (const_offset != 0)
172 offset = nir_imm_int(b, 0);
173 } else {
174 /* Offsets from NIR are in vec4's */
175 offset = nir_imul_imm(b, offset, 16);
176 range = (sem.num_slots - 1) * 16 + intrin->num_components * 4;
177 }
178
179 const struct nak_nir_attr_io_flags flags = {
180 .output = is_output,
181 .patch = is_patch,
182 .phys = !offset_is_const && !is_patch,
183 };
184
185 uint32_t flags_u32;
186 STATIC_ASSERT(sizeof(flags_u32) == sizeof(flags));
187 memcpy(&flags_u32, &flags, sizeof(flags_u32));
188
189 nir_def *dst_comps[NIR_MAX_VEC_COMPONENTS];
190 while (mask) {
191 const unsigned c = ffs(mask) - 1;
192 unsigned comps = ffs(~(mask >> c)) - 1;
193 assert(comps > 0);
194
195 unsigned c_addr = base_addr + 4 * c;
196
197 /* vec2 has to be vec2 aligned, vec3/4 have to be vec4 aligned. We
198 * don't have actual alignment information on these intrinsics but we
199 * can assume that the indirect offset (if any) is a multiple of 16 so
200 * we don't need to worry about that and can just look at c_addr.
201 */
202 comps = MIN2(comps, 4);
203 if (c_addr & 0xf)
204 comps = MIN2(comps, 2);
205 if (c_addr & 0x7)
206 comps = 1;
207 assert(!(c_addr & 0x3));
208
209 nir_def *c_offset = offset;
210 if (flags.phys) {
211 /* Physical addressing has to be scalar */
212 comps = 1;
213
214 /* Use al2p to compute a physical address */
215 c_offset = nir_al2p_nv(b, offset, .base = c_addr,
216 .flags = flags_u32);
217 c_addr = 0;
218 }
219
220 if (is_store) {
221 nir_def *c_data = nir_channels(b, data, BITFIELD_RANGE(c, comps));
222 nir_ast_nv(b, c_data, vtx, c_offset,
223 .base = c_addr,
224 .flags = flags_u32,
225 .range_base = base_addr,
226 .range = range);
227 } else {
228 uint32_t access = flags.output ? 0 : ACCESS_CAN_REORDER;
229 nir_def *c_data = nir_ald_nv(b, comps, vtx, c_offset,
230 .base = c_addr,
231 .flags = flags_u32,
232 .range_base = base_addr,
233 .range = range,
234 .access = access);
235 for (unsigned i = 0; i < comps; i++)
236 dst_comps[c + i] = nir_channel(b, c_data, i);
237 }
238
239 mask &= ~BITFIELD_RANGE(c, comps);
240 }
241
242 if (!is_store) {
243 nir_def *dst = nir_vec(b, dst_comps, intrin->num_components);
244 nir_def_rewrite_uses(&intrin->def, dst);
245 }
246
247 nir_instr_remove(&intrin->instr);
248
249 return true;
250 }
251
252 bool
nak_nir_lower_vtg_io(nir_shader * nir,const struct nak_compiler * nak)253 nak_nir_lower_vtg_io(nir_shader *nir, const struct nak_compiler *nak)
254 {
255 return nir_shader_intrinsics_pass(nir, lower_vtg_io_intrin,
256 nir_metadata_control_flow,
257 (void *)nak);
258 }
259