1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pm4.h"
8 #include "si_pipe.h"
9 #include "si_build_pm4.h"
10 #include "sid.h"
11 #include "util/u_memory.h"
12 #include "ac_debug.h"
13
14 static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
15 unsigned opcode, unsigned idx);
16
opcode_is_pairs_packed(unsigned opcode)17 static bool opcode_is_pairs_packed(unsigned opcode)
18 {
19 return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
20 opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
21 opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
22 }
23
pairs_packed_opcode_to_regular(unsigned opcode)24 static unsigned pairs_packed_opcode_to_regular(unsigned opcode)
25 {
26 switch (opcode) {
27 case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
28 return PKT3_SET_CONTEXT_REG;
29 case PKT3_SET_SH_REG_PAIRS_PACKED:
30 return PKT3_SET_SH_REG;
31 default:
32 unreachable("invalid packed opcode");
33 }
34 }
35
regular_opcode_to_pairs(struct si_pm4_state * state,unsigned opcode)36 static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opcode)
37 {
38 const struct radeon_info *info = &state->screen->info;
39
40 switch (opcode) {
41 case PKT3_SET_CONTEXT_REG:
42 return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED : opcode;
43 case PKT3_SET_SH_REG:
44 return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED : opcode;
45 }
46
47 return opcode;
48 }
49
packed_next_is_reg_offset_pair(struct si_pm4_state * state)50 static bool packed_next_is_reg_offset_pair(struct si_pm4_state *state)
51 {
52 return (state->ndw - state->last_pm4) % 3 == 2;
53 }
54
packed_next_is_reg_value1(struct si_pm4_state * state)55 static bool packed_next_is_reg_value1(struct si_pm4_state *state)
56 {
57 return (state->ndw - state->last_pm4) % 3 == 1;
58 }
59
packed_prev_is_reg_value0(struct si_pm4_state * state)60 static bool packed_prev_is_reg_value0(struct si_pm4_state *state)
61 {
62 return packed_next_is_reg_value1(state);
63 }
64
get_packed_reg_dw_offsetN(struct si_pm4_state * state,unsigned index)65 static unsigned get_packed_reg_dw_offsetN(struct si_pm4_state *state, unsigned index)
66 {
67 unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
68 assert(i < state->ndw);
69 return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
70 }
71
get_packed_reg_valueN_idx(struct si_pm4_state * state,unsigned index)72 static unsigned get_packed_reg_valueN_idx(struct si_pm4_state *state, unsigned index)
73 {
74 unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
75 assert(i < state->ndw);
76 return i;
77 }
78
get_packed_reg_valueN(struct si_pm4_state * state,unsigned index)79 static unsigned get_packed_reg_valueN(struct si_pm4_state *state, unsigned index)
80 {
81 return state->pm4[get_packed_reg_valueN_idx(state, index)];
82 }
83
get_packed_reg_count(struct si_pm4_state * state)84 static unsigned get_packed_reg_count(struct si_pm4_state *state)
85 {
86 int body_size = state->ndw - state->last_pm4 - 2;
87 assert(body_size > 0 && body_size % 3 == 0);
88 return (body_size / 3) * 2;
89 }
90
si_pm4_finalize(struct si_pm4_state * state)91 void si_pm4_finalize(struct si_pm4_state *state)
92 {
93 if (opcode_is_pairs_packed(state->last_opcode)) {
94 unsigned reg_count = get_packed_reg_count(state);
95 unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
96
97 if (state->packed_is_padded)
98 reg_count--;
99
100 bool all_consecutive = true;
101
102 /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
103 * to be unpacked to make it shorter.
104 *
105 * This also eliminates the invalid scenario when the packed SET packet sets only
106 * 2 registers and the register offsets are equal due to padding.
107 */
108 for (unsigned i = 1; i < reg_count; i++) {
109 if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
110 all_consecutive = false;
111 break;
112 }
113 }
114
115 if (all_consecutive) {
116 assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
117 state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
118 reg_count, 0);
119 state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
120 for (unsigned i = 0; i < reg_count; i++)
121 state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
122 state->ndw = state->last_pm4 + 2 + reg_count;
123 state->last_opcode = PKT3_SET_SH_REG;
124 } else {
125 /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
126 if (state->screen->debug_flags & DBG(SQTT) &&
127 (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
128 state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
129 if (state->packed_is_padded)
130 reg_count++; /* Add this back because we only need to record the last write. */
131
132 for (int i = reg_count - 1; i >= 0; i--) {
133 unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
134
135 if (strstr(ac_get_register_name(state->screen->info.gfx_level,
136 state->screen->info.family, reg_offset),
137 "SPI_SHADER_PGM_LO_")) {
138 state->spi_shader_pgm_lo_reg = reg_offset;
139 break;
140 }
141 }
142 }
143
144 /* If it's a packed SET_SH packet, use the *_N variant when possible. */
145 if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
146 state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
147 state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
148 }
149 }
150 }
151
152 if (state->screen->debug_flags & DBG(SQTT) && state->last_opcode == PKT3_SET_SH_REG) {
153 /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
154 unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
155 unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
156
157 for (unsigned i = 0; i < reg_count; i++) {
158 if (strstr(ac_get_register_name(state->screen->info.gfx_level,
159 state->screen->info.family, reg_base_offset + i * 4),
160 "SPI_SHADER_PGM_LO_")) {
161 state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
162
163 break;
164 }
165 }
166 }
167 }
168
si_pm4_cmd_begin(struct si_pm4_state * state,unsigned opcode)169 static void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
170 {
171 si_pm4_finalize(state);
172
173 assert(state->max_dw);
174 assert(state->ndw < state->max_dw);
175 assert(opcode <= 254);
176 state->last_opcode = opcode;
177 state->last_pm4 = state->ndw++;
178 state->packed_is_padded = false;
179 }
180
si_pm4_cmd_add(struct si_pm4_state * state,uint32_t dw)181 void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
182 {
183 assert(state->max_dw);
184 assert(state->ndw < state->max_dw);
185 state->pm4[state->ndw++] = dw;
186 state->last_opcode = 255; /* invalid opcode */
187 }
188
si_pm4_cmd_end(struct si_pm4_state * state,bool predicate)189 static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
190 {
191 unsigned count;
192 count = state->ndw - state->last_pm4 - 2;
193 /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
194 bool reset_filter_cam = !state->is_compute_queue &&
195 opcode_is_pairs_packed(state->last_opcode);
196
197 state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
198 PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
199
200 if (opcode_is_pairs_packed(state->last_opcode)) {
201 if (packed_prev_is_reg_value0(state)) {
202 /* Duplicate the first register at the end to make the number of registers aligned to 2. */
203 si_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
204 get_packed_reg_valueN(state, 0),
205 state->last_opcode, 0);
206 state->packed_is_padded = true;
207 }
208
209 state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
210 }
211 }
212
si_pm4_set_reg_custom(struct si_pm4_state * state,unsigned reg,uint32_t val,unsigned opcode,unsigned idx)213 static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
214 unsigned opcode, unsigned idx)
215 {
216 bool is_packed = opcode_is_pairs_packed(opcode);
217 reg >>= 2;
218
219 assert(state->max_dw);
220 assert(state->ndw + 2 <= state->max_dw);
221
222 if (is_packed) {
223 assert(idx == 0);
224
225 if (opcode != state->last_opcode) {
226 si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
227 state->ndw++; /* reserve space for the register count, it will be set at the end */
228 }
229 } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
230 idx != state->last_idx) {
231 si_pm4_cmd_begin(state, opcode);
232 state->pm4[state->ndw++] = reg | (idx << 28);
233 }
234
235 assert(reg <= UINT16_MAX);
236 state->last_reg = reg;
237 state->last_idx = idx;
238
239 if (is_packed) {
240 if (state->packed_is_padded) {
241 /* The packet is padded, which means the first register is written redundantly again
242 * at the end. Remove it, so that we can replace it with this register.
243 */
244 state->packed_is_padded = false;
245 state->ndw--;
246 }
247
248 if (packed_next_is_reg_offset_pair(state)) {
249 state->pm4[state->ndw++] = reg;
250 } else if (packed_next_is_reg_value1(state)) {
251 /* Set the second register offset in the high 16 bits. */
252 state->pm4[state->ndw - 2] &= 0x0000ffff;
253 state->pm4[state->ndw - 2] |= reg << 16;
254 }
255 }
256
257 state->pm4[state->ndw++] = val;
258 si_pm4_cmd_end(state, false);
259 }
260
si_pm4_set_reg(struct si_pm4_state * state,unsigned reg,uint32_t val)261 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
262 {
263 unsigned opcode;
264
265 if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
266 opcode = PKT3_SET_CONFIG_REG;
267 reg -= SI_CONFIG_REG_OFFSET;
268
269 } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
270 opcode = PKT3_SET_SH_REG;
271 reg -= SI_SH_REG_OFFSET;
272
273 } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
274 opcode = PKT3_SET_CONTEXT_REG;
275 reg -= SI_CONTEXT_REG_OFFSET;
276
277 } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
278 opcode = PKT3_SET_UCONFIG_REG;
279 reg -= CIK_UCONFIG_REG_OFFSET;
280
281 } else {
282 PRINT_ERR("Invalid register offset %08x!\n", reg);
283 return;
284 }
285
286 opcode = regular_opcode_to_pairs(state, opcode);
287
288 si_pm4_set_reg_custom(state, reg, val, opcode, 0);
289 }
290
si_pm4_set_reg_idx3(struct si_pm4_state * state,unsigned reg,uint32_t val)291 void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
292 {
293 if (state->screen->info.uses_kernel_cu_mask) {
294 assert(state->screen->info.gfx_level >= GFX10);
295 si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
296 } else {
297 si_pm4_set_reg(state, reg, val);
298 }
299 }
300
si_pm4_clear_state(struct si_pm4_state * state,struct si_screen * sscreen,bool is_compute_queue)301 void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
302 bool is_compute_queue)
303 {
304 state->screen = sscreen;
305 state->ndw = 0;
306 state->is_compute_queue = is_compute_queue;
307
308 if (!state->max_dw)
309 state->max_dw = ARRAY_SIZE(state->pm4);
310 }
311
si_pm4_free_state(struct si_context * sctx,struct si_pm4_state * state,unsigned idx)312 void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
313 {
314 if (!state)
315 return;
316
317 if (idx != ~0) {
318 if (sctx->emitted.array[idx] == state)
319 sctx->emitted.array[idx] = NULL;
320
321 if (sctx->queued.array[idx] == state) {
322 sctx->queued.array[idx] = NULL;
323 sctx->dirty_atoms &= ~BITFIELD64_BIT(idx);
324 }
325 }
326
327 FREE(state);
328 }
329
si_pm4_emit_commands(struct si_context * sctx,struct si_pm4_state * state)330 void si_pm4_emit_commands(struct si_context *sctx, struct si_pm4_state *state)
331 {
332 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
333
334 radeon_begin(cs);
335 radeon_emit_array(state->pm4, state->ndw);
336 radeon_end();
337 }
338
si_pm4_emit_state(struct si_context * sctx,unsigned index)339 void si_pm4_emit_state(struct si_context *sctx, unsigned index)
340 {
341 struct si_pm4_state *state = sctx->queued.array[index];
342 struct radeon_cmdbuf *cs = &sctx->gfx_cs;
343
344 /* All places should unset dirty_states if this doesn't pass. */
345 assert(state && state != sctx->emitted.array[index]);
346
347 radeon_begin(cs);
348 radeon_emit_array(state->pm4, state->ndw);
349 radeon_end();
350
351 sctx->emitted.array[index] = state;
352 }
353
si_pm4_emit_shader(struct si_context * sctx,unsigned index)354 void si_pm4_emit_shader(struct si_context *sctx, unsigned index)
355 {
356 struct si_pm4_state *state = sctx->queued.array[index];
357
358 si_pm4_emit_state(sctx, index);
359
360 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo,
361 RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
362 if (state->atom.emit)
363 state->atom.emit(sctx, -1);
364 }
365
si_pm4_reset_emitted(struct si_context * sctx)366 void si_pm4_reset_emitted(struct si_context *sctx)
367 {
368 memset(&sctx->emitted, 0, sizeof(sctx->emitted));
369
370 for (unsigned i = 0; i < SI_NUM_STATES; i++) {
371 if (sctx->queued.array[i])
372 sctx->dirty_atoms |= BITFIELD64_BIT(i);
373 }
374 }
375
si_pm4_create_sized(struct si_screen * sscreen,unsigned max_dw,bool is_compute_queue)376 struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max_dw,
377 bool is_compute_queue)
378 {
379 struct si_pm4_state *pm4;
380 unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
381
382 pm4 = (struct si_pm4_state *)calloc(1, size);
383 if (pm4) {
384 pm4->max_dw = max_dw;
385 si_pm4_clear_state(pm4, sscreen, is_compute_queue);
386 }
387 return pm4;
388 }
389
si_pm4_clone(struct si_pm4_state * orig)390 struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig)
391 {
392 struct si_pm4_state *pm4 = si_pm4_create_sized(orig->screen, orig->max_dw,
393 orig->is_compute_queue);
394 if (pm4)
395 memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->max_dw - ARRAY_SIZE(pm4->pm4)));
396 return pm4;
397 }
398