• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pm4.h"
8 #include "si_pipe.h"
9 #include "si_build_pm4.h"
10 #include "sid.h"
11 #include "util/u_memory.h"
12 #include "ac_debug.h"
13 
14 static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
15                                   unsigned opcode, unsigned idx);
16 
opcode_is_pairs_packed(unsigned opcode)17 static bool opcode_is_pairs_packed(unsigned opcode)
18 {
19    return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
20           opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
21           opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
22 }
23 
pairs_packed_opcode_to_regular(unsigned opcode)24 static unsigned pairs_packed_opcode_to_regular(unsigned opcode)
25 {
26    switch (opcode) {
27    case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
28       return PKT3_SET_CONTEXT_REG;
29    case PKT3_SET_SH_REG_PAIRS_PACKED:
30       return PKT3_SET_SH_REG;
31    default:
32       unreachable("invalid packed opcode");
33    }
34 }
35 
regular_opcode_to_pairs(struct si_pm4_state * state,unsigned opcode)36 static unsigned regular_opcode_to_pairs(struct si_pm4_state *state, unsigned opcode)
37 {
38    const struct radeon_info *info = &state->screen->info;
39 
40    switch (opcode) {
41    case PKT3_SET_CONTEXT_REG:
42       return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED : opcode;
43    case PKT3_SET_SH_REG:
44       return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED : opcode;
45    }
46 
47    return opcode;
48 }
49 
packed_next_is_reg_offset_pair(struct si_pm4_state * state)50 static bool packed_next_is_reg_offset_pair(struct si_pm4_state *state)
51 {
52    return (state->ndw - state->last_pm4) % 3 == 2;
53 }
54 
packed_next_is_reg_value1(struct si_pm4_state * state)55 static bool packed_next_is_reg_value1(struct si_pm4_state *state)
56 {
57    return (state->ndw - state->last_pm4) % 3 == 1;
58 }
59 
packed_prev_is_reg_value0(struct si_pm4_state * state)60 static bool packed_prev_is_reg_value0(struct si_pm4_state *state)
61 {
62    return packed_next_is_reg_value1(state);
63 }
64 
get_packed_reg_dw_offsetN(struct si_pm4_state * state,unsigned index)65 static unsigned get_packed_reg_dw_offsetN(struct si_pm4_state *state, unsigned index)
66 {
67    unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
68    assert(i < state->ndw);
69    return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
70 }
71 
get_packed_reg_valueN_idx(struct si_pm4_state * state,unsigned index)72 static unsigned get_packed_reg_valueN_idx(struct si_pm4_state *state, unsigned index)
73 {
74    unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
75    assert(i < state->ndw);
76    return i;
77 }
78 
get_packed_reg_valueN(struct si_pm4_state * state,unsigned index)79 static unsigned get_packed_reg_valueN(struct si_pm4_state *state, unsigned index)
80 {
81    return state->pm4[get_packed_reg_valueN_idx(state, index)];
82 }
83 
get_packed_reg_count(struct si_pm4_state * state)84 static unsigned get_packed_reg_count(struct si_pm4_state *state)
85 {
86    int body_size = state->ndw - state->last_pm4 - 2;
87    assert(body_size > 0 && body_size % 3 == 0);
88    return (body_size / 3) * 2;
89 }
90 
si_pm4_finalize(struct si_pm4_state * state)91 void si_pm4_finalize(struct si_pm4_state *state)
92 {
93    if (opcode_is_pairs_packed(state->last_opcode)) {
94       unsigned reg_count = get_packed_reg_count(state);
95       unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
96 
97       if (state->packed_is_padded)
98          reg_count--;
99 
100       bool all_consecutive = true;
101 
102       /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
103        * to be unpacked to make it shorter.
104        *
105        * This also eliminates the invalid scenario when the packed SET packet sets only
106        * 2 registers and the register offsets are equal due to padding.
107        */
108       for (unsigned i = 1; i < reg_count; i++) {
109          if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
110             all_consecutive = false;
111             break;
112          }
113       }
114 
115       if (all_consecutive) {
116          assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
117          state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
118                                             reg_count, 0);
119          state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
120          for (unsigned i = 0; i < reg_count; i++)
121             state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
122          state->ndw = state->last_pm4 + 2 + reg_count;
123          state->last_opcode = PKT3_SET_SH_REG;
124       } else {
125          /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
126          if (state->screen->debug_flags & DBG(SQTT) &&
127              (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
128               state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
129             if (state->packed_is_padded)
130                reg_count++; /* Add this back because we only need to record the last write. */
131 
132             for (int i = reg_count - 1; i >= 0; i--) {
133                unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
134 
135                if (strstr(ac_get_register_name(state->screen->info.gfx_level,
136                                                state->screen->info.family, reg_offset),
137                           "SPI_SHADER_PGM_LO_")) {
138                   state->spi_shader_pgm_lo_reg = reg_offset;
139                   break;
140                }
141             }
142          }
143 
144          /* If it's a packed SET_SH packet, use the *_N variant when possible. */
145          if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
146             state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
147             state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
148          }
149       }
150    }
151 
152    if (state->screen->debug_flags & DBG(SQTT) && state->last_opcode == PKT3_SET_SH_REG) {
153       /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
154       unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
155       unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
156 
157       for (unsigned i = 0; i < reg_count; i++) {
158          if (strstr(ac_get_register_name(state->screen->info.gfx_level,
159                                          state->screen->info.family, reg_base_offset + i * 4),
160                     "SPI_SHADER_PGM_LO_")) {
161             state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
162 
163             break;
164          }
165       }
166    }
167 }
168 
si_pm4_cmd_begin(struct si_pm4_state * state,unsigned opcode)169 static void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
170 {
171    si_pm4_finalize(state);
172 
173    assert(state->max_dw);
174    assert(state->ndw < state->max_dw);
175    assert(opcode <= 254);
176    state->last_opcode = opcode;
177    state->last_pm4 = state->ndw++;
178    state->packed_is_padded = false;
179 }
180 
si_pm4_cmd_add(struct si_pm4_state * state,uint32_t dw)181 void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
182 {
183    assert(state->max_dw);
184    assert(state->ndw < state->max_dw);
185    state->pm4[state->ndw++] = dw;
186    state->last_opcode = 255; /* invalid opcode */
187 }
188 
si_pm4_cmd_end(struct si_pm4_state * state,bool predicate)189 static void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
190 {
191    unsigned count;
192    count = state->ndw - state->last_pm4 - 2;
193    /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
194    bool reset_filter_cam = !state->is_compute_queue &&
195                            opcode_is_pairs_packed(state->last_opcode);
196 
197    state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
198                                  PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
199 
200    if (opcode_is_pairs_packed(state->last_opcode)) {
201       if (packed_prev_is_reg_value0(state)) {
202          /* Duplicate the first register at the end to make the number of registers aligned to 2. */
203          si_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
204                                get_packed_reg_valueN(state, 0),
205                                state->last_opcode, 0);
206          state->packed_is_padded = true;
207       }
208 
209       state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
210    }
211 }
212 
si_pm4_set_reg_custom(struct si_pm4_state * state,unsigned reg,uint32_t val,unsigned opcode,unsigned idx)213 static void si_pm4_set_reg_custom(struct si_pm4_state *state, unsigned reg, uint32_t val,
214                                   unsigned opcode, unsigned idx)
215 {
216    bool is_packed = opcode_is_pairs_packed(opcode);
217    reg >>= 2;
218 
219    assert(state->max_dw);
220    assert(state->ndw + 2 <= state->max_dw);
221 
222    if (is_packed) {
223       assert(idx == 0);
224 
225       if (opcode != state->last_opcode) {
226          si_pm4_cmd_begin(state, opcode); /* reserve space for the header */
227          state->ndw++; /* reserve space for the register count, it will be set at the end */
228       }
229    } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
230               idx != state->last_idx) {
231       si_pm4_cmd_begin(state, opcode);
232       state->pm4[state->ndw++] = reg | (idx << 28);
233    }
234 
235    assert(reg <= UINT16_MAX);
236    state->last_reg = reg;
237    state->last_idx = idx;
238 
239    if (is_packed) {
240       if (state->packed_is_padded) {
241          /* The packet is padded, which means the first register is written redundantly again
242           * at the end. Remove it, so that we can replace it with this register.
243           */
244          state->packed_is_padded = false;
245          state->ndw--;
246       }
247 
248       if (packed_next_is_reg_offset_pair(state)) {
249          state->pm4[state->ndw++] = reg;
250       } else if (packed_next_is_reg_value1(state)) {
251          /* Set the second register offset in the high 16 bits. */
252          state->pm4[state->ndw - 2] &= 0x0000ffff;
253          state->pm4[state->ndw - 2] |= reg << 16;
254       }
255    }
256 
257    state->pm4[state->ndw++] = val;
258    si_pm4_cmd_end(state, false);
259 }
260 
si_pm4_set_reg(struct si_pm4_state * state,unsigned reg,uint32_t val)261 void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
262 {
263    unsigned opcode;
264 
265    if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
266       opcode = PKT3_SET_CONFIG_REG;
267       reg -= SI_CONFIG_REG_OFFSET;
268 
269    } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
270       opcode = PKT3_SET_SH_REG;
271       reg -= SI_SH_REG_OFFSET;
272 
273    } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
274       opcode = PKT3_SET_CONTEXT_REG;
275       reg -= SI_CONTEXT_REG_OFFSET;
276 
277    } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
278       opcode = PKT3_SET_UCONFIG_REG;
279       reg -= CIK_UCONFIG_REG_OFFSET;
280 
281    } else {
282       PRINT_ERR("Invalid register offset %08x!\n", reg);
283       return;
284    }
285 
286    opcode = regular_opcode_to_pairs(state, opcode);
287 
288    si_pm4_set_reg_custom(state, reg, val, opcode, 0);
289 }
290 
si_pm4_set_reg_idx3(struct si_pm4_state * state,unsigned reg,uint32_t val)291 void si_pm4_set_reg_idx3(struct si_pm4_state *state, unsigned reg, uint32_t val)
292 {
293    if (state->screen->info.uses_kernel_cu_mask) {
294       assert(state->screen->info.gfx_level >= GFX10);
295       si_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
296    } else {
297       si_pm4_set_reg(state, reg, val);
298    }
299 }
300 
si_pm4_clear_state(struct si_pm4_state * state,struct si_screen * sscreen,bool is_compute_queue)301 void si_pm4_clear_state(struct si_pm4_state *state, struct si_screen *sscreen,
302                         bool is_compute_queue)
303 {
304    state->screen = sscreen;
305    state->ndw = 0;
306    state->is_compute_queue = is_compute_queue;
307 
308    if (!state->max_dw)
309       state->max_dw = ARRAY_SIZE(state->pm4);
310 }
311 
si_pm4_free_state(struct si_context * sctx,struct si_pm4_state * state,unsigned idx)312 void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
313 {
314    if (!state)
315       return;
316 
317    if (idx != ~0) {
318       if (sctx->emitted.array[idx] == state)
319          sctx->emitted.array[idx] = NULL;
320 
321       if (sctx->queued.array[idx] == state) {
322          sctx->queued.array[idx] = NULL;
323          sctx->dirty_atoms &= ~BITFIELD64_BIT(idx);
324       }
325    }
326 
327    FREE(state);
328 }
329 
si_pm4_emit_commands(struct si_context * sctx,struct si_pm4_state * state)330 void si_pm4_emit_commands(struct si_context *sctx, struct si_pm4_state *state)
331 {
332    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
333 
334    radeon_begin(cs);
335    radeon_emit_array(state->pm4, state->ndw);
336    radeon_end();
337 }
338 
si_pm4_emit_state(struct si_context * sctx,unsigned index)339 void si_pm4_emit_state(struct si_context *sctx, unsigned index)
340 {
341    struct si_pm4_state *state = sctx->queued.array[index];
342    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
343 
344    /* All places should unset dirty_states if this doesn't pass. */
345    assert(state && state != sctx->emitted.array[index]);
346 
347    radeon_begin(cs);
348    radeon_emit_array(state->pm4, state->ndw);
349    radeon_end();
350 
351    sctx->emitted.array[index] = state;
352 }
353 
si_pm4_emit_shader(struct si_context * sctx,unsigned index)354 void si_pm4_emit_shader(struct si_context *sctx, unsigned index)
355 {
356    struct si_pm4_state *state = sctx->queued.array[index];
357 
358    si_pm4_emit_state(sctx, index);
359 
360    radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, ((struct si_shader*)state)->bo,
361                              RADEON_USAGE_READ | RADEON_PRIO_SHADER_BINARY);
362    if (state->atom.emit)
363       state->atom.emit(sctx, -1);
364 }
365 
si_pm4_reset_emitted(struct si_context * sctx)366 void si_pm4_reset_emitted(struct si_context *sctx)
367 {
368    memset(&sctx->emitted, 0, sizeof(sctx->emitted));
369 
370    for (unsigned i = 0; i < SI_NUM_STATES; i++) {
371       if (sctx->queued.array[i])
372          sctx->dirty_atoms |= BITFIELD64_BIT(i);
373    }
374 }
375 
si_pm4_create_sized(struct si_screen * sscreen,unsigned max_dw,bool is_compute_queue)376 struct si_pm4_state *si_pm4_create_sized(struct si_screen *sscreen, unsigned max_dw,
377                                          bool is_compute_queue)
378 {
379    struct si_pm4_state *pm4;
380    unsigned size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
381 
382    pm4 = (struct si_pm4_state *)calloc(1, size);
383    if (pm4) {
384       pm4->max_dw = max_dw;
385       si_pm4_clear_state(pm4, sscreen, is_compute_queue);
386    }
387    return pm4;
388 }
389 
si_pm4_clone(struct si_pm4_state * orig)390 struct si_pm4_state *si_pm4_clone(struct si_pm4_state *orig)
391 {
392    struct si_pm4_state *pm4 = si_pm4_create_sized(orig->screen, orig->max_dw,
393                                                   orig->is_compute_queue);
394    if (pm4)
395       memcpy(pm4, orig, sizeof(*pm4) + 4 * (pm4->max_dw - ARRAY_SIZE(pm4->pm4)));
396    return pm4;
397 }
398