1 /*
2 * Copyright © 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "radv_cs.h"
27 #include "radv_private.h"
28 #include "sid.h"
29
30 #define SPM_RING_BASE_ALIGN 32
31
32 static bool
radv_spm_init_bo(struct radv_device * device)33 radv_spm_init_bo(struct radv_device *device)
34 {
35 struct radeon_winsys *ws = device->ws;
36 uint64_t size = 32 * 1024 * 1024; /* Default to 1MB. */
37 uint16_t sample_interval = 4096; /* Default to 4096 clk. */
38 VkResult result;
39
40 device->spm.buffer_size = size;
41 device->spm.sample_interval = sample_interval;
42
43 struct radeon_winsys_bo *bo = NULL;
44 result = ws->buffer_create(ws, size, 4096, RADEON_DOMAIN_VRAM,
45 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
46 RADV_BO_PRIORITY_SCRATCH, 0, &bo);
47 device->spm.bo = bo;
48 if (result != VK_SUCCESS)
49 return false;
50
51 result = ws->buffer_make_resident(ws, device->spm.bo, true);
52 if (result != VK_SUCCESS)
53 return false;
54
55 device->spm.ptr = ws->buffer_map(device->spm.bo);
56 if (!device->spm.ptr)
57 return false;
58
59 return true;
60 }
61
62 static void
radv_emit_spm_counters(struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)63 radv_emit_spm_counters(struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
64 {
65 const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
66 struct ac_spm *spm = &device->spm;
67
68 if (gfx_level >= GFX11) {
69 for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sq_wgp); instance++) {
70 uint32_t num_counters = spm->sq_wgp[instance].num_counters;
71
72 if (!num_counters)
73 continue;
74
75 radeon_check_space(device->ws, cs, 3 + num_counters * 3);
76
77 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, spm->sq_wgp[instance].grbm_gfx_index);
78
79 for (uint32_t b = 0; b < num_counters; b++) {
80 const struct ac_spm_counter_select *cntr_sel = &spm->sq_wgp[instance].counters[b];
81 uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
82
83 radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, reg_base + b * 4, 1);
84 radeon_emit(cs, cntr_sel->sel0);
85 }
86 }
87 }
88
89 for (uint32_t instance = 0; instance < ARRAY_SIZE(spm->sqg); instance++) {
90 uint32_t num_counters = spm->sqg[instance].num_counters;
91
92 if (!num_counters)
93 continue;
94
95 radeon_check_space(device->ws, cs, 3 + num_counters * 3);
96
97 radeon_set_uconfig_reg(
98 cs, R_030800_GRBM_GFX_INDEX,
99 S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1) | S_030800_SE_INDEX(instance));
100
101 for (uint32_t b = 0; b < num_counters; b++) {
102 const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b];
103 uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT;
104
105 radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, reg_base + b * 4, 1);
106 radeon_emit(cs, cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */
107 }
108 }
109
110 for (uint32_t b = 0; b < spm->num_block_sel; b++) {
111 struct ac_spm_block_select *block_sel = &spm->block_sel[b];
112 struct ac_pc_block_base *regs = block_sel->b->b->b;
113
114 for (unsigned i = 0; i < block_sel->num_instances; i++) {
115 struct ac_spm_block_instance *block_instance = &block_sel->instances[i];
116
117 radeon_check_space(device->ws, cs, 3 + (AC_SPM_MAX_COUNTER_PER_BLOCK * 6));
118
119 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, block_instance->grbm_gfx_index);
120
121 for (unsigned c = 0; c < block_instance->num_counters; c++) {
122 const struct ac_spm_counter_select *cntr_sel = &block_instance->counters[c];
123
124 if (!cntr_sel->active)
125 continue;
126
127 radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, regs->select0[c], 1);
128 radeon_emit(cs, cntr_sel->sel0);
129
130 radeon_set_uconfig_reg_seq_perfctr(gfx_level, qf, cs, regs->select1[c], 1);
131 radeon_emit(cs, cntr_sel->sel1);
132 }
133 }
134 }
135
136 /* Restore global broadcasting. */
137 radeon_set_uconfig_reg(
138 cs, R_030800_GRBM_GFX_INDEX,
139 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
140 }
141
142 void
radv_emit_spm_setup(struct radv_device * device,struct radeon_cmdbuf * cs,enum radv_queue_family qf)143 radv_emit_spm_setup(struct radv_device *device, struct radeon_cmdbuf *cs, enum radv_queue_family qf)
144 {
145 const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
146 struct ac_spm *spm = &device->spm;
147 uint64_t va = radv_buffer_get_va(spm->bo);
148 uint64_t ring_size = spm->buffer_size;
149
150 /* It's required that the ring VA and the size are correctly aligned. */
151 assert(!(va & (SPM_RING_BASE_ALIGN - 1)));
152 assert(!(ring_size & (SPM_RING_BASE_ALIGN - 1)));
153 assert(spm->sample_interval >= 32);
154
155 radeon_check_space(device->ws, cs, 27);
156
157 /* Configure the SPM ring buffer. */
158 radeon_set_uconfig_reg(cs, R_037200_RLC_SPM_PERFMON_CNTL,
159 S_037200_PERFMON_RING_MODE(0) | /* no stall and no interrupt on overflow */
160 S_037200_PERFMON_SAMPLE_INTERVAL(spm->sample_interval)); /* in sclk */
161 radeon_set_uconfig_reg(cs, R_037204_RLC_SPM_PERFMON_RING_BASE_LO, va);
162 radeon_set_uconfig_reg(cs, R_037208_RLC_SPM_PERFMON_RING_BASE_HI, S_037208_RING_BASE_HI(va >> 32));
163 radeon_set_uconfig_reg(cs, R_03720C_RLC_SPM_PERFMON_RING_SIZE, ring_size);
164
165 /* Configure the muxsel. */
166 uint32_t total_muxsel_lines = 0;
167 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
168 total_muxsel_lines += spm->num_muxsel_lines[s];
169 }
170
171 radeon_set_uconfig_reg(cs, R_03726C_RLC_SPM_ACCUM_MODE, 0);
172
173 if (device->physical_device->rad_info.gfx_level >= GFX11) {
174 radeon_set_uconfig_reg(cs, R_03721C_RLC_SPM_PERFMON_SEGMENT_SIZE,
175 S_03721C_TOTAL_NUM_SEGMENT(total_muxsel_lines) |
176 S_03721C_GLOBAL_NUM_SEGMENT(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]) |
177 S_03721C_SE_NUM_SEGMENT(spm->max_se_muxsel_lines));
178
179 radeon_set_uconfig_reg(cs, R_037210_RLC_SPM_RING_WRPTR, 0);
180 } else {
181 radeon_set_uconfig_reg(cs, R_037210_RLC_SPM_PERFMON_SEGMENT_SIZE, 0);
182 radeon_set_uconfig_reg(cs, R_03727C_RLC_SPM_PERFMON_SE3TO0_SEGMENT_SIZE,
183 S_03727C_SE0_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE0]) |
184 S_03727C_SE1_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE1]) |
185 S_03727C_SE2_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE2]) |
186 S_03727C_SE3_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_SE3]));
187 radeon_set_uconfig_reg(cs, R_037280_RLC_SPM_PERFMON_GLB_SEGMENT_SIZE,
188 S_037280_PERFMON_SEGMENT_SIZE(total_muxsel_lines) |
189 S_037280_GLOBAL_NUM_LINE(spm->num_muxsel_lines[AC_SPM_SEGMENT_TYPE_GLOBAL]));
190 }
191
192 /* Upload each muxsel ram to the RLC. */
193 for (unsigned s = 0; s < AC_SPM_SEGMENT_TYPE_COUNT; s++) {
194 unsigned rlc_muxsel_addr, rlc_muxsel_data;
195 unsigned grbm_gfx_index = S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1);
196
197 if (!spm->num_muxsel_lines[s])
198 continue;
199
200 if (s == AC_SPM_SEGMENT_TYPE_GLOBAL) {
201 grbm_gfx_index |= S_030800_SE_BROADCAST_WRITES(1);
202
203 rlc_muxsel_addr =
204 gfx_level >= GFX11 ? R_037220_RLC_SPM_GLOBAL_MUXSEL_ADDR : R_037224_RLC_SPM_GLOBAL_MUXSEL_ADDR;
205 rlc_muxsel_data =
206 gfx_level >= GFX11 ? R_037224_RLC_SPM_GLOBAL_MUXSEL_DATA : R_037228_RLC_SPM_GLOBAL_MUXSEL_DATA;
207 } else {
208 grbm_gfx_index |= S_030800_SE_INDEX(s);
209
210 rlc_muxsel_addr = gfx_level >= GFX11 ? R_037228_RLC_SPM_SE_MUXSEL_ADDR : R_03721C_RLC_SPM_SE_MUXSEL_ADDR;
211 rlc_muxsel_data = gfx_level >= GFX11 ? R_03722C_RLC_SPM_SE_MUXSEL_DATA : R_037220_RLC_SPM_SE_MUXSEL_DATA;
212 }
213
214 radeon_check_space(device->ws, cs, 3 + spm->num_muxsel_lines[s] * (7 + AC_SPM_MUXSEL_LINE_SIZE));
215
216 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, grbm_gfx_index);
217
218 for (unsigned l = 0; l < spm->num_muxsel_lines[s]; l++) {
219 uint32_t *data = (uint32_t *)spm->muxsel_lines[s][l].muxsel;
220
221 /* Select MUXSEL_ADDR to point to the next muxsel. */
222 radeon_set_uconfig_reg_perfctr(gfx_level, qf, cs, rlc_muxsel_addr, l * AC_SPM_MUXSEL_LINE_SIZE);
223
224 /* Write the muxsel line configuration with MUXSEL_DATA. */
225 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + AC_SPM_MUXSEL_LINE_SIZE, 0));
226 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME) |
227 S_370_WR_ONE_ADDR(1));
228 radeon_emit(cs, rlc_muxsel_data >> 2);
229 radeon_emit(cs, 0);
230 radeon_emit_array(cs, data, AC_SPM_MUXSEL_LINE_SIZE);
231 }
232 }
233
234 /* Select SPM counters. */
235 radv_emit_spm_counters(device, cs, qf);
236 }
237
238 bool
radv_spm_init(struct radv_device * device)239 radv_spm_init(struct radv_device *device)
240 {
241 const struct radeon_info *info = &device->physical_device->rad_info;
242 struct ac_perfcounters *pc = &device->physical_device->ac_perfcounters;
243
244 /* We failed to initialize the performance counters. */
245 if (!pc->blocks)
246 return false;
247
248 if (!ac_init_spm(info, pc, &device->spm))
249 return false;
250
251 if (!radv_spm_init_bo(device))
252 return false;
253
254 return true;
255 }
256
257 void
radv_spm_finish(struct radv_device * device)258 radv_spm_finish(struct radv_device *device)
259 {
260 struct radeon_winsys *ws = device->ws;
261
262 if (device->spm.bo) {
263 ws->buffer_make_resident(ws, device->spm.bo, false);
264 ws->buffer_destroy(ws, device->spm.bo);
265 }
266
267 ac_destroy_spm(&device->spm);
268 }
269