1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_shader_util.h"
8 #include "ac_gpu_info.h"
9
10 #include "sid.h"
11 #include "util/u_math.h"
12
13 #include <assert.h>
14 #include <stdlib.h>
15 #include <string.h>
16
17 /* Set NIR options shared by ACO, LLVM, RADV, and radeonsi. */
ac_set_nir_options(struct radeon_info * info,bool use_llvm,nir_shader_compiler_options * options)18 void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
19 nir_shader_compiler_options *options)
20 {
21 /* |---------------------------------- Performance & Availability --------------------------------|
22 * |MAD/MAC/MADAK/MADMK|MAD_LEGACY|MAC_LEGACY| FMA |FMAC/FMAAK/FMAMK|FMA_LEGACY|PK_FMA_F16,|Best choice
23 * Arch | F32,F16,F64 | F32,F16 | F32,F16 |F32,F16,F64 | F32,F16 | F32 |PK_FMAC_F16|F16,F32,F64
24 * ------------------------------------------------------------------------------------------------------------------
25 * gfx6,7 | 1 , - , - | 1 , - | 1 , - |1/4, - ,1/16| - , - | - | - , - | - ,MAD,FMA
26 * gfx8 | 1 , 1 , - | 1 , - | - , - |1/4, 1 ,1/16| - , - | - | - , - |MAD,MAD,FMA
27 * gfx9 | 1 ,1|0, - | 1 , - | - , - | 1 , 1 ,1/16| 0|1, - | - | 2 , - |FMA,MAD,FMA
28 * gfx10 | 1 , - , - | 1 , - | 1 , - | 1 , 1 ,1/16| 1 , 1 | - | 2 , 2 |FMA,MAD,FMA
29 * gfx10.3| - , - , - | - , - | - , - | 1 , 1 ,1/16| 1 , 1 | 1 | 2 , 2 | all FMA
30 * gfx11 | - , - , - | - , - | - , - | 2 , 2 ,1/16| 2 , 2 | 2 | 2 , 2 | all FMA
31 *
32 * Tahiti, Hawaii, Carrizo, Vega20: FMA_F32 is full rate, FMA_F64 is 1/4
33 * gfx9 supports MAD_F16 only on Vega10, Raven, Raven2, Renoir.
34 * gfx9 supports FMAC_F32 only on Vega20, but doesn't support FMAAK and FMAMK.
35 *
36 * gfx8 prefers MAD for F16 because of MAC/MADAK/MADMK.
37 * gfx9 and newer prefer FMA for F16 because of the packed instruction.
38 * gfx10 and older prefer MAD for F32 because of the legacy instruction.
39 */
40
41 memset(options, 0, sizeof(*options));
42 options->vertex_id_zero_based = true;
43 options->lower_scmp = true;
44 options->lower_flrp16 = true;
45 options->lower_flrp32 = true;
46 options->lower_flrp64 = true;
47 options->lower_device_index_to_zero = true;
48 options->lower_fdiv = true;
49 options->lower_fmod = true;
50 options->lower_ineg = true;
51 options->lower_bitfield_insert = true;
52 options->lower_bitfield_extract = true;
53 options->lower_pack_snorm_4x8 = true;
54 options->lower_pack_unorm_4x8 = true;
55 options->lower_pack_half_2x16 = true;
56 options->lower_pack_64_2x32 = true;
57 options->lower_pack_64_4x16 = true;
58 options->lower_pack_32_2x16 = true;
59 options->lower_unpack_snorm_2x16 = true;
60 options->lower_unpack_snorm_4x8 = true;
61 options->lower_unpack_unorm_2x16 = true;
62 options->lower_unpack_unorm_4x8 = true;
63 options->lower_unpack_half_2x16 = true;
64 options->lower_fpow = true;
65 options->lower_mul_2x32_64 = true;
66 options->lower_iadd_sat = info->gfx_level <= GFX8;
67 options->lower_hadd = true;
68 options->lower_mul_32x16 = true;
69 options->has_bfe = true;
70 options->has_bfm = true;
71 options->has_bitfield_select = true;
72 options->has_fsub = true;
73 options->has_isub = true;
74 options->has_sdot_4x8 = info->has_accelerated_dot_product;
75 options->has_sudot_4x8 = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
76 options->has_udot_4x8 = info->has_accelerated_dot_product;
77 options->has_sdot_4x8_sat = info->has_accelerated_dot_product;
78 options->has_sudot_4x8_sat = info->has_accelerated_dot_product && info->gfx_level >= GFX11;
79 options->has_udot_4x8_sat = info->has_accelerated_dot_product;
80 options->has_dot_2x16 = info->has_accelerated_dot_product && info->gfx_level < GFX11;
81 options->has_find_msb_rev = true;
82 options->has_pack_half_2x16_rtz = true;
83 options->has_bit_test = !use_llvm;
84 options->has_fmulz = true;
85 options->has_msad = true;
86 options->use_interpolated_input_intrinsics = true;
87 options->lower_int64_options = nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 | nir_lower_divmod64 |
88 nir_lower_minmax64 | nir_lower_iabs64 | nir_lower_iadd_sat64 | nir_lower_conv64;
89 options->divergence_analysis_options = nir_divergence_view_index_uniform;
90 options->optimize_quad_vote_to_reduce = true;
91 options->lower_fisnormal = true;
92 options->support_16bit_alu = info->gfx_level >= GFX8;
93 options->vectorize_vec2_16bit = info->has_packed_math_16bit;
94 }
95
ac_get_spi_shader_z_format(bool writes_z,bool writes_stencil,bool writes_samplemask,bool writes_mrt0_alpha)96 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,
97 bool writes_mrt0_alpha)
98 {
99 /* If writes_mrt0_alpha is true, one other flag must be true too. */
100 assert(!writes_mrt0_alpha || writes_z || writes_stencil || writes_samplemask);
101
102 if (writes_z || writes_mrt0_alpha) {
103 /* Z needs 32 bits. */
104 if (writes_samplemask || writes_mrt0_alpha)
105 return V_028710_SPI_SHADER_32_ABGR;
106 else if (writes_stencil)
107 return V_028710_SPI_SHADER_32_GR;
108 else
109 return V_028710_SPI_SHADER_32_R;
110 } else if (writes_stencil || writes_samplemask) {
111 /* Both stencil and sample mask need only 16 bits. */
112 return V_028710_SPI_SHADER_UINT16_ABGR;
113 } else {
114 return V_028710_SPI_SHADER_ZERO;
115 }
116 }
117
ac_get_cb_shader_mask(unsigned spi_shader_col_format)118 unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
119 {
120 unsigned i, cb_shader_mask = 0;
121
122 for (i = 0; i < 8; i++) {
123 switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
124 case V_028714_SPI_SHADER_ZERO:
125 break;
126 case V_028714_SPI_SHADER_32_R:
127 cb_shader_mask |= 0x1 << (i * 4);
128 break;
129 case V_028714_SPI_SHADER_32_GR:
130 cb_shader_mask |= 0x3 << (i * 4);
131 break;
132 case V_028714_SPI_SHADER_32_AR:
133 cb_shader_mask |= 0x9u << (i * 4);
134 break;
135 case V_028714_SPI_SHADER_FP16_ABGR:
136 case V_028714_SPI_SHADER_UNORM16_ABGR:
137 case V_028714_SPI_SHADER_SNORM16_ABGR:
138 case V_028714_SPI_SHADER_UINT16_ABGR:
139 case V_028714_SPI_SHADER_SINT16_ABGR:
140 case V_028714_SPI_SHADER_32_ABGR:
141 cb_shader_mask |= 0xfu << (i * 4);
142 break;
143 default:
144 assert(0);
145 }
146 }
147 return cb_shader_mask;
148 }
149
150 /**
151 * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
152 * geometry shader.
153 */
ac_vgt_gs_mode(unsigned gs_max_vert_out,enum amd_gfx_level gfx_level)154 uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level)
155 {
156 unsigned cut_mode;
157
158 assert (gfx_level < GFX11);
159
160 if (gs_max_vert_out <= 128) {
161 cut_mode = V_028A40_GS_CUT_128;
162 } else if (gs_max_vert_out <= 256) {
163 cut_mode = V_028A40_GS_CUT_256;
164 } else if (gs_max_vert_out <= 512) {
165 cut_mode = V_028A40_GS_CUT_512;
166 } else {
167 assert(gs_max_vert_out <= 1024);
168 cut_mode = V_028A40_GS_CUT_1024;
169 }
170
171 return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
172 S_028A40_ES_WRITE_OPTIMIZE(gfx_level <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
173 S_028A40_ONCHIP(gfx_level >= GFX9 ? 1 : 0);
174 }
175
176 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
177 /// value for LLVM8+ tbuffer intrinsics.
ac_get_tbuffer_format(enum amd_gfx_level gfx_level,unsigned dfmt,unsigned nfmt)178 unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt)
179 {
180 // Some games try to access vertex buffers without a valid format.
181 // This is a game bug, but we should still handle it gracefully.
182 if (dfmt == V_008F0C_GFX10_FORMAT_INVALID)
183 return V_008F0C_GFX10_FORMAT_INVALID;
184
185 if (gfx_level >= GFX11) {
186 switch (dfmt) {
187 default:
188 unreachable("bad dfmt");
189 case V_008F0C_BUF_DATA_FORMAT_INVALID:
190 return V_008F0C_GFX11_FORMAT_INVALID;
191
192 case V_008F0C_BUF_DATA_FORMAT_8:
193 switch (nfmt) {
194 case V_008F0C_BUF_NUM_FORMAT_UNORM:
195 return V_008F0C_GFX11_FORMAT_8_UNORM;
196 case V_008F0C_BUF_NUM_FORMAT_SNORM:
197 return V_008F0C_GFX11_FORMAT_8_SNORM;
198 case V_008F0C_BUF_NUM_FORMAT_USCALED:
199 return V_008F0C_GFX11_FORMAT_8_USCALED;
200 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
201 return V_008F0C_GFX11_FORMAT_8_SSCALED;
202 default:
203 unreachable("bad nfmt");
204 case V_008F0C_BUF_NUM_FORMAT_UINT:
205 return V_008F0C_GFX11_FORMAT_8_UINT;
206 case V_008F0C_BUF_NUM_FORMAT_SINT:
207 return V_008F0C_GFX11_FORMAT_8_SINT;
208 }
209
210 case V_008F0C_BUF_DATA_FORMAT_8_8:
211 switch (nfmt) {
212 case V_008F0C_BUF_NUM_FORMAT_UNORM:
213 return V_008F0C_GFX11_FORMAT_8_8_UNORM;
214 case V_008F0C_BUF_NUM_FORMAT_SNORM:
215 return V_008F0C_GFX11_FORMAT_8_8_SNORM;
216 case V_008F0C_BUF_NUM_FORMAT_USCALED:
217 return V_008F0C_GFX11_FORMAT_8_8_USCALED;
218 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
219 return V_008F0C_GFX11_FORMAT_8_8_SSCALED;
220 default:
221 unreachable("bad nfmt");
222 case V_008F0C_BUF_NUM_FORMAT_UINT:
223 return V_008F0C_GFX11_FORMAT_8_8_UINT;
224 case V_008F0C_BUF_NUM_FORMAT_SINT:
225 return V_008F0C_GFX11_FORMAT_8_8_SINT;
226 }
227
228 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
229 switch (nfmt) {
230 case V_008F0C_BUF_NUM_FORMAT_UNORM:
231 return V_008F0C_GFX11_FORMAT_8_8_8_8_UNORM;
232 case V_008F0C_BUF_NUM_FORMAT_SNORM:
233 return V_008F0C_GFX11_FORMAT_8_8_8_8_SNORM;
234 case V_008F0C_BUF_NUM_FORMAT_USCALED:
235 return V_008F0C_GFX11_FORMAT_8_8_8_8_USCALED;
236 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
237 return V_008F0C_GFX11_FORMAT_8_8_8_8_SSCALED;
238 default:
239 unreachable("bad nfmt");
240 case V_008F0C_BUF_NUM_FORMAT_UINT:
241 return V_008F0C_GFX11_FORMAT_8_8_8_8_UINT;
242 case V_008F0C_BUF_NUM_FORMAT_SINT:
243 return V_008F0C_GFX11_FORMAT_8_8_8_8_SINT;
244 }
245
246 case V_008F0C_BUF_DATA_FORMAT_16:
247 switch (nfmt) {
248 case V_008F0C_BUF_NUM_FORMAT_UNORM:
249 return V_008F0C_GFX11_FORMAT_16_UNORM;
250 case V_008F0C_BUF_NUM_FORMAT_SNORM:
251 return V_008F0C_GFX11_FORMAT_16_SNORM;
252 case V_008F0C_BUF_NUM_FORMAT_USCALED:
253 return V_008F0C_GFX11_FORMAT_16_USCALED;
254 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
255 return V_008F0C_GFX11_FORMAT_16_SSCALED;
256 default:
257 unreachable("bad nfmt");
258 case V_008F0C_BUF_NUM_FORMAT_UINT:
259 return V_008F0C_GFX11_FORMAT_16_UINT;
260 case V_008F0C_BUF_NUM_FORMAT_SINT:
261 return V_008F0C_GFX11_FORMAT_16_SINT;
262 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
263 return V_008F0C_GFX11_FORMAT_16_FLOAT;
264 }
265
266 case V_008F0C_BUF_DATA_FORMAT_16_16:
267 switch (nfmt) {
268 case V_008F0C_BUF_NUM_FORMAT_UNORM:
269 return V_008F0C_GFX11_FORMAT_16_16_UNORM;
270 case V_008F0C_BUF_NUM_FORMAT_SNORM:
271 return V_008F0C_GFX11_FORMAT_16_16_SNORM;
272 case V_008F0C_BUF_NUM_FORMAT_USCALED:
273 return V_008F0C_GFX11_FORMAT_16_16_USCALED;
274 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
275 return V_008F0C_GFX11_FORMAT_16_16_SSCALED;
276 default:
277 unreachable("bad nfmt");
278 case V_008F0C_BUF_NUM_FORMAT_UINT:
279 return V_008F0C_GFX11_FORMAT_16_16_UINT;
280 case V_008F0C_BUF_NUM_FORMAT_SINT:
281 return V_008F0C_GFX11_FORMAT_16_16_SINT;
282 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
283 return V_008F0C_GFX11_FORMAT_16_16_FLOAT;
284 }
285
286 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
287 switch (nfmt) {
288 case V_008F0C_BUF_NUM_FORMAT_UNORM:
289 return V_008F0C_GFX11_FORMAT_16_16_16_16_UNORM;
290 case V_008F0C_BUF_NUM_FORMAT_SNORM:
291 return V_008F0C_GFX11_FORMAT_16_16_16_16_SNORM;
292 case V_008F0C_BUF_NUM_FORMAT_USCALED:
293 return V_008F0C_GFX11_FORMAT_16_16_16_16_USCALED;
294 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
295 return V_008F0C_GFX11_FORMAT_16_16_16_16_SSCALED;
296 default:
297 unreachable("bad nfmt");
298 case V_008F0C_BUF_NUM_FORMAT_UINT:
299 return V_008F0C_GFX11_FORMAT_16_16_16_16_UINT;
300 case V_008F0C_BUF_NUM_FORMAT_SINT:
301 return V_008F0C_GFX11_FORMAT_16_16_16_16_SINT;
302 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
303 return V_008F0C_GFX11_FORMAT_16_16_16_16_FLOAT;
304 }
305
306 case V_008F0C_BUF_DATA_FORMAT_32:
307 switch (nfmt) {
308 default:
309 unreachable("bad nfmt");
310 case V_008F0C_BUF_NUM_FORMAT_UINT:
311 return V_008F0C_GFX11_FORMAT_32_UINT;
312 case V_008F0C_BUF_NUM_FORMAT_SINT:
313 return V_008F0C_GFX11_FORMAT_32_SINT;
314 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
315 return V_008F0C_GFX11_FORMAT_32_FLOAT;
316 }
317
318 case V_008F0C_BUF_DATA_FORMAT_32_32:
319 switch (nfmt) {
320 default:
321 unreachable("bad nfmt");
322 case V_008F0C_BUF_NUM_FORMAT_UINT:
323 return V_008F0C_GFX11_FORMAT_32_32_UINT;
324 case V_008F0C_BUF_NUM_FORMAT_SINT:
325 return V_008F0C_GFX11_FORMAT_32_32_SINT;
326 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
327 return V_008F0C_GFX11_FORMAT_32_32_FLOAT;
328 }
329
330 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
331 switch (nfmt) {
332 default:
333 unreachable("bad nfmt");
334 case V_008F0C_BUF_NUM_FORMAT_UINT:
335 return V_008F0C_GFX11_FORMAT_32_32_32_UINT;
336 case V_008F0C_BUF_NUM_FORMAT_SINT:
337 return V_008F0C_GFX11_FORMAT_32_32_32_SINT;
338 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
339 return V_008F0C_GFX11_FORMAT_32_32_32_FLOAT;
340 }
341
342 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
343 switch (nfmt) {
344 default:
345 unreachable("bad nfmt");
346 case V_008F0C_BUF_NUM_FORMAT_UINT:
347 return V_008F0C_GFX11_FORMAT_32_32_32_32_UINT;
348 case V_008F0C_BUF_NUM_FORMAT_SINT:
349 return V_008F0C_GFX11_FORMAT_32_32_32_32_SINT;
350 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
351 return V_008F0C_GFX11_FORMAT_32_32_32_32_FLOAT;
352 }
353
354 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
355 switch (nfmt) {
356 case V_008F0C_BUF_NUM_FORMAT_UNORM:
357 return V_008F0C_GFX11_FORMAT_2_10_10_10_UNORM;
358 case V_008F0C_BUF_NUM_FORMAT_SNORM:
359 return V_008F0C_GFX11_FORMAT_2_10_10_10_SNORM;
360 case V_008F0C_BUF_NUM_FORMAT_USCALED:
361 return V_008F0C_GFX11_FORMAT_2_10_10_10_USCALED;
362 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
363 return V_008F0C_GFX11_FORMAT_2_10_10_10_SSCALED;
364 default:
365 unreachable("bad nfmt");
366 case V_008F0C_BUF_NUM_FORMAT_UINT:
367 return V_008F0C_GFX11_FORMAT_2_10_10_10_UINT;
368 case V_008F0C_BUF_NUM_FORMAT_SINT:
369 return V_008F0C_GFX11_FORMAT_2_10_10_10_SINT;
370 }
371
372 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
373 switch (nfmt) {
374 default:
375 unreachable("bad nfmt");
376 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
377 return V_008F0C_GFX11_FORMAT_10_11_11_FLOAT;
378 }
379 }
380 } else if (gfx_level >= GFX10) {
381 unsigned format;
382 switch (dfmt) {
383 default:
384 unreachable("bad dfmt");
385 case V_008F0C_BUF_DATA_FORMAT_INVALID:
386 format = V_008F0C_GFX10_FORMAT_INVALID;
387 break;
388 case V_008F0C_BUF_DATA_FORMAT_8:
389 format = V_008F0C_GFX10_FORMAT_8_UINT;
390 break;
391 case V_008F0C_BUF_DATA_FORMAT_8_8:
392 format = V_008F0C_GFX10_FORMAT_8_8_UINT;
393 break;
394 case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
395 format = V_008F0C_GFX10_FORMAT_8_8_8_8_UINT;
396 break;
397 case V_008F0C_BUF_DATA_FORMAT_16:
398 format = V_008F0C_GFX10_FORMAT_16_UINT;
399 break;
400 case V_008F0C_BUF_DATA_FORMAT_16_16:
401 format = V_008F0C_GFX10_FORMAT_16_16_UINT;
402 break;
403 case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
404 format = V_008F0C_GFX10_FORMAT_16_16_16_16_UINT;
405 break;
406 case V_008F0C_BUF_DATA_FORMAT_32:
407 format = V_008F0C_GFX10_FORMAT_32_UINT;
408 break;
409 case V_008F0C_BUF_DATA_FORMAT_32_32:
410 format = V_008F0C_GFX10_FORMAT_32_32_UINT;
411 break;
412 case V_008F0C_BUF_DATA_FORMAT_32_32_32:
413 format = V_008F0C_GFX10_FORMAT_32_32_32_UINT;
414 break;
415 case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
416 format = V_008F0C_GFX10_FORMAT_32_32_32_32_UINT;
417 break;
418 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
419 format = V_008F0C_GFX10_FORMAT_2_10_10_10_UINT;
420 break;
421 case V_008F0C_BUF_DATA_FORMAT_10_11_11:
422 format = V_008F0C_GFX10_FORMAT_10_11_11_UINT;
423 break;
424 }
425
426 // Use the regularity properties of the combined format enum.
427 //
428 // Note: float is incompatible with 8-bit data formats,
429 // [us]{norm,scaled} are incompatible with 32-bit data formats.
430 // [us]scaled are not writable.
431 switch (nfmt) {
432 case V_008F0C_BUF_NUM_FORMAT_UNORM:
433 format -= 4;
434 break;
435 case V_008F0C_BUF_NUM_FORMAT_SNORM:
436 format -= 3;
437 break;
438 case V_008F0C_BUF_NUM_FORMAT_USCALED:
439 format -= 2;
440 break;
441 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
442 format -= 1;
443 break;
444 default:
445 unreachable("bad nfmt");
446 case V_008F0C_BUF_NUM_FORMAT_UINT:
447 break;
448 case V_008F0C_BUF_NUM_FORMAT_SINT:
449 format += 1;
450 break;
451 case V_008F0C_BUF_NUM_FORMAT_FLOAT:
452 format += 2;
453 break;
454 }
455
456 return format;
457 } else {
458 return dfmt | (nfmt << 4);
459 }
460 }
461
462 static const struct ac_data_format_info data_format_table[] = {
463 [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
464 [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
465 [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
466 [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
467 [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
468 [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
469 [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
470 [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
471 [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
472 [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
473 [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
474 [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
475 [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
476 [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
477 [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
478 };
479
ac_get_data_format_info(unsigned dfmt)480 const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
481 {
482 assert(dfmt < ARRAY_SIZE(data_format_table));
483 return &data_format_table[dfmt];
484 }
485
486 #define DUP2(v) v, v
487 #define DUP3(v) v, v, v
488 #define DUP4(v) v, v, v, v
489
490 #define FMT(dfmt, nfmt) 0xb, {HW_FMT(dfmt, nfmt), HW_FMT(dfmt##_##dfmt, nfmt), HW_FMT_INVALID, HW_FMT(dfmt##_##dfmt##_##dfmt##_##dfmt, nfmt)}
491 #define FMT_32(nfmt) 0xf, {HW_FMT(32, nfmt), HW_FMT(32_32, nfmt), HW_FMT(32_32_32, nfmt), HW_FMT(32_32_32_32, nfmt)}
492 #define FMT_64(nfmt) 0x3, {HW_FMT(32_32, nfmt), HW_FMT(32_32_32_32, nfmt), DUP2(HW_FMT_INVALID)}
493 #define FMTP(dfmt, nfmt) 0xf, {DUP4(HW_FMT(dfmt, nfmt))}
494
495 #define DST_SEL(x, y, z, w) \
496 (S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_##x) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_##y) | \
497 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_##z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_##w))
498
499 #define LIST_NFMT_8_16(nfmt) \
500 [(int)PIPE_FORMAT_R8_##nfmt] = {DST_SEL(X,0,0,1), 1, 1, 1, FMT(8, nfmt)}, \
501 [(int)PIPE_FORMAT_R8G8_##nfmt] = {DST_SEL(X,Y,0,1), 2, 2, 1, FMT(8, nfmt)}, \
502 [(int)PIPE_FORMAT_R8G8B8_##nfmt] = {DST_SEL(X,Y,Z,1), 3, 3, 1, FMT(8, nfmt)}, \
503 [(int)PIPE_FORMAT_B8G8R8_##nfmt] = {DST_SEL(Z,Y,X,1), 3, 3, 1, FMT(8, nfmt)}, \
504 [(int)PIPE_FORMAT_R8G8B8A8_##nfmt] = {DST_SEL(X,Y,Z,W), 4, 4, 1, FMT(8, nfmt)}, \
505 [(int)PIPE_FORMAT_B8G8R8A8_##nfmt] = {DST_SEL(Z,Y,X,W), 4, 4, 1, FMT(8, nfmt)}, \
506 [(int)PIPE_FORMAT_R16_##nfmt] = {DST_SEL(X,0,0,1), 2, 1, 2, FMT(16, nfmt)}, \
507 [(int)PIPE_FORMAT_R16G16_##nfmt] = {DST_SEL(X,Y,0,1), 4, 2, 2, FMT(16, nfmt)}, \
508 [(int)PIPE_FORMAT_R16G16B16_##nfmt] = {DST_SEL(X,Y,Z,1), 6, 3, 2, FMT(16, nfmt)}, \
509 [(int)PIPE_FORMAT_R16G16B16A16_##nfmt] = {DST_SEL(X,Y,Z,W), 8, 4, 2, FMT(16, nfmt)},
510
511 #define LIST_NFMT_32_64(nfmt) \
512 [(int)PIPE_FORMAT_R32_##nfmt] = {DST_SEL(X,0,0,1), 4, 1, 4, FMT_32(nfmt)}, \
513 [(int)PIPE_FORMAT_R32G32_##nfmt] = {DST_SEL(X,Y,0,1), 8, 2, 4, FMT_32(nfmt)}, \
514 [(int)PIPE_FORMAT_R32G32B32_##nfmt] = {DST_SEL(X,Y,Z,1), 12, 3, 4, FMT_32(nfmt)}, \
515 [(int)PIPE_FORMAT_R32G32B32A32_##nfmt] = {DST_SEL(X,Y,Z,W), 16, 4, 4, FMT_32(nfmt)}, \
516 [(int)PIPE_FORMAT_R64_##nfmt] = {DST_SEL(X,Y,0,0), 8, 1, 8, FMT_64(nfmt)}, \
517 [(int)PIPE_FORMAT_R64G64_##nfmt] = {DST_SEL(X,Y,Z,W), 16, 2, 8, FMT_64(nfmt)}, \
518 [(int)PIPE_FORMAT_R64G64B64_##nfmt] = {DST_SEL(X,Y,Z,W), 24, 3, 8, FMT_64(nfmt)}, \
519 [(int)PIPE_FORMAT_R64G64B64A64_##nfmt] = {DST_SEL(X,Y,Z,W), 32, 4, 8, FMT_64(nfmt)}, \
520
521 #define VB_FORMATS \
522 [(int)PIPE_FORMAT_NONE] = {DST_SEL(0,0,0,1), 0, 4, 0, 0xf, {DUP4(HW_FMT_INVALID)}}, \
523 LIST_NFMT_8_16(UNORM) \
524 LIST_NFMT_8_16(SNORM) \
525 LIST_NFMT_8_16(USCALED) \
526 LIST_NFMT_8_16(SSCALED) \
527 LIST_NFMT_8_16(UINT) \
528 LIST_NFMT_8_16(SINT) \
529 LIST_NFMT_32_64(UINT) \
530 LIST_NFMT_32_64(SINT) \
531 LIST_NFMT_32_64(FLOAT) \
532 [(int)PIPE_FORMAT_R16_FLOAT] = {DST_SEL(X,0,0,1), 2, 1, 2, FMT(16, FLOAT)}, \
533 [(int)PIPE_FORMAT_R16G16_FLOAT] = {DST_SEL(X,Y,0,1), 4, 2, 2, FMT(16, FLOAT)}, \
534 [(int)PIPE_FORMAT_R16G16B16_FLOAT] = {DST_SEL(X,Y,Z,1), 6, 3, 2, FMT(16, FLOAT)}, \
535 [(int)PIPE_FORMAT_R16G16B16A16_FLOAT] = {DST_SEL(X,Y,Z,W), 8, 4, 2, FMT(16, FLOAT)}, \
536 [(int)PIPE_FORMAT_B10G10R10A2_UNORM] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, UNORM)}, \
537 [(int)PIPE_FORMAT_B10G10R10A2_SNORM] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SNORM), \
538 AA(AC_ALPHA_ADJUST_SNORM)}, \
539 [(int)PIPE_FORMAT_B10G10R10A2_USCALED] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, USCALED)}, \
540 [(int)PIPE_FORMAT_B10G10R10A2_SSCALED] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SSCALED), \
541 AA(AC_ALPHA_ADJUST_SSCALED)}, \
542 [(int)PIPE_FORMAT_B10G10R10A2_UINT] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, UINT)}, \
543 [(int)PIPE_FORMAT_B10G10R10A2_SINT] = {DST_SEL(Z,Y,X,W), 4, 4, 0, FMTP(2_10_10_10, SINT), \
544 AA(AC_ALPHA_ADJUST_SINT)}, \
545 [(int)PIPE_FORMAT_R10G10B10A2_UNORM] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, UNORM)}, \
546 [(int)PIPE_FORMAT_R10G10B10A2_SNORM] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SNORM), \
547 AA(AC_ALPHA_ADJUST_SNORM)}, \
548 [(int)PIPE_FORMAT_R10G10B10A2_USCALED] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, USCALED)}, \
549 [(int)PIPE_FORMAT_R10G10B10A2_SSCALED] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SSCALED), \
550 AA(AC_ALPHA_ADJUST_SSCALED)}, \
551 [(int)PIPE_FORMAT_R10G10B10A2_UINT] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, UINT)}, \
552 [(int)PIPE_FORMAT_R10G10B10A2_SINT] = {DST_SEL(X,Y,Z,W), 4, 4, 0, FMTP(2_10_10_10, SINT), \
553 AA(AC_ALPHA_ADJUST_SINT)}, \
554 [(int)PIPE_FORMAT_R11G11B10_FLOAT] = {DST_SEL(X,Y,Z,1), 4, 3, 0, FMTP(10_11_11, FLOAT)}, \
555
556 #define HW_FMT(dfmt, nfmt) (V_008F0C_BUF_DATA_FORMAT_##dfmt | (V_008F0C_BUF_NUM_FORMAT_##nfmt << 4))
557 #define HW_FMT_INVALID (V_008F0C_BUF_DATA_FORMAT_INVALID | (V_008F0C_BUF_NUM_FORMAT_UNORM << 4))
558 #define AA(v) v
559 static const struct ac_vtx_format_info vb_formats_gfx6_alpha_adjust[] = {VB_FORMATS};
560 #undef AA
561
562 #define AA(v) AC_ALPHA_ADJUST_NONE
563 static const struct ac_vtx_format_info vb_formats_gfx6[] = {VB_FORMATS};
564 #undef HW_FMT_INVALID
565 #undef HW_FMT
566
567 #define HW_FMT(dfmt, nfmt) V_008F0C_GFX10_FORMAT_##dfmt##_##nfmt
568 #define HW_FMT_INVALID V_008F0C_GFX10_FORMAT_INVALID
569 static const struct ac_vtx_format_info vb_formats_gfx10[] = {VB_FORMATS};
570 #undef HW_FMT_INVALID
571 #undef HW_FMT
572
573 #define HW_FMT(dfmt, nfmt) V_008F0C_GFX11_FORMAT_##dfmt##_##nfmt
574 #define HW_FMT_INVALID V_008F0C_GFX11_FORMAT_INVALID
575 static const struct ac_vtx_format_info vb_formats_gfx11[] = {VB_FORMATS};
576
577 const struct ac_vtx_format_info *
ac_get_vtx_format_info_table(enum amd_gfx_level level,enum radeon_family family)578 ac_get_vtx_format_info_table(enum amd_gfx_level level, enum radeon_family family)
579 {
580 if (level >= GFX11)
581 return vb_formats_gfx11;
582 else if (level >= GFX10)
583 return vb_formats_gfx10;
584 bool alpha_adjust = level <= GFX8 && family != CHIP_STONEY;
585 return alpha_adjust ? vb_formats_gfx6_alpha_adjust : vb_formats_gfx6;
586 }
587
588 const struct ac_vtx_format_info *
ac_get_vtx_format_info(enum amd_gfx_level level,enum radeon_family family,enum pipe_format fmt)589 ac_get_vtx_format_info(enum amd_gfx_level level, enum radeon_family family, enum pipe_format fmt)
590 {
591 return &ac_get_vtx_format_info_table(level, family)[fmt];
592 }
593
594 /**
595 * Check whether the specified fetch size is safe to use with MTBUF.
596 *
597 * Split typed vertex buffer loads when necessary to avoid any
598 * alignment issues that trigger memory violations and eventually a GPU
599 * hang. This can happen if the stride (static or dynamic) is unaligned and
600 * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
601 * offset is 2 for R16G16B16A16_SNORM).
602 */
603 static bool
is_fetch_size_safe(const enum amd_gfx_level gfx_level,const struct ac_vtx_format_info * vtx_info,const unsigned offset,const unsigned alignment,const unsigned channels)604 is_fetch_size_safe(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info,
605 const unsigned offset, const unsigned alignment, const unsigned channels)
606 {
607 if (!(vtx_info->has_hw_format & BITFIELD_BIT(channels - 1)))
608 return false;
609
610 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
611 return (gfx_level >= GFX7 && gfx_level <= GFX9) ||
612 (offset % vertex_byte_size == 0 && MAX2(alignment, 1) % vertex_byte_size == 0);
613 }
614
615 /**
616 * Gets the number of channels that can be safely fetched by MTBUF (typed buffer load)
617 * instructions without triggering alignment-related issues.
618 */
619 unsigned
ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level,const struct ac_vtx_format_info * vtx_info,const unsigned offset,const unsigned max_channels,const unsigned alignment,const unsigned num_channels)620 ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info,
621 const unsigned offset, const unsigned max_channels, const unsigned alignment,
622 const unsigned num_channels)
623 {
624 /* Packed formats can't be split. */
625 if (!vtx_info->chan_byte_size)
626 return vtx_info->num_channels;
627
628 /* Early exit if the specified number of channels is fine. */
629 if (is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, num_channels))
630 return num_channels;
631
632 /* First, assume that more load instructions are worse and try using a larger data format. */
633 unsigned new_channels = num_channels + 1;
634 while (new_channels <= max_channels &&
635 !is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, new_channels)) {
636 new_channels++;
637 }
638
639 /* Found a feasible load size. */
640 if (new_channels <= max_channels)
641 return new_channels;
642
643 /* Try decreasing load size (at the cost of more load instructions). */
644 new_channels = num_channels;
645 while (new_channels > 1 &&
646 !is_fetch_size_safe(gfx_level, vtx_info, offset, alignment, new_channels)) {
647 new_channels--;
648 }
649
650 return new_channels;
651 }
652
ac_get_sampler_dim(enum amd_gfx_level gfx_level,enum glsl_sampler_dim dim,bool is_array)653 enum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim,
654 bool is_array)
655 {
656 switch (dim) {
657 case GLSL_SAMPLER_DIM_1D:
658 if (gfx_level == GFX9)
659 return is_array ? ac_image_2darray : ac_image_2d;
660 return is_array ? ac_image_1darray : ac_image_1d;
661 case GLSL_SAMPLER_DIM_2D:
662 case GLSL_SAMPLER_DIM_RECT:
663 case GLSL_SAMPLER_DIM_EXTERNAL:
664 return is_array ? ac_image_2darray : ac_image_2d;
665 case GLSL_SAMPLER_DIM_3D:
666 return ac_image_3d;
667 case GLSL_SAMPLER_DIM_CUBE:
668 return ac_image_cube;
669 case GLSL_SAMPLER_DIM_MS:
670 return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
671 case GLSL_SAMPLER_DIM_SUBPASS:
672 return ac_image_2darray;
673 case GLSL_SAMPLER_DIM_SUBPASS_MS:
674 return ac_image_2darraymsaa;
675 default:
676 unreachable("bad sampler dim");
677 }
678 }
679
ac_get_image_dim(enum amd_gfx_level gfx_level,enum glsl_sampler_dim sdim,bool is_array)680 enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim,
681 bool is_array)
682 {
683 enum ac_image_dim dim = ac_get_sampler_dim(gfx_level, sdim, is_array);
684
685 /* Match the resource type set in the descriptor. */
686 if (dim == ac_image_cube || (gfx_level <= GFX8 && dim == ac_image_3d))
687 dim = ac_image_2darray;
688 else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && gfx_level == GFX9) {
689 /* When a single layer of a 3D texture is bound, the shader
690 * will refer to a 2D target, but the descriptor has a 3D type.
691 * Since the HW ignores BASE_ARRAY in this case, we need to
692 * send 3 coordinates. This doesn't hurt when the underlying
693 * texture is non-3D.
694 */
695 dim = ac_image_3d;
696 }
697
698 return dim;
699 }
700
ac_get_fs_input_vgpr_cnt(const struct ac_shader_config * config,uint8_t * num_fragcoord_components)701 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
702 uint8_t *num_fragcoord_components)
703 {
704 unsigned num_input_vgprs = 0;
705 unsigned fragcoord_components = 0;
706
707 if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
708 num_input_vgprs += 2;
709 if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
710 num_input_vgprs += 2;
711 if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
712 num_input_vgprs += 2;
713 if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
714 num_input_vgprs += 3;
715 if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
716 num_input_vgprs += 2;
717 if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
718 num_input_vgprs += 2;
719 if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
720 num_input_vgprs += 2;
721 if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
722 num_input_vgprs += 1;
723 if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr)) {
724 num_input_vgprs += 1;
725 fragcoord_components++;
726 }
727 if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr)) {
728 num_input_vgprs += 1;
729 fragcoord_components++;
730 }
731 if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr)) {
732 num_input_vgprs += 1;
733 fragcoord_components++;
734 }
735 if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr)) {
736 num_input_vgprs += 1;
737 fragcoord_components++;
738 }
739 if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
740 num_input_vgprs += 1;
741 if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr))
742 num_input_vgprs += 1;
743 if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
744 num_input_vgprs += 1;
745 if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
746 num_input_vgprs += 1;
747
748 if (num_fragcoord_components)
749 *num_fragcoord_components = fragcoord_components;
750
751 return num_input_vgprs;
752 }
753
ac_get_ps_iter_mask(unsigned ps_iter_samples)754 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples)
755 {
756 /* The bit pattern matches that used by fixed function fragment
757 * processing.
758 */
759 switch (ps_iter_samples) {
760 case 1: return 0xffff;
761 case 2: return 0x5555;
762 case 4: return 0x1111;
763 case 8: return 0x0101;
764 case 16: return 0x0001;
765 default:
766 unreachable("invalid sample count");
767 }
768 }
769
ac_choose_spi_color_formats(unsigned format,unsigned swap,unsigned ntype,bool is_depth,bool use_rbplus,struct ac_spi_color_formats * formats)770 void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype,
771 bool is_depth, bool use_rbplus,
772 struct ac_spi_color_formats *formats)
773 {
774 /* Alpha is needed for alpha-to-coverage.
775 * Blending may be with or without alpha.
776 */
777 unsigned normal = 0; /* most optimal, may not support blending or export alpha */
778 unsigned alpha = 0; /* exports alpha, but may not support blending */
779 unsigned blend = 0; /* supports blending, but may not export alpha */
780 unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
781
782 /* Choose the SPI color formats. These are required values for RB+.
783 * Other chips have multiple choices, though they are not necessarily better.
784 */
785 switch (format) {
786 case V_028C70_COLOR_5_6_5:
787 case V_028C70_COLOR_1_5_5_5:
788 case V_028C70_COLOR_5_5_5_1:
789 case V_028C70_COLOR_4_4_4_4:
790 case V_028C70_COLOR_10_11_11:
791 case V_028C70_COLOR_11_11_10:
792 case V_028C70_COLOR_5_9_9_9:
793 case V_028C70_COLOR_8:
794 case V_028C70_COLOR_8_8:
795 case V_028C70_COLOR_8_8_8_8:
796 case V_028C70_COLOR_10_10_10_2:
797 case V_028C70_COLOR_2_10_10_10:
798 if (ntype == V_028C70_NUMBER_UINT)
799 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
800 else if (ntype == V_028C70_NUMBER_SINT)
801 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
802 else
803 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
804
805 if (!use_rbplus && format == V_028C70_COLOR_8 &&
806 ntype != V_028C70_NUMBER_SRGB && swap == V_028C70_SWAP_STD) /* R */ {
807 /* When RB+ is enabled, R8_UNORM should use FP16_ABGR for 2x
808 * exporting performance. Otherwise, use 32_R to remove useless
809 * instructions needed for 16-bit compressed exports.
810 */
811 blend = normal = V_028714_SPI_SHADER_32_R;
812 }
813 break;
814
815 case V_028C70_COLOR_16:
816 case V_028C70_COLOR_16_16:
817 case V_028C70_COLOR_16_16_16_16:
818 if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
819 /* UNORM16 and SNORM16 don't support blending */
820 if (ntype == V_028C70_NUMBER_UNORM)
821 normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
822 else
823 normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
824
825 /* Use 32 bits per channel for blending. */
826 if (format == V_028C70_COLOR_16) {
827 if (swap == V_028C70_SWAP_STD) { /* R */
828 blend = V_028714_SPI_SHADER_32_R;
829 blend_alpha = V_028714_SPI_SHADER_32_AR;
830 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
831 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
832 else
833 assert(0);
834 } else if (format == V_028C70_COLOR_16_16) {
835 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
836 blend = V_028714_SPI_SHADER_32_GR;
837 blend_alpha = V_028714_SPI_SHADER_32_ABGR;
838 } else if (swap == V_028C70_SWAP_ALT) /* RA */
839 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
840 else
841 assert(0);
842 } else /* 16_16_16_16 */
843 blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
844 } else if (ntype == V_028C70_NUMBER_UINT)
845 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
846 else if (ntype == V_028C70_NUMBER_SINT)
847 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
848 else if (ntype == V_028C70_NUMBER_FLOAT)
849 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
850 else
851 assert(0);
852 break;
853
854 case V_028C70_COLOR_32:
855 if (swap == V_028C70_SWAP_STD) { /* R */
856 blend = normal = V_028714_SPI_SHADER_32_R;
857 alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
858 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
859 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
860 else
861 assert(0);
862 break;
863
864 case V_028C70_COLOR_32_32:
865 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV) { /* RG or GR */
866 blend = normal = V_028714_SPI_SHADER_32_GR;
867 alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
868 } else if (swap == V_028C70_SWAP_ALT) /* RA */
869 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
870 else
871 assert(0);
872 break;
873
874 case V_028C70_COLOR_32_32_32_32:
875 case V_028C70_COLOR_8_24:
876 case V_028C70_COLOR_24_8:
877 case V_028C70_COLOR_X24_8_32_FLOAT:
878 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
879 break;
880
881 default:
882 assert(0);
883 return;
884 }
885
886 /* The DB->CB copy needs 32_ABGR. */
887 if (is_depth)
888 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
889
890 formats->normal = normal;
891 formats->alpha = alpha;
892 formats->blend = blend;
893 formats->blend_alpha = blend_alpha;
894 }
895
ac_compute_late_alloc(const struct radeon_info * info,bool ngg,bool ngg_culling,bool uses_scratch,unsigned * late_alloc_wave64,unsigned * cu_mask)896 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling,
897 bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask)
898 {
899 *late_alloc_wave64 = 0; /* The limit is per SA. */
900 *cu_mask = 0xffff;
901
902 /* CU masking can decrease performance and cause a hang with <= 2 CUs per SA. */
903 if (info->min_good_cu_per_sa <= 2)
904 return;
905
906 /* If scratch is used with late alloc, the GPU could deadlock if PS uses scratch too. A more
907 * complicated computation is needed to enable late alloc with scratch (see PAL).
908 */
909 if (uses_scratch)
910 return;
911
912 /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
913 if (ngg && info->family == CHIP_NAVI14)
914 return;
915
916 if (info->gfx_level >= GFX10) {
917 /* For Wave32, the hw will launch twice the number of late alloc waves, so 1 == 2x wave32.
918 * These limits are estimated because they are all safe but they vary in performance.
919 */
920 if (ngg_culling)
921 *late_alloc_wave64 = info->min_good_cu_per_sa * 10;
922 else if (info->gfx_level >= GFX11)
923 *late_alloc_wave64 = 63;
924 else
925 *late_alloc_wave64 = info->min_good_cu_per_sa * 4;
926
927 /* Limit LATE_ALLOC_GS to prevent a hang (hw bug) on gfx10. */
928 if (info->gfx_level == GFX10 && ngg)
929 *late_alloc_wave64 = MIN2(*late_alloc_wave64, 64);
930
931 /* Gfx10: CU2 & CU3 must be disabled to prevent a hw deadlock.
932 * Others: CU1 must be disabled to prevent a hw deadlock.
933 *
934 * The deadlock is caused by late alloc, which usually increases performance.
935 */
936 *cu_mask &= info->gfx_level == GFX10 ? ~BITFIELD_RANGE(2, 2) :
937 ~BITFIELD_RANGE(1, 1);
938 } else {
939 if (info->min_good_cu_per_sa <= 4) {
940 /* Too few available compute units per SA. Disallowing VS to run on one CU could hurt us
941 * more than late VS allocation would help.
942 *
943 * 2 is the highest safe number that allows us to keep all CUs enabled.
944 */
945 *late_alloc_wave64 = 2;
946 } else {
947 /* This is a good initial value, allowing 1 late_alloc wave per SIMD on num_cu - 2.
948 */
949 *late_alloc_wave64 = (info->min_good_cu_per_sa - 2) * 4;
950 }
951
952 /* VS can't execute on one CU if the limit is > 2. */
953 if (*late_alloc_wave64 > 2)
954 *cu_mask = 0xfffe; /* 1 CU disabled */
955 }
956
957 /* Max number that fits into the register field. */
958 if (ngg) /* GS */
959 *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(~0u));
960 else /* VS */
961 *late_alloc_wave64 = MIN2(*late_alloc_wave64, G_00B11C_LIMIT(~0u));
962 }
963
ac_compute_cs_workgroup_size(const uint16_t sizes[3],bool variable,unsigned max)964 unsigned ac_compute_cs_workgroup_size(const uint16_t sizes[3], bool variable, unsigned max)
965 {
966 if (variable)
967 return max;
968
969 return sizes[0] * sizes[1] * sizes[2];
970 }
971
ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level,gl_shader_stage stage,unsigned tess_num_patches,unsigned tess_patch_in_vtx,unsigned tess_patch_out_vtx)972 unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage,
973 unsigned tess_num_patches,
974 unsigned tess_patch_in_vtx,
975 unsigned tess_patch_out_vtx)
976 {
977 /* When tessellation is used, API VS runs on HW LS, API TCS runs on HW HS.
978 * These two HW stages are merged on GFX9+.
979 */
980
981 bool merged_shaders = gfx_level >= GFX9;
982 unsigned ls_workgroup_size = tess_num_patches * tess_patch_in_vtx;
983 unsigned hs_workgroup_size = tess_num_patches * tess_patch_out_vtx;
984
985 if (merged_shaders)
986 return MAX2(ls_workgroup_size, hs_workgroup_size);
987 else if (stage == MESA_SHADER_VERTEX)
988 return ls_workgroup_size;
989 else if (stage == MESA_SHADER_TESS_CTRL)
990 return hs_workgroup_size;
991 else
992 unreachable("invalid LSHS shader stage");
993 }
994
ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level,unsigned wave_size,unsigned es_verts,unsigned gs_inst_prims)995 unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size,
996 unsigned es_verts, unsigned gs_inst_prims)
997 {
998 /* ESGS may operate in workgroups if on-chip GS (LDS rings) are enabled.
999 *
1000 * GFX6: Not possible in the HW.
1001 * GFX7-8 (unmerged): possible in the HW, but not implemented in Mesa.
1002 * GFX9+ (merged): implemented in Mesa.
1003 */
1004
1005 if (gfx_level <= GFX8)
1006 return wave_size;
1007
1008 unsigned workgroup_size = MAX2(es_verts, gs_inst_prims);
1009 return CLAMP(workgroup_size, 1, 256);
1010 }
1011
ac_compute_ngg_workgroup_size(unsigned es_verts,unsigned gs_inst_prims,unsigned max_vtx_out,unsigned prim_amp_factor)1012 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims,
1013 unsigned max_vtx_out, unsigned prim_amp_factor)
1014 {
1015 /* NGG always operates in workgroups.
1016 *
1017 * For API VS/TES/GS:
1018 * - 1 invocation per input vertex
1019 * - 1 invocation per input primitive
1020 *
1021 * The same invocation can process both an input vertex and primitive,
1022 * however 1 invocation can only output up to 1 vertex and 1 primitive.
1023 */
1024
1025 unsigned max_vtx_in = es_verts < 256 ? es_verts : 3 * gs_inst_prims;
1026 unsigned max_prim_in = gs_inst_prims;
1027 unsigned max_prim_out = gs_inst_prims * prim_amp_factor;
1028 unsigned workgroup_size = MAX4(max_vtx_in, max_vtx_out, max_prim_in, max_prim_out);
1029
1030 return CLAMP(workgroup_size, 1, 256);
1031 }
1032
ac_apply_cu_en(uint32_t value,uint32_t clear_mask,unsigned value_shift,const struct radeon_info * info)1033 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift,
1034 const struct radeon_info *info)
1035 {
1036 /* Register field position and mask. */
1037 uint32_t cu_en_mask = ~clear_mask;
1038 unsigned cu_en_shift = ffs(cu_en_mask) - 1;
1039 /* The value being set. */
1040 uint32_t cu_en = (value & cu_en_mask) >> cu_en_shift;
1041
1042 /* AND the field by spi_cu_en. */
1043 uint32_t spi_cu_en = info->spi_cu_en >> value_shift;
1044 return (value & ~cu_en_mask) |
1045 (((cu_en & spi_cu_en) << cu_en_shift) & cu_en_mask);
1046 }
1047
1048 /* Return the register value and tune bytes_per_wave to increase scratch performance. */
ac_get_scratch_tmpring_size(const struct radeon_info * info,unsigned bytes_per_wave,unsigned * max_seen_bytes_per_wave,uint32_t * tmpring_size)1049 void ac_get_scratch_tmpring_size(const struct radeon_info *info,
1050 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave,
1051 uint32_t *tmpring_size)
1052 {
1053 /* SPI_TMPRING_SIZE and COMPUTE_TMPRING_SIZE are essentially scratch buffer descriptors.
1054 * WAVES means NUM_RECORDS. WAVESIZE is the size of each element, meaning STRIDE.
1055 * Thus, WAVESIZE must be constant while the scratch buffer is being used by the GPU.
1056 *
1057 * If you want to increase WAVESIZE without waiting for idle, you need to allocate a new
1058 * scratch buffer and use it instead. This will result in multiple scratch buffers being
1059 * used at the same time, each with a different WAVESIZE.
1060 *
1061 * If you want to decrease WAVESIZE, you don't have to. There is no advantage in decreasing
1062 * WAVESIZE after it's been increased.
1063 *
1064 * Shaders with SCRATCH_EN=0 don't allocate scratch space.
1065 */
1066 const unsigned size_shift = info->gfx_level >= GFX11 ? 8 : 10;
1067 const unsigned min_size_per_wave = BITFIELD_BIT(size_shift);
1068
1069 /* The LLVM shader backend should be reporting aligned scratch_sizes. */
1070 assert((bytes_per_wave & BITFIELD_MASK(size_shift)) == 0 &&
1071 "scratch size per wave should be aligned");
1072
1073 /* Add 1 scratch item to make the number of items odd. This should improve scratch
1074 * performance by more randomly distributing scratch waves among memory channels.
1075 */
1076 if (bytes_per_wave)
1077 bytes_per_wave |= min_size_per_wave;
1078
1079 *max_seen_bytes_per_wave = MAX2(*max_seen_bytes_per_wave, bytes_per_wave);
1080
1081 unsigned max_scratch_waves = info->max_scratch_waves;
1082 if (info->gfx_level >= GFX11)
1083 max_scratch_waves /= info->num_se; /* WAVES is per SE */
1084
1085 /* TODO: We could decrease WAVES to make the whole buffer fit into the infinity cache. */
1086 *tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
1087 S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
1088 }
1089
1090 /* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC)
1091 * from a NIR memory intrinsic.
1092 */
ac_get_mem_access_flags(const nir_intrinsic_instr * instr)1093 enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr)
1094 {
1095 enum gl_access_qualifier access =
1096 nir_intrinsic_has_access(instr) ? nir_intrinsic_access(instr) : 0;
1097
1098 /* Determine ACCESS_MAY_STORE_SUBDWORD. (for the GFX6 TC L1 bug workaround) */
1099 if (!nir_intrinsic_infos[instr->intrinsic].has_dest) {
1100 switch (instr->intrinsic) {
1101 case nir_intrinsic_bindless_image_store:
1102 access |= ACCESS_MAY_STORE_SUBDWORD;
1103 break;
1104
1105 case nir_intrinsic_store_ssbo:
1106 case nir_intrinsic_store_buffer_amd:
1107 case nir_intrinsic_store_global:
1108 case nir_intrinsic_store_global_amd:
1109 if (access & ACCESS_USES_FORMAT_AMD ||
1110 (nir_intrinsic_has_align_offset(instr) && nir_intrinsic_align(instr) % 4 != 0) ||
1111 ((instr->src[0].ssa->bit_size / 8) * instr->src[0].ssa->num_components) % 4 != 0)
1112 access |= ACCESS_MAY_STORE_SUBDWORD;
1113 break;
1114
1115 default:
1116 unreachable("unexpected store instruction");
1117 }
1118 }
1119
1120 return access;
1121 }
1122
1123 /* Convert chip-agnostic memory access flags into hw-specific cache flags.
1124 *
1125 * "access" must be a result of ac_get_mem_access_flags() with the appropriate ACCESS_TYPE_*
1126 * flags set.
1127 */
ac_get_hw_cache_flags(const struct radeon_info * info,enum gl_access_qualifier access)1128 union ac_hw_cache_flags ac_get_hw_cache_flags(const struct radeon_info *info,
1129 enum gl_access_qualifier access)
1130 {
1131 union ac_hw_cache_flags result;
1132 result.value = 0;
1133
1134 assert(util_bitcount(access & (ACCESS_TYPE_LOAD | ACCESS_TYPE_STORE |
1135 ACCESS_TYPE_ATOMIC)) == 1);
1136 assert(!(access & ACCESS_TYPE_SMEM) || access & ACCESS_TYPE_LOAD);
1137 assert(!(access & ACCESS_IS_SWIZZLED_AMD) || !(access & ACCESS_TYPE_SMEM));
1138 assert(!(access & ACCESS_MAY_STORE_SUBDWORD) || access & ACCESS_TYPE_STORE);
1139
1140 bool scope_is_device = access & (ACCESS_COHERENT | ACCESS_VOLATILE);
1141
1142 if (info->gfx_level >= GFX11) {
1143 /* GFX11 simplified it and exposes what is actually useful.
1144 *
1145 * GLC means device scope for loads only. (stores and atomics are always device scope)
1146 * SLC means non-temporal for GL1 and GL2 caches. (GL1 = hit-evict, GL2 = stream, unavailable in SMEM)
1147 * DLC means non-temporal for MALL. (noalloc, i.e. coherent bypass)
1148 *
1149 * GL0 doesn't have a non-temporal flag, so you always get LRU caching in CU scope.
1150 */
1151 if (access & ACCESS_TYPE_LOAD && scope_is_device)
1152 result.value |= ac_glc;
1153
1154 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1155 result.value |= ac_slc;
1156 } else if (info->gfx_level >= GFX10) {
1157 /* GFX10-10.3:
1158 *
1159 * VMEM and SMEM loads (SMEM only supports the first four):
1160 * !GLC && !DLC && !SLC means CU scope <== use for normal loads with CU scope
1161 * GLC && !DLC && !SLC means SA scope
1162 * !GLC && DLC && !SLC means CU scope, GL1 bypass
1163 * GLC && DLC && !SLC means device scope <== use for normal loads with device scope
1164 * !GLC && !DLC && SLC means CU scope, non-temporal (GL0 = GL1 = hit-evict, GL2 = stream) <== use for non-temporal loads with CU scope
1165 * GLC && !DLC && SLC means SA scope, non-temporal (GL1 = hit-evict, GL2 = stream)
1166 * !GLC && DLC && SLC means CU scope, GL0 non-temporal, GL1-GL2 coherent bypass (GL0 = hit-evict, GL1 = bypass, GL2 = noalloc)
1167 * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc) <== use for non-temporal loads with device scope
1168 *
1169 * VMEM stores/atomics (stores are CU scope only if they overwrite the whole cache line,
1170 * atomics are always device scope, GL1 is always bypassed):
1171 * !GLC && !DLC && !SLC means CU scope <== use for normal stores with CU scope
1172 * GLC && !DLC && !SLC means device scope <== use for normal stores with device scope
1173 * !GLC && DLC && !SLC means CU scope, GL2 non-coherent bypass
1174 * GLC && DLC && !SLC means device scope, GL2 non-coherent bypass
1175 * !GLC && !DLC && SLC means CU scope, GL2 non-temporal (stream) <== use for non-temporal stores with CU scope
1176 * GLC && !DLC && SLC means device scope, GL2 non-temporal (stream) <== use for non-temporal stores with device scope
1177 * !GLC && DLC && SLC means CU scope, GL2 coherent bypass (noalloc)
1178 * GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc)
1179 *
1180 * "stream" allows write combining in GL2. "coherent bypass" doesn't.
1181 * "non-coherent bypass" doesn't guarantee ordering with any coherent stores.
1182 */
1183 if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC))
1184 result.value |= ac_glc | (access & ACCESS_TYPE_LOAD ? ac_dlc : 0);
1185
1186 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1187 result.value |= ac_slc;
1188 } else {
1189 /* GFX6-GFX9:
1190 *
1191 * VMEM loads:
1192 * !GLC && !SLC means CU scope
1193 * GLC && !SLC means (GFX6: device scope, GFX7-9: device scope [*])
1194 * !GLC && SLC means (GFX6: CU scope, GFX7: device scope, GFX8-9: CU scope), GL2 non-temporal (stream)
1195 * GLC && SLC means device scope, GL2 non-temporal (stream)
1196 *
1197 * VMEM stores (atomics don't have [*]):
1198 * !GLC && !SLC means (GFX6: CU scope, GFX7-9: device scope [*])
1199 * GLC && !SLC means (GFX6-7: device scope, GFX8-9: device scope [*])
1200 * !GLC && SLC means (GFX6: CU scope, GFX7-9: device scope [*]), GL2 non-temporal (stream)
1201 * GLC && SLC means device scope, GL2 non-temporal (stream)
1202 *
1203 * [*] data can be cached in GL1 for future CU scope
1204 *
1205 * SMEM loads:
1206 * GLC means device scope (available on GFX8+)
1207 */
1208 if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) {
1209 /* SMEM doesn't support the device scope on GFX6-7. */
1210 assert(info->gfx_level >= GFX8 || !(access & ACCESS_TYPE_SMEM));
1211 result.value |= ac_glc;
1212 }
1213
1214 if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
1215 result.value |= ac_slc;
1216
1217 /* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All store opcodes not
1218 * aligned to a dword are affected.
1219 */
1220 if (info->gfx_level == GFX6 && access & ACCESS_MAY_STORE_SUBDWORD)
1221 result.value |= ac_glc;
1222 }
1223
1224 if (access & ACCESS_IS_SWIZZLED_AMD)
1225 result.value |= ac_swizzled;
1226
1227 return result;
1228 }
1229
ac_get_all_edge_flag_bits(void)1230 unsigned ac_get_all_edge_flag_bits(void)
1231 {
1232 /* This will be extended in the future. */
1233 return (1u << 9) | (1u << 19) | (1u << 29);
1234 }
1235
1236 /**
1237 * Returns a unique index for a per-patch semantic name and index. The index
1238 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
1239 * can be calculated.
1240 */
1241 unsigned
ac_shader_io_get_unique_index_patch(unsigned semantic)1242 ac_shader_io_get_unique_index_patch(unsigned semantic)
1243 {
1244 switch (semantic) {
1245 case VARYING_SLOT_TESS_LEVEL_OUTER:
1246 return 0;
1247 case VARYING_SLOT_TESS_LEVEL_INNER:
1248 return 1;
1249 default:
1250 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
1251 return 2 + (semantic - VARYING_SLOT_PATCH0);
1252
1253 assert(!"invalid semantic");
1254 return 0;
1255 }
1256 }
1257