1 /*
2 * Copyright © 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_gpu_info.h"
8 #include "ac_shader_util.h"
9 #include "ac_debug.h"
10 #include "ac_surface.h"
11 #include "ac_fake_hw_db.h"
12 #include "ac_linux_drm.h"
13
14 #include "addrlib/src/amdgpu_asic_addr.h"
15 #include "sid.h"
16 #include "util/macros.h"
17 #include "util/u_cpu_detect.h"
18 #include "util/u_math.h"
19 #include "util/os_misc.h"
20 #include "util/bitset.h"
21
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <inttypes.h>
25
26 #define AMDGPU_MI100_RANGE 0x32, 0x3C
27 #define AMDGPU_MI200_RANGE 0x3C, 0x46
28 #define AMDGPU_GFX940_RANGE 0x46, 0xFF
29
30 #define ASICREV_IS_MI100(r) ASICREV_IS(r, MI100)
31 #define ASICREV_IS_MI200(r) ASICREV_IS(r, MI200)
32 #define ASICREV_IS_GFX940(r) ASICREV_IS(r, GFX940)
33
34 #ifdef _WIN32
35 #define DRM_CAP_ADDFB2_MODIFIERS 0x10
36 #define DRM_CAP_SYNCOBJ 0x13
37 #define DRM_CAP_SYNCOBJ_TIMELINE 0x14
38 #define AMDGPU_GEM_DOMAIN_GTT 0x2
39 #define AMDGPU_GEM_DOMAIN_VRAM 0x4
40 #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0)
41 #define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10)
42 #define AMDGPU_HW_IP_GFX 0
43 #define AMDGPU_HW_IP_COMPUTE 1
44 #define AMDGPU_HW_IP_DMA 2
45 #define AMDGPU_HW_IP_UVD 3
46 #define AMDGPU_HW_IP_VCE 4
47 #define AMDGPU_HW_IP_UVD_ENC 5
48 #define AMDGPU_HW_IP_VCN_DEC 6
49 #define AMDGPU_HW_IP_VCN_ENC 7
50 #define AMDGPU_HW_IP_VCN_JPEG 8
51 #define AMDGPU_HW_IP_VPE 9
52 #define AMDGPU_IDS_FLAGS_FUSION 0x1
53 #define AMDGPU_IDS_FLAGS_PREEMPTION 0x2
54 #define AMDGPU_IDS_FLAGS_TMZ 0x4
55 #define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8
56 #define AMDGPU_INFO_FW_VCE 0x1
57 #define AMDGPU_INFO_FW_UVD 0x2
58 #define AMDGPU_INFO_FW_GFX_ME 0x04
59 #define AMDGPU_INFO_FW_GFX_PFP 0x05
60 #define AMDGPU_INFO_FW_GFX_CE 0x06
61 #define AMDGPU_INFO_FW_VCN 0x0e
62 #define AMDGPU_INFO_DEV_INFO 0x16
63 #define AMDGPU_INFO_MEMORY 0x19
64 #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0
65 #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1
66 #define AMDGPU_INFO_FW_GFX_MEC 0x08
67 #define AMDGPU_INFO_MAX_IBS 0x22
68
69 #define AMDGPU_VRAM_TYPE_UNKNOWN 0
70 #define AMDGPU_VRAM_TYPE_GDDR1 1
71 #define AMDGPU_VRAM_TYPE_DDR2 2
72 #define AMDGPU_VRAM_TYPE_GDDR3 3
73 #define AMDGPU_VRAM_TYPE_GDDR4 4
74 #define AMDGPU_VRAM_TYPE_GDDR5 5
75 #define AMDGPU_VRAM_TYPE_HBM 6
76 #define AMDGPU_VRAM_TYPE_DDR3 7
77 #define AMDGPU_VRAM_TYPE_DDR4 8
78 #define AMDGPU_VRAM_TYPE_GDDR6 9
79 #define AMDGPU_VRAM_TYPE_DDR5 10
80 #define AMDGPU_VRAM_TYPE_LPDDR4 11
81 #define AMDGPU_VRAM_TYPE_LPDDR5 12
82
83 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0
84 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1
85 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2
86 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3
87 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4
88 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5
89 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6
90 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7
91 #define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8
92
93 struct drm_amdgpu_heap_info {
94 uint64_t total_heap_size;
95 };
96 struct drm_amdgpu_memory_info {
97 struct drm_amdgpu_heap_info vram;
98 struct drm_amdgpu_heap_info cpu_accessible_vram;
99 struct drm_amdgpu_heap_info gtt;
100 };
101 struct drm_amdgpu_info_device {
102 /** PCI Device ID */
103 uint32_t device_id;
104 /** Internal chip revision: A0, A1, etc.) */
105 uint32_t chip_rev;
106 uint32_t external_rev;
107 /** Revision id in PCI Config space */
108 uint32_t pci_rev;
109 uint32_t family;
110 uint32_t num_shader_engines;
111 uint32_t num_shader_arrays_per_engine;
112 /* in KHz */
113 uint32_t gpu_counter_freq;
114 uint64_t max_engine_clock;
115 uint64_t max_memory_clock;
116 /* cu information */
117 uint32_t cu_active_number;
118 /* NOTE: cu_ao_mask is INVALID, DON'T use it */
119 uint32_t cu_ao_mask;
120 uint32_t cu_bitmap[4][4];
121 /** Render backend pipe mask. One render backend is CB+DB. */
122 uint32_t enabled_rb_pipes_mask;
123 uint32_t num_rb_pipes;
124 uint32_t num_hw_gfx_contexts;
125 /* PCIe version (the smaller of the GPU and the CPU/motherboard) */
126 uint32_t pcie_gen;
127 uint64_t ids_flags;
128 /** Starting virtual address for UMDs. */
129 uint64_t virtual_address_offset;
130 /** The maximum virtual address */
131 uint64_t virtual_address_max;
132 /** Required alignment of virtual addresses. */
133 uint32_t virtual_address_alignment;
134 /** Page table entry - fragment size */
135 uint32_t pte_fragment_size;
136 uint32_t gart_page_size;
137 /** constant engine ram size*/
138 uint32_t ce_ram_size;
139 /** video memory type info*/
140 uint32_t vram_type;
141 /** video memory bit width*/
142 uint32_t vram_bit_width;
143 /* vce harvesting instance */
144 uint32_t vce_harvest_config;
145 /* gfx double offchip LDS buffers */
146 uint32_t gc_double_offchip_lds_buf;
147 /* NGG Primitive Buffer */
148 uint64_t prim_buf_gpu_addr;
149 /* NGG Position Buffer */
150 uint64_t pos_buf_gpu_addr;
151 /* NGG Control Sideband */
152 uint64_t cntl_sb_buf_gpu_addr;
153 /* NGG Parameter Cache */
154 uint64_t param_buf_gpu_addr;
155 uint32_t prim_buf_size;
156 uint32_t pos_buf_size;
157 uint32_t cntl_sb_buf_size;
158 uint32_t param_buf_size;
159 /* wavefront size*/
160 uint32_t wave_front_size;
161 /* shader visible vgprs*/
162 uint32_t num_shader_visible_vgprs;
163 /* CU per shader array*/
164 uint32_t num_cu_per_sh;
165 /* number of tcc blocks*/
166 uint32_t num_tcc_blocks;
167 /* gs vgt table depth*/
168 uint32_t gs_vgt_table_depth;
169 /* gs primitive buffer depth*/
170 uint32_t gs_prim_buffer_depth;
171 /* max gs wavefront per vgt*/
172 uint32_t max_gs_waves_per_vgt;
173 /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */
174 uint32_t pcie_num_lanes;
175 /* always on cu bitmap */
176 uint32_t cu_ao_bitmap[4][4];
177 /** Starting high virtual address for UMDs. */
178 uint64_t high_va_offset;
179 /** The maximum high virtual address */
180 uint64_t high_va_max;
181 /* gfx10 pa_sc_tile_steering_override */
182 uint32_t pa_sc_tile_steering_override;
183 /* disabled TCCs */
184 uint64_t tcc_disabled_mask;
185 uint64_t min_engine_clock;
186 uint64_t min_memory_clock;
187 /* The following fields are only set on gfx11+, older chips set 0. */
188 uint32_t tcp_cache_size; /* AKA GL0, VMEM cache */
189 uint32_t num_sqc_per_wgp;
190 uint32_t sqc_data_cache_size; /* AKA SMEM cache */
191 uint32_t sqc_inst_cache_size;
192 uint32_t gl1c_cache_size;
193 uint32_t gl2c_cache_size;
194 uint64_t mall_size; /* AKA infinity cache */
195 /* high 32 bits of the rb pipes mask */
196 uint32_t enabled_rb_pipes_mask_hi;
197 /* shadow area size for gfx11 */
198 uint32_t shadow_size;
199 /* shadow area base virtual alignment for gfx11 */
200 uint32_t shadow_alignment;
201 /* context save area size for gfx11 */
202 uint32_t csa_size;
203 /* context save area base virtual alignment for gfx11 */
204 uint32_t csa_alignment;
205 };
206 struct drm_amdgpu_info_hw_ip {
207 uint32_t hw_ip_version_major;
208 uint32_t hw_ip_version_minor;
209 uint32_t ib_start_alignment;
210 uint32_t ib_size_alignment;
211 uint32_t available_rings;
212 uint32_t ip_discovery_version;
213 };
214
215 struct drm_amdgpu_info_uq_fw_areas_gfx {
216 uint32_t shadow_size;
217 uint32_t shadow_alignment;
218 uint32_t csa_size;
219 uint32_t csa_alignment;
220 };
221
222 struct drm_amdgpu_info_uq_fw_areas {
223 union {
224 struct drm_amdgpu_info_uq_fw_areas_gfx gfx;
225 };
226 };
227
228 typedef struct _drmPciBusInfo {
229 uint16_t domain;
230 uint8_t bus;
231 uint8_t dev;
232 uint8_t func;
233 } drmPciBusInfo, *drmPciBusInfoPtr;
234 typedef struct _drmDevice {
235 union {
236 drmPciBusInfoPtr pci;
237 } businfo;
238 } drmDevice, *drmDevicePtr;
239 enum amdgpu_sw_info {
240 amdgpu_sw_info_address32_hi = 0,
241 };
242 struct amdgpu_bo_alloc_request {
243 uint64_t alloc_size;
244 uint64_t phys_alignment;
245 uint32_t preferred_heap;
246 uint64_t flags;
247 };
248
249 struct amdgpu_gpu_info {
250 uint32_t asic_id;
251 uint32_t chip_external_rev;
252 uint32_t family_id;
253 uint64_t ids_flags;
254 uint64_t max_engine_clk;
255 uint64_t max_memory_clk;
256 uint32_t num_shader_engines;
257 uint32_t num_shader_arrays_per_engine;
258 uint32_t rb_pipes;
259 uint32_t enabled_rb_pipes_mask;
260 uint32_t gpu_counter_freq;
261 uint32_t mc_arb_ramcfg;
262 uint32_t gb_addr_cfg;
263 uint32_t gb_tile_mode[32];
264 uint32_t gb_macro_tile_mode[16];
265 uint32_t cu_bitmap[4][4];
266 uint32_t vram_type;
267 uint32_t vram_bit_width;
268 uint32_t ce_ram_size;
269 uint32_t vce_harvest_config;
270 uint32_t pci_rev_id;
271 };
drmGetCap(int fd,uint64_t capability,uint64_t * value)272 static int drmGetCap(int fd, uint64_t capability, uint64_t *value)
273 {
274 return -EINVAL;
275 }
drmFreeDevice(drmDevicePtr * device)276 static void drmFreeDevice(drmDevicePtr *device)
277 {
278 }
drmGetDevice2(int fd,uint32_t flags,drmDevicePtr * device)279 static int drmGetDevice2(int fd, uint32_t flags, drmDevicePtr *device)
280 {
281 return -ENODEV;
282 }
readlink(const char * path,char * buf,size_t bufsiz)283 static intptr_t readlink(const char *path, char *buf, size_t bufsiz)
284 {
285 return -1;
286 }
287 static char *
drmGetFormatModifierName(uint64_t modifier)288 drmGetFormatModifierName(uint64_t modifier)
289 {
290 return NULL;
291 }
292 #else
293 #include "drm-uapi/amdgpu_drm.h"
294 #include <amdgpu.h>
295 #include <xf86drm.h>
296 #include <unistd.h>
297 #endif
298
299 #define CIK_TILE_MODE_COLOR_2D 14
300
has_timeline_syncobj(int fd)301 static bool has_timeline_syncobj(int fd)
302 {
303 uint64_t value;
304 if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
305 return false;
306 return value ? true : false;
307 }
308
has_modifiers(int fd)309 static bool has_modifiers(int fd)
310 {
311 uint64_t value;
312 if (drmGetCap(fd, DRM_CAP_ADDFB2_MODIFIERS, &value))
313 return false;
314 return value ? true : false;
315 }
316
fix_vram_size(uint64_t size)317 static uint64_t fix_vram_size(uint64_t size)
318 {
319 /* The VRAM size is underreported, so we need to fix it, because
320 * it's used to compute the number of memory modules for harvesting.
321 */
322 return align64(size, 256 * 1024 * 1024);
323 }
324
325 static bool
has_tmz_support(ac_drm_device * dev,struct radeon_info * info,uint32_t ids_flags)326 has_tmz_support(ac_drm_device *dev, struct radeon_info *info, uint32_t ids_flags)
327 {
328 struct amdgpu_bo_alloc_request request = {0};
329 int r;
330 ac_drm_bo bo;
331
332 if (ids_flags & AMDGPU_IDS_FLAGS_TMZ)
333 return true;
334
335 /* AMDGPU_IDS_FLAGS_TMZ is supported starting from drm_minor 40 */
336 if (info->drm_minor >= 40)
337 return false;
338
339 /* Find out ourselves if TMZ is enabled */
340 if (info->gfx_level < GFX9)
341 return false;
342
343 if (info->drm_minor < 36)
344 return false;
345
346 request.alloc_size = 256;
347 request.phys_alignment = 1024;
348 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
349 request.flags = AMDGPU_GEM_CREATE_ENCRYPTED;
350 r = ac_drm_bo_alloc(dev, &request, &bo);
351 if (r)
352 return false;
353 ac_drm_bo_free(dev, bo);
354 return true;
355 }
356
set_custom_cu_en_mask(struct radeon_info * info)357 static void set_custom_cu_en_mask(struct radeon_info *info)
358 {
359 info->spi_cu_en = ~0;
360
361 const char *cu_env_var = os_get_option("AMD_CU_MASK");
362 if (!cu_env_var)
363 return;
364
365 int size = strlen(cu_env_var);
366 char *str = alloca(size + 1);
367 memset(str, 0, size + 1);
368
369 size = 0;
370
371 /* Strip whitespace. */
372 for (unsigned src = 0; cu_env_var[src]; src++) {
373 if (cu_env_var[src] != ' ' && cu_env_var[src] != '\t' &&
374 cu_env_var[src] != '\n' && cu_env_var[src] != '\r') {
375 str[size++] = cu_env_var[src];
376 }
377 }
378
379 /* The following syntax is used, all whitespace is ignored:
380 * ID = [0-9][0-9]* ex. base 10 numbers
381 * ID_list = (ID | ID-ID)[, (ID | ID-ID)]* ex. 0,2-4,7
382 * CU_list = 0x[0-F]* | ID_list ex. 0x337F OR 0,2-4,7
383 * AMD_CU_MASK = CU_list
384 *
385 * It's a CU mask within a shader array. It's applied to all shader arrays.
386 */
387 bool is_good_form = true;
388 uint32_t spi_cu_en = 0;
389
390 if (size > 2 && str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
391 str += 2;
392 size -= 2;
393
394 for (unsigned i = 0; i < size; i++)
395 is_good_form &= isxdigit(str[i]) != 0;
396
397 if (!is_good_form) {
398 fprintf(stderr, "amd: invalid AMD_CU_MASK: ill-formed hex value\n");
399 } else {
400 spi_cu_en = strtol(str, NULL, 16);
401 }
402 } else {
403 /* Parse ID_list. */
404 long first = 0, last = -1;
405
406 if (!isdigit(*str)) {
407 is_good_form = false;
408 } else {
409 while (*str) {
410 bool comma = false;
411
412 if (isdigit(*str)) {
413 first = last = strtol(str, &str, 10);
414 } else if (*str == '-') {
415 str++;
416 /* Parse a digit after a dash. */
417 if (isdigit(*str)) {
418 last = strtol(str, &str, 10);
419 } else {
420 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after -\n");
421 is_good_form = false;
422 break;
423 }
424 } else if (*str == ',') {
425 comma = true;
426 str++;
427 if (!isdigit(*str)) {
428 fprintf(stderr, "amd: invalid AMD_CU_MASK: expected a digit after ,\n");
429 is_good_form = false;
430 break;
431 }
432 }
433
434 if (comma || !*str) {
435 if (first > last) {
436 fprintf(stderr, "amd: invalid AMD_CU_MASK: range not increasing (%li, %li)\n", first, last);
437 is_good_form = false;
438 break;
439 }
440 if (last > 31) {
441 fprintf(stderr, "amd: invalid AMD_CU_MASK: index too large (%li)\n", last);
442 is_good_form = false;
443 break;
444 }
445
446 spi_cu_en |= BITFIELD_RANGE(first, last - first + 1);
447 last = -1;
448 }
449 }
450 }
451 }
452
453 /* The mask is parsed. Now assign bits to CUs. */
454 if (is_good_form) {
455 bool error = false;
456
457 /* Clear bits that have no effect. */
458 spi_cu_en &= BITFIELD_MASK(info->max_good_cu_per_sa);
459
460 if (!spi_cu_en) {
461 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU in each SA must be enabled\n");
462 error = true;
463 }
464
465 if (info->has_graphics) {
466 uint32_t min_full_cu_mask = BITFIELD_MASK(info->min_good_cu_per_sa);
467
468 /* The hw ignores all non-compute CU masks if any of them is 0. Disallow that. */
469 if ((spi_cu_en & min_full_cu_mask) == 0) {
470 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
471 "enabled (SPI limitation)\n", min_full_cu_mask);
472 error = true;
473 }
474
475 /* We usually disable 1 or 2 CUs for VS and GS, which means at last 1 other CU
476 * must be enabled.
477 */
478 uint32_t cu_mask_ge, unused;
479 ac_compute_late_alloc(info, false, false, false, &unused, &cu_mask_ge);
480 cu_mask_ge &= min_full_cu_mask;
481
482 if ((spi_cu_en & cu_mask_ge) == 0) {
483 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
484 "enabled (late alloc constraint for GE)\n", cu_mask_ge);
485 error = true;
486 }
487
488 if ((min_full_cu_mask & spi_cu_en & ~cu_mask_ge) == 0) {
489 fprintf(stderr, "amd: invalid AMD_CU_MASK: at least 1 CU from 0x%x per SA must be "
490 "enabled (late alloc constraint for PS)\n",
491 min_full_cu_mask & ~cu_mask_ge);
492 error = true;
493 }
494 }
495
496 if (!error) {
497 info->spi_cu_en = spi_cu_en;
498 info->spi_cu_en_has_effect = spi_cu_en & BITFIELD_MASK(info->max_good_cu_per_sa);
499 }
500 }
501 }
502
ac_query_pci_bus_info(int fd,struct radeon_info * info)503 static bool ac_query_pci_bus_info(int fd, struct radeon_info *info)
504 {
505 drmDevicePtr devinfo;
506
507 /* Get PCI info. */
508 int r = drmGetDevice2(fd, 0, &devinfo);
509 if (r) {
510 fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
511 info->pci.valid = false;
512 return false;
513 }
514 info->pci.domain = devinfo->businfo.pci->domain;
515 info->pci.bus = devinfo->businfo.pci->bus;
516 info->pci.dev = devinfo->businfo.pci->dev;
517 info->pci.func = devinfo->businfo.pci->func;
518 info->pci.valid = true;
519
520 drmFreeDevice(&devinfo);
521 return true;
522 }
523
handle_env_var_force_family(struct radeon_info * info)524 static void handle_env_var_force_family(struct radeon_info *info)
525 {
526 const char *family = debug_get_option("AMD_FORCE_FAMILY", NULL);
527
528 if (!family)
529 return;
530
531 for (size_t i = 0; i < ARRAY_SIZE(ac_fake_hw_db); i++) {
532 if (!strcmp(family, ac_fake_hw_db[i].name)) {
533 get_radeon_info(info, &ac_fake_hw_db[i]);
534 info->name = "NOOP";
535 info->family_overridden = true;
536 info->chip_rev = 1;
537 return;
538 }
539 }
540
541 fprintf(stderr, "radeonsi: Unknown family: %s\n", family);
542 exit(1);
543 }
544
ac_query_gpu_info(int fd,void * dev_p,struct radeon_info * info,bool require_pci_bus_info)545 bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
546 bool require_pci_bus_info)
547 {
548 struct amdgpu_gpu_info amdinfo;
549 struct drm_amdgpu_info_device device_info = {0};
550 uint32_t vidip_fw_version = 0, vidip_fw_feature = 0;
551 uint32_t num_instances = 0;
552 int r, i, j;
553 ac_drm_device *dev = dev_p;
554
555 STATIC_ASSERT(AMDGPU_HW_IP_GFX == AMD_IP_GFX);
556 STATIC_ASSERT(AMDGPU_HW_IP_COMPUTE == AMD_IP_COMPUTE);
557 STATIC_ASSERT(AMDGPU_HW_IP_DMA == AMD_IP_SDMA);
558 STATIC_ASSERT(AMDGPU_HW_IP_UVD == AMD_IP_UVD);
559 STATIC_ASSERT(AMDGPU_HW_IP_VCE == AMD_IP_VCE);
560 STATIC_ASSERT(AMDGPU_HW_IP_UVD_ENC == AMD_IP_UVD_ENC);
561 STATIC_ASSERT(AMDGPU_HW_IP_VCN_DEC == AMD_IP_VCN_DEC);
562 STATIC_ASSERT(AMDGPU_HW_IP_VCN_ENC == AMD_IP_VCN_ENC);
563 STATIC_ASSERT(AMDGPU_HW_IP_VCN_JPEG == AMD_IP_VCN_JPEG);
564 STATIC_ASSERT(AMDGPU_HW_IP_VPE == AMD_IP_VPE);
565
566 handle_env_var_force_family(info);
567
568 if (!ac_query_pci_bus_info(fd, info)) {
569 if (require_pci_bus_info)
570 return false;
571 }
572
573 assert(info->drm_major == 3);
574 info->is_amdgpu = true;
575
576 if (info->drm_minor < 27) {
577 fprintf(stderr, "amdgpu: DRM version is %u.%u.%u, but this driver is "
578 "only compatible with 3.27.0 (kernel 4.20+) or later.\n",
579 info->drm_major, info->drm_minor, info->drm_patchlevel);
580 return false;
581 }
582
583 uint64_t cap;
584 r = drmGetCap(fd, DRM_CAP_SYNCOBJ, &cap);
585 if (r != 0 || cap == 0) {
586 fprintf(stderr, "amdgpu: syncobj support is missing but is required.\n");
587 return false;
588 }
589
590 /* Query hardware and driver information. */
591 r = ac_drm_query_gpu_info(dev, &amdinfo);
592 if (r) {
593 fprintf(stderr, "amdgpu: ac_drm_query_gpu_info failed.\n");
594 return false;
595 }
596
597 r = ac_drm_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info);
598 if (r) {
599 fprintf(stderr, "amdgpu: ac_drm_query_info(dev_info) failed.\n");
600 return false;
601 }
602
603 for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) {
604 struct drm_amdgpu_info_hw_ip ip_info = {0};
605
606 r = ac_drm_query_hw_ip_info(dev, ip_type, 0, &ip_info);
607 if (r || !ip_info.available_rings)
608 continue;
609
610 /* Gfx6-8 don't set ip_discovery_version. */
611 if (info->drm_minor >= 48 && ip_info.ip_discovery_version) {
612 info->ip[ip_type].ver_major = (ip_info.ip_discovery_version >> 16) & 0xff;
613 info->ip[ip_type].ver_minor = (ip_info.ip_discovery_version >> 8) & 0xff;
614 info->ip[ip_type].ver_rev = ip_info.ip_discovery_version & 0xff;
615 } else {
616 info->ip[ip_type].ver_major = ip_info.hw_ip_version_major;
617 info->ip[ip_type].ver_minor = ip_info.hw_ip_version_minor;
618
619 /* Fix incorrect IP versions reported by the kernel. */
620 if (device_info.family == FAMILY_NV &&
621 (ASICREV_IS(device_info.external_rev, NAVI10) ||
622 ASICREV_IS(device_info.external_rev, NAVI12) ||
623 ASICREV_IS(device_info.external_rev, NAVI14)))
624 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 1;
625 else if (device_info.family == FAMILY_NV ||
626 device_info.family == FAMILY_VGH ||
627 device_info.family == FAMILY_RMB ||
628 device_info.family == FAMILY_RPL ||
629 device_info.family == FAMILY_MDN)
630 info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 3;
631 }
632 info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings);
633
634 /* query ip count */
635 r = ac_drm_query_hw_ip_count(dev, ip_type, &num_instances);
636 if (!r)
637 info->ip[ip_type].num_instances = num_instances;
638
639 /* According to the kernel, only SDMA and VPE require 256B alignment, but use it
640 * for all queues because the kernel reports wrong limits for some of the queues.
641 * This is only space allocation alignment, so it's OK to keep it like this even
642 * when it's greater than what the queues require.
643 */
644 info->ip[ip_type].ib_alignment = MAX3(ip_info.ib_start_alignment,
645 ip_info.ib_size_alignment, 256);
646 }
647
648 /* Set dword padding minus 1. */
649 info->ip[AMD_IP_GFX].ib_pad_dw_mask = 0x7;
650 info->ip[AMD_IP_COMPUTE].ib_pad_dw_mask = 0x7;
651 info->ip[AMD_IP_SDMA].ib_pad_dw_mask = 0xf;
652 info->ip[AMD_IP_UVD].ib_pad_dw_mask = 0xf;
653 info->ip[AMD_IP_VCE].ib_pad_dw_mask = 0x3f;
654 info->ip[AMD_IP_UVD_ENC].ib_pad_dw_mask = 0x3f;
655 info->ip[AMD_IP_VCN_DEC].ib_pad_dw_mask = 0xf;
656 info->ip[AMD_IP_VCN_ENC].ib_pad_dw_mask = 0x3f;
657 info->ip[AMD_IP_VCN_JPEG].ib_pad_dw_mask = 0xf;
658 info->ip[AMD_IP_VPE].ib_pad_dw_mask = 0xf;
659
660 /* Only require gfx or compute. */
661 if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) {
662 fprintf(stderr, "amdgpu: failed to find gfx or compute.\n");
663 return false;
664 }
665
666 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
667 &info->me_fw_feature);
668 if (r) {
669 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(me) failed.\n");
670 return false;
671 }
672
673 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_MEC, 0, 0, &info->mec_fw_version,
674 &info->mec_fw_feature);
675 if (r) {
676 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(mec) failed.\n");
677 return false;
678 }
679
680 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version,
681 &info->pfp_fw_feature);
682 if (r) {
683 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(pfp) failed.\n");
684 return false;
685 }
686
687 if (info->ip[AMD_IP_VCN_DEC].num_queues || info->ip[AMD_IP_VCN_UNIFIED].num_queues) {
688 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCN, 0, 0, &vidip_fw_version, &vidip_fw_feature);
689 if (r) {
690 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vcn) failed.\n");
691 return false;
692 } else {
693 info->vcn_dec_version = (vidip_fw_version & 0x0F000000) >> 24;
694 info->vcn_enc_major_version = (vidip_fw_version & 0x00F00000) >> 20;
695 info->vcn_enc_minor_version = (vidip_fw_version & 0x000FF000) >> 12;
696 }
697 } else {
698 if (info->ip[AMD_IP_VCE].num_queues) {
699 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vidip_fw_version, &vidip_fw_feature);
700 if (r) {
701 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(vce) failed.\n");
702 return false;
703 } else
704 info->vce_fw_version = vidip_fw_version;
705 }
706
707 if (info->ip[AMD_IP_UVD].num_queues) {
708 r = ac_drm_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &vidip_fw_version, &vidip_fw_feature);
709 if (r) {
710 fprintf(stderr, "amdgpu: ac_drm_query_firmware_version(uvd) failed.\n");
711 return false;
712 } else
713 info->uvd_fw_version = vidip_fw_version;
714 }
715 }
716
717 r = ac_drm_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
718 if (r) {
719 fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
720 return false;
721 }
722
723 struct drm_amdgpu_memory_info meminfo = {0};
724
725 r = ac_drm_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
726 if (r) {
727 fprintf(stderr, "amdgpu: ac_drm_query_info(memory) failed.\n");
728 return false;
729 }
730
731 /* Note: usable_heap_size values can be random and can't be relied on. */
732 info->gart_size_kb = DIV_ROUND_UP(meminfo.gtt.total_heap_size, 1024);
733 info->vram_size_kb = DIV_ROUND_UP(fix_vram_size(meminfo.vram.total_heap_size), 1024);
734 info->vram_vis_size_kb = DIV_ROUND_UP(meminfo.cpu_accessible_vram.total_heap_size, 1024);
735
736 if (info->drm_minor >= 41) {
737 ac_drm_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_DECODE,
738 sizeof(info->dec_caps), &(info->dec_caps));
739 ac_drm_query_video_caps_info(dev, AMDGPU_INFO_VIDEO_CAPS_ENCODE,
740 sizeof(info->enc_caps), &(info->enc_caps));
741 }
742
743 /* Add some margin of error, though this shouldn't be needed in theory. */
744 info->all_vram_visible = info->vram_size_kb * 0.9 < info->vram_vis_size_kb;
745
746 /* Set chip identification. */
747 info->pci_id = device_info.device_id;
748 info->pci_rev_id = device_info.pci_rev;
749 info->vce_harvest_config = device_info.vce_harvest_config;
750
751 #define identify_chip2(asic, chipname) \
752 if (ASICREV_IS(device_info.external_rev, asic)) { \
753 info->family = CHIP_##chipname; \
754 info->name = #chipname; \
755 }
756 #define identify_chip(chipname) identify_chip2(chipname, chipname)
757
758 if (!info->family_overridden) {
759 switch (device_info.family) {
760 case FAMILY_SI:
761 identify_chip(TAHITI);
762 identify_chip(PITCAIRN);
763 identify_chip2(CAPEVERDE, VERDE);
764 identify_chip(OLAND);
765 identify_chip(HAINAN);
766 break;
767 case FAMILY_CI:
768 identify_chip(BONAIRE);
769 identify_chip(HAWAII);
770 break;
771 case FAMILY_KV:
772 identify_chip2(SPECTRE, KAVERI);
773 identify_chip2(SPOOKY, KAVERI);
774 identify_chip2(KALINDI, KABINI);
775 identify_chip2(GODAVARI, KABINI);
776 break;
777 case FAMILY_VI:
778 identify_chip(ICELAND);
779 identify_chip(TONGA);
780 identify_chip(FIJI);
781 identify_chip(POLARIS10);
782 identify_chip(POLARIS11);
783 identify_chip(POLARIS12);
784 identify_chip(VEGAM);
785 break;
786 case FAMILY_CZ:
787 identify_chip(CARRIZO);
788 identify_chip(STONEY);
789 break;
790 case FAMILY_AI:
791 identify_chip(VEGA10);
792 identify_chip(VEGA12);
793 identify_chip(VEGA20);
794 identify_chip(MI100);
795 identify_chip(MI200);
796 identify_chip(GFX940);
797 break;
798 case FAMILY_RV:
799 identify_chip(RAVEN);
800 identify_chip(RAVEN2);
801 identify_chip(RENOIR);
802 break;
803 case FAMILY_NV:
804 identify_chip(NAVI10);
805 identify_chip(NAVI12);
806 identify_chip(NAVI14);
807 identify_chip(NAVI21);
808 identify_chip(NAVI22);
809 identify_chip(NAVI23);
810 identify_chip(NAVI24);
811 break;
812 case FAMILY_VGH:
813 identify_chip(VANGOGH);
814 break;
815 case FAMILY_RMB:
816 identify_chip(REMBRANDT);
817 break;
818 case FAMILY_RPL:
819 identify_chip2(RAPHAEL, RAPHAEL_MENDOCINO);
820 break;
821 case FAMILY_MDN:
822 identify_chip2(MENDOCINO, RAPHAEL_MENDOCINO);
823 break;
824 case FAMILY_NV3:
825 identify_chip(NAVI31);
826 identify_chip(NAVI32);
827 identify_chip(NAVI33);
828 break;
829 case FAMILY_PHX:
830 identify_chip2(PHOENIX1, PHOENIX);
831 identify_chip(PHOENIX2);
832 identify_chip2(HAWK_POINT1, PHOENIX);
833 identify_chip2(HAWK_POINT2, PHOENIX2);
834 break;
835 case FAMILY_GFX1150:
836 identify_chip(GFX1150);
837 identify_chip(GFX1151);
838 identify_chip(GFX1152);
839 identify_chip(GFX1153);
840 break;
841 case FAMILY_GFX12:
842 identify_chip(GFX1200);
843 identify_chip(GFX1201);
844 break;
845 }
846
847 if (info->ip[AMD_IP_GFX].ver_major == 12 && info->ip[AMD_IP_GFX].ver_minor == 0)
848 info->gfx_level = GFX12;
849 else if (info->ip[AMD_IP_GFX].ver_major == 11 && info->ip[AMD_IP_GFX].ver_minor == 5)
850 info->gfx_level = GFX11_5;
851 else if (info->ip[AMD_IP_GFX].ver_major == 11 && info->ip[AMD_IP_GFX].ver_minor == 0)
852 info->gfx_level = GFX11;
853 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 3)
854 info->gfx_level = GFX10_3;
855 else if (info->ip[AMD_IP_GFX].ver_major == 10 && info->ip[AMD_IP_GFX].ver_minor == 1)
856 info->gfx_level = GFX10;
857 else if (info->ip[AMD_IP_GFX].ver_major == 9 || info->ip[AMD_IP_COMPUTE].ver_major == 9)
858 info->gfx_level = GFX9;
859 else if (info->ip[AMD_IP_GFX].ver_major == 8)
860 info->gfx_level = GFX8;
861 else if (info->ip[AMD_IP_GFX].ver_major == 7)
862 info->gfx_level = GFX7;
863 else if (info->ip[AMD_IP_GFX].ver_major == 6)
864 info->gfx_level = GFX6;
865 else {
866 fprintf(stderr, "amdgpu: Unknown gfx version: %u.%u\n",
867 info->ip[AMD_IP_GFX].ver_major, info->ip[AMD_IP_GFX].ver_minor);
868 return false;
869 }
870
871 info->family_id = device_info.family;
872 info->chip_external_rev = device_info.external_rev;
873 info->chip_rev = device_info.chip_rev;
874 info->marketing_name = ac_drm_get_marketing_name(dev);
875 info->is_pro_graphics = info->marketing_name && (strstr(info->marketing_name, "Pro") ||
876 strstr(info->marketing_name, "PRO") ||
877 strstr(info->marketing_name, "Frontier"));
878 }
879
880 if (!info->name) {
881 fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
882 device_info.family, device_info.external_rev);
883 return false;
884 }
885
886 memset(info->lowercase_name, 0, sizeof(info->lowercase_name));
887 for (unsigned i = 0; info->name[i] && i < ARRAY_SIZE(info->lowercase_name) - 1; i++)
888 info->lowercase_name[i] = tolower(info->name[i]);
889
890 char proc_fd[64];
891 snprintf(proc_fd, sizeof(proc_fd), "/proc/self/fd/%u", fd);
892 UNUSED int _result = readlink(proc_fd, info->dev_filename, sizeof(info->dev_filename));
893
894 #define VCN_IP_VERSION(mj, mn, rv) (((mj) << 16) | ((mn) << 8) | (rv))
895
896 for (unsigned i = AMD_IP_VCN_DEC; i <= AMD_IP_VCN_JPEG; ++i) {
897 if (!info->ip[i].num_queues)
898 continue;
899
900 switch(VCN_IP_VERSION(info->ip[i].ver_major,
901 info->ip[i].ver_minor,
902 info->ip[i].ver_rev)) {
903 case VCN_IP_VERSION(1, 0, 0):
904 info->vcn_ip_version = VCN_1_0_0;
905 break;
906 case VCN_IP_VERSION(1, 0, 1):
907 info->vcn_ip_version = VCN_1_0_1;
908 break;
909 case VCN_IP_VERSION(2, 0, 0):
910 info->vcn_ip_version = VCN_2_0_0;
911 break;
912 case VCN_IP_VERSION(2, 0, 2):
913 info->vcn_ip_version = VCN_2_0_2;
914 break;
915 case VCN_IP_VERSION(2, 0, 3):
916 info->vcn_ip_version = VCN_2_0_3;
917 break;
918 case VCN_IP_VERSION(2, 2, 0):
919 info->vcn_ip_version = VCN_2_2_0;
920 break;
921 case VCN_IP_VERSION(2, 5, 0):
922 info->vcn_ip_version = VCN_2_5_0;
923 break;
924 case VCN_IP_VERSION(2, 6, 0):
925 info->vcn_ip_version = VCN_2_6_0;
926 break;
927 case VCN_IP_VERSION(3, 0, 0):
928 /* Navi24 version need to be revised if it fallbacks to the older way
929 * with default version as 3.0.0, since Navi24 has different feature
930 * sets from other VCN3 family */
931 info->vcn_ip_version = (info->family != CHIP_NAVI24) ? VCN_3_0_0 : VCN_3_0_33;
932 break;
933 case VCN_IP_VERSION(3, 0, 2):
934 info->vcn_ip_version = VCN_3_0_2;
935 break;
936 case VCN_IP_VERSION(3, 0, 16):
937 info->vcn_ip_version = VCN_3_0_16;
938 break;
939 case VCN_IP_VERSION(3, 0, 33):
940 info->vcn_ip_version = VCN_3_0_33;
941 break;
942 case VCN_IP_VERSION(3, 1, 1):
943 info->vcn_ip_version = VCN_3_1_1;
944 break;
945 case VCN_IP_VERSION(3, 1, 2):
946 info->vcn_ip_version = VCN_3_1_2;
947 break;
948 case VCN_IP_VERSION(4, 0, 0):
949 info->vcn_ip_version = VCN_4_0_0;
950 break;
951 case VCN_IP_VERSION(4, 0, 2):
952 info->vcn_ip_version = VCN_4_0_2;
953 break;
954 case VCN_IP_VERSION(4, 0, 3):
955 info->vcn_ip_version = VCN_4_0_3;
956 break;
957 case VCN_IP_VERSION(4, 0, 4):
958 info->vcn_ip_version = VCN_4_0_4;
959 break;
960 case VCN_IP_VERSION(4, 0, 5):
961 info->vcn_ip_version = VCN_4_0_5;
962 break;
963 case VCN_IP_VERSION(4, 0, 6):
964 info->vcn_ip_version = VCN_4_0_6;
965 break;
966 case VCN_IP_VERSION(5, 0, 0):
967 info->vcn_ip_version = VCN_5_0_0;
968 break;
969 case VCN_IP_VERSION(5, 0, 1):
970 info->vcn_ip_version = VCN_5_0_1;
971 break;
972 default:
973 info->vcn_ip_version = VCN_UNKNOWN;
974 }
975 break;
976 }
977
978 /* Set which chips have dedicated VRAM. */
979 info->has_dedicated_vram = !(device_info.ids_flags & AMDGPU_IDS_FLAGS_FUSION);
980
981 /* The kernel can split large buffers in VRAM but not in GTT, so large
982 * allocations can fail or cause buffer movement failures in the kernel.
983 */
984 if (info->has_dedicated_vram)
985 info->max_heap_size_kb = info->vram_size_kb;
986 else
987 info->max_heap_size_kb = info->gart_size_kb;
988
989 info->vram_type = device_info.vram_type;
990 info->memory_bus_width = device_info.vram_bit_width;
991
992 /* Set which chips have uncached device memory. */
993 info->has_l2_uncached = info->gfx_level >= GFX9;
994
995 /* Set hardware information. */
996 /* convert the shader/memory clocks from KHz to MHz */
997 info->max_gpu_freq_mhz = device_info.max_engine_clock / 1000;
998 info->memory_freq_mhz_effective = info->memory_freq_mhz = device_info.max_memory_clock / 1000;
999 info->max_tcc_blocks = device_info.num_tcc_blocks;
1000 info->max_se = device_info.num_shader_engines;
1001 info->max_sa_per_se = device_info.num_shader_arrays_per_engine;
1002 info->num_cu_per_sh = device_info.num_cu_per_sh;
1003 info->enabled_rb_mask = device_info.enabled_rb_pipes_mask;
1004 if (info->drm_minor >= 52)
1005 info->enabled_rb_mask |= (uint64_t)device_info.enabled_rb_pipes_mask_hi << 32;
1006
1007 info->memory_freq_mhz_effective *= ac_memory_ops_per_clock(info->vram_type);
1008
1009 info->has_userptr = !info->is_virtio;
1010 info->has_syncobj = true;
1011 info->has_timeline_syncobj = !info->is_virtio && has_timeline_syncobj(fd);
1012 info->has_fence_to_handle = true;
1013 info->has_local_buffers = !info->is_virtio;
1014 info->has_bo_metadata = true;
1015 info->has_eqaa_surface_allocator = info->gfx_level < GFX11;
1016 /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
1017 * these faults are mitigated in software.
1018 */
1019 info->has_sparse_vm_mappings = info->gfx_level >= GFX7;
1020 info->has_scheduled_fence_dependency = info->drm_minor >= 28;
1021 info->has_gang_submit = info->drm_minor >= 49;
1022 info->has_gpuvm_fault_query = info->drm_minor >= 55;
1023 info->has_tmz_support = has_tmz_support(dev, info, device_info.ids_flags);
1024 info->kernel_has_modifiers = has_modifiers(fd);
1025 info->uses_kernel_cu_mask = false; /* Not implemented in the kernel. */
1026 info->has_graphics = info->ip[AMD_IP_GFX].num_queues > 0;
1027
1028 /* On GFX8, the TBA/TMA registers can be configured from the userspace.
1029 * On GFX9+, they are privileged registers and they need to be configured
1030 * from the kernel but it's not suppported yet.
1031 */
1032 info->has_trap_handler_support = info->gfx_level == GFX8;
1033
1034 info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
1035 info->max_render_backends = device_info.num_rb_pipes;
1036 /* The value returned by the kernel driver was wrong. */
1037 if (info->family == CHIP_KAVERI)
1038 info->max_render_backends = 2;
1039
1040 info->clock_crystal_freq = device_info.gpu_counter_freq;
1041 if (!info->clock_crystal_freq) {
1042 fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
1043 info->clock_crystal_freq = 1;
1044 }
1045
1046 if (info->gfx_level >= GFX10) {
1047 info->tcc_cache_line_size = info->gfx_level >= GFX12 ? 256 : 128;
1048
1049 if (info->drm_minor >= 35) {
1050 info->num_tcc_blocks = info->max_tcc_blocks - util_bitcount64(device_info.tcc_disabled_mask);
1051 } else {
1052 /* This is a hack, but it's all we can do without a kernel upgrade. */
1053 info->num_tcc_blocks = info->vram_size_kb / (512 * 1024);
1054 if (info->num_tcc_blocks > info->max_tcc_blocks)
1055 info->num_tcc_blocks /= 2;
1056 }
1057 } else {
1058 if (!info->has_graphics && info->family >= CHIP_MI200)
1059 info->tcc_cache_line_size = 128;
1060 else
1061 info->tcc_cache_line_size = 64;
1062
1063 info->num_tcc_blocks = info->max_tcc_blocks;
1064 }
1065
1066 info->tcc_rb_non_coherent = info->gfx_level < GFX12 &&
1067 !util_is_power_of_two_or_zero(info->num_tcc_blocks) &&
1068 info->num_rb != info->num_tcc_blocks;
1069 info->cp_sdma_ge_use_system_memory_scope = info->gfx_level == GFX12;
1070 info->cp_dma_use_L2 = info->gfx_level >= GFX7 && !info->cp_sdma_ge_use_system_memory_scope;
1071
1072 if (info->drm_minor >= 52) {
1073 info->sqc_inst_cache_size = device_info.sqc_inst_cache_size * 1024;
1074 info->sqc_scalar_cache_size = device_info.sqc_data_cache_size * 1024;
1075 info->num_sqc_per_wgp = device_info.num_sqc_per_wgp;
1076 }
1077
1078 /* Firmware wrongly reports 0 bytes of MALL being present on Navi33.
1079 * Work around this by manually computing cache sizes. */
1080 if (info->gfx_level >= GFX11 && info->drm_minor >= 52 && info->family != CHIP_NAVI33) {
1081 info->tcp_cache_size = device_info.tcp_cache_size * 1024;
1082 info->l1_cache_size = device_info.gl1c_cache_size * 1024;
1083 info->l2_cache_size = device_info.gl2c_cache_size * 1024;
1084 info->l3_cache_size_mb = DIV_ROUND_UP(device_info.mall_size, 1024 * 1024);
1085 } else {
1086 if (info->gfx_level >= GFX11) {
1087 info->tcp_cache_size = 32768;
1088 info->l1_cache_size = 256 * 1024;
1089 } else {
1090 info->tcp_cache_size = 16384;
1091 info->l1_cache_size = 128 * 1024;
1092 }
1093
1094 if (info->gfx_level >= GFX10_3 && info->has_dedicated_vram) {
1095 info->l3_cache_size_mb = info->num_tcc_blocks *
1096 (info->family == CHIP_NAVI21 ||
1097 info->family == CHIP_NAVI22 ? 8 : 4);
1098 }
1099
1100 switch (info->family) {
1101 case CHIP_TAHITI:
1102 case CHIP_PITCAIRN:
1103 case CHIP_OLAND:
1104 case CHIP_HAWAII:
1105 case CHIP_KABINI:
1106 case CHIP_TONGA:
1107 case CHIP_STONEY:
1108 case CHIP_RAVEN2:
1109 info->l2_cache_size = info->num_tcc_blocks * 64 * 1024;
1110 break;
1111 case CHIP_VERDE:
1112 case CHIP_HAINAN:
1113 case CHIP_BONAIRE:
1114 case CHIP_KAVERI:
1115 case CHIP_ICELAND:
1116 case CHIP_CARRIZO:
1117 case CHIP_FIJI:
1118 case CHIP_POLARIS12:
1119 case CHIP_VEGAM:
1120 case CHIP_RAPHAEL_MENDOCINO:
1121 info->l2_cache_size = info->num_tcc_blocks * 128 * 1024;
1122 break;
1123 default:
1124 info->l2_cache_size = info->num_tcc_blocks * 256 * 1024;
1125 break;
1126 case CHIP_REMBRANDT:
1127 case CHIP_PHOENIX:
1128 info->l2_cache_size = info->num_tcc_blocks * 512 * 1024;
1129 break;
1130 }
1131 }
1132
1133 info->mc_arb_ramcfg = amdinfo.mc_arb_ramcfg;
1134 if (!info->family_overridden)
1135 info->gb_addr_config = amdinfo.gb_addr_cfg;
1136 if (info->gfx_level >= GFX9) {
1137 if (!info->has_graphics && info->family >= CHIP_GFX940)
1138 info->gb_addr_config = 0;
1139
1140 info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(info->gb_addr_config);
1141 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config);
1142 } else {
1143 unsigned pipe_config = G_009910_PIPE_CONFIG(amdinfo.gb_tile_mode[CIK_TILE_MODE_COLOR_2D]);
1144 info->num_tile_pipes = ac_pipe_config_to_num_pipes(pipe_config);
1145 info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config);
1146 }
1147 info->r600_has_virtual_memory = true;
1148
1149 /* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
1150 * 16KB makes some SIMDs unoccupied).
1151 *
1152 * GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
1153 * GFX7+: Workgroups can use up to 64KB.
1154 * GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
1155 */
1156 info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024
1157 : info->gfx_level >= GFX7 ? 64 * 1024
1158 : 32 * 1024;
1159
1160 /* lds_encode_granularity is the block size used for encoding registers.
1161 * lds_alloc_granularity is what the hardware will align the LDS size to.
1162 */
1163 info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4;
1164 info->lds_alloc_granularity = info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;
1165
1166 /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
1167 * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
1168 * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
1169 */
1170 info->has_clear_state = info->gfx_level >= GFX7 && info->gfx_level < GFX12;
1171
1172 info->has_distributed_tess =
1173 info->gfx_level >= GFX10 || (info->gfx_level >= GFX8 && info->max_se >= 2);
1174
1175 info->has_dcc_constant_encode =
1176 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10;
1177
1178 /* TC-compat HTILE is only available on GFX8-GFX11.5.
1179 *
1180 * There are issues with TC-compatible HTILE on Tonga (and Iceland is the same design), and
1181 * documented bug workarounds don't help. For example, this fails:
1182 * piglit/bin/tex-miplevel-selection 'texture()' 2DShadow -auto
1183 */
1184 info->has_tc_compatible_htile = info->gfx_level >= GFX8 && info->gfx_level < GFX12 &&
1185 info->family != CHIP_TONGA && info->family != CHIP_ICELAND;
1186
1187 info->has_etc_support = info->family == CHIP_STONEY || info->family == CHIP_VEGA10 ||
1188 info->family == CHIP_RAVEN || info->family == CHIP_RAVEN2;
1189
1190 info->has_rbplus = info->family == CHIP_STONEY || info->gfx_level >= GFX9;
1191
1192 /* Some chips have RB+ registers, but don't support RB+. Those must
1193 * always disable it.
1194 */
1195 info->rbplus_allowed =
1196 info->has_rbplus &&
1197 (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN ||
1198 info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->gfx_level >= GFX10_3);
1199
1200 info->has_out_of_order_rast =
1201 info->gfx_level >= GFX8 && info->gfx_level <= GFX9 && info->max_se >= 2;
1202
1203 /* Whether chips support double rate packed math instructions. */
1204 info->has_packed_math_16bit = info->gfx_level >= GFX9;
1205
1206 /* Whether chips support dot product instructions. A subset of these support a smaller
1207 * instruction encoding which accumulates with the destination.
1208 */
1209 info->has_accelerated_dot_product =
1210 info->family == CHIP_VEGA20 ||
1211 (info->family >= CHIP_MI100 && info->family != CHIP_NAVI10);
1212
1213 /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
1214 info->has_load_ctx_reg_pkt =
1215 info->gfx_level >= GFX9 || (info->gfx_level >= GFX8 && info->me_fw_feature >= 41);
1216
1217 info->cpdma_prefetch_writes_memory = info->gfx_level <= GFX8;
1218
1219 info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1220
1221 info->has_tc_compat_zrange_bug = info->gfx_level >= GFX8 && info->gfx_level <= GFX9;
1222
1223 info->has_small_prim_filter_sample_loc_bug =
1224 (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) ||
1225 info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1226
1227 info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1228
1229 /* DB_DFSM_CONTROL.POPS_DRAIN_PS_ON_OVERLAP must be enabled for 8 or more coverage or
1230 * depth/stencil samples with POPS (PAL waMiscPopsMissedOverlap).
1231 */
1232 info->has_pops_missed_overlap_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
1233
1234 /* GFX6 hw bug when the IBO addr is 0 which causes invalid clamping (underflow).
1235 * Setting the IB addr to 2 or higher solves this issue.
1236 */
1237 info->has_null_index_buffer_clamping_bug = info->gfx_level == GFX6;
1238
1239 /* Drawing from 0-sized index buffers causes hangs on gfx10. */
1240 info->has_zero_index_buffer_bug = info->gfx_level == GFX10;
1241
1242 /* Whether chips are affected by the image load/sample/gather hw bug when
1243 * DCC is enabled (ie. WRITE_COMPRESS_ENABLE should be 0).
1244 */
1245 info->has_image_load_dcc_bug = info->family == CHIP_NAVI23 ||
1246 info->family == CHIP_VANGOGH ||
1247 info->family == CHIP_REMBRANDT;
1248
1249 /* DB has a bug when ITERATE_256 is set to 1 that can cause a hang. The
1250 * workaround is to set DECOMPRESS_ON_Z_PLANES to 2 for 4X MSAA D/S images.
1251 */
1252 info->has_two_planes_iterate256_bug = info->gfx_level == GFX10;
1253
1254 /* GFX10+Navi21: NGG->legacy transitions require VGT_FLUSH. */
1255 info->has_vgt_flush_ngg_legacy_bug = info->gfx_level == GFX10 ||
1256 info->family == CHIP_NAVI21;
1257
1258 /* First Navi2x chips have a hw bug that doesn't allow to write
1259 * depth/stencil from a FS for multi-pixel fragments.
1260 */
1261 info->has_vrs_ds_export_bug = info->family == CHIP_NAVI21 ||
1262 info->family == CHIP_NAVI22 ||
1263 info->family == CHIP_VANGOGH;
1264
1265 /* HW bug workaround when CS threadgroups > 256 threads and async compute
1266 * isn't used, i.e. only one compute job can run at a time. If async
1267 * compute is possible, the threadgroup size must be limited to 256 threads
1268 * on all queues to avoid the bug.
1269 * Only GFX6 and certain GFX7 chips are affected.
1270 *
1271 * FIXME: RADV doesn't limit the number of threads for async compute.
1272 */
1273 info->has_cs_regalloc_hang_bug = info->gfx_level == GFX6 ||
1274 info->family == CHIP_BONAIRE ||
1275 info->family == CHIP_KABINI;
1276
1277 /* HW bug workaround with async compute dispatches when threadgroup > 4096.
1278 * The workaround is to change the "threadgroup" dimension mode to "thread"
1279 * dimension mode.
1280 */
1281 info->has_async_compute_threadgroup_bug = info->family == CHIP_ICELAND ||
1282 info->family == CHIP_TONGA;
1283
1284 /* GFX7 CP requires 32 bytes alignment for the indirect buffer arguments on
1285 * the compute queue.
1286 */
1287 info->has_async_compute_align32_bug = info->gfx_level == GFX7;
1288
1289 /* Support for GFX10.3 was added with F32_ME_FEATURE_VERSION_31 but the
1290 * feature version wasn't bumped.
1291 */
1292 info->has_32bit_predication = (info->gfx_level >= GFX10 &&
1293 info->me_fw_feature >= 32) ||
1294 (info->gfx_level == GFX9 &&
1295 info->me_fw_feature >= 52);
1296
1297 /* Firmware bug with DISPATCH_TASKMESH_INDIRECT_MULTI_ACE packets.
1298 * On old MEC FW versions, it hangs the GPU when indirect count is zero.
1299 */
1300 info->has_taskmesh_indirect0_bug = info->gfx_level == GFX10_3 &&
1301 info->mec_fw_version < 100;
1302
1303 /* Some GFX10 chips can hang when NGG exports zero vertices and primitives.
1304 * The workaround is to always export a single degenerate triangle.
1305 */
1306 info->has_ngg_fully_culled_bug = info->gfx_level == GFX10;
1307
1308 /* On newer chips, it is not necessary for NGG shaders to request
1309 * the allocation of GS space in passthrough mode, when they set
1310 * PRIMGEN_PASSTHRU_NO_MSG.
1311 */
1312 info->has_ngg_passthru_no_msg = info->family >= CHIP_NAVI23;
1313
1314 info->has_export_conflict_bug = info->gfx_level == GFX11;
1315
1316 /* The hw starts culling after all exports are finished,
1317 * not when all waves in an NGG workgroup are finished,
1318 * and if all primitives are culled, the hw deallocates the attribute ring
1319 * for the NGG workgroup and reuses it for next one while the previous NGG
1320 * workgroup might still be issuing attribute stores.
1321 * When there are 2 NGG workgroups in the system with the same attribute ring address,
1322 * attributes may be corrupted.
1323 * The workaround is to issue and wait for attribute stores before the last export.
1324 */
1325 info->has_attr_ring_wait_bug = info->gfx_level == GFX11 || info->gfx_level == GFX11_5;
1326
1327 /* When LLVM is fixed to handle multiparts shaders, this value will depend
1328 * on the known good versions of LLVM. Until then, enable the equivalent WA
1329 * in the nir -> llvm backend.
1330 */
1331 info->needs_llvm_wait_wa = info->gfx_level == GFX11;
1332
1333 /* Convert the SDMA version in the current GPU to an enum. */
1334 info->sdma_ip_version =
1335 (enum sdma_version)SDMA_VERSION_VALUE(info->ip[AMD_IP_SDMA].ver_major,
1336 info->ip[AMD_IP_SDMA].ver_minor);
1337
1338 /* SDMA v1.0-3.x (GFX6-8) can't ignore page faults on unmapped sparse resources. */
1339 info->sdma_supports_sparse = info->sdma_ip_version >= SDMA_4_0;
1340
1341 /* SDMA v5.0+ (GFX10+) supports DCC and HTILE, but Navi 10 has issues with it according to PAL. */
1342 info->sdma_supports_compression = info->sdma_ip_version >= SDMA_5_0 && info->family != CHIP_NAVI10;
1343
1344 /* Get the number of good compute units. */
1345 info->num_cu = 0;
1346 for (i = 0; i < info->max_se; i++) {
1347 for (j = 0; j < info->max_sa_per_se; j++) {
1348 if (info->gfx_level >= GFX11) {
1349 assert(info->max_sa_per_se <= 2);
1350 info->cu_mask[i][j] = device_info.cu_bitmap[i % 4][(i / 4) * 2 + j];
1351 } else if (info->family == CHIP_MI100) {
1352 /* The CU bitmap in amd gpu info structure is
1353 * 4x4 size array, and it's usually suitable for Vega
1354 * ASICs which has 4*2 SE/SA layout.
1355 * But for MI100, SE/SA layout is changed to 8*1.
1356 * To mostly reduce the impact, we make it compatible
1357 * with current bitmap array as below:
1358 * SE4 --> cu_bitmap[0][1]
1359 * SE5 --> cu_bitmap[1][1]
1360 * SE6 --> cu_bitmap[2][1]
1361 * SE7 --> cu_bitmap[3][1]
1362 */
1363 assert(info->max_sa_per_se == 1);
1364 info->cu_mask[i][0] = device_info.cu_bitmap[i % 4][i / 4];
1365 } else {
1366 info->cu_mask[i][j] = device_info.cu_bitmap[i][j];
1367 }
1368 info->num_cu += util_bitcount(info->cu_mask[i][j]);
1369 }
1370 }
1371
1372 if (info->gfx_level >= GFX10_3 && info->max_se > 1) {
1373 uint32_t enabled_se_mask = 0;
1374
1375 /* Derive the enabled SE mask from the CU mask. */
1376 for (unsigned se = 0; se < info->max_se; se++) {
1377 for (unsigned sa = 0; sa < info->max_sa_per_se; sa++) {
1378 if (info->cu_mask[se][sa]) {
1379 enabled_se_mask |= BITFIELD_BIT(se);
1380 break;
1381 }
1382 }
1383 }
1384 info->num_se = util_bitcount(enabled_se_mask);
1385
1386 /* Trim the number of enabled RBs based on the number of enabled SEs because the RB mask
1387 * might include disabled SEs.
1388 */
1389 if (info->gfx_level >= GFX12) {
1390 unsigned num_rb_per_se = info->max_render_backends / info->max_se;
1391
1392 for (unsigned se = 0; se < info->max_se; se++) {
1393 if (!(BITFIELD_BIT(se) & enabled_se_mask))
1394 info->enabled_rb_mask &= ~(BITFIELD_MASK(num_rb_per_se) << (se * num_rb_per_se));
1395 }
1396 }
1397 } else {
1398 /* GFX10 and older always enable all SEs because they don't support SE harvesting. */
1399 info->num_se = info->max_se;
1400 }
1401
1402 info->num_rb = util_bitcount64(info->enabled_rb_mask);
1403
1404 /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
1405 * and max - min <= 2.
1406 */
1407 unsigned cu_group = info->gfx_level >= GFX10 ? 2 : 1;
1408 info->max_good_cu_per_sa =
1409 DIV_ROUND_UP(info->num_cu, (info->num_se * info->max_sa_per_se * cu_group)) *
1410 cu_group;
1411 info->min_good_cu_per_sa =
1412 (info->num_cu / (info->num_se * info->max_sa_per_se * cu_group)) * cu_group;
1413
1414 if (!info->family_overridden)
1415 memcpy(info->si_tile_mode_array, amdinfo.gb_tile_mode, sizeof(amdinfo.gb_tile_mode));
1416
1417 memcpy(info->cik_macrotile_mode_array, amdinfo.gb_macro_tile_mode,
1418 sizeof(amdinfo.gb_macro_tile_mode));
1419
1420 info->pte_fragment_size = device_info.pte_fragment_size;
1421 info->gart_page_size = device_info.gart_page_size;
1422
1423 info->gfx_ib_pad_with_type2 = info->gfx_level == GFX6;
1424 /* CDNA starting with GFX940 shouldn't use CP DMA. */
1425 info->has_cp_dma = info->has_graphics || info->family < CHIP_GFX940;
1426
1427 if (info->gfx_level >= GFX11 && info->gfx_level < GFX12) {
1428 /* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
1429 * power savings, hence enable dcc with retile for gfx11 with num_cu >= 4.
1430 */
1431 info->use_display_dcc_with_retile_blit = info->num_cu >= 4;
1432 } else if (info->gfx_level == GFX10_3) {
1433 /* Displayable DCC with retiling is known to increase power consumption on Raphael
1434 * and Mendocino, so disable it on the smallest APUs. We need a proof that
1435 * displayable DCC doesn't regress bigger chips in the same way.
1436 */
1437 info->use_display_dcc_with_retile_blit = info->num_cu > 4;
1438 } else if (info->gfx_level == GFX9 && !info->has_dedicated_vram &&
1439 info->drm_minor >= 31) {
1440 if (info->max_render_backends == 1) {
1441 info->use_display_dcc_unaligned = true;
1442 } else {
1443 /* there may be power increase for small APUs with less num_cu. */
1444 info->use_display_dcc_with_retile_blit = info->num_cu > 4;
1445 }
1446 }
1447
1448 /* The kernel code translating tiling flags into a modifier was wrong
1449 * until .58.
1450 */
1451 info->gfx12_supports_display_dcc = info->gfx_level >= GFX12 && info->drm_minor >= 58;
1452
1453 /* AMDGPU always enables DCC compressed writes when a BO is moved back to
1454 * VRAM until .60.
1455 */
1456 info->gfx12_supports_dcc_write_compress_disable = info->gfx_level >= GFX12 && info->drm_minor >= 60;
1457
1458 info->has_stable_pstate = info->drm_minor >= 45;
1459
1460 if (info->gfx_level >= GFX12) {
1461 /* Gfx12 doesn't use pc_lines and pbb_max_alloc_count. */
1462 } else if (info->gfx_level >= GFX11) {
1463 info->pc_lines = 1024;
1464 info->pbb_max_alloc_count = 16; /* minimum is 2, maximum is 256 */
1465 } else if (info->gfx_level >= GFX9 && info->has_graphics) {
1466 unsigned pc_lines = 0;
1467
1468 switch (info->family) {
1469 case CHIP_VEGA10:
1470 case CHIP_VEGA12:
1471 case CHIP_VEGA20:
1472 pc_lines = 2048;
1473 break;
1474 case CHIP_RAVEN:
1475 case CHIP_RAVEN2:
1476 case CHIP_RENOIR:
1477 case CHIP_NAVI10:
1478 case CHIP_NAVI12:
1479 case CHIP_NAVI21:
1480 case CHIP_NAVI22:
1481 case CHIP_NAVI23:
1482 pc_lines = 1024;
1483 break;
1484 case CHIP_NAVI14:
1485 case CHIP_NAVI24:
1486 pc_lines = 512;
1487 break;
1488 case CHIP_VANGOGH:
1489 case CHIP_REMBRANDT:
1490 case CHIP_RAPHAEL_MENDOCINO:
1491 pc_lines = 256;
1492 break;
1493 default:
1494 assert(0);
1495 }
1496
1497 info->pc_lines = pc_lines;
1498
1499 if (info->gfx_level >= GFX10) {
1500 info->pbb_max_alloc_count = pc_lines / 3;
1501 } else {
1502 info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
1503 }
1504 }
1505
1506 if (info->gfx_level >= GFX10_3)
1507 info->max_waves_per_simd = 16;
1508 else if (info->gfx_level == GFX10)
1509 info->max_waves_per_simd = 20;
1510 else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
1511 info->max_waves_per_simd = 8;
1512 else
1513 info->max_waves_per_simd = 10;
1514
1515 if (info->gfx_level >= GFX10) {
1516 info->num_physical_sgprs_per_simd = 128 * info->max_waves_per_simd;
1517 info->min_sgpr_alloc = 128;
1518 info->sgpr_alloc_granularity = 128;
1519 } else if (info->gfx_level >= GFX8) {
1520 info->num_physical_sgprs_per_simd = 800;
1521 info->min_sgpr_alloc = 16;
1522 info->sgpr_alloc_granularity = 16;
1523 } else {
1524 info->num_physical_sgprs_per_simd = 512;
1525 info->min_sgpr_alloc = 8;
1526 info->sgpr_alloc_granularity = 8;
1527 }
1528
1529 info->has_3d_cube_border_color_mipmap = info->has_graphics || info->family == CHIP_MI100;
1530 info->has_image_opcodes = debug_get_bool_option("AMD_IMAGE_OPCODES",
1531 info->has_graphics || info->family < CHIP_GFX940);
1532 info->never_stop_sq_perf_counters = info->gfx_level == GFX10 ||
1533 info->gfx_level == GFX10_3;
1534 info->never_send_perfcounter_stop = info->gfx_level == GFX11;
1535 info->has_sqtt_rb_harvest_bug = (info->family == CHIP_NAVI23 ||
1536 info->family == CHIP_NAVI24 ||
1537 info->family == CHIP_REMBRANDT ||
1538 info->family == CHIP_VANGOGH) &&
1539 util_bitcount64(info->enabled_rb_mask) !=
1540 info->max_render_backends;
1541
1542 /* On GFX10.3, the polarity of AUTO_FLUSH_MODE is inverted. */
1543 info->has_sqtt_auto_flush_mode_bug = info->gfx_level == GFX10_3;
1544
1545 info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104;
1546
1547 if (!info->has_graphics && info->family >= CHIP_MI200) {
1548 info->min_wave64_vgpr_alloc = 8;
1549 info->max_vgpr_alloc = 512;
1550 info->wave64_vgpr_alloc_granularity = 8;
1551 } else {
1552 info->min_wave64_vgpr_alloc = 4;
1553 info->max_vgpr_alloc = 256;
1554 info->wave64_vgpr_alloc_granularity = 4;
1555 }
1556
1557 /* Some GPU info was broken before DRM 3.45.0. */
1558 if (info->drm_minor >= 45 && device_info.num_shader_visible_vgprs) {
1559 /* The Gfx10 VGPR count is in Wave32, so divide it by 2 for Wave64.
1560 * Gfx6-9 numbers are in Wave64.
1561 */
1562 if (info->gfx_level >= GFX10)
1563 info->num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs / 2;
1564 else
1565 info->num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs;
1566 } else if (info->gfx_level >= GFX10) {
1567 info->num_physical_wave64_vgprs_per_simd = 512;
1568 } else {
1569 info->num_physical_wave64_vgprs_per_simd = 256;
1570 }
1571
1572 info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4;
1573
1574 /* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed
1575 * with AMDGPU_GEM_CREATE_DISCARDABLE. DISCARDABLE was added in DRM 3.47.0.
1576 */
1577 info->discardable_allows_big_page = info->gfx_level >= GFX10_3 && info->gfx_level < GFX12 &&
1578 info->has_dedicated_vram &&
1579 info->drm_minor >= 47;
1580
1581 /* The maximum number of scratch waves. The number is only a function of the number of CUs.
1582 * It should be large enough to hold at least 1 threadgroup. Use the minimum per-SA CU count.
1583 *
1584 * We can decrease the number to make it fit into the infinity cache.
1585 */
1586 const unsigned max_waves_per_tg = 32; /* 1024 threads in Wave32 */
1587 info->max_scratch_waves = MAX2(32 * info->min_good_cu_per_sa * info->max_sa_per_se * info->num_se,
1588 max_waves_per_tg);
1589 info->has_scratch_base_registers = info->gfx_level >= GFX11 ||
1590 (!info->has_graphics && info->family >= CHIP_GFX940);
1591 info->max_gflops = (info->gfx_level >= GFX11 ? 256 : 128) * info->num_cu * info->max_gpu_freq_mhz / 1000;
1592 info->memory_bandwidth_gbps = DIV_ROUND_UP(info->memory_freq_mhz_effective * info->memory_bus_width / 8, 1000);
1593 info->has_pcie_bandwidth_info = info->drm_minor >= 51;
1594
1595 if (info->has_pcie_bandwidth_info) {
1596 info->pcie_gen = device_info.pcie_gen;
1597 info->pcie_num_lanes = device_info.pcie_num_lanes;
1598
1599 /* Source: https://en.wikipedia.org/wiki/PCI_Express#History_and_revisions */
1600 switch (info->pcie_gen) {
1601 case 1:
1602 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.25 * 1024;
1603 break;
1604 case 2:
1605 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.5 * 1024;
1606 break;
1607 case 3:
1608 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 0.985 * 1024;
1609 break;
1610 case 4:
1611 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 1.969 * 1024;
1612 break;
1613 case 5:
1614 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 3.938 * 1024;
1615 break;
1616 case 6:
1617 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 7.563 * 1024;
1618 break;
1619 case 7:
1620 info->pcie_bandwidth_mbps = info->pcie_num_lanes * 15.125 * 1024;
1621 break;
1622 }
1623 }
1624
1625 /* The number of IBs per submit isn't infinite, it depends on the IP type
1626 * (ie. some initial setup needed for a submit) and the packet size.
1627 * It can be calculated according to the kernel source code as:
1628 * (ring->max_dw - emit_frame_size) / emit_ib_size
1629 */
1630 r = ac_drm_query_info(dev, AMDGPU_INFO_MAX_IBS,
1631 sizeof(info->max_submitted_ibs), info->max_submitted_ibs);
1632 if (r) {
1633 /* When the number of IBs can't be queried from the kernel, we choose a
1634 * rough estimate that should work well (as of kernel 6.3).
1635 */
1636 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; ++i)
1637 info->max_submitted_ibs[i] = 50;
1638
1639 info->max_submitted_ibs[AMD_IP_GFX] = info->gfx_level >= GFX7 ? 192 : 144;
1640 info->max_submitted_ibs[AMD_IP_COMPUTE] = 124;
1641 info->max_submitted_ibs[AMD_IP_VCN_JPEG] = 16;
1642 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; ++i) {
1643 /* Clear out max submitted IB count for IPs that have no queues. */
1644 if (!info->ip[i].num_queues)
1645 info->max_submitted_ibs[i] = 0;
1646 }
1647 }
1648
1649 if (info->gfx_level >= GFX11) {
1650 unsigned num_prim_exports = 0, num_pos_exports = 0;
1651
1652 if (info->gfx_level >= GFX12) {
1653 info->attribute_ring_size_per_se = 1024 * 1024;
1654 num_prim_exports = 16368; /* also includes gs_alloc_req */
1655 num_pos_exports = 16384;
1656 } else if (info->l3_cache_size_mb || info->family_overridden) {
1657 info->attribute_ring_size_per_se = 1400 * 1024;
1658 } else {
1659 assert(info->num_se == 1);
1660
1661 if (info->l2_cache_size >= 2 * 1024 * 1024)
1662 info->attribute_ring_size_per_se = 768 * 1024;
1663 else
1664 info->attribute_ring_size_per_se = info->l2_cache_size / 2;
1665 }
1666
1667 /* The size must be aligned to 64K per SE and must be at most 16M in total. */
1668 info->attribute_ring_size_per_se = align(info->attribute_ring_size_per_se, 64 * 1024);
1669 assert(info->attribute_ring_size_per_se * info->max_se <= 16 * 1024 * 1024);
1670
1671 /* Compute the pos and prim ring sizes and offsets. */
1672 info->pos_ring_size_per_se = align(num_pos_exports * 16, 32);
1673 info->prim_ring_size_per_se = align(num_prim_exports * 4, 32);
1674 assert(info->gfx_level >= GFX12 ||
1675 (!info->pos_ring_size_per_se && !info->prim_ring_size_per_se));
1676
1677 uint32_t max_se_squared = info->max_se * info->max_se;
1678 uint32_t attribute_ring_size = info->attribute_ring_size_per_se * info->max_se;
1679 uint32_t pos_ring_size = align(info->pos_ring_size_per_se * max_se_squared, 64 * 1024);
1680 uint32_t prim_ring_size = align(info->prim_ring_size_per_se * max_se_squared, 64 * 1024);
1681
1682 info->pos_ring_offset = attribute_ring_size;
1683 info->prim_ring_offset = info->pos_ring_offset + pos_ring_size;
1684 info->total_attribute_pos_prim_ring_size = info->prim_ring_offset + prim_ring_size;
1685
1686 info->conformant_trunc_coord =
1687 info->drm_minor >= 52 &&
1688 device_info.ids_flags & AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD;
1689
1690 info->has_attr_ring = info->attribute_ring_size_per_se > 0;
1691 }
1692
1693 if (info->gfx_level >= GFX11 && debug_get_bool_option("AMD_USERQ", false)) {
1694 struct drm_amdgpu_info_uq_fw_areas fw_info;
1695
1696 r = ac_drm_query_uq_fw_area_info(dev, AMDGPU_HW_IP_GFX, 0, &fw_info);
1697 if (r) {
1698 fprintf(stderr, "amdgpu: amdgpu_query_uq_fw_area_info() failed.\n");
1699 return false;
1700 }
1701
1702 info->has_fw_based_shadowing = true;
1703 info->fw_based_mcbp.shadow_size = fw_info.gfx.shadow_size;
1704 info->fw_based_mcbp.shadow_alignment = fw_info.gfx.shadow_alignment;
1705 info->fw_based_mcbp.csa_size = fw_info.gfx.csa_size;
1706 info->fw_based_mcbp.csa_alignment = fw_info.gfx.csa_alignment;
1707 } else if (info->gfx_level >= GFX11 && device_info.shadow_size > 0) {
1708 info->has_fw_based_shadowing = true;
1709 info->fw_based_mcbp.shadow_size = device_info.shadow_size;
1710 info->fw_based_mcbp.shadow_alignment = device_info.shadow_alignment;
1711 info->fw_based_mcbp.csa_size = device_info.csa_size;
1712 info->fw_based_mcbp.csa_alignment = device_info.csa_alignment;
1713 }
1714
1715 /* WARNING: Register shadowing decreases performance by up to 50% on GFX11 with current FW. */
1716 info->register_shadowing_required = device_info.ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION &&
1717 info->gfx_level < GFX11;
1718
1719 if (info->gfx_level >= GFX12) {
1720 info->has_set_context_pairs = true;
1721 info->has_set_sh_pairs = true;
1722 info->has_set_uconfig_pairs = true;
1723 } else if (info->gfx_level >= GFX11 && info->has_dedicated_vram) {
1724 info->has_set_context_pairs_packed = true;
1725 info->has_set_sh_pairs_packed = info->register_shadowing_required;
1726 }
1727
1728 info->has_image_bvh_intersect_ray = info->gfx_level >= GFX10_3;
1729
1730 set_custom_cu_en_mask(info);
1731
1732 const char *ib_filename = debug_get_option("AMD_PARSE_IB", NULL);
1733 if (ib_filename) {
1734 FILE *f = fopen(ib_filename, "r");
1735 if (f) {
1736 fseek(f, 0, SEEK_END);
1737 size_t size = ftell(f);
1738 uint32_t *ib = (uint32_t *)malloc(size);
1739 fseek(f, 0, SEEK_SET);
1740 size_t n_read = fread(ib, 1, size, f);
1741 fclose(f);
1742
1743 if (n_read != size) {
1744 fprintf(stderr, "failed to read %zu bytes from '%s'\n", size, ib_filename);
1745 exit(1);
1746 }
1747
1748 struct ac_ib_parser ib_parser = {
1749 .f = stdout,
1750 .ib = ib,
1751 .num_dw = size / 4,
1752 .gfx_level = info->gfx_level,
1753 .family = info->family,
1754 .ip_type = AMD_IP_GFX,
1755 };
1756
1757 ac_parse_ib(&ib_parser, "IB");
1758 free(ib);
1759 exit(0);
1760 }
1761 }
1762 return true;
1763 }
1764
ac_compute_driver_uuid(char * uuid,size_t size)1765 void ac_compute_driver_uuid(char *uuid, size_t size)
1766 {
1767 char amd_uuid[] = "AMD-MESA-DRV";
1768
1769 assert(size >= sizeof(amd_uuid));
1770
1771 memset(uuid, 0, size);
1772 strncpy(uuid, amd_uuid, size);
1773 }
1774
ac_compute_device_uuid(const struct radeon_info * info,char * uuid,size_t size)1775 void ac_compute_device_uuid(const struct radeon_info *info, char *uuid, size_t size)
1776 {
1777 uint32_t *uint_uuid = (uint32_t *)uuid;
1778
1779 assert(size >= sizeof(uint32_t) * 4);
1780
1781 /**
1782 * Use the device info directly instead of using a sha1. GL/VK UUIDs
1783 * are 16 byte vs 20 byte for sha1, and the truncation that would be
1784 * required would get rid of part of the little entropy we have.
1785 * */
1786 memset(uuid, 0, size);
1787 if (!info->pci.valid) {
1788 fprintf(stderr,
1789 "ac_compute_device_uuid's output is based on invalid pci bus info.\n");
1790 }
1791 uint_uuid[0] = info->pci.domain;
1792 uint_uuid[1] = info->pci.bus;
1793 uint_uuid[2] = info->pci.dev;
1794 uint_uuid[3] = info->pci.func;
1795 }
1796
ac_print_gpu_info(const struct radeon_info * info,FILE * f)1797 void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
1798 {
1799 fprintf(f, "Device info:\n");
1800 fprintf(f, " name = %s\n", info->name);
1801 fprintf(f, " marketing_name = %s\n", info->marketing_name);
1802 fprintf(f, " dev_filename = %s\n", info->dev_filename);
1803 fprintf(f, " num_se = %i\n", info->num_se);
1804 fprintf(f, " num_rb = %i\n", info->num_rb);
1805 fprintf(f, " num_cu = %i\n", info->num_cu);
1806 fprintf(f, " max_gpu_freq = %i MHz\n", info->max_gpu_freq_mhz);
1807 fprintf(f, " max_gflops = %u GFLOPS\n", info->max_gflops);
1808
1809 if (info->sqc_inst_cache_size) {
1810 fprintf(f, " sqc_inst_cache_size = %i KB (%u per WGP)\n",
1811 DIV_ROUND_UP(info->sqc_inst_cache_size, 1024), info->num_sqc_per_wgp);
1812 }
1813 if (info->sqc_scalar_cache_size) {
1814 fprintf(f, " sqc_scalar_cache_size = %i KB (%u per WGP)\n",
1815 DIV_ROUND_UP(info->sqc_scalar_cache_size, 1024), info->num_sqc_per_wgp);
1816 }
1817
1818 fprintf(f, " tcp_cache_size = %i KB\n", DIV_ROUND_UP(info->tcp_cache_size, 1024));
1819
1820 if (info->gfx_level >= GFX10 && info->gfx_level < GFX12)
1821 fprintf(f, " l1_cache_size = %i KB\n", DIV_ROUND_UP(info->l1_cache_size, 1024));
1822
1823 fprintf(f, " l2_cache_size = %i KB\n", DIV_ROUND_UP(info->l2_cache_size, 1024));
1824
1825 if (info->l3_cache_size_mb)
1826 fprintf(f, " l3_cache_size = %i MB\n", info->l3_cache_size_mb);
1827
1828 fprintf(f, " memory_channels = %u (TCC blocks)\n", info->num_tcc_blocks);
1829 fprintf(f, " memory_size = %u GB (%u MB)\n",
1830 DIV_ROUND_UP(info->vram_size_kb, (1024 * 1024)),
1831 DIV_ROUND_UP(info->vram_size_kb, 1024));
1832 fprintf(f, " memory_freq = %u GHz\n", DIV_ROUND_UP(info->memory_freq_mhz_effective, 1000));
1833 fprintf(f, " memory_bus_width = %u bits\n", info->memory_bus_width);
1834 fprintf(f, " memory_bandwidth = %u GB/s\n", info->memory_bandwidth_gbps);
1835 fprintf(f, " pcie_gen = %u\n", info->pcie_gen);
1836 fprintf(f, " pcie_num_lanes = %u\n", info->pcie_num_lanes);
1837 fprintf(f, " pcie_bandwidth = %1.1f GB/s\n", info->pcie_bandwidth_mbps / 1024.0);
1838 fprintf(f, " clock_crystal_freq = %i KHz\n", info->clock_crystal_freq);
1839
1840 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
1841 if (info->ip[i].num_queues) {
1842 fprintf(f, " IP %-7s %2u.%u \tqueues:%u \talign:%u \tpad_dw:0x%x\n",
1843 ac_get_ip_type_string(info, i),
1844 info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues,
1845 info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
1846 }
1847 }
1848
1849 fprintf(f, "Identification:\n");
1850 if (info->pci.valid)
1851 fprintf(f, " pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci.domain, info->pci.bus,
1852 info->pci.dev, info->pci.func);
1853 else
1854 fprintf(f, " pci (domain:bus:dev.func): unknown\n");
1855 fprintf(f, " pci_id = 0x%x\n", info->pci_id);
1856 fprintf(f, " pci_rev_id = 0x%x\n", info->pci_rev_id);
1857 fprintf(f, " family = %i\n", info->family);
1858 fprintf(f, " gfx_level = %i\n", info->gfx_level);
1859 fprintf(f, " family_id = %i\n", info->family_id);
1860 fprintf(f, " chip_external_rev = %i\n", info->chip_external_rev);
1861 fprintf(f, " chip_rev = %i\n", info->chip_rev);
1862
1863 fprintf(f, "Flags:\n");
1864 fprintf(f, " family_overridden = %u\n", info->family_overridden);
1865 fprintf(f, " is_pro_graphics = %u\n", info->is_pro_graphics);
1866 fprintf(f, " has_graphics = %i\n", info->has_graphics);
1867 fprintf(f, " has_clear_state = %u\n", info->has_clear_state);
1868 fprintf(f, " has_distributed_tess = %u\n", info->has_distributed_tess);
1869 fprintf(f, " has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
1870 fprintf(f, " has_rbplus = %u\n", info->has_rbplus);
1871 fprintf(f, " rbplus_allowed = %u\n", info->rbplus_allowed);
1872 fprintf(f, " has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
1873 fprintf(f, " has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
1874 fprintf(f, " cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
1875 fprintf(f, " has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
1876 fprintf(f, " has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
1877 fprintf(f, " has_small_prim_filter_sample_loc_bug = %i\n", info->has_small_prim_filter_sample_loc_bug);
1878 fprintf(f, " has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
1879 fprintf(f, " has_pops_missed_overlap_bug = %i\n", info->has_pops_missed_overlap_bug);
1880 fprintf(f, " has_32bit_predication = %i\n", info->has_32bit_predication);
1881 fprintf(f, " has_3d_cube_border_color_mipmap = %i\n", info->has_3d_cube_border_color_mipmap);
1882 fprintf(f, " has_image_opcodes = %i\n", info->has_image_opcodes);
1883 fprintf(f, " never_stop_sq_perf_counters = %i\n", info->never_stop_sq_perf_counters);
1884 fprintf(f, " has_sqtt_rb_harvest_bug = %i\n", info->has_sqtt_rb_harvest_bug);
1885 fprintf(f, " has_sqtt_auto_flush_mode_bug = %i\n", info->has_sqtt_auto_flush_mode_bug);
1886 fprintf(f, " never_send_perfcounter_stop = %i\n", info->never_send_perfcounter_stop);
1887 fprintf(f, " discardable_allows_big_page = %i\n", info->discardable_allows_big_page);
1888 fprintf(f, " has_taskmesh_indirect0_bug = %i\n", info->has_taskmesh_indirect0_bug);
1889 fprintf(f, " has_set_context_pairs = %i\n", info->has_set_context_pairs);
1890 fprintf(f, " has_set_context_pairs_packed = %i\n", info->has_set_context_pairs_packed);
1891 fprintf(f, " has_set_sh_pairs = %i\n", info->has_set_sh_pairs);
1892 fprintf(f, " has_set_sh_pairs_packed = %i\n", info->has_set_sh_pairs_packed);
1893 fprintf(f, " has_set_uconfig_pairs = %i\n", info->has_set_uconfig_pairs);
1894 fprintf(f, " conformant_trunc_coord = %i\n", info->conformant_trunc_coord);
1895
1896 if (info->gfx_level < GFX12) {
1897 fprintf(f, "Display features:\n");
1898 fprintf(f, " use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
1899 fprintf(f, " use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
1900 }
1901
1902 fprintf(f, "Memory info:\n");
1903 fprintf(f, " pte_fragment_size = %u\n", info->pte_fragment_size);
1904 fprintf(f, " gart_page_size = %u\n", info->gart_page_size);
1905 fprintf(f, " gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size_kb, 1024));
1906 fprintf(f, " vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size_kb, 1024));
1907 fprintf(f, " vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size_kb, 1024));
1908 fprintf(f, " vram_type = %i\n", info->vram_type);
1909 fprintf(f, " max_heap_size_kb = %i MB\n", (int)DIV_ROUND_UP(info->max_heap_size_kb, 1024));
1910 fprintf(f, " min_alloc_size = %u\n", info->min_alloc_size);
1911 fprintf(f, " address32_hi = 0x%x\n", info->address32_hi);
1912 fprintf(f, " has_dedicated_vram = %u\n", info->has_dedicated_vram);
1913 fprintf(f, " all_vram_visible = %u\n", info->all_vram_visible);
1914 fprintf(f, " max_tcc_blocks = %i\n", info->max_tcc_blocks);
1915 fprintf(f, " tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
1916 fprintf(f, " tcc_rb_non_coherent = %u\n", info->tcc_rb_non_coherent);
1917 fprintf(f, " cp_sdma_ge_use_system_memory_scope = %u\n", info->cp_sdma_ge_use_system_memory_scope);
1918 fprintf(f, " pc_lines = %u\n", info->pc_lines);
1919 fprintf(f, " lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
1920 fprintf(f, " lds_alloc_granularity = %i\n", info->lds_alloc_granularity);
1921 fprintf(f, " lds_encode_granularity = %i\n", info->lds_encode_granularity);
1922 fprintf(f, " max_memory_clock = %i MHz\n", info->memory_freq_mhz);
1923
1924 fprintf(f, "CP info:\n");
1925 fprintf(f, " gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
1926 fprintf(f, " has_cp_dma = %i\n", info->has_cp_dma);
1927 fprintf(f, " me_fw_version = %i\n", info->me_fw_version);
1928 fprintf(f, " me_fw_feature = %i\n", info->me_fw_feature);
1929 fprintf(f, " mec_fw_version = %i\n", info->mec_fw_version);
1930 fprintf(f, " mec_fw_feature = %i\n", info->mec_fw_feature);
1931 fprintf(f, " pfp_fw_version = %i\n", info->pfp_fw_version);
1932 fprintf(f, " pfp_fw_feature = %i\n", info->pfp_fw_feature);
1933
1934 fprintf(f, "Multimedia info:\n");
1935 if (info->ip[AMD_IP_VCN_DEC].num_queues || info->ip[AMD_IP_VCN_UNIFIED].num_queues) {
1936 if (info->family >= CHIP_NAVI31 || info->family == CHIP_GFX940)
1937 fprintf(f, " vcn_unified = %u\n", info->ip[AMD_IP_VCN_UNIFIED].num_instances);
1938 else {
1939 fprintf(f, " vcn_decode = %u\n", info->ip[AMD_IP_VCN_DEC].num_instances);
1940 fprintf(f, " vcn_encode = %u\n", info->ip[AMD_IP_VCN_ENC].num_instances);
1941 }
1942 fprintf(f, " vcn_enc_major_version = %u\n", info->vcn_enc_major_version);
1943 fprintf(f, " vcn_enc_minor_version = %u\n", info->vcn_enc_minor_version);
1944 fprintf(f, " vcn_dec_version = %u\n", info->vcn_dec_version);
1945 } else if (info->ip[AMD_IP_VCE].num_queues) {
1946 fprintf(f, " vce_encode = %u\n", info->ip[AMD_IP_VCE].num_queues);
1947 fprintf(f, " vce_fw_version = %u\n", info->vce_fw_version);
1948 fprintf(f, " vce_harvest_config = %i\n", info->vce_harvest_config);
1949 } else if (info->ip[AMD_IP_UVD].num_queues)
1950 fprintf(f, " uvd_fw_version = %u\n", info->uvd_fw_version);
1951
1952 if (info->ip[AMD_IP_VCN_JPEG].num_queues)
1953 fprintf(f, " jpeg_decode = %u\n", info->ip[AMD_IP_VCN_JPEG].num_instances);
1954
1955 if ((info->drm_minor >= 41) &&
1956 (info->ip[AMD_IP_VCN_DEC].num_queues || info->ip[AMD_IP_VCN_UNIFIED].num_queues
1957 || info->ip[AMD_IP_VCE].num_queues || info->ip[AMD_IP_UVD].num_queues)) {
1958 char max_res_dec[64] = {0}, max_res_enc[64] = {0};
1959 char codec_str[][8] = {
1960 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2] = "mpeg2",
1961 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4] = "mpeg4",
1962 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1] = "vc1",
1963 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC] = "h264",
1964 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC] = "hevc",
1965 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG] = "jpeg",
1966 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9] = "vp9",
1967 [AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1] = "av1",
1968 };
1969 fprintf(f, " %-8s %-4s %-16s %-4s %-16s\n",
1970 "codec", "dec", "max_resolution", "enc", "max_resolution");
1971 for (unsigned i = 0; i < AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT; i++) {
1972 if (info->dec_caps.codec_info[i].valid)
1973 sprintf(max_res_dec, "%ux%u", info->dec_caps.codec_info[i].max_width,
1974 info->dec_caps.codec_info[i].max_height);
1975 else
1976 sprintf(max_res_dec, "%s", "-");
1977 if (info->enc_caps.codec_info[i].valid)
1978 sprintf(max_res_enc, "%ux%u", info->enc_caps.codec_info[i].max_width,
1979 info->enc_caps.codec_info[i].max_height);
1980 else
1981 sprintf(max_res_enc, "%s", "-");
1982 fprintf(f, " %-8s %-4s %-16s %-4s %-16s\n", codec_str[i],
1983 info->dec_caps.codec_info[i].valid ? "*" : "-", max_res_dec,
1984 info->enc_caps.codec_info[i].valid ? "*" : "-", max_res_enc);
1985 }
1986 }
1987
1988 fprintf(f, "Kernel & winsys capabilities:\n");
1989 fprintf(f, " drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel);
1990 fprintf(f, " has_userptr = %i\n", info->has_userptr);
1991 fprintf(f, " has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
1992 fprintf(f, " has_local_buffers = %u\n", info->has_local_buffers);
1993 fprintf(f, " has_bo_metadata = %u\n", info->has_bo_metadata);
1994 fprintf(f, " has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
1995 fprintf(f, " has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
1996 fprintf(f, " has_stable_pstate = %u\n", info->has_stable_pstate);
1997 fprintf(f, " has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
1998 fprintf(f, " has_gang_submit = %u\n", info->has_gang_submit);
1999 fprintf(f, " has_gpuvm_fault_query = %u\n", info->has_gpuvm_fault_query);
2000 fprintf(f, " register_shadowing_required = %u\n", info->register_shadowing_required);
2001 fprintf(f, " has_fw_based_shadowing = %u\n", info->has_fw_based_shadowing);
2002 if (info->has_fw_based_shadowing) {
2003 fprintf(f, " * shadow size: %u (alignment: %u)\n",
2004 info->fw_based_mcbp.shadow_size,
2005 info->fw_based_mcbp.shadow_alignment);
2006 fprintf(f, " * csa size: %u (alignment: %u)\n",
2007 info->fw_based_mcbp.csa_size,
2008 info->fw_based_mcbp.csa_alignment);
2009 }
2010
2011 fprintf(f, " has_tmz_support = %u\n", info->has_tmz_support);
2012 fprintf(f, " has_trap_handler_support = %u\n", info->has_trap_handler_support);
2013 for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
2014 if (info->max_submitted_ibs[i]) {
2015 fprintf(f, " IP %-7s max_submitted_ibs = %u\n", ac_get_ip_type_string(info, i),
2016 info->max_submitted_ibs[i]);
2017 }
2018 }
2019 fprintf(f, " kernel_has_modifiers = %u\n", info->kernel_has_modifiers);
2020 fprintf(f, " uses_kernel_cu_mask = %u\n", info->uses_kernel_cu_mask);
2021
2022 fprintf(f, "Shader core info:\n");
2023 for (unsigned i = 0; i < info->max_se; i++) {
2024 for (unsigned j = 0; j < info->max_sa_per_se; j++) {
2025 fprintf(f, " cu_mask[SE%u][SA%u] = 0x%x \t(%u)\tCU_EN = 0x%x\n", i, j,
2026 info->cu_mask[i][j], util_bitcount(info->cu_mask[i][j]),
2027 info->spi_cu_en & BITFIELD_MASK(util_bitcount(info->cu_mask[i][j])));
2028 }
2029 }
2030 fprintf(f, " spi_cu_en_has_effect = %i\n", info->spi_cu_en_has_effect);
2031 fprintf(f, " max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
2032 fprintf(f, " min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
2033 fprintf(f, " max_se = %i\n", info->max_se);
2034 fprintf(f, " max_sa_per_se = %i\n", info->max_sa_per_se);
2035 fprintf(f, " num_cu_per_sh = %i\n", info->num_cu_per_sh);
2036 fprintf(f, " max_waves_per_simd = %i\n", info->max_waves_per_simd);
2037 fprintf(f, " num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
2038 fprintf(f, " num_physical_wave64_vgprs_per_simd = %i\n",
2039 info->num_physical_wave64_vgprs_per_simd);
2040 fprintf(f, " num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
2041 fprintf(f, " min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
2042 fprintf(f, " max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
2043 fprintf(f, " sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
2044 fprintf(f, " min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
2045 fprintf(f, " max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
2046 fprintf(f, " wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
2047 fprintf(f, " max_scratch_waves = %i\n", info->max_scratch_waves);
2048 fprintf(f, " has_scratch_base_registers = %i\n", info->has_scratch_base_registers);
2049 fprintf(f, "Ring info:\n");
2050 fprintf(f, " attribute_ring_size_per_se = %u KB\n",
2051 DIV_ROUND_UP(info->attribute_ring_size_per_se, 1024));
2052 if (info->gfx_level >= GFX12) {
2053 fprintf(f, " pos_ring_size_per_se = %u KB\n", DIV_ROUND_UP(info->pos_ring_size_per_se, 1024));
2054 fprintf(f, " prim_ring_size_per_se = %u KB\n", DIV_ROUND_UP(info->prim_ring_size_per_se, 1024));
2055 }
2056 fprintf(f, " total_attribute_pos_prim_ring_size = %u KB\n",
2057 DIV_ROUND_UP(info->total_attribute_pos_prim_ring_size, 1024));
2058
2059 fprintf(f, "Render backend info:\n");
2060 fprintf(f, " pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
2061 fprintf(f, " max_render_backends = %i\n", info->max_render_backends);
2062 fprintf(f, " num_tile_pipes = %i\n", info->num_tile_pipes);
2063 fprintf(f, " pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
2064 fprintf(f, " enabled_rb_mask = 0x%" PRIx64 "\n", info->enabled_rb_mask);
2065 fprintf(f, " max_alignment = %u\n", (unsigned)info->max_alignment);
2066 fprintf(f, " pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
2067
2068 fprintf(f, "GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
2069 if (info->gfx_level >= GFX12) {
2070 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
2071 fprintf(f, " pipe_interleave_size = %u\n",
2072 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
2073 fprintf(f, " num_pkrs = %u\n", 1 << G_0098F8_NUM_PKRS(info->gb_addr_config));
2074 } else if (info->gfx_level >= GFX10) {
2075 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
2076 fprintf(f, " pipe_interleave_size = %u\n",
2077 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
2078 fprintf(f, " max_compressed_frags = %u\n",
2079 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
2080 if (info->gfx_level >= GFX10_3)
2081 fprintf(f, " num_pkrs = %u\n", 1 << G_0098F8_NUM_PKRS(info->gb_addr_config));
2082 } else if (info->gfx_level == GFX9) {
2083 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
2084 fprintf(f, " pipe_interleave_size = %u\n",
2085 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
2086 fprintf(f, " max_compressed_frags = %u\n",
2087 1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
2088 fprintf(f, " bank_interleave_size = %u\n",
2089 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
2090 fprintf(f, " num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
2091 fprintf(f, " shader_engine_tile_size = %u\n",
2092 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
2093 fprintf(f, " num_shader_engines = %u\n",
2094 1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
2095 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
2096 fprintf(f, " multi_gpu_tile_size = %u (raw)\n",
2097 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
2098 fprintf(f, " num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
2099 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
2100 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
2101 fprintf(f, " se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config));
2102 } else {
2103 fprintf(f, " num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
2104 fprintf(f, " pipe_interleave_size = %u\n",
2105 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
2106 fprintf(f, " bank_interleave_size = %u\n",
2107 1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
2108 fprintf(f, " num_shader_engines = %u\n",
2109 1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
2110 fprintf(f, " shader_engine_tile_size = %u\n",
2111 16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
2112 fprintf(f, " num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
2113 fprintf(f, " multi_gpu_tile_size = %u (raw)\n",
2114 G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
2115 fprintf(f, " row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
2116 fprintf(f, " num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
2117 }
2118
2119 struct ac_modifier_options modifier_options = {
2120 .dcc = true,
2121 .dcc_retile = true,
2122 };
2123 uint64_t modifiers[256];
2124 unsigned modifier_count = ARRAY_SIZE(modifiers);
2125
2126 /* Get the number of modifiers. */
2127 if (ac_get_supported_modifiers(info, &modifier_options, PIPE_FORMAT_R8G8B8A8_UNORM,
2128 &modifier_count, modifiers)) {
2129 if (modifier_count)
2130 fprintf(f, "Modifiers (32bpp):\n");
2131
2132 for (unsigned i = 0; i < modifier_count; i++) {
2133 char *name = drmGetFormatModifierName(modifiers[i]);
2134
2135 fprintf(f, " %s\n", name);
2136 free(name);
2137 }
2138 }
2139 }
2140
ac_get_gs_table_depth(enum amd_gfx_level gfx_level,enum radeon_family family)2141 int ac_get_gs_table_depth(enum amd_gfx_level gfx_level, enum radeon_family family)
2142 {
2143 if (gfx_level >= GFX9)
2144 return -1;
2145
2146 switch (family) {
2147 case CHIP_OLAND:
2148 case CHIP_HAINAN:
2149 case CHIP_KAVERI:
2150 case CHIP_KABINI:
2151 case CHIP_ICELAND:
2152 case CHIP_CARRIZO:
2153 case CHIP_STONEY:
2154 return 16;
2155 case CHIP_TAHITI:
2156 case CHIP_PITCAIRN:
2157 case CHIP_VERDE:
2158 case CHIP_BONAIRE:
2159 case CHIP_HAWAII:
2160 case CHIP_TONGA:
2161 case CHIP_FIJI:
2162 case CHIP_POLARIS10:
2163 case CHIP_POLARIS11:
2164 case CHIP_POLARIS12:
2165 case CHIP_VEGAM:
2166 return 32;
2167 default:
2168 unreachable("Unknown GPU");
2169 }
2170 }
2171
ac_get_raster_config(const struct radeon_info * info,uint32_t * raster_config_p,uint32_t * raster_config_1_p,uint32_t * se_tile_repeat_p)2172 void ac_get_raster_config(const struct radeon_info *info, uint32_t *raster_config_p,
2173 uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p)
2174 {
2175 unsigned raster_config, raster_config_1, se_tile_repeat;
2176
2177 switch (info->family) {
2178 /* 1 SE / 1 RB */
2179 case CHIP_HAINAN:
2180 case CHIP_KABINI:
2181 case CHIP_STONEY:
2182 raster_config = 0x00000000;
2183 raster_config_1 = 0x00000000;
2184 break;
2185 /* 1 SE / 4 RBs */
2186 case CHIP_VERDE:
2187 raster_config = 0x0000124a;
2188 raster_config_1 = 0x00000000;
2189 break;
2190 /* 1 SE / 2 RBs (Oland is special) */
2191 case CHIP_OLAND:
2192 raster_config = 0x00000082;
2193 raster_config_1 = 0x00000000;
2194 break;
2195 /* 1 SE / 2 RBs */
2196 case CHIP_KAVERI:
2197 case CHIP_ICELAND:
2198 case CHIP_CARRIZO:
2199 raster_config = 0x00000002;
2200 raster_config_1 = 0x00000000;
2201 break;
2202 /* 2 SEs / 4 RBs */
2203 case CHIP_BONAIRE:
2204 case CHIP_POLARIS11:
2205 case CHIP_POLARIS12:
2206 raster_config = 0x16000012;
2207 raster_config_1 = 0x00000000;
2208 break;
2209 /* 2 SEs / 8 RBs */
2210 case CHIP_TAHITI:
2211 case CHIP_PITCAIRN:
2212 raster_config = 0x2a00126a;
2213 raster_config_1 = 0x00000000;
2214 break;
2215 /* 4 SEs / 8 RBs */
2216 case CHIP_TONGA:
2217 case CHIP_POLARIS10:
2218 raster_config = 0x16000012;
2219 raster_config_1 = 0x0000002a;
2220 break;
2221 /* 4 SEs / 16 RBs */
2222 case CHIP_HAWAII:
2223 case CHIP_FIJI:
2224 case CHIP_VEGAM:
2225 raster_config = 0x3a00161a;
2226 raster_config_1 = 0x0000002e;
2227 break;
2228 default:
2229 fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n");
2230 raster_config = 0x00000000;
2231 raster_config_1 = 0x00000000;
2232 break;
2233 }
2234
2235 /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
2236 * This decreases performance by up to 50% when the RB is the bottleneck.
2237 */
2238 if (info->family == CHIP_KAVERI && !info->is_amdgpu)
2239 raster_config = 0x00000000;
2240
2241 /* Fiji: Old kernels have incorrect tiling config. This decreases
2242 * RB performance by 25%. (it disables 1 RB in the second packer)
2243 */
2244 if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) {
2245 raster_config = 0x16000012;
2246 raster_config_1 = 0x0000002a;
2247 }
2248
2249 unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
2250 unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
2251
2252 /* I don't know how to calculate this, though this is probably a good guess. */
2253 se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
2254
2255 *raster_config_p = raster_config;
2256 *raster_config_1_p = raster_config_1;
2257 if (se_tile_repeat_p)
2258 *se_tile_repeat_p = se_tile_repeat;
2259 }
2260
ac_get_harvested_configs(const struct radeon_info * info,unsigned raster_config,unsigned * cik_raster_config_1_p,unsigned * raster_config_se)2261 void ac_get_harvested_configs(const struct radeon_info *info, unsigned raster_config,
2262 unsigned *cik_raster_config_1_p, unsigned *raster_config_se)
2263 {
2264 unsigned sh_per_se = MAX2(info->max_sa_per_se, 1);
2265 unsigned num_se = MAX2(info->max_se, 1);
2266 unsigned rb_mask = info->enabled_rb_mask;
2267 unsigned num_rb = MIN2(info->max_render_backends, 16);
2268 unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
2269 unsigned rb_per_se = num_rb / num_se;
2270 unsigned se_mask[4];
2271 unsigned se;
2272
2273 se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
2274 se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
2275 se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
2276 se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
2277
2278 assert(num_se == 1 || num_se == 2 || num_se == 4);
2279 assert(sh_per_se == 1 || sh_per_se == 2);
2280 assert(rb_per_pkr == 1 || rb_per_pkr == 2);
2281
2282 if (info->gfx_level >= GFX7) {
2283 unsigned raster_config_1 = *cik_raster_config_1_p;
2284 if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) {
2285 raster_config_1 &= C_028354_SE_PAIR_MAP;
2286
2287 if (!se_mask[0] && !se_mask[1]) {
2288 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
2289 } else {
2290 raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
2291 }
2292 *cik_raster_config_1_p = raster_config_1;
2293 }
2294 }
2295
2296 for (se = 0; se < num_se; se++) {
2297 unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
2298 unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
2299 int idx = (se / 2) * 2;
2300
2301 raster_config_se[se] = raster_config;
2302 if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
2303 raster_config_se[se] &= C_028350_SE_MAP;
2304
2305 if (!se_mask[idx]) {
2306 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
2307 } else {
2308 raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
2309 }
2310 }
2311
2312 pkr0_mask &= rb_mask;
2313 pkr1_mask &= rb_mask;
2314 if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
2315 raster_config_se[se] &= C_028350_PKR_MAP;
2316
2317 if (!pkr0_mask) {
2318 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
2319 } else {
2320 raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
2321 }
2322 }
2323
2324 if (rb_per_se >= 2) {
2325 unsigned rb0_mask = 1 << (se * rb_per_se);
2326 unsigned rb1_mask = rb0_mask << 1;
2327
2328 rb0_mask &= rb_mask;
2329 rb1_mask &= rb_mask;
2330 if (!rb0_mask || !rb1_mask) {
2331 raster_config_se[se] &= C_028350_RB_MAP_PKR0;
2332
2333 if (!rb0_mask) {
2334 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
2335 } else {
2336 raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
2337 }
2338 }
2339
2340 if (rb_per_se > 2) {
2341 rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
2342 rb1_mask = rb0_mask << 1;
2343 rb0_mask &= rb_mask;
2344 rb1_mask &= rb_mask;
2345 if (!rb0_mask || !rb1_mask) {
2346 raster_config_se[se] &= C_028350_RB_MAP_PKR1;
2347
2348 if (!rb0_mask) {
2349 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
2350 } else {
2351 raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
2352 }
2353 }
2354 }
2355 }
2356 }
2357 }
2358
2359 unsigned
ac_get_compute_resource_limits(const struct radeon_info * info,unsigned waves_per_threadgroup,unsigned max_waves_per_sh,unsigned threadgroups_per_cu)2360 ac_get_compute_resource_limits(const struct radeon_info *info, unsigned waves_per_threadgroup,
2361 unsigned max_waves_per_sh, unsigned threadgroups_per_cu)
2362 {
2363 unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
2364
2365 if (info->gfx_level >= GFX7) {
2366 unsigned num_cu_per_se = info->num_cu / info->num_se;
2367
2368 /* Gfx9 should set the limit to max instead of 0 to fix high priority compute. */
2369 if (info->gfx_level == GFX9 && !max_waves_per_sh) {
2370 max_waves_per_sh = info->max_good_cu_per_sa * info->num_simd_per_compute_unit *
2371 info->max_waves_per_simd;
2372 }
2373
2374 /* On GFX12+, WAVES_PER_SH means waves per SE. */
2375 if (info->gfx_level >= GFX12)
2376 max_waves_per_sh *= info->max_sa_per_se;
2377
2378 /* Force even distribution on all SIMDs in CU if the workgroup
2379 * size is 64. This has shown some good improvements if # of CUs
2380 * per SE is not a multiple of 4.
2381 */
2382 if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
2383 compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
2384
2385 assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
2386 compute_resource_limits |=
2387 S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
2388 } else {
2389 /* GFX6 */
2390 if (max_waves_per_sh) {
2391 unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
2392 compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
2393 }
2394 }
2395 return compute_resource_limits;
2396 }
2397
ac_get_hs_info(const struct radeon_info * info,struct ac_hs_info * hs)2398 void ac_get_hs_info(const struct radeon_info *info,
2399 struct ac_hs_info *hs)
2400 {
2401 bool double_offchip_buffers = info->gfx_level >= GFX7 &&
2402 info->family != CHIP_CARRIZO &&
2403 info->family != CHIP_STONEY;
2404 unsigned max_offchip_buffers_per_se;
2405 unsigned max_offchip_buffers;
2406 unsigned offchip_granularity;
2407 unsigned hs_offchip_param;
2408
2409 hs->tess_offchip_block_dw_size =
2410 info->family == CHIP_HAWAII ? 4096 : 8192;
2411
2412 /*
2413 * Per RadeonSI:
2414 * This must be one less than the maximum number due to a hw limitation.
2415 * Various hardware bugs need this.
2416 *
2417 * Per AMDVLK:
2418 * Vega10 should limit max_offchip_buffers to 508 (4 * 127).
2419 * Gfx7 should limit max_offchip_buffers to 508
2420 * Gfx6 should limit max_offchip_buffers to 126 (2 * 63)
2421 *
2422 * Follow AMDVLK here.
2423 */
2424 if (info->gfx_level >= GFX11) {
2425 max_offchip_buffers_per_se = 256; /* TODO: we could decrease this to reduce memory/cache usage */
2426 } else if (info->gfx_level >= GFX10) {
2427 max_offchip_buffers_per_se = 128;
2428 } else if (info->family == CHIP_VEGA12 || info->family == CHIP_VEGA20) {
2429 /* Only certain chips can use the maximum value. */
2430 max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
2431 } else {
2432 max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
2433 }
2434
2435 max_offchip_buffers = max_offchip_buffers_per_se * info->max_se;
2436
2437 /* Hawaii has a bug with offchip buffers > 256 that can be worked
2438 * around by setting 4K granularity.
2439 */
2440 if (hs->tess_offchip_block_dw_size == 4096) {
2441 assert(info->family == CHIP_HAWAII);
2442 offchip_granularity = V_03093C_X_4K_DWORDS;
2443 } else {
2444 assert(hs->tess_offchip_block_dw_size == 8192);
2445 offchip_granularity = V_03093C_X_8K_DWORDS;
2446 }
2447
2448 switch (info->gfx_level) {
2449 case GFX6:
2450 max_offchip_buffers = MIN2(max_offchip_buffers, 126);
2451 break;
2452 case GFX7:
2453 case GFX8:
2454 case GFX9:
2455 max_offchip_buffers = MIN2(max_offchip_buffers, 508);
2456 break;
2457 case GFX10:
2458 break;
2459 default:
2460 break;
2461 }
2462
2463 hs->max_offchip_buffers = max_offchip_buffers;
2464
2465 if (info->gfx_level >= GFX11) {
2466 /* OFFCHIP_BUFFERING is per SE. */
2467 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers_per_se - 1) |
2468 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
2469 } else if (info->gfx_level >= GFX10_3) {
2470 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX103(max_offchip_buffers - 1) |
2471 S_03093C_OFFCHIP_GRANULARITY_GFX103(offchip_granularity);
2472 } else if (info->gfx_level >= GFX7) {
2473 if (info->gfx_level >= GFX8)
2474 --max_offchip_buffers;
2475 hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(max_offchip_buffers) |
2476 S_03093C_OFFCHIP_GRANULARITY_GFX7(offchip_granularity);
2477 } else {
2478 hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers);
2479 }
2480
2481 hs->hs_offchip_param = hs_offchip_param;
2482
2483 hs->tess_factor_ring_size = 48 * 1024 * info->max_se;
2484 hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024);
2485 hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4;
2486 }
2487
get_task_num_entries(enum radeon_family fam)2488 static uint16_t get_task_num_entries(enum radeon_family fam)
2489 {
2490 /* Number of task shader ring entries. Needs to be a power of two.
2491 * Use a low number on smaller chips so we don't waste space,
2492 * but keep it high on bigger chips so it doesn't inhibit parallelism.
2493 *
2494 * This number is compiled into task/mesh shaders as a constant.
2495 * In order to ensure this works fine with the shader cache, we must
2496 * base this decision on the chip family, not the number of CUs in
2497 * the current GPU. (So, the cache remains consistent for all
2498 * chips in the same family.)
2499 */
2500 switch (fam) {
2501 case CHIP_VANGOGH:
2502 case CHIP_NAVI24:
2503 case CHIP_REMBRANDT:
2504 return 256;
2505 case CHIP_NAVI21:
2506 case CHIP_NAVI22:
2507 case CHIP_NAVI23:
2508 default:
2509 return 1024;
2510 }
2511 }
2512
ac_get_task_info(const struct radeon_info * info,struct ac_task_info * task_info)2513 void ac_get_task_info(const struct radeon_info *info,
2514 struct ac_task_info *task_info)
2515 {
2516 const uint16_t num_entries = get_task_num_entries(info->family);
2517 const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
2518 const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
2519
2520 /* Ensure that the addresses of each ring are 256 byte aligned. */
2521 task_info->num_entries = num_entries;
2522 task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
2523 task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
2524 task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
2525 }
2526
ac_memory_ops_per_clock(uint32_t vram_type)2527 uint32_t ac_memory_ops_per_clock(uint32_t vram_type)
2528 {
2529 /* Based on MemoryOpsPerClockTable from PAL. */
2530 switch (vram_type) {
2531 case AMDGPU_VRAM_TYPE_GDDR1:
2532 case AMDGPU_VRAM_TYPE_GDDR3: /* last in low-end Evergreen */
2533 case AMDGPU_VRAM_TYPE_GDDR4: /* last in R7xx, not used much */
2534 case AMDGPU_VRAM_TYPE_UNKNOWN:
2535 default:
2536 return 0;
2537 case AMDGPU_VRAM_TYPE_DDR2:
2538 case AMDGPU_VRAM_TYPE_DDR3:
2539 case AMDGPU_VRAM_TYPE_DDR4:
2540 case AMDGPU_VRAM_TYPE_LPDDR4:
2541 case AMDGPU_VRAM_TYPE_HBM: /* same for HBM2 and HBM3 */
2542 return 2;
2543 case AMDGPU_VRAM_TYPE_DDR5:
2544 case AMDGPU_VRAM_TYPE_LPDDR5:
2545 case AMDGPU_VRAM_TYPE_GDDR5: /* last in Polaris and low-end Navi14 */
2546 return 4;
2547 case AMDGPU_VRAM_TYPE_GDDR6:
2548 return 16;
2549 }
2550 }
2551
ac_gfx103_get_cu_mask_ps(const struct radeon_info * info)2552 uint32_t ac_gfx103_get_cu_mask_ps(const struct radeon_info *info)
2553 {
2554 /* It's wasteful to enable all CUs for PS if shader arrays have a different
2555 * number of CUs. The reason is that the hardware sends the same number of PS
2556 * waves to each shader array, so the slowest shader array limits the performance.
2557 * Disable the extra CUs for PS in other shader arrays to save power and thus
2558 * increase clocks for busy CUs. In the future, we might disable or enable this
2559 * tweak only for certain apps.
2560 */
2561 return u_bit_consecutive(0, info->min_good_cu_per_sa);
2562 }
2563