1 /* 2 * Copyright 2012 Advanced Micro Devices, Inc. 3 * 4 * SPDX-License-Identifier: MIT 5 */ 6 7 #ifndef AC_SHADER_UTIL_H 8 #define AC_SHADER_UTIL_H 9 10 #include "ac_binary.h" 11 #include "amd_family.h" 12 #include "compiler/shader_enums.h" 13 #include "util/format/u_format.h" 14 15 #include <stdbool.h> 16 #include <stdint.h> 17 18 #ifdef __cplusplus 19 extern "C" { 20 #endif 21 22 #define AC_SENDMSG_HS_TESSFACTOR 2 23 24 #define AC_SENDMSG_GS 2 25 #define AC_SENDMSG_GS_DONE 3 26 #define AC_SENDMSG_GS_ALLOC_REQ 9 27 28 #define AC_SENDMSG_GS_OP_NOP (0 << 4) 29 #define AC_SENDMSG_GS_OP_CUT (1 << 4) 30 #define AC_SENDMSG_GS_OP_EMIT (2 << 4) 31 #define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4) 32 33 /* Reserve this size at the beginning of LDS for the tf0/1 shader message group vote. */ 34 #define AC_HS_MSG_VOTE_LDS_BYTES 16 35 36 /* An extension of gl_access_qualifier describing other aspects of memory operations 37 * for code generation. 38 */ 39 enum { 40 /* Only one of LOAD/STORE/ATOMIC can be set. */ 41 ACCESS_TYPE_LOAD = BITFIELD_BIT(27), 42 ACCESS_TYPE_STORE = BITFIELD_BIT(28), 43 ACCESS_TYPE_ATOMIC = BITFIELD_BIT(29), 44 45 /* This access is expected to use an SMEM instruction if source operands are non-divergent. 46 * Only loads can set this. 47 */ 48 ACCESS_TYPE_SMEM = BITFIELD_BIT(30), 49 50 /* Whether a store offset or size alignment is less than 4. */ 51 ACCESS_MAY_STORE_SUBDWORD = BITFIELD_BIT(31), 52 }; 53 54 /* GFX6-11. The meaning of these enums is different between chips. They match LLVM definitions, 55 * but they can also be used by ACO. Use ac_get_hw_cache_flags to get these. 56 */ 57 enum ac_cache_flags 58 { 59 ac_glc = BITFIELD_BIT(0), 60 ac_slc = BITFIELD_BIT(1), 61 ac_dlc = BITFIELD_BIT(2), 62 ac_swizzled = BITFIELD_BIT(3), 63 }; 64 65 /* Cache-agnostic scope flags. */ 66 enum gfx12_scope 67 { 68 /* Memory access is coherent within a workgroup in CU mode. 69 * There is no coherency between VMEM and SMEM. 70 */ 71 gfx12_scope_cu, 72 73 /* Memory access is coherent within an SE. 74 * If there is no SE cache, this resolves to the device scope in the gfx domain. 75 */ 76 gfx12_scope_se, 77 78 /* Memory access is globally coherent within the device for all gfx blocks except CP and GE 79 * depending on the chip (see below). This is referred to as the device scope. It's not coherent 80 * with non-gfx blocks like DCN and VCN. 81 * 82 * If there a single global GL2 cache: 83 * - The device scope in the gfx domain resolves to GL2 scope in hw. 84 * - Memory access is cached in GL2. 85 * - radeon_info::cp_sdma_ge_use_system_memory_scope says whether CP, SDMA, and GE are 86 * not coherent. If true, some features need special handling. The list of the features 87 * and the suggested programming is: 88 * * tess factor ring for GE: use ACCESS_CP_GE_COHERENT_AMD (it selects the correct scope 89 * automatically) 90 * * query results read by shaders and SET_PREDICATION: use AMDGPU_VM_MTYPE_UC, 91 * but use VRAM for queries not read by the CPU for better performance 92 * * vertex indices for GE: flush GL2 after buffer stores, but don't invalidate 93 * * draw indirect for CP: flush GL2 after buffer stores, but don't invalidate 94 * * shader uploads via SDMA: invalidate GL2 at the beginning of IBs 95 * * PRIME buffer read by SDMA: the kernel flushes GL2 at the end of IBs 96 * * CP DMA clears/copies: use compute shaders or range-flush/invalidate GL2 around it 97 * * CP DMA prefetch: no change 98 * * COPY_DATA - FILLED_SIZE state for streamout, range-flush/invalidate GL2 99 * * WRITE_DATA - bindless descriptors: range-invalidate GL2 100 * 101 * If there is a separate GL2 cache per SE: 102 * - The device scope resolves to memory scope in hw. 103 * - Memory access is cached in MALL if MALL (infinity cache) is present. 104 * - radeon_info::cp_sdma_ge_use_system_memory_scope is always false in this case. 105 */ 106 gfx12_scope_device, 107 108 /* Memory scope. It's cached if MALL is present. This is called "system scope" in the ISA 109 * documentation. 110 */ 111 gfx12_scope_memory, 112 }; 113 114 enum gfx12_load_temporal_hint 115 { 116 /* VMEM and SMEM */ 117 gfx12_load_regular_temporal, 118 gfx12_load_non_temporal, 119 gfx12_load_high_temporal, 120 /* VMEM$ treats SCOPE=3 and TH=3 as MALL bypass on GFX12. Don't use this combination in shaders. */ 121 gfx12_load_last_use_discard, 122 /* VMEM only, far means the last level cache, near means other caches. */ 123 gfx12_load_near_non_temporal_far_regular_temporal, 124 gfx12_load_near_regular_temporal_far_non_temporal, 125 gfx12_load_near_non_temporal_far_high_temporal, 126 gfx12_load_reserved, 127 }; 128 129 enum gfx12_store_temporal_hint 130 { 131 gfx12_store_regular_temporal, 132 gfx12_store_non_temporal, 133 gfx12_store_high_temporal, 134 gfx12_store_high_temporal_stay_dirty, 135 gfx12_store_near_non_temporal_far_regular_temporal, 136 gfx12_store_near_regular_temporal_far_non_temporal, 137 gfx12_store_near_non_temporal_far_high_temporal, 138 gfx12_store_near_non_temporal_far_writeback, 139 }; 140 141 enum gfx12_atomic_temporal_hint 142 { 143 gfx12_atomic_return = BITFIELD_BIT(0), 144 gfx12_atomic_non_temporal = BITFIELD_BIT(1), 145 gfx12_atomic_accum_deferred_scope = BITFIELD_BIT(2), /* requires no return */ 146 }; 147 148 enum gfx12_speculative_data_read 149 { 150 gfx12_spec_read_auto, 151 gfx12_spec_read_force_on, 152 gfx12_spec_read_force_off, 153 }; 154 155 union ac_hw_cache_flags 156 { 157 struct { 158 /* This matches LLVM, but it can also be used by ACO for translation of ac_memop_flags. */ 159 uint8_t temporal_hint:3; /* gfx12_{load,store,atomic}_temporal_hint */ 160 uint8_t scope:2; /* gfx12_scope */ 161 uint8_t _reserved:1; 162 uint8_t swizzled:1; /* for swizzled buffer access (attribute ring) */ 163 uint8_t _pad:1; 164 } gfx12; 165 166 uint8_t value; /* ac_cache_flags (GFX6-11) or the gfx12 structure */ 167 }; 168 169 enum ac_image_dim 170 { 171 ac_image_1d, 172 ac_image_2d, 173 ac_image_3d, 174 ac_image_cube, // includes cube arrays 175 ac_image_1darray, 176 ac_image_2darray, 177 ac_image_2dmsaa, 178 ac_image_2darraymsaa, 179 }; 180 181 struct ac_data_format_info { 182 uint8_t element_size; 183 uint8_t num_channels; 184 uint8_t chan_byte_size; 185 uint8_t chan_format; 186 }; 187 188 enum ac_vs_input_alpha_adjust { 189 AC_ALPHA_ADJUST_NONE = 0, 190 AC_ALPHA_ADJUST_SNORM = 1, 191 AC_ALPHA_ADJUST_SSCALED = 2, 192 AC_ALPHA_ADJUST_SINT = 3, 193 }; 194 195 struct ac_vtx_format_info { 196 uint16_t dst_sel; 197 uint8_t element_size; 198 uint8_t num_channels; 199 uint8_t chan_byte_size; /* 0 for packed formats */ 200 201 /* These last three are dependent on the family. */ 202 203 uint8_t has_hw_format; 204 /* Index is number of channels minus one. Use any index for packed formats. 205 * GFX6-8 is dfmt[0:3],nfmt[4:7]. 206 */ 207 uint8_t hw_format[4]; 208 enum ac_vs_input_alpha_adjust alpha_adjust : 8; 209 }; 210 211 struct ac_spi_color_formats { 212 unsigned normal : 8; 213 unsigned alpha : 8; 214 unsigned blend : 8; 215 unsigned blend_alpha : 8; 216 }; 217 218 /* For ac_build_fetch_format. 219 * 220 * Note: FLOAT must be 0 (used for convenience of encoding in radeonsi). 221 */ 222 enum ac_fetch_format 223 { 224 AC_FETCH_FORMAT_FLOAT = 0, 225 AC_FETCH_FORMAT_FIXED, 226 AC_FETCH_FORMAT_UNORM, 227 AC_FETCH_FORMAT_SNORM, 228 AC_FETCH_FORMAT_USCALED, 229 AC_FETCH_FORMAT_SSCALED, 230 AC_FETCH_FORMAT_UINT, 231 AC_FETCH_FORMAT_SINT, 232 AC_FETCH_FORMAT_NONE, 233 }; 234 235 enum ac_descriptor_type 236 { 237 AC_DESC_IMAGE, 238 AC_DESC_FMASK, 239 AC_DESC_SAMPLER, 240 AC_DESC_BUFFER, 241 AC_DESC_PLANE_0, 242 AC_DESC_PLANE_1, 243 AC_DESC_PLANE_2, 244 }; 245 246 unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, 247 bool writes_mrt0_alpha); 248 249 unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format); 250 251 uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum amd_gfx_level gfx_level); 252 253 unsigned ac_get_tbuffer_format(enum amd_gfx_level gfx_level, unsigned dfmt, unsigned nfmt); 254 255 const struct ac_vtx_format_info *ac_get_vtx_format_info_table(enum amd_gfx_level level, 256 enum radeon_family family); 257 258 const struct ac_vtx_format_info *ac_get_vtx_format_info(enum amd_gfx_level level, 259 enum radeon_family family, 260 enum pipe_format fmt); 261 262 unsigned ac_get_safe_fetch_size(const enum amd_gfx_level gfx_level, const struct ac_vtx_format_info* vtx_info, 263 const unsigned offset, const unsigned max_channels, const unsigned alignment, 264 const unsigned num_channels); 265 266 enum ac_image_dim ac_get_sampler_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim dim, 267 bool is_array); 268 269 enum ac_image_dim ac_get_image_dim(enum amd_gfx_level gfx_level, enum glsl_sampler_dim sdim, 270 bool is_array); 271 272 unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config, 273 uint8_t *num_fragcoord_components); 274 275 uint16_t ac_get_ps_iter_mask(unsigned ps_iter_samples); 276 277 void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, 278 bool is_depth, bool use_rbplus, 279 struct ac_spi_color_formats *formats); 280 281 void ac_compute_late_alloc(const struct radeon_info *info, bool ngg, bool ngg_culling, 282 bool uses_scratch, unsigned *late_alloc_wave64, unsigned *cu_mask); 283 284 unsigned ac_compute_cs_workgroup_size(const uint16_t sizes[3], bool variable, unsigned max); 285 286 unsigned ac_compute_lshs_workgroup_size(enum amd_gfx_level gfx_level, gl_shader_stage stage, 287 unsigned tess_num_patches, 288 unsigned tess_patch_in_vtx, 289 unsigned tess_patch_out_vtx); 290 291 unsigned ac_compute_esgs_workgroup_size(enum amd_gfx_level gfx_level, unsigned wave_size, 292 unsigned es_verts, unsigned gs_inst_prims); 293 294 unsigned ac_compute_ngg_workgroup_size(unsigned es_verts, unsigned gs_inst_prims, 295 unsigned max_vtx_out, unsigned prim_amp_factor); 296 297 uint32_t ac_compute_num_tess_patches(const struct radeon_info *info, uint32_t num_tcs_input_cp, 298 uint32_t num_tcs_output_cp, uint32_t vram_per_patch, 299 uint32_t lds_per_patch, uint32_t wave_size, 300 bool tess_uses_primid); 301 302 uint32_t ac_apply_cu_en(uint32_t value, uint32_t clear_mask, unsigned value_shift, 303 const struct radeon_info *info); 304 305 void ac_get_scratch_tmpring_size(const struct radeon_info *info, 306 unsigned bytes_per_wave, unsigned *max_seen_bytes_per_wave, 307 uint32_t *tmpring_size); 308 309 unsigned 310 ac_ngg_nogs_get_pervertex_lds_size(gl_shader_stage stage, 311 unsigned shader_num_outputs, 312 bool streamout_enabled, 313 bool export_prim_id, 314 bool has_user_edgeflags, 315 bool can_cull, 316 bool uses_instance_id, 317 bool uses_primitive_id); 318 319 unsigned 320 ac_ngg_get_scratch_lds_size(gl_shader_stage stage, 321 unsigned workgroup_size, 322 unsigned wave_size, 323 bool streamout_enabled, 324 bool can_cull, 325 bool compact_primitives); 326 327 union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level, 328 enum gl_access_qualifier access); 329 330 unsigned ac_get_all_edge_flag_bits(enum amd_gfx_level gfx_level); 331 332 unsigned ac_shader_io_get_unique_index_patch(unsigned semantic); 333 334 #ifdef __cplusplus 335 } 336 #endif 337 338 #endif 339