1 /* 2 * Copyright 2018-2021 Alyssa Rosenzweig 3 * SPDX-License-Identifier: MIT 4 */ 5 6 #pragma once 7 8 #include "compiler/nir/nir.h" 9 #include "util/u_dynarray.h" 10 #include "shader_enums.h" 11 12 struct agx_varyings_vs { 13 /* The number of user varyings of each type. The varyings must be allocated 14 * in this order ({smooth, flat, linear} × {32, 16}), which may require 15 * remapping. 16 */ 17 unsigned num_32_smooth; 18 unsigned num_32_flat; 19 unsigned num_32_linear; 20 unsigned num_16_smooth; 21 unsigned num_16_flat; 22 unsigned num_16_linear; 23 24 /* The first index used for FP16 varyings. Indices less than this are treated 25 * as FP32. This may require remapping slots to guarantee. 26 */ 27 unsigned base_index_fp16; 28 29 /* The total number of vertex shader indices output. Must be at least 30 * base_index_fp16. 31 */ 32 unsigned nr_index; 33 34 /* If the slot is written, this is the base index that the first component 35 * of the slot is written to. The next components are found in the next 36 * indices. If less than base_index_fp16, this is a 32-bit slot (with 4 37 * indices for the 4 components), else this is a 16-bit slot (with 2 38 * indices for the 4 components). This must be less than nr_index. 39 * 40 * If the slot is not written, this must be ~0. 41 */ 42 unsigned slots[VARYING_SLOT_MAX]; 43 44 /* Slot for the combined layer/viewport 32-bit sysval output, or ~0 if none 45 * is written. What's at slots[VARYING_SLOT_LAYER] is the varying output. 46 */ 47 unsigned layer_viewport_slot; 48 49 /* Base slot for the clip distance sysval outputs, or ~0 if none is written. 50 * What's at slots[VARYING_SLOT_CLIP_DIST0] is the varying output. 51 */ 52 unsigned clip_dist_slot; 53 unsigned nr_clip_dists; 54 }; 55 56 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating 57 * coefficient register aliasing?) 58 */ 59 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4) 60 61 struct agx_varyings_fs { 62 /* Number of coefficient registers used */ 63 unsigned nr_cf; 64 65 /* Number of coefficient register bindings */ 66 unsigned nr_bindings; 67 68 /* Whether gl_FragCoord.z is read */ 69 bool reads_z; 70 71 /* Coefficient register bindings */ 72 struct { 73 /* Base coefficient register */ 74 unsigned cf_base; 75 76 /* Slot being bound */ 77 gl_varying_slot slot; 78 79 /* First component bound. 80 * 81 * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS. 82 */ 83 unsigned offset : 2; 84 85 /* Number of components bound */ 86 unsigned count : 3; 87 88 /* Is smooth shading enabled? If false, flat shading is used */ 89 bool smooth : 1; 90 91 /* Perspective correct interpolation */ 92 bool perspective : 1; 93 } bindings[AGX_MAX_CF_BINDINGS]; 94 }; 95 96 union agx_varyings { 97 struct agx_varyings_vs vs; 98 struct agx_varyings_fs fs; 99 }; 100 101 struct agx_uncompiled_shader_info { 102 uint64_t inputs_flat_shaded; 103 uint64_t inputs_linear_shaded; 104 uint8_t cull_distance_size; 105 bool has_edgeflags; 106 107 /* Number of bindful textures, images used */ 108 unsigned nr_bindful_textures, nr_bindful_images; 109 }; 110 111 struct agx_shader_info { 112 union agx_varyings varyings; 113 114 /* Number of uniforms */ 115 unsigned push_count; 116 117 /* Local memory allocation in bytes */ 118 unsigned local_size; 119 120 /* Scratch memory allocation in bytes for main/preamble respectively */ 121 unsigned scratch_size, preamble_scratch_size; 122 123 /* Does the shader have a preamble? If so, it is at offset preamble_offset. 124 * The main shader is at offset main_offset. The preamble is executed first. 125 */ 126 bool has_preamble; 127 unsigned preamble_offset, main_offset; 128 129 /* Does the shader read the tilebuffer? */ 130 bool reads_tib; 131 132 /* Does the shader write point size? */ 133 bool writes_psiz; 134 135 /* Does the shader potentially draw to a nonzero viewport? */ 136 bool nonzero_viewport; 137 138 /* Does the shader write layer and/or viewport index? Written together */ 139 bool writes_layer_viewport; 140 141 /* Does the shader control the sample mask? */ 142 bool writes_sample_mask; 143 144 /* Depth layout, never equal to NONE */ 145 enum gl_frag_depth_layout depth_layout; 146 147 /* Based only the compiled shader, should tag writes be disabled? This is set 148 * based on what is outputted. Note if rasterizer discard is used, that needs 149 * to disable tag writes regardless of this flag. 150 */ 151 bool tag_write_disable; 152 153 /* Shader is incompatible with triangle merging */ 154 bool disable_tri_merging; 155 156 /* Reads draw ID system value */ 157 bool uses_draw_id; 158 159 /* Reads base vertex/instance */ 160 bool uses_base_param; 161 162 /* Shader uses txf, requiring a workaround sampler in the given location */ 163 bool uses_txf; 164 unsigned txf_sampler; 165 166 /* Number of 16-bit registers used by the main shader and preamble 167 * respectively. 168 */ 169 unsigned nr_gprs, nr_preamble_gprs; 170 171 /* Output mask set during driver lowering */ 172 uint64_t outputs; 173 }; 174 175 #define AGX_MAX_RTS (8) 176 177 enum agx_format { 178 AGX_FORMAT_I8 = 0, 179 AGX_FORMAT_I16 = 1, 180 AGX_FORMAT_I32 = 2, 181 AGX_FORMAT_F16 = 3, 182 AGX_FORMAT_U8NORM = 4, 183 AGX_FORMAT_S8NORM = 5, 184 AGX_FORMAT_U16NORM = 6, 185 AGX_FORMAT_S16NORM = 7, 186 AGX_FORMAT_RGB10A2 = 8, 187 AGX_FORMAT_SRGBA8 = 10, 188 AGX_FORMAT_RG11B10F = 12, 189 AGX_FORMAT_RGB9E5 = 13, 190 191 /* Keep last */ 192 AGX_NUM_FORMATS, 193 }; 194 195 struct agx_vs_shader_key { 196 /* The GPU ABI requires all smooth shaded varyings to come first, then all 197 * flat shaded varyings, then all linear shaded varyings, as written by the 198 * VS. In order to correctly remap the varyings into the right order in the 199 * VS, we need to propagate the mask of flat/linear shaded varyings into the 200 * compiler. 201 */ 202 uint64_t outputs_flat_shaded; 203 uint64_t outputs_linear_shaded; 204 }; 205 206 struct agx_fs_shader_key { 207 /* Normally, access to the tilebuffer must be guarded by appropriate fencing 208 * instructions to ensure correct results in the presence of out-of-order 209 * hardware optimizations. However, specially dispatched clear shaders are 210 * not subject to these conditions and can omit the wait instructions. 211 * 212 * Must (only) be set for special clear shaders. 213 * 214 * Must not be used with sample mask writes (including discards) or 215 * tilebuffer loads (including blending). 216 */ 217 bool ignore_tib_dependencies; 218 }; 219 220 struct agx_shader_key { 221 /* Number of reserved preamble slots at the start */ 222 unsigned reserved_preamble; 223 224 /* Does the target GPU need explicit cluster coherency for atomics? 225 * Only used on G13X. 226 */ 227 bool needs_g13x_coherency; 228 229 /* Library routines to link against */ 230 const nir_shader *libagx; 231 232 /* Whether scratch memory is available in the given shader stage */ 233 bool has_scratch; 234 235 /* Whether we're compiling the helper program used for scratch allocation. 236 * This has special register allocation requirements. 237 */ 238 bool is_helper; 239 240 union { 241 struct agx_vs_shader_key vs; 242 struct agx_fs_shader_key fs; 243 }; 244 }; 245 246 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx, 247 bool allow_mediump, 248 struct agx_uncompiled_shader_info *out); 249 250 bool agx_nir_lower_discard_zs_emit(nir_shader *s); 251 bool agx_nir_lower_sample_mask(nir_shader *s); 252 253 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s, 254 unsigned nr_distances); 255 256 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key, 257 struct util_debug_callback *debug, 258 struct util_dynarray *binary, 259 struct agx_shader_info *out); 260 261 struct agx_occupancy { 262 unsigned max_registers; 263 unsigned max_threads; 264 }; 265 266 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs); 267 unsigned agx_max_registers_for_occupancy(unsigned occupancy); 268 269 static const nir_shader_compiler_options agx_nir_options = { 270 .lower_fdiv = true, 271 .fuse_ffma16 = true, 272 .fuse_ffma32 = true, 273 .lower_flrp16 = true, 274 .lower_flrp32 = true, 275 .lower_fpow = true, 276 .lower_fmod = true, 277 .lower_bitfield_insert = true, 278 .lower_ifind_msb = true, 279 .lower_find_lsb = true, 280 .lower_uadd_carry = true, 281 .lower_usub_borrow = true, 282 .lower_fisnormal = true, 283 .lower_scmp = true, 284 .lower_isign = true, 285 .lower_fsign = true, 286 .lower_iabs = true, 287 .lower_fdph = true, 288 .lower_ffract = true, 289 .lower_ldexp = true, 290 .lower_pack_half_2x16 = true, 291 .lower_pack_64_2x32 = true, 292 .lower_unpack_half_2x16 = true, 293 .lower_extract_byte = true, 294 .lower_insert_byte = true, 295 .lower_insert_word = true, 296 .has_cs_global_id = true, 297 .lower_hadd = true, 298 .vectorize_io = true, 299 .use_interpolated_input_intrinsics = true, 300 .has_isub = true, 301 .support_16bit_alu = true, 302 .max_unroll_iterations = 32, 303 .lower_uniforms_to_ubo = true, 304 .lower_int64_options = 305 (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64), 306 .lower_doubles_options = (nir_lower_doubles_options)(~0), 307 .lower_fquantize2f16 = true, 308 }; 309