• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018-2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #pragma once
7 
8 #include "compiler/nir/nir.h"
9 #include "util/u_dynarray.h"
10 #include "util/u_tristate.h"
11 #include "shader_enums.h"
12 
13 struct agx_cf_binding {
14    /* Base coefficient register */
15    uint8_t cf_base;
16 
17    /* Slot being bound */
18    gl_varying_slot slot : 8;
19 
20    /* First component bound.
21     *
22     * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS.
23     */
24    unsigned offset : 2;
25 
26    /* Number of components bound */
27    unsigned count : 3;
28 
29    /* Is smooth shading enabled? If false, flat shading is used */
30    bool smooth : 1;
31 
32    /* Perspective correct interpolation */
33    bool perspective : 1;
34 
35    uint8_t pad;
36 };
37 
38 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating
39  * coefficient register aliasing?)
40  */
41 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4)
42 
43 struct agx_varyings_fs {
44    /* Number of coefficient registers used */
45    unsigned nr_cf;
46 
47    /* Number of coefficient register bindings */
48    unsigned nr_bindings;
49 
50    /* Whether gl_FragCoord.z is read */
51    bool reads_z;
52 
53    /* Coefficient register bindings */
54    struct agx_cf_binding bindings[AGX_MAX_CF_BINDINGS];
55 };
56 
57 union agx_varyings {
58    struct agx_varyings_fs fs;
59 };
60 
61 struct agx_interp_info {
62    /* Bit masks indexed by I/O location of flat and linear varyings */
63    uint64_t flat;
64    uint64_t linear;
65 };
66 static_assert(sizeof(struct agx_interp_info) == 16, "packed");
67 
68 struct agx_rodata {
69    /* Offset in the binary */
70    uint32_t offset;
71 
72    /* Base uniform to map constants */
73    uint16_t base_uniform;
74 
75    /* Number of 16-bit constants to map contiguously there */
76    uint16_t size_16;
77 };
78 
79 struct agx_shader_info {
80    enum pipe_shader_type stage;
81    uint32_t binary_size;
82 
83    union agx_varyings varyings;
84 
85    /* Number of uniforms */
86    uint16_t push_count;
87 
88    /* Local memory allocation in bytes */
89    uint16_t local_size;
90 
91    /* Local imageblock allocation in bytes per thread */
92    uint16_t imageblock_stride;
93 
94    /* Scratch memory allocation in bytes for main/preamble respectively */
95    unsigned scratch_size, preamble_scratch_size;
96 
97    /* Size in bytes of the main sahder */
98    unsigned main_size;
99 
100    /* Does the shader have a preamble? If so, it is at offset preamble_offset.
101     * The main shader is at offset main_offset. The preamble is executed first.
102     */
103    bool has_preamble;
104    unsigned preamble_offset, main_offset;
105 
106    /* Does the shader read the tilebuffer? */
107    bool reads_tib;
108 
109    /* Does the shader require early fragment tests? */
110    bool early_fragment_tests;
111 
112    /* Does the shader potentially draw to a nonzero viewport? */
113    bool nonzero_viewport;
114 
115    /* Does the shader write layer and/or viewport index? Written together */
116    bool writes_layer_viewport;
117 
118    /* Does the shader control the sample mask? */
119    bool writes_sample_mask;
120 
121    /* Depth layout, never equal to NONE */
122    enum gl_frag_depth_layout depth_layout;
123 
124    /* Based only the compiled shader, should tag writes be disabled? This is set
125     * based on what is outputted. Note if rasterizer discard is used, that needs
126     * to disable tag writes regardless of this flag.
127     */
128    bool tag_write_disable;
129 
130    /* Shader is incompatible with triangle merging */
131    bool disable_tri_merging;
132 
133    /* Reads draw ID system value */
134    bool uses_draw_id;
135 
136    /* Reads base vertex/instance */
137    bool uses_base_param;
138 
139    /* Uses txf and hence needs a txf sampler mapped */
140    bool uses_txf;
141 
142    /* Number of 16-bit registers used by the main shader and preamble
143     * respectively.
144     */
145    uint16_t nr_gprs, nr_preamble_gprs;
146 
147    /* Output mask set during driver lowering */
148    uint64_t outputs;
149 
150    /* Workgroup size */
151    uint16_t workgroup_size[3];
152 
153    /* There may be constants in the binary. The driver must map these to uniform
154     * registers as specified hre.
155     */
156    struct agx_rodata rodata;
157 };
158 
159 struct agx_precompiled_kernel_info {
160    uint32_t preamble_offset, main_offset;
161    uint32_t main_size, binary_size;
162    struct agx_rodata rodata;
163    uint16_t nr_gprs, nr_preamble_gprs;
164    uint16_t push_count;
165    uint16_t workgroup_size[3];
166    uint16_t local_size;
167    uint16_t imageblock_stride;
168    bool uses_txf;
169 };
170 
171 static inline struct agx_precompiled_kernel_info
agx_compact_kernel_info(struct agx_shader_info * info)172 agx_compact_kernel_info(struct agx_shader_info *info)
173 {
174    assert(info->has_preamble == (info->nr_preamble_gprs > 0));
175 
176    return (struct agx_precompiled_kernel_info){
177       .preamble_offset = info->preamble_offset,
178       .main_offset = info->main_offset,
179       .main_size = info->main_size,
180       .binary_size = info->binary_size,
181       .rodata = info->rodata,
182       .nr_gprs = info->nr_gprs,
183       .nr_preamble_gprs = info->nr_preamble_gprs,
184       .push_count = info->push_count,
185       .workgroup_size = {info->workgroup_size[0], info->workgroup_size[1],
186                          info->workgroup_size[2]},
187       .local_size = info->local_size,
188       .imageblock_stride = info->imageblock_stride,
189       .uses_txf = info->uses_txf,
190    };
191 }
192 
193 struct agx_shader_part {
194    struct agx_shader_info info;
195    void *binary;
196 };
197 
198 #define AGX_MAX_RTS (8)
199 
200 enum agx_format {
201    AGX_FORMAT_I8 = 0,
202    AGX_FORMAT_I16 = 1,
203    AGX_FORMAT_I32 = 2,
204    AGX_FORMAT_F16 = 3,
205    AGX_FORMAT_U8NORM = 4,
206    AGX_FORMAT_S8NORM = 5,
207    AGX_FORMAT_U16NORM = 6,
208    AGX_FORMAT_S16NORM = 7,
209    AGX_FORMAT_RGB10A2 = 8,
210    AGX_FORMAT_SRGBA8 = 10,
211    AGX_FORMAT_RG11B10F = 12,
212    AGX_FORMAT_RGB9E5 = 13,
213 
214    /* Keep last */
215    AGX_NUM_FORMATS,
216 };
217 
218 struct agx_fs_shader_key {
219    /* Normally, access to the tilebuffer must be guarded by appropriate fencing
220     * instructions to ensure correct results in the presence of out-of-order
221     * hardware optimizations. However, specially dispatched clear shaders are
222     * not subject to these conditions and can omit the wait instructions.
223     *
224     * Must (only) be set for special clear shaders.
225     *
226     * Must not be used with sample mask writes (including discards) or
227     * tilebuffer loads (including blending).
228     */
229    bool ignore_tib_dependencies;
230 
231    /* When dynamic sample shading is used, the fragment shader is wrapped in a
232     * loop external to the API shader. This bit indicates that we are compiling
233     * inside the sample loop, meaning the execution nesting counter is already
234     * zero and must be preserved.
235     */
236    bool inside_sample_loop;
237 
238    /* Base coefficient register. 0 for API shaders but nonzero for FS prolog */
239    uint8_t cf_base;
240 };
241 
242 struct agx_device_key {
243    /* Does the target GPU need explicit cluster coherency for atomics?
244     * Only used on G13X.
245     */
246    enum u_tristate needs_g13x_coherency;
247 
248    /* Is soft fault enabled? This is technically system-wide policy set by the
249     * kernel, but that's functionally a hardware feature.
250     */
251    bool soft_fault;
252 };
253 
254 struct agx_shader_key {
255    /* Device info */
256    struct agx_device_key dev;
257 
258    /* Number of reserved preamble slots at the start */
259    unsigned reserved_preamble;
260 
261    /* Library routines to link against */
262    const nir_shader *libagx;
263 
264    /* Whether scratch memory is available in the given shader stage */
265    bool has_scratch;
266 
267    /* Whether we're compiling the helper program used for scratch allocation.
268     * This has special register allocation requirements.
269     */
270    bool is_helper;
271 
272    /* Whether the driver supports uploading constants for this shader. If
273     * false, constants will not be promoted to uniforms.
274     */
275    bool promote_constants;
276 
277    /* Set if this is a non-monolithic shader that must be linked with additional
278     * shader parts before the program can be used. This suppresses omission of
279     * `stop` instructions, which the linker must insert instead.
280     */
281    bool no_stop;
282 
283    /* Set if this is a secondary shader part (prolog or epilog). This prevents
284     * the compiler from allocating uniform registers. For example, this turns
285     * off preambles.
286     */
287    bool secondary;
288 
289    union {
290       struct agx_fs_shader_key fs;
291    };
292 };
293 
294 struct agx_interp_info agx_gather_interp_info(nir_shader *nir);
295 uint64_t agx_gather_texcoords(nir_shader *nir);
296 
297 void agx_link_libagx(nir_shader *nir, const nir_shader *libagx);
298 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx);
299 bool agx_nir_lower_discard_zs_emit(nir_shader *s);
300 bool agx_nir_lower_sample_mask(nir_shader *s);
301 bool agx_nir_lower_interpolation(nir_shader *s);
302 
303 bool agx_nir_lower_cull_distance_vs(struct nir_shader *s);
304 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,
305                                     unsigned nr_distances);
306 bool agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
307                           unsigned bit_size, unsigned num_components,
308                           int64_t hole_size, nir_intrinsic_instr *low,
309                           nir_intrinsic_instr *high, void *data);
310 
311 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
312                             struct util_debug_callback *debug,
313                             struct agx_shader_part *out);
314 
315 struct agx_occupancy {
316    unsigned max_registers;
317    unsigned max_threads;
318 };
319 
320 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
321 unsigned agx_max_registers_for_occupancy(unsigned occupancy);
322 
323 static const nir_shader_compiler_options agx_nir_options = {
324    .lower_fdiv = true,
325    .fuse_ffma16 = true,
326    .fuse_ffma32 = true,
327    .lower_flrp16 = true,
328    .lower_flrp32 = true,
329    .lower_fpow = true,
330    .lower_fmod = true,
331    .lower_bitfield_insert = true,
332    .lower_ifind_msb = true,
333    .lower_find_lsb = true,
334    .lower_uadd_carry = true,
335    .lower_usub_borrow = true,
336    .lower_fisnormal = true,
337    .lower_scmp = true,
338    .lower_isign = true,
339    .lower_fsign = true,
340    .lower_iabs = true,
341    .lower_fminmax_signed_zero = true,
342    .lower_fdph = true,
343    .lower_ffract = true,
344    .lower_ldexp = true,
345    .lower_pack_half_2x16 = true,
346    .lower_pack_unorm_2x16 = true,
347    .lower_pack_snorm_2x16 = true,
348    .lower_pack_unorm_4x8 = true,
349    .lower_pack_snorm_4x8 = true,
350    .lower_pack_64_2x32 = true,
351    .lower_unpack_half_2x16 = true,
352    .lower_unpack_unorm_2x16 = true,
353    .lower_unpack_snorm_2x16 = true,
354    .lower_unpack_unorm_4x8 = true,
355    .lower_unpack_snorm_4x8 = true,
356    .lower_extract_byte = true,
357    .lower_insert_byte = true,
358    .lower_insert_word = true,
359    .has_cs_global_id = true,
360    .lower_device_index_to_zero = true,
361    .lower_hadd = true,
362    .vectorize_io = true,
363    .has_amul = true,
364    .has_isub = true,
365    .support_16bit_alu = true,
366    .max_unroll_iterations = 32,
367    .lower_uniforms_to_ubo = true,
368    .late_lower_int64 = true,
369    .lower_int64_options =
370       (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64),
371    .lower_doubles_options = (nir_lower_doubles_options)(~0),
372    .support_indirect_inputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES),
373    .support_indirect_outputs = (uint8_t)BITFIELD_MASK(PIPE_SHADER_TYPES),
374    .lower_fquantize2f16 = true,
375    .compact_arrays = true,
376    .discard_is_demote = true,
377    .scalarize_ddx = true,
378    .io_options = nir_io_always_interpolate_convergent_fs_inputs,
379 };
380