• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018-2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #pragma once
7 
8 #include "compiler/nir/nir.h"
9 #include "util/u_dynarray.h"
10 #include "shader_enums.h"
11 
12 struct agx_varyings_vs {
13    /* The number of user varyings of each type. The varyings must be allocated
14     * in this order ({smooth, flat, linear} × {32, 16}), which may require
15     * remapping.
16     */
17    unsigned num_32_smooth;
18    unsigned num_32_flat;
19    unsigned num_32_linear;
20    unsigned num_16_smooth;
21    unsigned num_16_flat;
22    unsigned num_16_linear;
23 
24    /* The first index used for FP16 varyings. Indices less than this are treated
25     * as FP32. This may require remapping slots to guarantee.
26     */
27    unsigned base_index_fp16;
28 
29    /* The total number of vertex shader indices output. Must be at least
30     * base_index_fp16.
31     */
32    unsigned nr_index;
33 
34    /* If the slot is written, this is the base index that the first component
35     * of the slot is written to.  The next components are found in the next
36     * indices. If less than base_index_fp16, this is a 32-bit slot (with 4
37     * indices for the 4 components), else this is a 16-bit slot (with 2
38     * indices for the 4 components). This must be less than nr_index.
39     *
40     * If the slot is not written, this must be ~0.
41     */
42    unsigned slots[VARYING_SLOT_MAX];
43 
44    /* Slot for the combined layer/viewport 32-bit sysval output, or ~0 if none
45     * is written. What's at slots[VARYING_SLOT_LAYER] is the varying output.
46     */
47    unsigned layer_viewport_slot;
48 
49    /* Base slot for the clip distance sysval outputs, or ~0 if none is written.
50     * What's at slots[VARYING_SLOT_CLIP_DIST0] is the varying output.
51     */
52    unsigned clip_dist_slot;
53    unsigned nr_clip_dists;
54 };
55 
56 /* Conservative bound, * 4 due to offsets (TODO: maybe worth eliminating
57  * coefficient register aliasing?)
58  */
59 #define AGX_MAX_CF_BINDINGS (VARYING_SLOT_MAX * 4)
60 
61 struct agx_varyings_fs {
62    /* Number of coefficient registers used */
63    unsigned nr_cf;
64 
65    /* Number of coefficient register bindings */
66    unsigned nr_bindings;
67 
68    /* Whether gl_FragCoord.z is read */
69    bool reads_z;
70 
71    /* Coefficient register bindings */
72    struct {
73       /* Base coefficient register */
74       unsigned cf_base;
75 
76       /* Slot being bound */
77       gl_varying_slot slot;
78 
79       /* First component bound.
80        *
81        * Must be 2 (Z) or 3 (W) if slot == VARYING_SLOT_POS.
82        */
83       unsigned offset : 2;
84 
85       /* Number of components bound */
86       unsigned count : 3;
87 
88       /* Is smooth shading enabled? If false, flat shading is used */
89       bool smooth : 1;
90 
91       /* Perspective correct interpolation */
92       bool perspective : 1;
93    } bindings[AGX_MAX_CF_BINDINGS];
94 };
95 
96 union agx_varyings {
97    struct agx_varyings_vs vs;
98    struct agx_varyings_fs fs;
99 };
100 
101 struct agx_uncompiled_shader_info {
102    uint64_t inputs_flat_shaded;
103    uint64_t inputs_linear_shaded;
104    uint8_t cull_distance_size;
105    bool has_edgeflags;
106 
107    /* Number of bindful textures, images used */
108    unsigned nr_bindful_textures, nr_bindful_images;
109 };
110 
111 struct agx_shader_info {
112    union agx_varyings varyings;
113 
114    /* Number of uniforms */
115    unsigned push_count;
116 
117    /* Local memory allocation in bytes */
118    unsigned local_size;
119 
120    /* Scratch memory allocation in bytes for main/preamble respectively */
121    unsigned scratch_size, preamble_scratch_size;
122 
123    /* Does the shader have a preamble? If so, it is at offset preamble_offset.
124     * The main shader is at offset main_offset. The preamble is executed first.
125     */
126    bool has_preamble;
127    unsigned preamble_offset, main_offset;
128 
129    /* Does the shader read the tilebuffer? */
130    bool reads_tib;
131 
132    /* Does the shader write point size? */
133    bool writes_psiz;
134 
135    /* Does the shader potentially draw to a nonzero viewport? */
136    bool nonzero_viewport;
137 
138    /* Does the shader write layer and/or viewport index? Written together */
139    bool writes_layer_viewport;
140 
141    /* Does the shader control the sample mask? */
142    bool writes_sample_mask;
143 
144    /* Depth layout, never equal to NONE */
145    enum gl_frag_depth_layout depth_layout;
146 
147    /* Based only the compiled shader, should tag writes be disabled? This is set
148     * based on what is outputted. Note if rasterizer discard is used, that needs
149     * to disable tag writes regardless of this flag.
150     */
151    bool tag_write_disable;
152 
153    /* Shader is incompatible with triangle merging */
154    bool disable_tri_merging;
155 
156    /* Reads draw ID system value */
157    bool uses_draw_id;
158 
159    /* Reads base vertex/instance */
160    bool uses_base_param;
161 
162    /* Shader uses txf, requiring a workaround sampler in the given location */
163    bool uses_txf;
164    unsigned txf_sampler;
165 
166    /* Number of 16-bit registers used by the main shader and preamble
167     * respectively.
168     */
169    unsigned nr_gprs, nr_preamble_gprs;
170 
171    /* Output mask set during driver lowering */
172    uint64_t outputs;
173 };
174 
175 #define AGX_MAX_RTS (8)
176 
177 enum agx_format {
178    AGX_FORMAT_I8 = 0,
179    AGX_FORMAT_I16 = 1,
180    AGX_FORMAT_I32 = 2,
181    AGX_FORMAT_F16 = 3,
182    AGX_FORMAT_U8NORM = 4,
183    AGX_FORMAT_S8NORM = 5,
184    AGX_FORMAT_U16NORM = 6,
185    AGX_FORMAT_S16NORM = 7,
186    AGX_FORMAT_RGB10A2 = 8,
187    AGX_FORMAT_SRGBA8 = 10,
188    AGX_FORMAT_RG11B10F = 12,
189    AGX_FORMAT_RGB9E5 = 13,
190 
191    /* Keep last */
192    AGX_NUM_FORMATS,
193 };
194 
195 struct agx_vs_shader_key {
196    /* The GPU ABI requires all smooth shaded varyings to come first, then all
197     * flat shaded varyings, then all linear shaded varyings, as written by the
198     * VS. In order to correctly remap the varyings into the right order in the
199     * VS, we need to propagate the mask of flat/linear shaded varyings into the
200     * compiler.
201     */
202    uint64_t outputs_flat_shaded;
203    uint64_t outputs_linear_shaded;
204 };
205 
206 struct agx_fs_shader_key {
207    /* Normally, access to the tilebuffer must be guarded by appropriate fencing
208     * instructions to ensure correct results in the presence of out-of-order
209     * hardware optimizations. However, specially dispatched clear shaders are
210     * not subject to these conditions and can omit the wait instructions.
211     *
212     * Must (only) be set for special clear shaders.
213     *
214     * Must not be used with sample mask writes (including discards) or
215     * tilebuffer loads (including blending).
216     */
217    bool ignore_tib_dependencies;
218 };
219 
220 struct agx_shader_key {
221    /* Number of reserved preamble slots at the start */
222    unsigned reserved_preamble;
223 
224    /* Does the target GPU need explicit cluster coherency for atomics?
225     * Only used on G13X.
226     */
227    bool needs_g13x_coherency;
228 
229    /* Library routines to link against */
230    const nir_shader *libagx;
231 
232    /* Whether scratch memory is available in the given shader stage */
233    bool has_scratch;
234 
235    /* Whether we're compiling the helper program used for scratch allocation.
236     * This has special register allocation requirements.
237     */
238    bool is_helper;
239 
240    union {
241       struct agx_vs_shader_key vs;
242       struct agx_fs_shader_key fs;
243    };
244 };
245 
246 void agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx,
247                         bool allow_mediump,
248                         struct agx_uncompiled_shader_info *out);
249 
250 bool agx_nir_lower_discard_zs_emit(nir_shader *s);
251 bool agx_nir_lower_sample_mask(nir_shader *s);
252 
253 bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,
254                                     unsigned nr_distances);
255 
256 void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
257                             struct util_debug_callback *debug,
258                             struct util_dynarray *binary,
259                             struct agx_shader_info *out);
260 
261 struct agx_occupancy {
262    unsigned max_registers;
263    unsigned max_threads;
264 };
265 
266 struct agx_occupancy agx_occupancy_for_register_count(unsigned halfregs);
267 unsigned agx_max_registers_for_occupancy(unsigned occupancy);
268 
269 static const nir_shader_compiler_options agx_nir_options = {
270    .lower_fdiv = true,
271    .fuse_ffma16 = true,
272    .fuse_ffma32 = true,
273    .lower_flrp16 = true,
274    .lower_flrp32 = true,
275    .lower_fpow = true,
276    .lower_fmod = true,
277    .lower_bitfield_insert = true,
278    .lower_ifind_msb = true,
279    .lower_find_lsb = true,
280    .lower_uadd_carry = true,
281    .lower_usub_borrow = true,
282    .lower_fisnormal = true,
283    .lower_scmp = true,
284    .lower_isign = true,
285    .lower_fsign = true,
286    .lower_iabs = true,
287    .lower_fdph = true,
288    .lower_ffract = true,
289    .lower_ldexp = true,
290    .lower_pack_half_2x16 = true,
291    .lower_pack_64_2x32 = true,
292    .lower_unpack_half_2x16 = true,
293    .lower_extract_byte = true,
294    .lower_insert_byte = true,
295    .lower_insert_word = true,
296    .has_cs_global_id = true,
297    .lower_hadd = true,
298    .vectorize_io = true,
299    .use_interpolated_input_intrinsics = true,
300    .has_isub = true,
301    .support_16bit_alu = true,
302    .max_unroll_iterations = 32,
303    .lower_uniforms_to_ubo = true,
304    .lower_int64_options =
305       (nir_lower_int64_options) ~(nir_lower_iadd64 | nir_lower_imul_2x32_64),
306    .lower_doubles_options = (nir_lower_doubles_options)(~0),
307    .lower_fquantize2f16 = true,
308 };
309