1 /*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef __PAN_IR_H
25 #define __PAN_IR_H
26
27 #include <stdint.h>
28 #include "compiler/nir/nir.h"
29 #include "util/hash_table.h"
30 #include "util/u_dynarray.h"
31
32 /* Indices for named (non-XFB) varyings that are present. These are packed
33 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
34 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
35 * of a given special field given a shift S by:
36 *
37 * idx = popcount(P & ((1 << S) - 1))
38 *
39 * That is... look at all of the varyings that come earlier and count them, the
40 * count is the new index since plus one. Likewise, the total number of special
41 * buffers required is simply popcount(P)
42 */
43
44 enum pan_special_varying {
45 PAN_VARY_GENERAL = 0,
46 PAN_VARY_POSITION = 1,
47 PAN_VARY_PSIZ = 2,
48 PAN_VARY_PNTCOORD = 3,
49 PAN_VARY_FACE = 4,
50 PAN_VARY_FRAGCOORD = 5,
51
52 /* Keep last */
53 PAN_VARY_MAX,
54 };
55
56 /* Maximum number of attribute descriptors required for varyings. These include
57 * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
58 * special varying */
59 #define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
60
61 /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
62 * consistent with the blob so we can compare traces easier. */
63
64 enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE };
65
66 /* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
67 * In practice, the maximum number of FAU slots is limited by implementation.
68 * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
69 * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
70 *
71 * Midgard can push at most 92 words, so this bound suffices. The Midgard
72 * compiler pushes less than this, as Midgard uses register-mapped uniforms
73 * instead of FAU, preventing large numbers of uniforms to be pushed for
74 * nontrivial programs.
75 */
76 #define PAN_MAX_PUSH 128
77
78 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
79 * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
80
81 struct panfrost_ubo_word {
82 uint16_t ubo;
83 uint16_t offset;
84 };
85
86 struct panfrost_ubo_push {
87 unsigned count;
88 struct panfrost_ubo_word words[PAN_MAX_PUSH];
89 };
90
91 /* Helper for searching the above. Note this is O(N) to the number of pushed
92 * constants, do not run in the draw call hot path */
93
94 unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
95 unsigned offs);
96
97 struct panfrost_compile_inputs {
98 struct util_debug_callback *debug;
99
100 unsigned gpu_id;
101 bool is_blend, is_blit;
102 struct {
103 unsigned nr_samples;
104 uint64_t bifrost_blend_desc;
105 } blend;
106 bool no_idvs;
107 bool no_ubo_to_push;
108 uint32_t view_mask;
109
110 /* Used on Valhall.
111 *
112 * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
113 * written by the previous stage (fragment shader) or written by this
114 * stage (vertex shader). Bits are slots from gl_varying_slot.
115 *
116 * For modern APIs (GLES or VK), this should be 0.
117 */
118 uint32_t fixed_varying_mask;
119
120 union {
121 struct {
122 uint32_t rt_conv[8];
123 } bifrost;
124 };
125 };
126
127 struct pan_shader_varying {
128 gl_varying_slot location;
129 enum pipe_format format;
130 };
131
132 struct bifrost_shader_blend_info {
133 nir_alu_type type;
134 uint32_t return_offset;
135
136 /* mali_bifrost_register_file_format corresponding to nir_alu_type */
137 unsigned format;
138 };
139
140 /*
141 * Unpacked form of a v7 message preload descriptor, produced by the compiler's
142 * message preload optimization. By splitting out this struct, the compiler does
143 * not need to know about data structure packing, avoiding a dependency on
144 * GenXML.
145 */
146 struct bifrost_message_preload {
147 /* Whether to preload this message */
148 bool enabled;
149
150 /* Varying to load from */
151 unsigned varying_index;
152
153 /* Register type, FP32 otherwise */
154 bool fp16;
155
156 /* Number of components, ignored if texturing */
157 unsigned num_components;
158
159 /* If texture is set, performs a texture instruction according to
160 * texture_index, skip, and zero_lod. If texture is unset, only the
161 * varying load is performed.
162 */
163 bool texture, skip, zero_lod;
164 unsigned texture_index;
165 };
166
167 struct bifrost_shader_info {
168 struct bifrost_shader_blend_info blend[8];
169 nir_alu_type blend_src1_type;
170 bool wait_6, wait_7;
171 struct bifrost_message_preload messages[2];
172
173 /* Whether any flat varyings are loaded. This may disable optimizations
174 * that change the provoking vertex, since that would load incorrect
175 * values for flat varyings.
176 */
177 bool uses_flat_shading;
178 };
179
180 struct midgard_shader_info {
181 unsigned first_tag;
182 union {
183 struct {
184 bool reads_raw_vertex_id;
185 } vs;
186 };
187 };
188
189 struct pan_shader_info {
190 gl_shader_stage stage;
191 unsigned work_reg_count;
192 unsigned tls_size;
193 unsigned wls_size;
194
195 /* Bit mask of preloaded registers */
196 uint64_t preload;
197
198 union {
199 struct {
200 bool reads_frag_coord;
201 bool reads_point_coord;
202 bool reads_face;
203 bool can_discard;
204 bool writes_depth;
205 bool writes_stencil;
206 bool writes_coverage;
207 bool sidefx;
208 bool sample_shading;
209 bool early_fragment_tests;
210 bool can_early_z, can_fpk;
211 bool untyped_color_outputs;
212 BITSET_WORD outputs_read;
213 BITSET_WORD outputs_written;
214 } fs;
215
216 struct {
217 bool writes_point_size;
218
219 /* If the primary shader writes point size, the Valhall
220 * driver may need a variant that does not write point
221 * size. Offset to such a shader in the program binary.
222 *
223 * Zero if no such variant is required.
224 *
225 * Only used with IDVS on Valhall.
226 */
227 unsigned no_psiz_offset;
228
229 /* Set if Index-Driven Vertex Shading is in use */
230 bool idvs;
231
232 /* If IDVS is used, whether a varying shader is used */
233 bool secondary_enable;
234
235 /* If a varying shader is used, the varying shader's
236 * offset in the program binary
237 */
238 unsigned secondary_offset;
239
240 /* If IDVS is in use, number of work registers used by
241 * the varying shader
242 */
243 unsigned secondary_work_reg_count;
244
245 /* If IDVS is in use, bit mask of preloaded registers
246 * used by the varying shader
247 */
248 uint64_t secondary_preload;
249 } vs;
250
251 struct {
252 /* Is it legal to merge workgroups? This is true if the
253 * shader uses neither barriers nor shared memory. This
254 * requires caution: if the API allows specifying shared
255 * memory at launch time (instead of compile time), that
256 * memory will not be accounted for by the compiler.
257 *
258 * Used by the Valhall hardware.
259 */
260 bool allow_merging_workgroups;
261 } cs;
262 };
263
264 /* Does the shader contains a barrier? or (for fragment shaders) does it
265 * require helper invocations, which demand the same ordering guarantees
266 * of the hardware? These notions are unified in the hardware, so we
267 * unify them here as well.
268 */
269 bool contains_barrier;
270 bool separable;
271 bool writes_global;
272 uint64_t outputs_written;
273
274 /* Floating point controls that the driver should try to honour */
275 bool ftz_fp16, ftz_fp32;
276
277 unsigned sampler_count;
278 unsigned texture_count;
279 unsigned ubo_count;
280 unsigned attributes_read_count;
281 unsigned attribute_count;
282 unsigned attributes_read;
283
284 struct {
285 unsigned input_count;
286 struct pan_shader_varying input[PAN_MAX_VARYINGS];
287 unsigned output_count;
288 struct pan_shader_varying output[PAN_MAX_VARYINGS];
289
290 /* Bitfield of noperspective varyings, starting at VARYING_SLOT_VAR0 */
291 uint32_t noperspective;
292 } varyings;
293
294 /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
295 * Uniforms (Bifrost) */
296 struct panfrost_ubo_push push;
297
298 uint32_t ubo_mask;
299
300 /* Quirk for GPUs that does not support auto32 types. */
301 bool quirk_no_auto32;
302
303 union {
304 struct bifrost_shader_info bifrost;
305 struct midgard_shader_info midgard;
306 };
307 };
308
309 typedef struct pan_block {
310 /* Link to next block. Must be first for mir_get_block */
311 struct list_head link;
312
313 /* List of instructions emitted for the current block */
314 struct list_head instructions;
315
316 /* Index of the block in source order */
317 unsigned name;
318
319 /* Control flow graph */
320 struct pan_block *successors[2];
321 struct set *predecessors;
322 bool unconditional_jumps;
323
324 /* In liveness analysis, these are live masks (per-component) for
325 * indices for the block. Scalar compilers have the luxury of using
326 * simple bit fields, but for us, liveness is a vector idea. */
327 uint16_t *live_in;
328 uint16_t *live_out;
329 } pan_block;
330
331 struct pan_instruction {
332 struct list_head link;
333 };
334
335 #define pan_foreach_instr_in_block_rev(block, v) \
336 list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, \
337 link)
338
339 #define pan_foreach_successor(blk, v) \
340 pan_block *v; \
341 pan_block **_v; \
342 for (_v = (pan_block **)&blk->successors[0], v = *_v; \
343 v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v)
344
345 #define pan_foreach_predecessor(blk, v) \
346 struct set_entry *_entry_##v; \
347 struct pan_block *v; \
348 for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \
349 v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL); \
350 _entry_##v != NULL; \
351 _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \
352 v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL))
353
354 static inline pan_block *
pan_exit_block(struct list_head * blocks)355 pan_exit_block(struct list_head *blocks)
356 {
357 pan_block *last = list_last_entry(blocks, pan_block, link);
358 assert(!last->successors[0] && !last->successors[1]);
359 return last;
360 }
361
362 typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
363
364 void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max,
365 uint16_t mask);
366 void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max,
367 uint16_t mask);
368 bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
369
370 void pan_compute_liveness(struct list_head *blocks, unsigned temp_count,
371 pan_liveness_update callback);
372
373 void pan_free_liveness(struct list_head *blocks);
374
375 uint16_t pan_to_bytemask(unsigned bytes, unsigned mask);
376
377 void pan_block_add_successor(pan_block *block, pan_block *successor);
378
379 /* IR indexing */
380 #define PAN_IS_REG (1)
381
382 /* IR printing helpers */
383 void pan_print_alu_type(nir_alu_type t, FILE *fp);
384
385 /* NIR passes to do some backend-specific lowering */
386
387 #define PAN_WRITEOUT_C 1
388 #define PAN_WRITEOUT_Z 2
389 #define PAN_WRITEOUT_S 4
390 #define PAN_WRITEOUT_2 8
391
392 bool pan_nir_lower_zs_store(nir_shader *nir);
393 bool pan_nir_lower_store_component(nir_shader *shader);
394
395 bool pan_nir_lower_vertex_id(nir_shader *shader);
396
397 bool pan_nir_lower_image_ms(nir_shader *shader);
398
399 bool pan_nir_lower_frag_coord_zw(nir_shader *shader);
400 bool pan_nir_lower_noperspective_vs(nir_shader *shader);
401 bool pan_nir_lower_noperspective_fs(nir_shader *shader);
402 bool pan_nir_lower_static_noperspective(nir_shader *shader,
403 uint32_t noperspective_varyings);
404
405 bool pan_lower_helper_invocation(nir_shader *shader);
406 bool pan_lower_sample_pos(nir_shader *shader);
407 bool pan_lower_xfb(nir_shader *nir);
408
409 bool pan_lower_image_index(nir_shader *shader, unsigned vs_img_attrib_offset);
410
411 uint32_t pan_nir_collect_noperspective_varyings_fs(nir_shader *s);
412 void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info);
413
414 /*
415 * Helper returning the subgroup size. Generally, this is equal to the number of
416 * threads in a warp. For Midgard (including warping models), this returns 1, as
417 * subgroups are not supported.
418 */
419 static inline unsigned
pan_subgroup_size(unsigned arch)420 pan_subgroup_size(unsigned arch)
421 {
422 if (arch >= 9)
423 return 16;
424 else if (arch >= 7)
425 return 8;
426 else if (arch >= 6)
427 return 4;
428 else
429 return 1;
430 }
431
432 /*
433 * Helper extracting the table from a given handle of Valhall descriptor model.
434 */
435 static inline unsigned
pan_res_handle_get_table(unsigned handle)436 pan_res_handle_get_table(unsigned handle)
437 {
438 unsigned table = handle >> 24;
439
440 assert(table < 64);
441 return table;
442 }
443
444 /*
445 * Helper returning the index from a given handle of Valhall descriptor model.
446 */
447 static inline unsigned
pan_res_handle_get_index(unsigned handle)448 pan_res_handle_get_index(unsigned handle)
449 {
450 return handle & BITFIELD_MASK(24);
451 }
452
453 /*
454 * Helper creating an handle for Valhall descriptor model.
455 */
456 static inline unsigned
pan_res_handle(unsigned table,unsigned index)457 pan_res_handle(unsigned table, unsigned index)
458 {
459 assert(table < 64);
460 assert(index < (1u << 24));
461
462 return (table << 24) | index;
463 }
464
465 #endif
466