• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef __PAN_IR_H
25 #define __PAN_IR_H
26 
27 #include <stdint.h>
28 #include "compiler/nir/nir.h"
29 #include "util/u_dynarray.h"
30 #include "util/hash_table.h"
31 
32 /* On Valhall, the driver gives the hardware a table of resource tables.
33  * Resources are addressed as the index of the table together with the index of
34  * the resource within the table. For simplicity, we put one type of resource
35  * in each table and fix the numbering of the tables.
36  *
37  * This numbering is arbitrary. It is a software ABI between the
38  * Gallium driver and the Valhall compiler.
39  */
40 enum pan_resource_table {
41         PAN_TABLE_UBO = 0,
42         PAN_TABLE_ATTRIBUTE,
43         PAN_TABLE_ATTRIBUTE_BUFFER,
44         PAN_TABLE_SAMPLER,
45         PAN_TABLE_TEXTURE,
46         PAN_TABLE_IMAGE,
47 
48         PAN_NUM_RESOURCE_TABLES
49 };
50 
51 /* Indices for named (non-XFB) varyings that are present. These are packed
52  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
53  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
54  * of a given special field given a shift S by:
55  *
56  *      idx = popcount(P & ((1 << S) - 1))
57  *
58  * That is... look at all of the varyings that come earlier and count them, the
59  * count is the new index since plus one. Likewise, the total number of special
60  * buffers required is simply popcount(P)
61  */
62 
63 enum pan_special_varying {
64         PAN_VARY_GENERAL = 0,
65         PAN_VARY_POSITION = 1,
66         PAN_VARY_PSIZ = 2,
67         PAN_VARY_PNTCOORD = 3,
68         PAN_VARY_FACE = 4,
69         PAN_VARY_FRAGCOORD = 5,
70 
71         /* Keep last */
72         PAN_VARY_MAX,
73 };
74 
75 /* Maximum number of attribute descriptors required for varyings. These include
76  * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
77  * special varying */
78 #define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
79 
80 /* Define the general compiler entry point */
81 
82 #define MAX_SYSVAL_COUNT 32
83 
84 /* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
85  * their class for equal comparison */
86 
87 #define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
88 #define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
89 #define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
90 
91 /* Define some common types. We start at one for easy indexing of hash
92  * tables internal to the compiler */
93 
94 enum {
95         PAN_SYSVAL_VIEWPORT_SCALE = 1,
96         PAN_SYSVAL_VIEWPORT_OFFSET = 2,
97         PAN_SYSVAL_TEXTURE_SIZE = 3,
98         PAN_SYSVAL_SSBO = 4,
99         PAN_SYSVAL_NUM_WORK_GROUPS = 5,
100         PAN_SYSVAL_SAMPLER = 7,
101         PAN_SYSVAL_LOCAL_GROUP_SIZE = 8,
102         PAN_SYSVAL_WORK_DIM = 9,
103         PAN_SYSVAL_IMAGE_SIZE = 10,
104         PAN_SYSVAL_SAMPLE_POSITIONS = 11,
105         PAN_SYSVAL_MULTISAMPLED = 12,
106         PAN_SYSVAL_RT_CONVERSION = 13,
107         PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14,
108         PAN_SYSVAL_DRAWID = 15,
109         PAN_SYSVAL_BLEND_CONSTANTS = 16,
110         PAN_SYSVAL_XFB = 17,
111         PAN_SYSVAL_NUM_VERTICES = 18,
112 };
113 
114 #define PAN_TXS_SYSVAL_ID(texidx, dim, is_array)          \
115 	((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
116 
117 #define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id)        ((id) & 0x7f)
118 #define PAN_SYSVAL_ID_TO_TXS_DIM(id)            (((id) >> 7) & 0x3)
119 #define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id)       !!((id) & (1 << 9))
120 
121 /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
122  * consistent with the blob so we can compare traces easier. */
123 
124 enum {
125         PAN_VERTEX_ID   = 16,
126         PAN_INSTANCE_ID = 17,
127         PAN_MAX_ATTRIBUTE
128 };
129 
130 struct panfrost_sysvals {
131         /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
132         unsigned sysvals[MAX_SYSVAL_COUNT];
133         unsigned sysval_count;
134 };
135 
136 /* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
137  * In practice, the maximum number of FAU slots is limited by implementation.
138  * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
139  * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
140  *
141  * Midgard can push at most 92 words, so this bound suffices. The Midgard
142  * compiler pushes less than this, as Midgard uses register-mapped uniforms
143  * instead of FAU, preventing large numbers of uniforms to be pushed for
144  * nontrivial programs.
145  */
146 #define PAN_MAX_PUSH 128
147 
148 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
149  * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
150 
151 struct panfrost_ubo_word {
152         uint16_t ubo;
153         uint16_t offset;
154 };
155 
156 struct panfrost_ubo_push {
157         unsigned count;
158         struct panfrost_ubo_word words[PAN_MAX_PUSH];
159 };
160 
161 /* Helper for searching the above. Note this is O(N) to the number of pushed
162  * constants, do not run in the draw call hot path */
163 
164 unsigned
165 pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs);
166 
167 struct hash_table_u64 *
168 panfrost_init_sysvals(struct panfrost_sysvals *sysvals,
169                       struct panfrost_sysvals *fixed_sysvals,
170                       void *memctx);
171 
172 unsigned
173 pan_lookup_sysval(struct hash_table_u64 *sysval_to_id,
174                   struct panfrost_sysvals *sysvals,
175                   int sysval);
176 
177 int
178 panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest);
179 
180 struct panfrost_compile_inputs {
181         unsigned gpu_id;
182         bool is_blend, is_blit;
183         struct {
184                 unsigned rt;
185                 unsigned nr_samples;
186                 uint64_t bifrost_blend_desc;
187         } blend;
188         int fixed_sysval_ubo;
189         struct panfrost_sysvals *fixed_sysval_layout;
190         bool shaderdb;
191         bool no_idvs;
192         bool no_ubo_to_push;
193 
194         enum pipe_format rt_formats[8];
195         uint8_t raw_fmt_mask;
196         unsigned nr_cbufs;
197 
198         /* Used on Valhall.
199          *
200          * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
201          * written by the previous stage (fragment shader) or written by this
202          * stage (vertex shader). Bits are slots from gl_varying_slot.
203          *
204          * For modern APIs (GLES or VK), this should be 0.
205          */
206         uint32_t fixed_varying_mask;
207 
208         union {
209                 struct {
210                         bool static_rt_conv;
211                         uint32_t rt_conv[8];
212                 } bifrost;
213         };
214 };
215 
216 struct pan_shader_varying {
217         gl_varying_slot location;
218         enum pipe_format format;
219 };
220 
221 struct bifrost_shader_blend_info {
222         nir_alu_type type;
223         uint32_t return_offset;
224 
225         /* mali_bifrost_register_file_format corresponding to nir_alu_type */
226         unsigned format;
227 };
228 
229 /*
230  * Unpacked form of a v7 message preload descriptor, produced by the compiler's
231  * message preload optimization. By splitting out this struct, the compiler does
232  * not need to know about data structure packing, avoiding a dependency on
233  * GenXML.
234  */
235 struct bifrost_message_preload {
236         /* Whether to preload this message */
237         bool enabled;
238 
239         /* Varying to load from */
240         unsigned varying_index;
241 
242         /* Register type, FP32 otherwise */
243         bool fp16;
244 
245         /* Number of components, ignored if texturing */
246         unsigned num_components;
247 
248         /* If texture is set, performs a texture instruction according to
249          * texture_index, skip, and zero_lod. If texture is unset, only the
250          * varying load is performed.
251          */
252         bool texture, skip, zero_lod;
253         unsigned texture_index;
254 };
255 
256 struct bifrost_shader_info {
257         struct bifrost_shader_blend_info blend[8];
258         nir_alu_type blend_src1_type;
259         bool wait_6, wait_7;
260         struct bifrost_message_preload messages[2];
261 
262         /* Whether any flat varyings are loaded. This may disable optimizations
263          * that change the provoking vertex, since that would load incorrect
264          * values for flat varyings.
265          */
266         bool uses_flat_shading;
267 };
268 
269 struct midgard_shader_info {
270         unsigned first_tag;
271 };
272 
273 struct pan_shader_info {
274         gl_shader_stage stage;
275         unsigned work_reg_count;
276         unsigned tls_size;
277         unsigned wls_size;
278 
279         /* Bit mask of preloaded registers */
280         uint64_t preload;
281 
282         union {
283                 struct {
284                         bool reads_frag_coord;
285                         bool reads_point_coord;
286                         bool reads_face;
287                         bool can_discard;
288                         bool writes_depth;
289                         bool writes_stencil;
290                         bool writes_coverage;
291                         bool sidefx;
292                         bool sample_shading;
293                         bool early_fragment_tests;
294                         bool can_early_z, can_fpk;
295                         BITSET_WORD outputs_read;
296                         BITSET_WORD outputs_written;
297                 } fs;
298 
299                 struct {
300                         bool writes_point_size;
301 
302                         /* If the primary shader writes point size, the Valhall
303                          * driver may need a variant that does not write point
304                          * size. Offset to such a shader in the program binary.
305                          *
306                          * Zero if no such variant is required.
307                          *
308                          * Only used with IDVS on Valhall.
309                          */
310                         unsigned no_psiz_offset;
311 
312                         /* Set if Index-Driven Vertex Shading is in use */
313                         bool idvs;
314 
315                         /* If IDVS is used, whether a varying shader is used */
316                         bool secondary_enable;
317 
318                         /* If a varying shader is used, the varying shader's
319                          * offset in the program binary
320                          */
321                         unsigned secondary_offset;
322 
323                         /* If IDVS is in use, number of work registers used by
324                          * the varying shader
325                          */
326                         unsigned secondary_work_reg_count;
327 
328                         /* If IDVS is in use, bit mask of preloaded registers
329                          * used by the varying shader
330                          */
331                         uint64_t secondary_preload;
332                 } vs;
333 
334                 struct {
335                         /* Is it legal to merge workgroups? This is true if the
336                          * shader uses neither barriers nor shared memory.
337                          *
338                          * Used by the Valhall hardware.
339                          */
340                         bool allow_merging_workgroups;
341                 } cs;
342         };
343 
344         /* Does the shader contains a barrier? or (for fragment shaders) does it
345          * require helper invocations, which demand the same ordering guarantees
346          * of the hardware? These notions are unified in the hardware, so we
347          * unify them here as well.
348          */
349         bool contains_barrier;
350         bool separable;
351         bool writes_global;
352         uint64_t outputs_written;
353 
354         unsigned sampler_count;
355         unsigned texture_count;
356         unsigned ubo_count;
357         unsigned attributes_read_count;
358         unsigned attribute_count;
359         unsigned attributes_read;
360 
361         struct {
362                 unsigned input_count;
363                 struct pan_shader_varying input[PAN_MAX_VARYINGS];
364                 unsigned output_count;
365                 struct pan_shader_varying output[PAN_MAX_VARYINGS];
366         } varyings;
367 
368         struct panfrost_sysvals sysvals;
369 
370         /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
371          * Uniforms (Bifrost) */
372         struct panfrost_ubo_push push;
373 
374         uint32_t ubo_mask;
375 
376         union {
377                 struct bifrost_shader_info bifrost;
378                 struct midgard_shader_info midgard;
379         };
380 };
381 
382 typedef struct pan_block {
383         /* Link to next block. Must be first for mir_get_block */
384         struct list_head link;
385 
386         /* List of instructions emitted for the current block */
387         struct list_head instructions;
388 
389         /* Index of the block in source order */
390         unsigned name;
391 
392         /* Control flow graph */
393         struct pan_block *successors[2];
394         struct set *predecessors;
395         bool unconditional_jumps;
396 
397         /* In liveness analysis, these are live masks (per-component) for
398          * indices for the block. Scalar compilers have the luxury of using
399          * simple bit fields, but for us, liveness is a vector idea. */
400         uint16_t *live_in;
401         uint16_t *live_out;
402 } pan_block;
403 
404 struct pan_instruction {
405         struct list_head link;
406 };
407 
408 #define pan_foreach_instr_in_block_rev(block, v) \
409         list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link)
410 
411 #define pan_foreach_successor(blk, v) \
412         pan_block *v; \
413         pan_block **_v; \
414         for (_v = (pan_block **) &blk->successors[0], \
415                 v = *_v; \
416                 v != NULL && _v < (pan_block **) &blk->successors[2]; \
417                 _v++, v = *_v) \
418 
419 #define pan_foreach_predecessor(blk, v) \
420         struct set_entry *_entry_##v; \
421         struct pan_block *v; \
422         for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \
423                 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL);  \
424                 _entry_##v != NULL; \
425                 _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \
426                 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL))
427 
428 static inline pan_block *
pan_exit_block(struct list_head * blocks)429 pan_exit_block(struct list_head *blocks)
430 {
431         pan_block *last = list_last_entry(blocks, pan_block, link);
432         assert(!last->successors[0] && !last->successors[1]);
433         return last;
434 }
435 
436 typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
437 
438 void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
439 void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
440 bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
441 
442 void pan_compute_liveness(struct list_head *blocks,
443                 unsigned temp_count,
444                 pan_liveness_update callback);
445 
446 void pan_free_liveness(struct list_head *blocks);
447 
448 uint16_t
449 pan_to_bytemask(unsigned bytes, unsigned mask);
450 
451 void pan_block_add_successor(pan_block *block, pan_block *successor);
452 
453 /* IR indexing */
454 #define PAN_IS_REG (1)
455 
456 static inline unsigned
pan_ssa_index(nir_ssa_def * ssa)457 pan_ssa_index(nir_ssa_def *ssa)
458 {
459         /* Off-by-one ensures BIR_NO_ARG is skipped */
460         return ((ssa->index + 1) << 1) | 0;
461 }
462 
463 static inline unsigned
pan_src_index(nir_src * src)464 pan_src_index(nir_src *src)
465 {
466         if (src->is_ssa)
467                 return pan_ssa_index(src->ssa);
468         else {
469                 assert(!src->reg.indirect);
470                 return (src->reg.reg->index << 1) | PAN_IS_REG;
471         }
472 }
473 
474 static inline unsigned
pan_dest_index(nir_dest * dst)475 pan_dest_index(nir_dest *dst)
476 {
477         if (dst->is_ssa)
478                 return pan_ssa_index(&dst->ssa);
479         else {
480                 assert(!dst->reg.indirect);
481                 return (dst->reg.reg->index << 1) | PAN_IS_REG;
482         }
483 }
484 
485 /* IR printing helpers */
486 void pan_print_alu_type(nir_alu_type t, FILE *fp);
487 
488 /* Until it can be upstreamed.. */
489 bool pan_has_source_mod(nir_alu_src *src, nir_op op);
490 bool pan_has_dest_mod(nir_dest **dest, nir_op op);
491 
492 /* NIR passes to do some backend-specific lowering */
493 
494 #define PAN_WRITEOUT_C 1
495 #define PAN_WRITEOUT_Z 2
496 #define PAN_WRITEOUT_S 4
497 #define PAN_WRITEOUT_2 8
498 
499 bool pan_nir_lower_zs_store(nir_shader *nir);
500 
501 bool pan_nir_lower_64bit_intrin(nir_shader *shader);
502 
503 bool pan_lower_helper_invocation(nir_shader *shader);
504 bool pan_lower_sample_pos(nir_shader *shader);
505 bool pan_lower_xfb(nir_shader *nir);
506 
507 /*
508  * Helper returning the subgroup size. Generally, this is equal to the number of
509  * threads in a warp. For Midgard (including warping models), this returns 1, as
510  * subgroups are not supported.
511  */
512 static inline unsigned
pan_subgroup_size(unsigned arch)513 pan_subgroup_size(unsigned arch)
514 {
515         if (arch >= 9)
516                 return 16;
517         else if (arch >= 7)
518                 return 8;
519         else if (arch >= 6)
520                 return 4;
521         else
522                 return 1;
523 }
524 
525 #endif
526