• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #ifndef __PAN_IR_H
25 #define __PAN_IR_H
26 
27 #include <stdint.h>
28 #include "compiler/nir/nir.h"
29 #include "util/hash_table.h"
30 #include "util/u_dynarray.h"
31 
32 /* On Valhall, the driver gives the hardware a table of resource tables.
33  * Resources are addressed as the index of the table together with the index of
34  * the resource within the table. For simplicity, we put one type of resource
35  * in each table and fix the numbering of the tables.
36  *
37  * This numbering is arbitrary. It is a software ABI between the
38  * Gallium driver and the Valhall compiler.
39  */
40 enum pan_resource_table {
41    PAN_TABLE_UBO = 0,
42    PAN_TABLE_ATTRIBUTE,
43    PAN_TABLE_ATTRIBUTE_BUFFER,
44    PAN_TABLE_SAMPLER,
45    PAN_TABLE_TEXTURE,
46    PAN_TABLE_IMAGE,
47 
48    PAN_NUM_RESOURCE_TABLES
49 };
50 
51 /* Indices for named (non-XFB) varyings that are present. These are packed
52  * tightly so they correspond to a bitfield present (P) indexed by (1 <<
53  * PAN_VARY_*). This has the nice property that you can lookup the buffer index
54  * of a given special field given a shift S by:
55  *
56  *      idx = popcount(P & ((1 << S) - 1))
57  *
58  * That is... look at all of the varyings that come earlier and count them, the
59  * count is the new index since plus one. Likewise, the total number of special
60  * buffers required is simply popcount(P)
61  */
62 
63 enum pan_special_varying {
64    PAN_VARY_GENERAL = 0,
65    PAN_VARY_POSITION = 1,
66    PAN_VARY_PSIZ = 2,
67    PAN_VARY_PNTCOORD = 3,
68    PAN_VARY_FACE = 4,
69    PAN_VARY_FRAGCOORD = 5,
70 
71    /* Keep last */
72    PAN_VARY_MAX,
73 };
74 
75 /* Maximum number of attribute descriptors required for varyings. These include
76  * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
77  * special varying */
78 #define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
79 
80 /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
81  * consistent with the blob so we can compare traces easier. */
82 
83 enum { PAN_VERTEX_ID = 16, PAN_INSTANCE_ID = 17, PAN_MAX_ATTRIBUTE };
84 
85 /* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
86  * In practice, the maximum number of FAU slots is limited by implementation.
87  * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
88  * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
89  *
90  * Midgard can push at most 92 words, so this bound suffices. The Midgard
91  * compiler pushes less than this, as Midgard uses register-mapped uniforms
92  * instead of FAU, preventing large numbers of uniforms to be pushed for
93  * nontrivial programs.
94  */
95 #define PAN_MAX_PUSH 128
96 
97 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
98  * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
99 
100 struct panfrost_ubo_word {
101    uint16_t ubo;
102    uint16_t offset;
103 };
104 
105 struct panfrost_ubo_push {
106    unsigned count;
107    struct panfrost_ubo_word words[PAN_MAX_PUSH];
108 };
109 
110 /* Helper for searching the above. Note this is O(N) to the number of pushed
111  * constants, do not run in the draw call hot path */
112 
113 unsigned pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo,
114                                unsigned offs);
115 
116 struct panfrost_compile_inputs {
117    struct util_debug_callback *debug;
118 
119    unsigned gpu_id;
120    bool is_blend, is_blit;
121    struct {
122       unsigned nr_samples;
123       uint64_t bifrost_blend_desc;
124    } blend;
125    bool no_idvs;
126    bool no_ubo_to_push;
127 
128    /* Used on Valhall.
129     *
130     * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
131     * written by the previous stage (fragment shader) or written by this
132     * stage (vertex shader). Bits are slots from gl_varying_slot.
133     *
134     * For modern APIs (GLES or VK), this should be 0.
135     */
136    uint32_t fixed_varying_mask;
137 
138    union {
139       struct {
140          uint32_t rt_conv[8];
141       } bifrost;
142    };
143 };
144 
145 struct pan_shader_varying {
146    gl_varying_slot location;
147    enum pipe_format format;
148 };
149 
150 struct bifrost_shader_blend_info {
151    nir_alu_type type;
152    uint32_t return_offset;
153 
154    /* mali_bifrost_register_file_format corresponding to nir_alu_type */
155    unsigned format;
156 };
157 
158 /*
159  * Unpacked form of a v7 message preload descriptor, produced by the compiler's
160  * message preload optimization. By splitting out this struct, the compiler does
161  * not need to know about data structure packing, avoiding a dependency on
162  * GenXML.
163  */
164 struct bifrost_message_preload {
165    /* Whether to preload this message */
166    bool enabled;
167 
168    /* Varying to load from */
169    unsigned varying_index;
170 
171    /* Register type, FP32 otherwise */
172    bool fp16;
173 
174    /* Number of components, ignored if texturing */
175    unsigned num_components;
176 
177    /* If texture is set, performs a texture instruction according to
178     * texture_index, skip, and zero_lod. If texture is unset, only the
179     * varying load is performed.
180     */
181    bool texture, skip, zero_lod;
182    unsigned texture_index;
183 };
184 
185 struct bifrost_shader_info {
186    struct bifrost_shader_blend_info blend[8];
187    nir_alu_type blend_src1_type;
188    bool wait_6, wait_7;
189    struct bifrost_message_preload messages[2];
190 
191    /* Whether any flat varyings are loaded. This may disable optimizations
192     * that change the provoking vertex, since that would load incorrect
193     * values for flat varyings.
194     */
195    bool uses_flat_shading;
196 };
197 
198 struct midgard_shader_info {
199    unsigned first_tag;
200 };
201 
202 struct pan_shader_info {
203    gl_shader_stage stage;
204    unsigned work_reg_count;
205    unsigned tls_size;
206    unsigned wls_size;
207 
208    /* Bit mask of preloaded registers */
209    uint64_t preload;
210 
211    union {
212       struct {
213          bool reads_frag_coord;
214          bool reads_point_coord;
215          bool reads_face;
216          bool can_discard;
217          bool writes_depth;
218          bool writes_stencil;
219          bool writes_coverage;
220          bool sidefx;
221          bool sample_shading;
222          bool early_fragment_tests;
223          bool can_early_z, can_fpk;
224          bool untyped_color_outputs;
225          BITSET_WORD outputs_read;
226          BITSET_WORD outputs_written;
227       } fs;
228 
229       struct {
230          bool writes_point_size;
231 
232          /* If the primary shader writes point size, the Valhall
233           * driver may need a variant that does not write point
234           * size. Offset to such a shader in the program binary.
235           *
236           * Zero if no such variant is required.
237           *
238           * Only used with IDVS on Valhall.
239           */
240          unsigned no_psiz_offset;
241 
242          /* Set if Index-Driven Vertex Shading is in use */
243          bool idvs;
244 
245          /* If IDVS is used, whether a varying shader is used */
246          bool secondary_enable;
247 
248          /* If a varying shader is used, the varying shader's
249           * offset in the program binary
250           */
251          unsigned secondary_offset;
252 
253          /* If IDVS is in use, number of work registers used by
254           * the varying shader
255           */
256          unsigned secondary_work_reg_count;
257 
258          /* If IDVS is in use, bit mask of preloaded registers
259           * used by the varying shader
260           */
261          uint64_t secondary_preload;
262       } vs;
263 
264       struct {
265          /* Is it legal to merge workgroups? This is true if the
266           * shader uses neither barriers nor shared memory. This
267           * requires caution: if the API allows specifying shared
268           * memory at launch time (instead of compile time), that
269           * memory will not be accounted for by the compiler.
270           *
271           * Used by the Valhall hardware.
272           */
273          bool allow_merging_workgroups;
274       } cs;
275    };
276 
277    /* Does the shader contains a barrier? or (for fragment shaders) does it
278     * require helper invocations, which demand the same ordering guarantees
279     * of the hardware? These notions are unified in the hardware, so we
280     * unify them here as well.
281     */
282    bool contains_barrier;
283    bool separable;
284    bool writes_global;
285    uint64_t outputs_written;
286 
287    /* Floating point controls that the driver should try to honour */
288    bool ftz_fp16, ftz_fp32;
289 
290    unsigned sampler_count;
291    unsigned texture_count;
292    unsigned ubo_count;
293    unsigned attributes_read_count;
294    unsigned attribute_count;
295    unsigned attributes_read;
296 
297    struct {
298       unsigned input_count;
299       struct pan_shader_varying input[PAN_MAX_VARYINGS];
300       unsigned output_count;
301       struct pan_shader_varying output[PAN_MAX_VARYINGS];
302    } varyings;
303 
304    /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
305     * Uniforms (Bifrost) */
306    struct panfrost_ubo_push push;
307 
308    uint32_t ubo_mask;
309 
310    union {
311       struct bifrost_shader_info bifrost;
312       struct midgard_shader_info midgard;
313    };
314 };
315 
316 typedef struct pan_block {
317    /* Link to next block. Must be first for mir_get_block */
318    struct list_head link;
319 
320    /* List of instructions emitted for the current block */
321    struct list_head instructions;
322 
323    /* Index of the block in source order */
324    unsigned name;
325 
326    /* Control flow graph */
327    struct pan_block *successors[2];
328    struct set *predecessors;
329    bool unconditional_jumps;
330 
331    /* In liveness analysis, these are live masks (per-component) for
332     * indices for the block. Scalar compilers have the luxury of using
333     * simple bit fields, but for us, liveness is a vector idea. */
334    uint16_t *live_in;
335    uint16_t *live_out;
336 } pan_block;
337 
338 struct pan_instruction {
339    struct list_head link;
340 };
341 
342 #define pan_foreach_instr_in_block_rev(block, v)                               \
343    list_for_each_entry_rev(struct pan_instruction, v, &block->instructions,    \
344                            link)
345 
346 #define pan_foreach_successor(blk, v)                                          \
347    pan_block *v;                                                               \
348    pan_block **_v;                                                             \
349    for (_v = (pan_block **)&blk->successors[0], v = *_v;                       \
350         v != NULL && _v < (pan_block **)&blk->successors[2]; _v++, v = *_v)
351 
352 #define pan_foreach_predecessor(blk, v)                                        \
353    struct set_entry *_entry_##v;                                               \
354    struct pan_block *v;                                                        \
355    for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL),            \
356        v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL);          \
357         _entry_##v != NULL;                                                    \
358         _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v),      \
359        v = (struct pan_block *)(_entry_##v ? _entry_##v->key : NULL))
360 
361 static inline pan_block *
pan_exit_block(struct list_head * blocks)362 pan_exit_block(struct list_head *blocks)
363 {
364    pan_block *last = list_last_entry(blocks, pan_block, link);
365    assert(!last->successors[0] && !last->successors[1]);
366    return last;
367 }
368 
369 typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
370 
371 void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max,
372                       uint16_t mask);
373 void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max,
374                        uint16_t mask);
375 bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
376 
377 void pan_compute_liveness(struct list_head *blocks, unsigned temp_count,
378                           pan_liveness_update callback);
379 
380 void pan_free_liveness(struct list_head *blocks);
381 
382 uint16_t pan_to_bytemask(unsigned bytes, unsigned mask);
383 
384 void pan_block_add_successor(pan_block *block, pan_block *successor);
385 
386 /* IR indexing */
387 #define PAN_IS_REG (1)
388 
389 /* IR printing helpers */
390 void pan_print_alu_type(nir_alu_type t, FILE *fp);
391 
392 /* NIR passes to do some backend-specific lowering */
393 
394 #define PAN_WRITEOUT_C 1
395 #define PAN_WRITEOUT_Z 2
396 #define PAN_WRITEOUT_S 4
397 #define PAN_WRITEOUT_2 8
398 
399 bool pan_nir_lower_zs_store(nir_shader *nir);
400 bool pan_nir_lower_store_component(nir_shader *shader);
401 
402 bool pan_nir_lower_image_ms(nir_shader *shader);
403 bool pan_nir_lower_64bit_intrin(nir_shader *shader);
404 
405 bool pan_lower_helper_invocation(nir_shader *shader);
406 bool pan_lower_sample_pos(nir_shader *shader);
407 bool pan_lower_xfb(nir_shader *nir);
408 
409 void pan_nir_collect_varyings(nir_shader *s, struct pan_shader_info *info);
410 
411 /*
412  * Helper returning the subgroup size. Generally, this is equal to the number of
413  * threads in a warp. For Midgard (including warping models), this returns 1, as
414  * subgroups are not supported.
415  */
416 static inline unsigned
pan_subgroup_size(unsigned arch)417 pan_subgroup_size(unsigned arch)
418 {
419    if (arch >= 9)
420       return 16;
421    else if (arch >= 7)
422       return 8;
423    else if (arch >= 6)
424       return 4;
425    else
426       return 1;
427 }
428 
429 /* Architectural maximums, since this register may be not implemented
430  * by a given chip. G31 is actually 512 instead of 768 but it doesn't
431  * really matter. */
432 
433 static inline unsigned
panfrost_max_thread_count(unsigned arch,unsigned work_reg_count)434 panfrost_max_thread_count(unsigned arch, unsigned work_reg_count)
435 {
436    switch (arch) {
437    /* Midgard */
438    case 4:
439    case 5:
440       if (work_reg_count > 8)
441          return 64;
442       else if (work_reg_count > 4)
443          return 128;
444       else
445          return 256;
446 
447    /* Bifrost, first generation */
448    case 6:
449       return 384;
450 
451    /* Bifrost, second generation (G31 is 512 but it doesn't matter) */
452    case 7:
453       return work_reg_count > 32 ? 384 : 768;
454 
455    /* Valhall (for completeness) */
456    default:
457       return work_reg_count > 32 ? 512 : 1024;
458    }
459 }
460 
461 #endif
462