1 /*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #ifndef __PAN_IR_H
25 #define __PAN_IR_H
26
27 #include <stdint.h>
28 #include "compiler/nir/nir.h"
29 #include "util/u_dynarray.h"
30 #include "util/hash_table.h"
31
32 /* On Valhall, the driver gives the hardware a table of resource tables.
33 * Resources are addressed as the index of the table together with the index of
34 * the resource within the table. For simplicity, we put one type of resource
35 * in each table and fix the numbering of the tables.
36 *
37 * This numbering is arbitrary. It is a software ABI between the
38 * Gallium driver and the Valhall compiler.
39 */
40 enum pan_resource_table {
41 PAN_TABLE_UBO = 0,
42 PAN_TABLE_ATTRIBUTE,
43 PAN_TABLE_ATTRIBUTE_BUFFER,
44 PAN_TABLE_SAMPLER,
45 PAN_TABLE_TEXTURE,
46 PAN_TABLE_IMAGE,
47
48 PAN_NUM_RESOURCE_TABLES
49 };
50
51 /* Indices for named (non-XFB) varyings that are present. These are packed
52 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
53 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
54 * of a given special field given a shift S by:
55 *
56 * idx = popcount(P & ((1 << S) - 1))
57 *
58 * That is... look at all of the varyings that come earlier and count them, the
59 * count is the new index since plus one. Likewise, the total number of special
60 * buffers required is simply popcount(P)
61 */
62
63 enum pan_special_varying {
64 PAN_VARY_GENERAL = 0,
65 PAN_VARY_POSITION = 1,
66 PAN_VARY_PSIZ = 2,
67 PAN_VARY_PNTCOORD = 3,
68 PAN_VARY_FACE = 4,
69 PAN_VARY_FRAGCOORD = 5,
70
71 /* Keep last */
72 PAN_VARY_MAX,
73 };
74
75 /* Maximum number of attribute descriptors required for varyings. These include
76 * up to MAX_VARYING source level varyings plus a descriptor each non-GENERAL
77 * special varying */
78 #define PAN_MAX_VARYINGS (MAX_VARYING + PAN_VARY_MAX - 1)
79
80 /* Define the general compiler entry point */
81
82 #define MAX_SYSVAL_COUNT 32
83
84 /* Allow 2D of sysval IDs, while allowing nonparametric sysvals to equal
85 * their class for equal comparison */
86
87 #define PAN_SYSVAL(type, no) (((no) << 16) | PAN_SYSVAL_##type)
88 #define PAN_SYSVAL_TYPE(sysval) ((sysval) & 0xffff)
89 #define PAN_SYSVAL_ID(sysval) ((sysval) >> 16)
90
91 /* Define some common types. We start at one for easy indexing of hash
92 * tables internal to the compiler */
93
94 enum {
95 PAN_SYSVAL_VIEWPORT_SCALE = 1,
96 PAN_SYSVAL_VIEWPORT_OFFSET = 2,
97 PAN_SYSVAL_TEXTURE_SIZE = 3,
98 PAN_SYSVAL_SSBO = 4,
99 PAN_SYSVAL_NUM_WORK_GROUPS = 5,
100 PAN_SYSVAL_SAMPLER = 7,
101 PAN_SYSVAL_LOCAL_GROUP_SIZE = 8,
102 PAN_SYSVAL_WORK_DIM = 9,
103 PAN_SYSVAL_IMAGE_SIZE = 10,
104 PAN_SYSVAL_SAMPLE_POSITIONS = 11,
105 PAN_SYSVAL_MULTISAMPLED = 12,
106 PAN_SYSVAL_RT_CONVERSION = 13,
107 PAN_SYSVAL_VERTEX_INSTANCE_OFFSETS = 14,
108 PAN_SYSVAL_DRAWID = 15,
109 PAN_SYSVAL_BLEND_CONSTANTS = 16,
110 PAN_SYSVAL_XFB = 17,
111 PAN_SYSVAL_NUM_VERTICES = 18,
112 };
113
114 #define PAN_TXS_SYSVAL_ID(texidx, dim, is_array) \
115 ((texidx) | ((dim) << 7) | ((is_array) ? (1 << 9) : 0))
116
117 #define PAN_SYSVAL_ID_TO_TXS_TEX_IDX(id) ((id) & 0x7f)
118 #define PAN_SYSVAL_ID_TO_TXS_DIM(id) (((id) >> 7) & 0x3)
119 #define PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(id) !!((id) & (1 << 9))
120
121 /* Special attribute slots for vertex builtins. Sort of arbitrary but let's be
122 * consistent with the blob so we can compare traces easier. */
123
124 enum {
125 PAN_VERTEX_ID = 16,
126 PAN_INSTANCE_ID = 17,
127 PAN_MAX_ATTRIBUTE
128 };
129
130 struct panfrost_sysvals {
131 /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */
132 unsigned sysvals[MAX_SYSVAL_COUNT];
133 unsigned sysval_count;
134 };
135
136 /* Architecturally, Bifrost/Valhall can address 128 FAU slots of 64-bits each.
137 * In practice, the maximum number of FAU slots is limited by implementation.
138 * All known Bifrost and Valhall devices limit to 64 FAU slots. Therefore the
139 * maximum number of 32-bit words is 128, since there are 2 words per FAU slot.
140 *
141 * Midgard can push at most 92 words, so this bound suffices. The Midgard
142 * compiler pushes less than this, as Midgard uses register-mapped uniforms
143 * instead of FAU, preventing large numbers of uniforms to be pushed for
144 * nontrivial programs.
145 */
146 #define PAN_MAX_PUSH 128
147
148 /* Architectural invariants (Midgard and Bifrost): UBO must be <= 2^16 bytes so
149 * an offset to a word must be < 2^16. There are less than 2^8 UBOs */
150
151 struct panfrost_ubo_word {
152 uint16_t ubo;
153 uint16_t offset;
154 };
155
156 struct panfrost_ubo_push {
157 unsigned count;
158 struct panfrost_ubo_word words[PAN_MAX_PUSH];
159 };
160
161 /* Helper for searching the above. Note this is O(N) to the number of pushed
162 * constants, do not run in the draw call hot path */
163
164 unsigned
165 pan_lookup_pushed_ubo(struct panfrost_ubo_push *push, unsigned ubo, unsigned offs);
166
167 struct hash_table_u64 *
168 panfrost_init_sysvals(struct panfrost_sysvals *sysvals,
169 struct panfrost_sysvals *fixed_sysvals,
170 void *memctx);
171
172 unsigned
173 pan_lookup_sysval(struct hash_table_u64 *sysval_to_id,
174 struct panfrost_sysvals *sysvals,
175 int sysval);
176
177 int
178 panfrost_sysval_for_instr(nir_instr *instr, nir_dest *dest);
179
180 struct panfrost_compile_inputs {
181 unsigned gpu_id;
182 bool is_blend, is_blit;
183 struct {
184 unsigned rt;
185 unsigned nr_samples;
186 uint64_t bifrost_blend_desc;
187 } blend;
188 int fixed_sysval_ubo;
189 struct panfrost_sysvals *fixed_sysval_layout;
190 bool shaderdb;
191 bool no_idvs;
192 bool no_ubo_to_push;
193
194 enum pipe_format rt_formats[8];
195 uint8_t raw_fmt_mask;
196 unsigned nr_cbufs;
197
198 /* Used on Valhall.
199 *
200 * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
201 * written by the previous stage (fragment shader) or written by this
202 * stage (vertex shader). Bits are slots from gl_varying_slot.
203 *
204 * For modern APIs (GLES or VK), this should be 0.
205 */
206 uint32_t fixed_varying_mask;
207
208 union {
209 struct {
210 bool static_rt_conv;
211 uint32_t rt_conv[8];
212 } bifrost;
213 };
214 };
215
216 struct pan_shader_varying {
217 gl_varying_slot location;
218 enum pipe_format format;
219 };
220
221 struct bifrost_shader_blend_info {
222 nir_alu_type type;
223 uint32_t return_offset;
224
225 /* mali_bifrost_register_file_format corresponding to nir_alu_type */
226 unsigned format;
227 };
228
229 /*
230 * Unpacked form of a v7 message preload descriptor, produced by the compiler's
231 * message preload optimization. By splitting out this struct, the compiler does
232 * not need to know about data structure packing, avoiding a dependency on
233 * GenXML.
234 */
235 struct bifrost_message_preload {
236 /* Whether to preload this message */
237 bool enabled;
238
239 /* Varying to load from */
240 unsigned varying_index;
241
242 /* Register type, FP32 otherwise */
243 bool fp16;
244
245 /* Number of components, ignored if texturing */
246 unsigned num_components;
247
248 /* If texture is set, performs a texture instruction according to
249 * texture_index, skip, and zero_lod. If texture is unset, only the
250 * varying load is performed.
251 */
252 bool texture, skip, zero_lod;
253 unsigned texture_index;
254 };
255
256 struct bifrost_shader_info {
257 struct bifrost_shader_blend_info blend[8];
258 nir_alu_type blend_src1_type;
259 bool wait_6, wait_7;
260 struct bifrost_message_preload messages[2];
261
262 /* Whether any flat varyings are loaded. This may disable optimizations
263 * that change the provoking vertex, since that would load incorrect
264 * values for flat varyings.
265 */
266 bool uses_flat_shading;
267 };
268
269 struct midgard_shader_info {
270 unsigned first_tag;
271 };
272
273 struct pan_shader_info {
274 gl_shader_stage stage;
275 unsigned work_reg_count;
276 unsigned tls_size;
277 unsigned wls_size;
278
279 /* Bit mask of preloaded registers */
280 uint64_t preload;
281
282 union {
283 struct {
284 bool reads_frag_coord;
285 bool reads_point_coord;
286 bool reads_face;
287 bool can_discard;
288 bool writes_depth;
289 bool writes_stencil;
290 bool writes_coverage;
291 bool sidefx;
292 bool sample_shading;
293 bool early_fragment_tests;
294 bool can_early_z, can_fpk;
295 BITSET_WORD outputs_read;
296 BITSET_WORD outputs_written;
297 } fs;
298
299 struct {
300 bool writes_point_size;
301
302 /* If the primary shader writes point size, the Valhall
303 * driver may need a variant that does not write point
304 * size. Offset to such a shader in the program binary.
305 *
306 * Zero if no such variant is required.
307 *
308 * Only used with IDVS on Valhall.
309 */
310 unsigned no_psiz_offset;
311
312 /* Set if Index-Driven Vertex Shading is in use */
313 bool idvs;
314
315 /* If IDVS is used, whether a varying shader is used */
316 bool secondary_enable;
317
318 /* If a varying shader is used, the varying shader's
319 * offset in the program binary
320 */
321 unsigned secondary_offset;
322
323 /* If IDVS is in use, number of work registers used by
324 * the varying shader
325 */
326 unsigned secondary_work_reg_count;
327
328 /* If IDVS is in use, bit mask of preloaded registers
329 * used by the varying shader
330 */
331 uint64_t secondary_preload;
332 } vs;
333
334 struct {
335 /* Is it legal to merge workgroups? This is true if the
336 * shader uses neither barriers nor shared memory.
337 *
338 * Used by the Valhall hardware.
339 */
340 bool allow_merging_workgroups;
341 } cs;
342 };
343
344 /* Does the shader contains a barrier? or (for fragment shaders) does it
345 * require helper invocations, which demand the same ordering guarantees
346 * of the hardware? These notions are unified in the hardware, so we
347 * unify them here as well.
348 */
349 bool contains_barrier;
350 bool separable;
351 bool writes_global;
352 uint64_t outputs_written;
353
354 unsigned sampler_count;
355 unsigned texture_count;
356 unsigned ubo_count;
357 unsigned attributes_read_count;
358 unsigned attribute_count;
359 unsigned attributes_read;
360
361 struct {
362 unsigned input_count;
363 struct pan_shader_varying input[PAN_MAX_VARYINGS];
364 unsigned output_count;
365 struct pan_shader_varying output[PAN_MAX_VARYINGS];
366 } varyings;
367
368 struct panfrost_sysvals sysvals;
369
370 /* UBOs to push to Register Mapped Uniforms (Midgard) or Fast Access
371 * Uniforms (Bifrost) */
372 struct panfrost_ubo_push push;
373
374 uint32_t ubo_mask;
375
376 union {
377 struct bifrost_shader_info bifrost;
378 struct midgard_shader_info midgard;
379 };
380 };
381
382 typedef struct pan_block {
383 /* Link to next block. Must be first for mir_get_block */
384 struct list_head link;
385
386 /* List of instructions emitted for the current block */
387 struct list_head instructions;
388
389 /* Index of the block in source order */
390 unsigned name;
391
392 /* Control flow graph */
393 struct pan_block *successors[2];
394 struct set *predecessors;
395 bool unconditional_jumps;
396
397 /* In liveness analysis, these are live masks (per-component) for
398 * indices for the block. Scalar compilers have the luxury of using
399 * simple bit fields, but for us, liveness is a vector idea. */
400 uint16_t *live_in;
401 uint16_t *live_out;
402 } pan_block;
403
404 struct pan_instruction {
405 struct list_head link;
406 };
407
408 #define pan_foreach_instr_in_block_rev(block, v) \
409 list_for_each_entry_rev(struct pan_instruction, v, &block->instructions, link)
410
411 #define pan_foreach_successor(blk, v) \
412 pan_block *v; \
413 pan_block **_v; \
414 for (_v = (pan_block **) &blk->successors[0], \
415 v = *_v; \
416 v != NULL && _v < (pan_block **) &blk->successors[2]; \
417 _v++, v = *_v) \
418
419 #define pan_foreach_predecessor(blk, v) \
420 struct set_entry *_entry_##v; \
421 struct pan_block *v; \
422 for (_entry_##v = _mesa_set_next_entry(blk->predecessors, NULL), \
423 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL); \
424 _entry_##v != NULL; \
425 _entry_##v = _mesa_set_next_entry(blk->predecessors, _entry_##v), \
426 v = (struct pan_block *) (_entry_##v ? _entry_##v->key : NULL))
427
428 static inline pan_block *
pan_exit_block(struct list_head * blocks)429 pan_exit_block(struct list_head *blocks)
430 {
431 pan_block *last = list_last_entry(blocks, pan_block, link);
432 assert(!last->successors[0] && !last->successors[1]);
433 return last;
434 }
435
436 typedef void (*pan_liveness_update)(uint16_t *, void *, unsigned max);
437
438 void pan_liveness_gen(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
439 void pan_liveness_kill(uint16_t *live, unsigned node, unsigned max, uint16_t mask);
440 bool pan_liveness_get(uint16_t *live, unsigned node, uint16_t max);
441
442 void pan_compute_liveness(struct list_head *blocks,
443 unsigned temp_count,
444 pan_liveness_update callback);
445
446 void pan_free_liveness(struct list_head *blocks);
447
448 uint16_t
449 pan_to_bytemask(unsigned bytes, unsigned mask);
450
451 void pan_block_add_successor(pan_block *block, pan_block *successor);
452
453 /* IR indexing */
454 #define PAN_IS_REG (1)
455
456 static inline unsigned
pan_ssa_index(nir_ssa_def * ssa)457 pan_ssa_index(nir_ssa_def *ssa)
458 {
459 /* Off-by-one ensures BIR_NO_ARG is skipped */
460 return ((ssa->index + 1) << 1) | 0;
461 }
462
463 static inline unsigned
pan_src_index(nir_src * src)464 pan_src_index(nir_src *src)
465 {
466 if (src->is_ssa)
467 return pan_ssa_index(src->ssa);
468 else {
469 assert(!src->reg.indirect);
470 return (src->reg.reg->index << 1) | PAN_IS_REG;
471 }
472 }
473
474 static inline unsigned
pan_dest_index(nir_dest * dst)475 pan_dest_index(nir_dest *dst)
476 {
477 if (dst->is_ssa)
478 return pan_ssa_index(&dst->ssa);
479 else {
480 assert(!dst->reg.indirect);
481 return (dst->reg.reg->index << 1) | PAN_IS_REG;
482 }
483 }
484
485 /* IR printing helpers */
486 void pan_print_alu_type(nir_alu_type t, FILE *fp);
487
488 /* Until it can be upstreamed.. */
489 bool pan_has_source_mod(nir_alu_src *src, nir_op op);
490 bool pan_has_dest_mod(nir_dest **dest, nir_op op);
491
492 /* NIR passes to do some backend-specific lowering */
493
494 #define PAN_WRITEOUT_C 1
495 #define PAN_WRITEOUT_Z 2
496 #define PAN_WRITEOUT_S 4
497 #define PAN_WRITEOUT_2 8
498
499 bool pan_nir_lower_zs_store(nir_shader *nir);
500
501 bool pan_nir_lower_64bit_intrin(nir_shader *shader);
502
503 bool pan_lower_helper_invocation(nir_shader *shader);
504 bool pan_lower_sample_pos(nir_shader *shader);
505 bool pan_lower_xfb(nir_shader *nir);
506
507 /*
508 * Helper returning the subgroup size. Generally, this is equal to the number of
509 * threads in a warp. For Midgard (including warping models), this returns 1, as
510 * subgroups are not supported.
511 */
512 static inline unsigned
pan_subgroup_size(unsigned arch)513 pan_subgroup_size(unsigned arch)
514 {
515 if (arch >= 9)
516 return 16;
517 else if (arch >= 7)
518 return 8;
519 else if (arch >= 6)
520 return 4;
521 else
522 return 1;
523 }
524
525 #endif
526