1 /*
2 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11
12 #include <stdio.h>
13
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19
20 #include "ir3_compiler.h"
21
22 BEGINC;
23
24 #define dword_offsetof(type, name) DIV_ROUND_UP(offsetof(type, name), 4)
25 #define dword_sizeof(type) DIV_ROUND_UP(sizeof(type), 4)
26
27 /**
28 * Driver params for compute shaders.
29 *
30 * Note, driver param structs should be size aligned to vec4
31 */
32 struct ir3_driver_params_cs {
33 /* NOTE: gl_NumWorkGroups should be vec4 aligned because
34 * glDispatchComputeIndirect() needs to load these from
35 * the info->indirect buffer. Keep that in mind when/if
36 * adding any addition CS driver params.
37 */
38 uint32_t num_work_groups_x;
39 uint32_t num_work_groups_y;
40 uint32_t num_work_groups_z;
41 uint32_t work_dim;
42 uint32_t base_group_x;
43 uint32_t base_group_y;
44 uint32_t base_group_z;
45 uint32_t subgroup_size;
46 uint32_t local_group_size_x;
47 uint32_t local_group_size_y;
48 uint32_t local_group_size_z;
49 uint32_t subgroup_id_shift;
50 uint32_t workgroup_id_x;
51 uint32_t workgroup_id_y;
52 uint32_t workgroup_id_z;
53 uint32_t __pad;
54 };
55 #define IR3_DP_CS(name) dword_offsetof(struct ir3_driver_params_cs, name)
56
57 /**
58 * Driver params for vertex shaders.
59 *
60 * Note, driver param structs should be size aligned to vec4
61 */
62 struct ir3_driver_params_vs {
63 uint32_t draw_id;
64 uint32_t vtxid_base;
65 uint32_t instid_base;
66 uint32_t vtxcnt_max;
67 uint32_t is_indexed_draw; /* Note: boolean, ie. 0 or ~0 */
68 /* user-clip-plane components, up to 8x vec4's: */
69 struct {
70 uint32_t x;
71 uint32_t y;
72 uint32_t z;
73 uint32_t w;
74 } ucp[8];
75 uint32_t __pad_37_39[3];
76 };
77 #define IR3_DP_VS(name) dword_offsetof(struct ir3_driver_params_vs, name)
78
79 /**
80 * Driver params for TCS shaders.
81 *
82 * Note, driver param structs should be size aligned to vec4
83 */
84 struct ir3_driver_params_tcs {
85 uint32_t default_outer_level_x;
86 uint32_t default_outer_level_y;
87 uint32_t default_outer_level_z;
88 uint32_t default_outer_level_w;
89 uint32_t default_inner_level_x;
90 uint32_t default_inner_level_y;
91 uint32_t __pad_06_07[2];
92 };
93 #define IR3_DP_TCS(name) dword_offsetof(struct ir3_driver_params_tcs, name)
94
95 /**
96 * Driver params for fragment shaders.
97 *
98 * Note, driver param structs should be size aligned to vec4
99 */
100 struct ir3_driver_params_fs {
101 uint32_t subgroup_size;
102 uint32_t __pad_01_03[3];
103 /* Dynamic params (that aren't known when compiling the shader) */
104 #define IR3_DP_FS_DYNAMIC dword_offsetof(struct ir3_driver_params_fs, frag_invocation_count)
105 uint32_t frag_invocation_count;
106 uint32_t __pad_05_07[3];
107 uint32_t frag_size;
108 uint32_t __pad_09;
109 uint32_t frag_offset;
110 uint32_t __pad_11_12[2];
111 };
112 #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
113
114 #define IR3_MAX_SHADER_BUFFERS 32
115 #define IR3_MAX_SHADER_IMAGES 32
116 #define IR3_MAX_SO_BUFFERS 4
117 #define IR3_MAX_SO_STREAMS 4
118 #define IR3_MAX_SO_OUTPUTS 128
119 #define IR3_MAX_UBO_PUSH_RANGES 32
120
121 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
122 enum ir3_bary {
123 IJ_PERSP_PIXEL,
124 IJ_PERSP_SAMPLE,
125 IJ_PERSP_CENTROID,
126 IJ_PERSP_CENTER_RHW,
127 IJ_LINEAR_PIXEL,
128 IJ_LINEAR_CENTROID,
129 IJ_LINEAR_SAMPLE,
130 IJ_COUNT,
131 };
132
133 /* Description of what wavesizes are allowed. */
134 enum ir3_wavesize_option {
135 IR3_SINGLE_ONLY,
136 IR3_SINGLE_OR_DOUBLE,
137 IR3_DOUBLE_ONLY,
138 };
139
140 /**
141 * Description of a lowered UBO.
142 */
143 struct nir_def;
144
145 struct ir3_ubo_info {
146 struct nir_def *global_base; /* For global loads, the base address */
147 uint32_t block; /* Which constant block */
148 uint16_t bindless_base; /* For bindless, which base register is used */
149 bool bindless;
150 bool global;
151 };
152
153 /**
154 * Description of a range of a lowered UBO access.
155 *
156 * Drivers should not assume that there are not multiple disjoint
157 * lowered ranges of a single UBO.
158 */
159 struct ir3_ubo_range {
160 struct ir3_ubo_info ubo;
161 uint32_t offset; /* start offset to push in the const register file */
162 uint32_t start, end; /* range of block that's actually used */
163 };
164
165 struct ir3_ubo_analysis_state {
166 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
167 uint32_t num_enabled;
168 uint32_t size;
169 };
170
171 enum ir3_push_consts_type {
172 IR3_PUSH_CONSTS_NONE,
173 IR3_PUSH_CONSTS_PER_STAGE,
174 IR3_PUSH_CONSTS_SHARED,
175 IR3_PUSH_CONSTS_SHARED_PREAMBLE,
176 };
177
178 /* This represents an internal UBO filled out by the driver. There are a few
179 * common UBOs that must be filled out identically by all drivers, for example
180 * for shader linkage, but drivers can also add their own that they manage
181 * themselves.
182 */
183 struct ir3_driver_ubo {
184 int32_t idx;
185 uint32_t size;
186 };
187
188 enum ir3_const_alloc_type {
189 /* Vulkan, push consts. */
190 IR3_CONST_ALLOC_PUSH_CONSTS = 0,
191 /* Vulkan, offsets required to calculate offsets of descriptors with dynamic
192 * offsets.
193 */
194 IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET = 1,
195 /* Vulkan, addresses of inline uniform buffers, to which we fallback when
196 * their size is unknown.
197 */
198 IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS = 2,
199 /* Common, stage-specific params uploaded by the driver/HW. */
200 IR3_CONST_ALLOC_DRIVER_PARAMS = 3,
201 /* Common, UBOs lowered to consts. */
202 IR3_CONST_ALLOC_UBO_RANGES = 4,
203 /* Common, consts produced by a preamble to be used in a main shader. */
204 IR3_CONST_ALLOC_PREAMBLE = 5,
205 /* Vulkan, inline uniforms loaded into consts in the preamble.*/
206 IR3_CONST_ALLOC_GLOBAL = 6,
207 /* OpenGL, pre-a6xx; pointers to UBOs */
208 IR3_CONST_ALLOC_UBO_PTRS = 7,
209 /* OpenGL, a5xx only; needed to calculate pixel offset, but only
210 * for images that have image_{load,store,size,atomic*} intrinsics.
211 */
212 IR3_CONST_ALLOC_IMAGE_DIMS = 8,
213 /* OpenCL */
214 IR3_CONST_ALLOC_KERNEL_PARAMS = 9,
215 /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */
216 IR3_CONST_ALLOC_TFBO = 10,
217 /* Common, stage-dependent primitive params:
218 * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
219 * hs, ds: uvec4(primitive_stride, vertex_stride,
220 * patch_stride, patch_vertices_in)
221 * uvec4(tess_param_base, tess_factor_base)
222 */
223 IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11,
224 /* Common, mapping from varying location to offset. */
225 IR3_CONST_ALLOC_PRIMITIVE_MAP = 12,
226 IR3_CONST_ALLOC_MAX = 13,
227 };
228
229 struct ir3_const_allocation {
230 uint32_t offset_vec4;
231 uint32_t size_vec4;
232
233 uint32_t reserved_size_vec4;
234 uint32_t reserved_align_vec4;
235 };
236
237 struct ir3_const_allocations {
238 struct ir3_const_allocation consts[IR3_CONST_ALLOC_MAX];
239 uint32_t max_const_offset_vec4;
240 uint32_t reserved_vec4;
241 };
242
243 static inline bool
ir3_const_can_upload(const struct ir3_const_allocations * const_alloc,enum ir3_const_alloc_type type,uint32_t shader_const_size_vec4)244 ir3_const_can_upload(const struct ir3_const_allocations *const_alloc,
245 enum ir3_const_alloc_type type,
246 uint32_t shader_const_size_vec4)
247 {
248 return const_alloc->consts[type].size_vec4 > 0 &&
249 const_alloc->consts[type].offset_vec4 < shader_const_size_vec4;
250 }
251
252 struct ir3_const_image_dims {
253 uint32_t mask; /* bitmask of images that have image_store */
254 uint32_t count; /* number of consts allocated */
255 /* three const allocated per image which has image_store:
256 * + cpp (bytes per pixel)
257 * + pitch (y pitch)
258 * + array_pitch (z pitch)
259 */
260 uint32_t off[IR3_MAX_SHADER_IMAGES];
261 };
262
263 /**
264 * Describes the layout of shader consts in the const register file
265 * and additional info about individual allocations.
266 *
267 * Each consts section is aligned to vec4. Note that pointer
268 * size (ubo, etc) changes depending on generation.
269 *
270 * The consts allocation flow is as follows:
271 * 1) Turnip/Freedreno allocates consts required by corresponding API,
272 * e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
273 * into IR3.
274 * 2) ir3_setup_const_state allocates consts with non-negotiable size.
275 * 3) IR3 lowerings afterwards allocate from the free space left.
276 *
277 * Note UBO size in bytes should be aligned to vec4
278 */
279 struct ir3_const_state {
280 unsigned num_ubos;
281 unsigned num_app_ubos; /* # of UBOs not including driver UBOs */
282 unsigned num_driver_params; /* scalar */
283
284 struct ir3_driver_ubo consts_ubo;
285 struct ir3_driver_ubo driver_params_ubo;
286 struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
287
288 struct ir3_const_allocations allocs;
289
290 struct ir3_const_image_dims image_dims;
291
292 unsigned immediates_count;
293 unsigned immediates_size;
294 uint32_t *immediates;
295
296 /* State of ubo access lowered to push consts: */
297 struct ir3_ubo_analysis_state ubo_state;
298 enum ir3_push_consts_type push_consts_type;
299 };
300
301 /**
302 * A single output for vertex transform feedback.
303 */
304 struct ir3_stream_output {
305 unsigned register_index : 6; /**< 0 to 63 (OUT index) */
306 unsigned start_component : 2; /** 0 to 3 */
307 unsigned num_components : 3; /** 1 to 4 */
308 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */
309 unsigned dst_offset : 16; /**< offset into the buffer in dwords */
310 unsigned stream : 2; /**< 0 to 3 */
311 };
312
313 /**
314 * Stream output for vertex transform feedback.
315 */
316 struct ir3_stream_output_info {
317 unsigned num_outputs;
318 /** stride for an entire vertex for each buffer in dwords */
319 uint16_t stride[IR3_MAX_SO_BUFFERS];
320
321 /* These correspond to the VPC_SO_STREAM_CNTL fields */
322 uint8_t streams_written;
323 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
324
325 /**
326 * Array of stream outputs, in the order they are to be written in.
327 * Selected components are tightly packed into the output buffer.
328 */
329 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
330 };
331
332 /**
333 * Starting from a4xx, HW supports pre-dispatching texture sampling
334 * instructions prior to scheduling a shader stage, when the
335 * coordinate maps exactly to an output of the previous stage.
336 */
337
338 /**
339 * There is a limit in the number of pre-dispatches allowed for any
340 * given stage.
341 */
342 #define IR3_MAX_SAMPLER_PREFETCH 4
343
344 /**
345 * This is the output stream value for 'cmd', as used by blob. It may
346 * encode the return type (in 3 bits) but it hasn't been verified yet.
347 */
348 #define IR3_SAMPLER_PREFETCH_CMD 0x4
349 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
350
351 /**
352 * Stream output for texture sampling pre-dispatches.
353 */
354 struct ir3_sampler_prefetch {
355 uint8_t src;
356 bool bindless;
357 uint8_t samp_id;
358 uint8_t tex_id;
359 uint16_t samp_bindless_id;
360 uint16_t tex_bindless_id;
361 uint8_t dst;
362 uint8_t wrmask;
363 uint8_t half_precision;
364 opc_t tex_opc;
365 };
366
367 /* Configuration key used to identify a shader variant.. different
368 * shader variants can be used to implement features not supported
369 * in hw (two sided color), binning-pass vertex shader, etc.
370 *
371 * When adding to this struct, please update ir3_shader_variant()'s debug
372 * output.
373 */
374 struct ir3_shader_key {
375 union {
376 struct {
377 /*
378 * Combined Vertex/Fragment shader parameters:
379 */
380 unsigned ucp_enables : 8;
381
382 /* do we need to check {v,f}saturate_{s,t,r}? */
383 unsigned has_per_samp : 1;
384
385 /*
386 * Fragment shader variant parameters:
387 */
388 unsigned sample_shading : 1;
389 unsigned msaa : 1;
390 /* used when shader needs to handle flat varyings (a4xx)
391 * for front/back color inputs to frag shader:
392 */
393 unsigned rasterflat : 1;
394
395 /* Indicates that this is a tessellation pipeline which requires a
396 * whole different kind of vertex shader. In case of
397 * tessellation, this field also tells us which kind of output
398 * topology the TES uses, which the TCS needs to know.
399 */
400 #define IR3_TESS_NONE 0
401 #define IR3_TESS_QUADS 1
402 #define IR3_TESS_TRIANGLES 2
403 #define IR3_TESS_ISOLINES 3
404 unsigned tessellation : 2;
405
406 unsigned has_gs : 1;
407
408 /* Whether stages after TCS read gl_PrimitiveID, used to determine
409 * whether the TCS has to store it in the tess factor BO.
410 */
411 unsigned tcs_store_primid : 1;
412
413 /* Whether this variant sticks to the "safe" maximum constlen,
414 * which guarantees that the combined stages will never go over
415 * the limit:
416 */
417 unsigned safe_constlen : 1;
418
419 /* Whether driconf "dual_color_blend_by_location" workaround is
420 * enabled
421 */
422 unsigned force_dual_color_blend : 1;
423 };
424 uint32_t global;
425 };
426
427 /* bitmask of ms shifts (a3xx) */
428 uint32_t vsamples, fsamples;
429
430 /* bitmask of samplers which need astc srgb workaround (a4xx): */
431 uint16_t vastc_srgb, fastc_srgb;
432
433 /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
434 uint16_t vsampler_swizzles[16];
435 uint16_t fsampler_swizzles[16];
436 };
437
438 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)439 ir3_tess_mode(enum tess_primitive_mode tess_mode)
440 {
441 switch (tess_mode) {
442 case TESS_PRIMITIVE_ISOLINES:
443 return IR3_TESS_ISOLINES;
444 case TESS_PRIMITIVE_TRIANGLES:
445 return IR3_TESS_TRIANGLES;
446 case TESS_PRIMITIVE_QUADS:
447 return IR3_TESS_QUADS;
448 default:
449 unreachable("bad tessmode");
450 }
451 }
452
453 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)454 ir3_tess_factor_stride(unsigned patch_type)
455 {
456 /* note: this matches the stride used by ir3's build_tessfactor_base */
457 switch (patch_type) {
458 case IR3_TESS_ISOLINES:
459 return 12;
460 case IR3_TESS_TRIANGLES:
461 return 20;
462 case IR3_TESS_QUADS:
463 return 28;
464 default:
465 unreachable("bad tessmode");
466 }
467 }
468
469 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)470 ir3_shader_key_equal(const struct ir3_shader_key *a,
471 const struct ir3_shader_key *b)
472 {
473 /* slow-path if we need to check {v,f}saturate_{s,t,r} */
474 if (a->has_per_samp || b->has_per_samp)
475 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
476 return a->global == b->global;
477 }
478
479 /* will the two keys produce different lowering for a fragment shader? */
480 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)481 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
482 struct ir3_shader_key *last_key)
483 {
484 if (last_key->has_per_samp || key->has_per_samp) {
485 if ((last_key->fsamples != key->fsamples) ||
486 (last_key->fastc_srgb != key->fastc_srgb) ||
487 memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
488 sizeof(key->fsampler_swizzles)))
489 return true;
490 }
491
492 if (last_key->rasterflat != key->rasterflat)
493 return true;
494
495 if (last_key->ucp_enables != key->ucp_enables)
496 return true;
497
498 if (last_key->safe_constlen != key->safe_constlen)
499 return true;
500
501 return false;
502 }
503
504 /* will the two keys produce different lowering for a vertex shader? */
505 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)506 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
507 struct ir3_shader_key *last_key)
508 {
509 if (last_key->has_per_samp || key->has_per_samp) {
510 if ((last_key->vsamples != key->vsamples) ||
511 (last_key->vastc_srgb != key->vastc_srgb) ||
512 memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
513 sizeof(key->vsampler_swizzles)))
514 return true;
515 }
516
517 if (last_key->ucp_enables != key->ucp_enables)
518 return true;
519
520 if (last_key->safe_constlen != key->safe_constlen)
521 return true;
522
523 return false;
524 }
525
526 /**
527 * On a4xx+a5xx, Images share state with textures and SSBOs:
528 *
529 * + Uses texture (cat5) state/instruction (isam) to read
530 * + Uses SSBO state and instructions (cat6) to write and for atomics
531 *
532 * Starting with a6xx, Images and SSBOs are basically the same thing,
533 * with texture state and isam also used for SSBO reads.
534 *
535 * On top of that, gallium makes the SSBO (shader_buffers) state semi
536 * sparse, with the first half of the state space used for atomic
537 * counters lowered to atomic buffers. We could ignore this, but I
538 * don't think we could *really* handle the case of a single shader
539 * that used the max # of textures + images + SSBOs. And once we are
540 * offsetting images by num_ssbos (or visa versa) to map them into
541 * the same hardware state, the hardware state has become coupled to
542 * the shader state, so at this point we might as well just use a
543 * mapping table to remap things from image/SSBO idx to hw idx.
544 *
545 * To make things less (more?) confusing, for the hw "SSBO" state
546 * (since it is really both SSBO and Image) I'll use the name "IBO"
547 */
548 struct ir3_ibo_mapping {
549 #define IBO_INVALID 0xff
550 /* Maps logical SSBO state to hw tex state: */
551 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
552
553 /* Maps logical Image state to hw tex state: */
554 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
555
556 /* Maps hw state back to logical SSBO or Image state:
557 *
558 * note IBO_SSBO ORd into values to indicate that the
559 * hw slot is used for SSBO state vs Image state.
560 */
561 #define IBO_SSBO 0x80
562 uint8_t tex_to_image[32];
563
564 /* including real textures */
565 uint8_t num_tex;
566 /* the number of real textures, ie. image/ssbo start here */
567 uint8_t tex_base;
568 };
569
570 struct ir3_disasm_info {
571 bool write_disasm;
572 char *nir;
573 char *disasm;
574 };
575
576 /* Represents half register in regid */
577 #define HALF_REG_ID 0x100
578
579 /* Options for common NIR optimization passes done in ir3. This is used for both
580 * finalize and post-finalize (where it has to be in the shader).
581 */
582 struct ir3_shader_nir_options {
583 /* For the modes specified, accesses are assumed to be bounds-checked as
584 * defined by VK_EXT_robustness2 and optimizations may have to be more
585 * conservative.
586 */
587 nir_variable_mode robust_modes;
588 };
589
590 struct ir3_shader_options {
591 /* What API-visible wavesizes are allowed. Even if only double wavesize is
592 * allowed, we may still use the smaller wavesize "under the hood" and the
593 * application simply sees the upper half as always disabled.
594 */
595 enum ir3_wavesize_option api_wavesize;
596 /* What wavesizes we're allowed to actually use. If the API wavesize is
597 * single-only, then this must be single-only too.
598 */
599 enum ir3_wavesize_option real_wavesize;
600 enum ir3_push_consts_type push_consts_type;
601
602 uint32_t push_consts_base;
603 uint32_t push_consts_dwords;
604
605 /* Some const allocations are required at API level. */
606 struct ir3_const_allocations const_allocs;
607
608 struct ir3_shader_nir_options nir_options;
609 };
610
611 /**
612 * Shader variant which contains the actual hw shader instructions,
613 * and necessary info for shader state setup.
614 */
615 struct ir3_shader_variant {
616 struct fd_bo *bo;
617
618 /* variant id (for debug) */
619 uint32_t id;
620
621 /* id of the shader the variant came from (for debug) */
622 uint32_t shader_id;
623
624 struct ir3_shader_key key;
625
626 /* vertex shaders can have an extra version for hwbinning pass,
627 * which is pointed to by so->binning:
628 */
629 bool binning_pass;
630 // union {
631 struct ir3_shader_variant *binning;
632 struct ir3_shader_variant *nonbinning;
633 // };
634
635 struct ir3 *ir; /* freed after assembling machine instructions */
636
637 /* shader variants form a linked list: */
638 struct ir3_shader_variant *next;
639
640 /* replicated here to avoid passing extra ptrs everywhere: */
641 gl_shader_stage type;
642 struct ir3_compiler *compiler;
643
644 char *name;
645
646 /* variant's copy of nir->constant_data (since we don't track the NIR in
647 * the variant, and shader->nir is before the opt pass). Moves to v->bin
648 * after assembly.
649 */
650 void *constant_data;
651
652 struct ir3_disasm_info disasm_info;
653
654 /*
655 * Below here is serialized when written to disk cache:
656 */
657
658 /* The actual binary shader instructions, size given by info.sizedwords: */
659 uint32_t *bin;
660
661 struct ir3_const_state *const_state;
662
663 /*
664 * The following macros are used by the shader disk cache save/
665 * restore paths to serialize/deserialize the variant. Any
666 * pointers that require special handling in store_variant()
667 * and retrieve_variant() should go above here.
668 */
669 #define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info)
670 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
671 #define VARIANT_CACHE_SIZE \
672 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
673
674 struct ir3_info info;
675
676 struct ir3_shader_options shader_options;
677
678 uint32_t constant_data_size;
679
680 /* Levels of nesting of flow control:
681 */
682 unsigned branchstack;
683
684 unsigned loops;
685
686 /* the instructions length is in units of instruction groups
687 * (4 instructions for a3xx, 16 instructions for a4xx.. each
688 * instruction is 2 dwords):
689 */
690 unsigned instrlen;
691
692 /* the constants length is in units of vec4's, and is the sum of
693 * the uniforms and the built-in compiler constants
694 */
695 unsigned constlen;
696
697 /* The private memory size in bytes per fiber */
698 unsigned pvtmem_size;
699 /* Whether we should use the new per-wave layout rather than per-fiber. */
700 bool pvtmem_per_wave;
701
702 /* Whether multi-position output is enabled. */
703 bool multi_pos_output;
704
705 /* Whether dual-source blending is enabled. */
706 bool dual_src_blend;
707
708 /* Whether early preamble is enabled. */
709 bool early_preamble;
710
711 /* Size in bytes of required shared memory */
712 unsigned shared_size;
713
714 /* About Linkage:
715 * + Let the frag shader determine the position/compmask for the
716 * varyings, since it is the place where we know if the varying
717 * is actually used, and if so, which components are used. So
718 * what the hw calls "outloc" is taken from the "inloc" of the
719 * frag shader.
720 * + From the vert shader, we only need the output regid
721 */
722
723 bool frag_face, color0_mrt;
724 uint8_t fragcoord_compmask;
725
726 /* NOTE: for input/outputs, slot is:
727 * gl_vert_attrib - for VS inputs
728 * gl_varying_slot - for VS output / FS input
729 * gl_frag_result - for FS output
730 */
731
732 /* varyings/outputs: */
733 unsigned outputs_count;
734 struct {
735 uint8_t slot;
736 uint8_t regid;
737 uint8_t view;
738 bool half : 1;
739 } outputs[32 + 2]; /* +POSITION +PSIZE */
740 bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
741 bool writes_shading_rate;
742
743 /* Size in dwords of all outputs for VS, size of entire patch for HS. */
744 uint32_t output_size;
745
746 /* Expected size of incoming output_loc for HS, DS, and GS */
747 uint32_t input_size;
748
749 /* Map from location to offset in per-primitive storage. In dwords for
750 * HS, where varyings are read in the next stage via ldg with a dword
751 * offset, and in bytes for all other stages.
752 * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
753 */
754 unsigned output_loc[13 + 32];
755
756 /* attributes (VS) / varyings (FS):
757 * Note that sysval's should come *after* normal inputs.
758 */
759 unsigned inputs_count;
760 struct {
761 uint8_t slot;
762 uint8_t regid;
763 uint8_t compmask;
764 /* location of input (ie. offset passed to bary.f, etc). This
765 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
766 * have the OUTLOCn value offset by 8, presumably to account
767 * for gl_Position/gl_PointSize)
768 */
769 uint8_t inloc;
770 /* vertex shader specific: */
771 bool sysval : 1; /* slot is a gl_system_value */
772 /* fragment shader specific: */
773 bool bary : 1; /* fetched varying (vs one loaded into reg) */
774 bool rasterflat : 1; /* special handling for emit->rasterflat */
775 bool half : 1;
776 bool flat : 1;
777 } inputs[32 + 2]; /* +POSITION +FACE */
778 bool reads_primid;
779 bool reads_shading_rate;
780 bool reads_smask;
781
782 /* sum of input components (scalar). For frag shaders, it only counts
783 * the varying inputs:
784 */
785 unsigned total_in;
786
787 /* sum of sysval input components (scalar). */
788 unsigned sysval_in;
789
790 /* For frag shaders, the total number of inputs (not scalar,
791 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
792 */
793 unsigned varying_in;
794
795 /* Remapping table to map Image and SSBO to hw state: */
796 struct ir3_ibo_mapping image_mapping;
797
798 /* number of samplers/textures (which are currently 1:1): */
799 int num_samp;
800
801 /* is there an implicit sampler to read framebuffer (FS only).. if
802 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
803 * the last "real" texture)
804 */
805 bool fb_read;
806
807 /* do we have one or more SSBO instructions: */
808 bool has_ssbo;
809
810 /* Which bindless resources are used, for filling out sp_xs_config */
811 bool bindless_tex;
812 bool bindless_samp;
813 bool bindless_ibo;
814 bool bindless_ubo;
815
816 /* do we need derivatives: */
817 bool need_pixlod;
818
819 bool need_full_quad;
820
821 /* do we need VS driver params? */
822 bool need_driver_params;
823
824 /* do we have image write, etc (which prevents early-z): */
825 bool no_earlyz;
826
827 /* do we have kill, which also prevents early-z, but not necessarily
828 * early-lrz (as long as lrz-write is disabled, which must be handled
829 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have
830 * side effects that prevent early-lrz discard.
831 */
832 bool has_kill;
833
834 bool per_samp;
835
836 bool post_depth_coverage;
837
838 /* Are we using split or merged register file? */
839 bool mergedregs;
840
841 uint8_t clip_mask, cull_mask;
842
843 /* for astc srgb workaround, the number/base of additional
844 * alpha tex states we need, and index of original tex states
845 */
846 struct {
847 unsigned base, count;
848 unsigned orig_idx[16];
849 } astc_srgb;
850
851 /* for tg4 workaround, the number/base of additional
852 * unswizzled tex states we need, and index of original tex states
853 */
854 struct {
855 unsigned base, count;
856 unsigned orig_idx[16];
857 } tg4;
858
859 /* texture sampler pre-dispatches */
860 uint32_t num_sampler_prefetch;
861 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
862
863 /* If true, the last use of helper invocations is the texture prefetch and
864 * they should be disabled for the actual shader. Equivalent to adding
865 * (eq)nop at the beginning of the shader.
866 */
867 bool prefetch_end_of_quad;
868
869 uint16_t local_size[3];
870 bool local_size_variable;
871
872 /* Important for compute shader to determine max reg footprint */
873 bool has_barrier;
874
875 /* The offset where images start in the IBO array. */
876 unsigned num_ssbos;
877
878 /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
879 unsigned num_ibos;
880
881 union {
882 struct {
883 enum tess_primitive_mode primitive_mode;
884
885 /** The number of vertices in the TCS output patch. */
886 uint8_t tcs_vertices_out;
887 enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
888
889 /** Is the vertex order counterclockwise? */
890 bool ccw:1;
891 bool point_mode:1;
892 } tess;
893 struct {
894 /** The output primitive type */
895 uint16_t output_primitive;
896
897 /** The maximum number of vertices the geometry shader might write. */
898 uint16_t vertices_out;
899
900 /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
901 uint8_t invocations;
902
903 /** The number of vertices received per input primitive (max. 6) */
904 uint8_t vertices_in:3;
905 } gs;
906 struct {
907 bool early_fragment_tests : 1;
908 bool color_is_dual_source : 1;
909 bool uses_fbfetch_output : 1;
910 bool fbfetch_coherent : 1;
911 } fs;
912 struct {
913 unsigned req_input_mem;
914 unsigned req_local_mem;
915 bool force_linear_dispatch;
916 uint32_t local_invocation_id;
917 uint32_t work_group_id;
918 } cs;
919 };
920
921 uint32_t vtxid_base;
922
923 /* For when we don't have a shader, variant's copy of streamout state */
924 struct ir3_stream_output_info stream_output;
925 };
926
927 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)928 ir3_shader_stage(struct ir3_shader_variant *v)
929 {
930 switch (v->type) {
931 case MESA_SHADER_VERTEX:
932 return v->binning_pass ? "BVERT" : "VERT";
933 case MESA_SHADER_TESS_CTRL:
934 return "TCS";
935 case MESA_SHADER_TESS_EVAL:
936 return "TES";
937 case MESA_SHADER_GEOMETRY:
938 return "GEOM";
939 case MESA_SHADER_FRAGMENT:
940 return "FRAG";
941 case MESA_SHADER_COMPUTE:
942 case MESA_SHADER_KERNEL:
943 return "CL";
944 default:
945 unreachable("invalid type");
946 return NULL;
947 }
948 }
949
950 /* Currently we do not do binning for tess. And for GS there is no
951 * cross-stage VS+GS optimization, so the full VS+GS is used in
952 * the binning pass.
953 */
954 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)955 ir3_has_binning_vs(const struct ir3_shader_key *key)
956 {
957 if (key->tessellation || key->has_gs)
958 return false;
959 return true;
960 }
961
962 /**
963 * Represents a shader at the API level, before state-specific variants are
964 * generated.
965 */
966 struct ir3_shader {
967 gl_shader_stage type;
968
969 /* shader id (for debug): */
970 uint32_t id;
971 uint32_t variant_count;
972
973 /* Set by freedreno after shader_state_create, so we can emit debug info
974 * when recompiling a shader at draw time.
975 */
976 bool initial_variants_done;
977
978 struct ir3_compiler *compiler;
979
980 struct ir3_shader_options options;
981
982 bool nir_finalized;
983 struct nir_shader *nir;
984 struct ir3_stream_output_info stream_output;
985
986 /* per shader stage specific info: */
987 union {
988 /* for compute shaders: */
989 struct {
990 unsigned req_input_mem; /* in dwords */
991 unsigned req_local_mem;
992 bool force_linear_dispatch;
993 } cs;
994 /* For vertex shaders: */
995 struct {
996 /* If we need to generate a passthrough TCS, it will be a function of
997 * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
998 * in the VS keyed by # of patch_vertices-1.
999 */
1000 unsigned passthrough_tcs_compiled;
1001 struct ir3_shader *passthrough_tcs[32];
1002 } vs;
1003 };
1004
1005 struct ir3_shader_variant *variants;
1006 mtx_t variants_lock;
1007
1008 cache_key cache_key; /* shader disk-cache key */
1009
1010 /* Bitmask of bits of the shader key used by this shader. Used to avoid
1011 * recompiles for GL NOS that doesn't actually apply to the shader.
1012 */
1013 struct ir3_shader_key key_mask;
1014 };
1015
1016 /**
1017 * In order to use the same cmdstream, in particular constlen setup and const
1018 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
1019 * corresponding draw pass shaders const_state.
1020 */
1021 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)1022 ir3_const_state(const struct ir3_shader_variant *v)
1023 {
1024 if (v->binning_pass)
1025 return v->nonbinning->const_state;
1026 return v->const_state;
1027 }
1028
1029 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)1030 ir3_const_state_mut(const struct ir3_shader_variant *v)
1031 {
1032 assert(!v->binning_pass);
1033 return v->const_state;
1034 }
1035
1036 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)1037 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
1038 {
1039 const struct ir3_compiler *compiler = v->compiler;
1040 bool shared_consts_enable =
1041 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1042
1043 /* Shared consts size for CS and FS matches with what's acutally used,
1044 * but the size of shared consts for geomtry stages doesn't.
1045 * So we use a hw quirk for geometry shared consts.
1046 */
1047 uint32_t shared_consts_size = shared_consts_enable ?
1048 compiler->shared_consts_size : 0;
1049
1050 uint32_t shared_consts_size_geom = shared_consts_enable ?
1051 compiler->geom_shared_consts_size_quirk : 0;
1052
1053 uint32_t safe_shared_consts_size = shared_consts_enable ?
1054 ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
1055 DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
1056
1057 if ((v->type == MESA_SHADER_COMPUTE) ||
1058 (v->type == MESA_SHADER_KERNEL)) {
1059 return compiler->max_const_compute - shared_consts_size;
1060 } else if (safe_constlen) {
1061 return compiler->max_const_safe - safe_shared_consts_size;
1062 } else if (v->type == MESA_SHADER_FRAGMENT) {
1063 return compiler->max_const_frag - shared_consts_size;
1064 } else {
1065 return compiler->max_const_geom - shared_consts_size_geom;
1066 }
1067 }
1068
1069 /* Given a variant, calculate the maximum constlen it can have.
1070 */
1071 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)1072 ir3_max_const(const struct ir3_shader_variant *v)
1073 {
1074 return _ir3_max_const(v, v->key.safe_constlen);
1075 }
1076
1077 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
1078 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
1079
1080 static inline unsigned
ir3_const_reg(const struct ir3_const_state * const_state,enum ir3_const_alloc_type type,unsigned offset)1081 ir3_const_reg(const struct ir3_const_state *const_state,
1082 enum ir3_const_alloc_type type,
1083 unsigned offset)
1084 {
1085 unsigned n = const_state->allocs.consts[type].offset_vec4;
1086 assert(const_state->allocs.consts[type].size_vec4 != 0);
1087 return regid(n + offset / 4, offset % 4);
1088 }
1089
1090 /* Return true if a variant may need to be recompiled due to exceeding the
1091 * maximum "safe" constlen.
1092 */
1093 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)1094 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
1095 {
1096 return v->constlen > _ir3_max_const(v, true);
1097 }
1098
1099 void *ir3_shader_assemble(struct ir3_shader_variant *v);
1100 struct ir3_shader_variant *
1101 ir3_shader_create_variant(struct ir3_shader *shader,
1102 const struct ir3_shader_key *key,
1103 bool keep_ir);
1104 struct ir3_shader_variant *
1105 ir3_shader_get_variant(struct ir3_shader *shader,
1106 const struct ir3_shader_key *key, bool binning_pass,
1107 bool keep_ir, bool *created);
1108
1109 struct ir3_shader *
1110 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1111 const struct ir3_shader_options *options,
1112 struct ir3_stream_output_info *stream_output);
1113 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1114 const struct ir3_compiler *compiler);
1115 struct ir3_shader *
1116 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1117 void ir3_shader_destroy(struct ir3_shader *shader);
1118 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1119 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1120
1121 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1122
1123 void ir3_shader_get_subgroup_size(const struct ir3_compiler *compiler,
1124 const struct ir3_shader_options *options,
1125 gl_shader_stage stage,
1126 unsigned *subgroup_size,
1127 unsigned *max_subgroup_size);
1128
1129 /*
1130 * Helper/util:
1131 */
1132
1133 /* clears shader-key flags which don't apply to the given shader.
1134 */
1135 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1136 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1137 {
1138 uint32_t *key_bits = (uint32_t *)key;
1139 uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1140 STATIC_ASSERT(sizeof(*key) % 4 == 0);
1141 for (unsigned i = 0; i < sizeof(*key) >> 2; i++)
1142 key_bits[i] &= key_mask[i];
1143 }
1144
1145 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1146 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1147 {
1148 for (unsigned j = 0; j < so->outputs_count; j++)
1149 if (so->outputs[j].slot == slot)
1150 return j;
1151
1152 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1153 * in the vertex shader.. but the fragment shader doesn't know this
1154 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So
1155 * at link time if there is no matching OUT.BCOLOR[n], we must map
1156 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
1157 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1158 */
1159 if (slot == VARYING_SLOT_BFC0) {
1160 slot = VARYING_SLOT_COL0;
1161 } else if (slot == VARYING_SLOT_BFC1) {
1162 slot = VARYING_SLOT_COL1;
1163 } else if (slot == VARYING_SLOT_COL0) {
1164 slot = VARYING_SLOT_BFC0;
1165 } else if (slot == VARYING_SLOT_COL1) {
1166 slot = VARYING_SLOT_BFC1;
1167 } else {
1168 return -1;
1169 }
1170
1171 for (unsigned j = 0; j < so->outputs_count; j++)
1172 if (so->outputs[j].slot == slot)
1173 return j;
1174
1175 return -1;
1176 }
1177
1178 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1179 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1180 {
1181 assert(so->inputs_count <= (unsigned)INT_MAX);
1182 while (++i < (int)so->inputs_count)
1183 if (so->inputs[i].compmask && so->inputs[i].bary)
1184 break;
1185 return i;
1186 }
1187
1188 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1189 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1190 {
1191 int j = -1;
1192
1193 while (true) {
1194 j = ir3_next_varying(so, j);
1195
1196 assert(so->inputs_count <= (unsigned)INT_MAX);
1197 if (j >= (int)so->inputs_count)
1198 return -1;
1199
1200 if (so->inputs[j].slot == slot)
1201 return j;
1202 }
1203 }
1204
1205 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1206 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1207 {
1208 int var = ir3_find_input(so, slot);
1209 return var == -1 ? 0xff : so->inputs[var].inloc;
1210 }
1211
1212 struct ir3_shader_linkage {
1213 /* Maximum location either consumed by the fragment shader or produced by
1214 * the last geometry stage, i.e. the size required for each vertex in the
1215 * VPC in DWORD's.
1216 */
1217 uint8_t max_loc;
1218
1219 /* Number of entries in var. */
1220 uint8_t cnt;
1221
1222 /* Bitset of locations used, including ones which are only used by the FS.
1223 */
1224 uint32_t varmask[4];
1225
1226 /* Map from VS output to location. */
1227 struct {
1228 uint8_t slot;
1229 uint8_t regid;
1230 uint8_t compmask;
1231 uint8_t loc;
1232 } var[32];
1233
1234 /* location for fixed-function gl_PrimitiveID passthrough */
1235 uint8_t primid_loc;
1236
1237 /* location for fixed-function gl_ViewIndex passthrough */
1238 uint8_t viewid_loc;
1239
1240 /* location for combined clip/cull distance arrays */
1241 uint8_t clip0_loc, clip1_loc;
1242 };
1243
1244 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1245 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1246 uint8_t compmask, uint8_t loc)
1247 {
1248 for (unsigned j = 0; j < util_last_bit(compmask); j++) {
1249 uint8_t comploc = loc + j;
1250 l->varmask[comploc / 32] |= 1 << (comploc % 32);
1251 }
1252
1253 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1254
1255 if (regid_ != regid(63, 0)) {
1256 int i = l->cnt++;
1257 assert(i < ARRAY_SIZE(l->var));
1258
1259 l->var[i].slot = slot;
1260 l->var[i].regid = regid_;
1261 l->var[i].compmask = compmask;
1262 l->var[i].loc = loc;
1263 }
1264 }
1265
1266 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1267 ir3_link_shaders(struct ir3_shader_linkage *l,
1268 const struct ir3_shader_variant *vs,
1269 const struct ir3_shader_variant *fs, bool pack_vs_out)
1270 {
1271 /* On older platforms, varmask isn't programmed at all, and it appears
1272 * that the hardware generates a mask of used VPC locations using the VS
1273 * output map, and hangs if a FS bary instruction references a location
1274 * not in the list. This means that we need to have a dummy entry in the
1275 * VS out map for things like gl_PointCoord which aren't written by the
1276 * VS. Furthermore we can't use r63.x, so just pick a random register to
1277 * use if there is no VS output.
1278 */
1279 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1280 int j = -1, k;
1281
1282 l->primid_loc = 0xff;
1283 l->viewid_loc = 0xff;
1284 l->clip0_loc = 0xff;
1285 l->clip1_loc = 0xff;
1286
1287 while (l->cnt < ARRAY_SIZE(l->var)) {
1288 j = ir3_next_varying(fs, j);
1289
1290 assert(fs->inputs_count <= (unsigned)INT_MAX);
1291 if (j >= (int)fs->inputs_count)
1292 break;
1293
1294 if (fs->inputs[j].inloc >= fs->total_in)
1295 continue;
1296
1297 k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1298
1299 if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1300 l->primid_loc = fs->inputs[j].inloc;
1301 }
1302
1303 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1304 assert(k < 0);
1305 l->viewid_loc = fs->inputs[j].inloc;
1306 }
1307
1308 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1309 l->clip0_loc = fs->inputs[j].inloc;
1310
1311 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1312 l->clip1_loc = fs->inputs[j].inloc;
1313
1314 ir3_link_add(l, fs->inputs[j].slot,
1315 k >= 0 ? vs->outputs[k].regid : default_regid,
1316 fs->inputs[j].compmask, fs->inputs[j].inloc);
1317 }
1318 }
1319
1320 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1321 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1322 {
1323 for (unsigned j = 0; j < so->outputs_count; j++)
1324 if (so->outputs[j].slot == slot) {
1325 uint32_t regid = so->outputs[j].regid;
1326 if (so->outputs[j].half)
1327 regid |= HALF_REG_ID;
1328 return regid;
1329 }
1330 return regid(63, 0);
1331 }
1332
1333 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1334
1335 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1336 const struct ir3_shader_variant *v);
1337
1338 #define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0)
1339 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1340 #define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2)
1341 #define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3)
1342
1343 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1344 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1345 {
1346 if (!so)
1347 return regid(63, 0);
1348 for (unsigned j = 0; j < so->inputs_count; j++)
1349 if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1350 return so->inputs[j].regid;
1351 return regid(63, 0);
1352 }
1353
1354 /* calculate register footprint in terms of half-regs (ie. one full
1355 * reg counts as two half-regs).
1356 */
1357 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1358 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1359 {
1360 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1361 }
1362
1363 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1364 ir3_shader_nibo(const struct ir3_shader_variant *v)
1365 {
1366 return v->num_ibos;
1367 }
1368
1369 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1370 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1371 {
1372 /* Dummy shader */
1373 if (!v->compiler)
1374 return 0;
1375
1376 if (v->compiler->gen < 5)
1377 return v->branchstack;
1378
1379 return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1380 }
1381
1382 ENDC;
1383
1384 #endif /* IR3_SHADER_H_ */
1385