1 /*
2 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11
12 #include <stdio.h>
13
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19
20 #include "ir3_compiler.h"
21
22 BEGINC;
23
24 #define dword_offsetof(type, name) DIV_ROUND_UP(offsetof(type, name), 4)
25 #define dword_sizeof(type) DIV_ROUND_UP(sizeof(type), 4)
26
27 /**
28 * Driver params for compute shaders.
29 *
30 * Note, driver param structs should be size aligned to vec4
31 */
32 struct ir3_driver_params_cs {
33 /* NOTE: gl_NumWorkGroups should be vec4 aligned because
34 * glDispatchComputeIndirect() needs to load these from
35 * the info->indirect buffer. Keep that in mind when/if
36 * adding any addition CS driver params.
37 */
38 uint32_t num_work_groups_x;
39 uint32_t num_work_groups_y;
40 uint32_t num_work_groups_z;
41 uint32_t work_dim;
42 uint32_t base_group_x;
43 uint32_t base_group_y;
44 uint32_t base_group_z;
45 uint32_t subgroup_size;
46 uint32_t local_group_size_x;
47 uint32_t local_group_size_y;
48 uint32_t local_group_size_z;
49 uint32_t subgroup_id_shift;
50 uint32_t workgroup_id_x;
51 uint32_t workgroup_id_y;
52 uint32_t workgroup_id_z;
53 uint32_t __pad;
54 };
55 #define IR3_DP_CS(name) dword_offsetof(struct ir3_driver_params_cs, name)
56
57 /**
58 * Driver params for vertex shaders.
59 *
60 * Note, driver param structs should be size aligned to vec4
61 */
62 struct ir3_driver_params_vs {
63 uint32_t draw_id;
64 uint32_t vtxid_base;
65 uint32_t instid_base;
66 uint32_t vtxcnt_max;
67 uint32_t is_indexed_draw; /* Note: boolean, ie. 0 or ~0 */
68 /* user-clip-plane components, up to 8x vec4's: */
69 struct {
70 uint32_t x;
71 uint32_t y;
72 uint32_t z;
73 uint32_t w;
74 } ucp[8];
75 uint32_t __pad_37_39[3];
76 };
77 #define IR3_DP_VS(name) dword_offsetof(struct ir3_driver_params_vs, name)
78
79 /**
80 * Driver params for TCS shaders.
81 *
82 * Note, driver param structs should be size aligned to vec4
83 */
84 struct ir3_driver_params_tcs {
85 uint32_t default_outer_level_x;
86 uint32_t default_outer_level_y;
87 uint32_t default_outer_level_z;
88 uint32_t default_outer_level_w;
89 uint32_t default_inner_level_x;
90 uint32_t default_inner_level_y;
91 uint32_t __pad_06_07[2];
92 };
93 #define IR3_DP_TCS(name) dword_offsetof(struct ir3_driver_params_tcs, name)
94
95 /**
96 * Driver params for fragment shaders.
97 *
98 * Note, driver param structs should be size aligned to vec4
99 */
100 struct ir3_driver_params_fs {
101 uint32_t subgroup_size;
102 uint32_t __pad_01_03[3];
103 /* Dynamic params (that aren't known when compiling the shader) */
104 #define IR3_DP_FS_DYNAMIC dword_offsetof(struct ir3_driver_params_fs, frag_invocation_count)
105 uint32_t frag_invocation_count;
106 uint32_t __pad_05_07[3];
107 uint32_t frag_size;
108 uint32_t __pad_09;
109 uint32_t frag_offset;
110 uint32_t __pad_11_12[2];
111 };
112 #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
113
114 #define IR3_MAX_SHADER_BUFFERS 32
115 #define IR3_MAX_SHADER_IMAGES 32
116 #define IR3_MAX_SO_BUFFERS 4
117 #define IR3_MAX_SO_STREAMS 4
118 #define IR3_MAX_SO_OUTPUTS 128
119 #define IR3_MAX_UBO_PUSH_RANGES 32
120
121 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
122 enum ir3_bary {
123 IJ_PERSP_PIXEL,
124 IJ_PERSP_SAMPLE,
125 IJ_PERSP_CENTROID,
126 IJ_PERSP_CENTER_RHW,
127 IJ_LINEAR_PIXEL,
128 IJ_LINEAR_CENTROID,
129 IJ_LINEAR_SAMPLE,
130 IJ_COUNT,
131 };
132
133 /* Description of what wavesizes are allowed. */
134 enum ir3_wavesize_option {
135 IR3_SINGLE_ONLY,
136 IR3_SINGLE_OR_DOUBLE,
137 IR3_DOUBLE_ONLY,
138 };
139
140 /**
141 * Description of a lowered UBO.
142 */
143 struct nir_def;
144
145 struct ir3_ubo_info {
146 struct nir_def *global_base; /* For global loads, the base address */
147 uint32_t block; /* Which constant block */
148 uint16_t bindless_base; /* For bindless, which base register is used */
149 bool bindless;
150 bool global;
151 };
152
153 /**
154 * Description of a range of a lowered UBO access.
155 *
156 * Drivers should not assume that there are not multiple disjoint
157 * lowered ranges of a single UBO.
158 */
159 struct ir3_ubo_range {
160 struct ir3_ubo_info ubo;
161 uint32_t offset; /* start offset to push in the const register file */
162 uint32_t start, end; /* range of block that's actually used */
163 };
164
165 struct ir3_ubo_analysis_state {
166 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
167 uint32_t num_enabled;
168 uint32_t size;
169 };
170
171 enum ir3_push_consts_type {
172 IR3_PUSH_CONSTS_NONE,
173 IR3_PUSH_CONSTS_PER_STAGE,
174 IR3_PUSH_CONSTS_SHARED,
175 IR3_PUSH_CONSTS_SHARED_PREAMBLE,
176 };
177
178 /* This represents an internal UBO filled out by the driver. There are a few
179 * common UBOs that must be filled out identically by all drivers, for example
180 * for shader linkage, but drivers can also add their own that they manage
181 * themselves.
182 */
183 struct ir3_driver_ubo {
184 int32_t idx;
185 uint32_t size;
186 };
187
188 enum ir3_const_alloc_type {
189 /* Vulkan, push consts. */
190 IR3_CONST_ALLOC_PUSH_CONSTS = 0,
191 /* Vulkan, offsets required to calculate offsets of descriptors with dynamic
192 * offsets.
193 */
194 IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET = 1,
195 /* Vulkan, addresses of inline uniform buffers, to which we fallback when
196 * their size is unknown.
197 */
198 IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS = 2,
199 /* Common, stage-specific params uploaded by the driver/HW. */
200 IR3_CONST_ALLOC_DRIVER_PARAMS = 3,
201 /* Common, UBOs lowered to consts. */
202 IR3_CONST_ALLOC_UBO_RANGES = 4,
203 /* Common, consts produced by a preamble to be used in a main shader. */
204 IR3_CONST_ALLOC_PREAMBLE = 5,
205 /* Vulkan, inline uniforms loaded into consts in the preamble.*/
206 IR3_CONST_ALLOC_GLOBAL = 6,
207 /* OpenGL, pre-a6xx; pointers to UBOs */
208 IR3_CONST_ALLOC_UBO_PTRS = 7,
209 /* OpenGL, a5xx only; needed to calculate pixel offset, but only
210 * for images that have image_{load,store,size,atomic*} intrinsics.
211 */
212 IR3_CONST_ALLOC_IMAGE_DIMS = 8,
213 /* OpenCL */
214 IR3_CONST_ALLOC_KERNEL_PARAMS = 9,
215 /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */
216 IR3_CONST_ALLOC_TFBO = 10,
217 /* Common, stage-dependent primitive params:
218 * vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
219 * hs, ds: uvec4(primitive_stride, vertex_stride,
220 * patch_stride, patch_vertices_in)
221 * uvec4(tess_param_base, tess_factor_base)
222 */
223 IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11,
224 /* Common, mapping from varying location to offset. */
225 IR3_CONST_ALLOC_PRIMITIVE_MAP = 12,
226 IR3_CONST_ALLOC_MAX = 13,
227 };
228
229 struct ir3_const_allocation {
230 uint32_t offset_vec4;
231 uint32_t size_vec4;
232
233 uint32_t reserved_size_vec4;
234 uint32_t reserved_align_vec4;
235 };
236
237 struct ir3_const_allocations {
238 struct ir3_const_allocation consts[IR3_CONST_ALLOC_MAX];
239 uint32_t max_const_offset_vec4;
240 uint32_t reserved_vec4;
241 };
242
243 static inline bool
ir3_const_can_upload(const struct ir3_const_allocations * const_alloc,enum ir3_const_alloc_type type,uint32_t shader_const_size_vec4)244 ir3_const_can_upload(const struct ir3_const_allocations *const_alloc,
245 enum ir3_const_alloc_type type,
246 uint32_t shader_const_size_vec4)
247 {
248 return const_alloc->consts[type].size_vec4 > 0 &&
249 const_alloc->consts[type].offset_vec4 < shader_const_size_vec4;
250 }
251
252 struct ir3_const_image_dims {
253 uint32_t mask; /* bitmask of images that have image_store */
254 uint32_t count; /* number of consts allocated */
255 /* three const allocated per image which has image_store:
256 * + cpp (bytes per pixel)
257 * + pitch (y pitch)
258 * + array_pitch (z pitch)
259 */
260 uint32_t off[IR3_MAX_SHADER_IMAGES];
261 };
262
263 /**
264 * Describes the layout of shader consts in the const register file
265 * and additional info about individual allocations.
266 *
267 * Each consts section is aligned to vec4. Note that pointer
268 * size (ubo, etc) changes depending on generation.
269 *
270 * The consts allocation flow is as follows:
271 * 1) Turnip/Freedreno allocates consts required by corresponding API,
272 * e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
273 * into IR3.
274 * 2) ir3_setup_const_state allocates consts with non-negotiable size.
275 * 3) IR3 lowerings afterwards allocate from the free space left.
276 *
277 * Note UBO size in bytes should be aligned to vec4
278 */
279 struct ir3_const_state {
280 unsigned num_ubos;
281 unsigned num_app_ubos; /* # of UBOs not including driver UBOs */
282 unsigned num_driver_params; /* scalar */
283
284 struct ir3_driver_ubo consts_ubo;
285 struct ir3_driver_ubo driver_params_ubo;
286 struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
287
288 struct ir3_const_allocations allocs;
289
290 struct ir3_const_image_dims image_dims;
291
292 unsigned immediates_count;
293 unsigned immediates_size;
294 uint32_t *immediates;
295
296 /* State of ubo access lowered to push consts: */
297 struct ir3_ubo_analysis_state ubo_state;
298 enum ir3_push_consts_type push_consts_type;
299 };
300
301 /**
302 * A single output for vertex transform feedback.
303 */
304 struct ir3_stream_output {
305 unsigned register_index : 6; /**< 0 to 63 (OUT index) */
306 unsigned start_component : 2; /** 0 to 3 */
307 unsigned num_components : 3; /** 1 to 4 */
308 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */
309 unsigned dst_offset : 16; /**< offset into the buffer in dwords */
310 unsigned stream : 2; /**< 0 to 3 */
311 };
312
313 /**
314 * Stream output for vertex transform feedback.
315 */
316 struct ir3_stream_output_info {
317 unsigned num_outputs;
318 /** stride for an entire vertex for each buffer in dwords */
319 uint16_t stride[IR3_MAX_SO_BUFFERS];
320
321 /* These correspond to the VPC_SO_STREAM_CNTL fields */
322 uint8_t streams_written;
323 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
324
325 /**
326 * Array of stream outputs, in the order they are to be written in.
327 * Selected components are tightly packed into the output buffer.
328 */
329 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
330 };
331
332 /**
333 * Starting from a4xx, HW supports pre-dispatching texture sampling
334 * instructions prior to scheduling a shader stage, when the
335 * coordinate maps exactly to an output of the previous stage.
336 */
337
338 /**
339 * There is a limit in the number of pre-dispatches allowed for any
340 * given stage.
341 */
342 #define IR3_MAX_SAMPLER_PREFETCH 4
343
344 /**
345 * This is the output stream value for 'cmd', as used by blob. It may
346 * encode the return type (in 3 bits) but it hasn't been verified yet.
347 */
348 #define IR3_SAMPLER_PREFETCH_CMD 0x4
349 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
350
351 /**
352 * Stream output for texture sampling pre-dispatches.
353 */
354 struct ir3_sampler_prefetch {
355 uint8_t src;
356 bool bindless;
357 uint8_t samp_id;
358 uint8_t tex_id;
359 uint16_t samp_bindless_id;
360 uint16_t tex_bindless_id;
361 uint8_t dst;
362 uint8_t wrmask;
363 uint8_t half_precision;
364 opc_t tex_opc;
365 };
366
367 /* Configuration key used to identify a shader variant.. different
368 * shader variants can be used to implement features not supported
369 * in hw (two sided color), binning-pass vertex shader, etc.
370 *
371 * When adding to this struct, please update ir3_shader_variant()'s debug
372 * output.
373 */
374 struct ir3_shader_key {
375 union {
376 struct {
377 /*
378 * Combined Vertex/Fragment shader parameters:
379 */
380 unsigned ucp_enables : 8;
381
382 /* do we need to check {v,f}saturate_{s,t,r}? */
383 unsigned has_per_samp : 1;
384
385 /*
386 * Fragment shader variant parameters:
387 */
388 unsigned sample_shading : 1;
389 unsigned msaa : 1;
390 /* used when shader needs to handle flat varyings (a4xx)
391 * for front/back color inputs to frag shader:
392 */
393 unsigned rasterflat : 1;
394
395 /* Indicates that this is a tessellation pipeline which requires a
396 * whole different kind of vertex shader. In case of
397 * tessellation, this field also tells us which kind of output
398 * topology the TES uses, which the TCS needs to know.
399 */
400 #define IR3_TESS_NONE 0
401 #define IR3_TESS_QUADS 1
402 #define IR3_TESS_TRIANGLES 2
403 #define IR3_TESS_ISOLINES 3
404 unsigned tessellation : 2;
405
406 unsigned has_gs : 1;
407
408 /* Whether stages after TCS read gl_PrimitiveID, used to determine
409 * whether the TCS has to store it in the tess factor BO.
410 */
411 unsigned tcs_store_primid : 1;
412
413 /* Whether this variant sticks to the "safe" maximum constlen,
414 * which guarantees that the combined stages will never go over
415 * the limit:
416 */
417 unsigned safe_constlen : 1;
418
419 /* Whether driconf "dual_color_blend_by_location" workaround is
420 * enabled
421 */
422 unsigned force_dual_color_blend : 1;
423 };
424 uint32_t global;
425 };
426
427 /* bitmask of ms shifts (a3xx) */
428 uint32_t vsamples, fsamples;
429
430 /* bitmask of samplers which need astc srgb workaround (a4xx): */
431 uint16_t vastc_srgb, fastc_srgb;
432
433 /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
434 uint16_t vsampler_swizzles[16];
435 uint16_t fsampler_swizzles[16];
436 };
437
438 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)439 ir3_tess_mode(enum tess_primitive_mode tess_mode)
440 {
441 switch (tess_mode) {
442 case TESS_PRIMITIVE_ISOLINES:
443 return IR3_TESS_ISOLINES;
444 case TESS_PRIMITIVE_TRIANGLES:
445 return IR3_TESS_TRIANGLES;
446 case TESS_PRIMITIVE_QUADS:
447 return IR3_TESS_QUADS;
448 default:
449 unreachable("bad tessmode");
450 }
451 }
452
453 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)454 ir3_tess_factor_stride(unsigned patch_type)
455 {
456 /* note: this matches the stride used by ir3's build_tessfactor_base */
457 switch (patch_type) {
458 case IR3_TESS_ISOLINES:
459 return 12;
460 case IR3_TESS_TRIANGLES:
461 return 20;
462 case IR3_TESS_QUADS:
463 return 28;
464 default:
465 unreachable("bad tessmode");
466 }
467 }
468
469 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)470 ir3_shader_key_equal(const struct ir3_shader_key *a,
471 const struct ir3_shader_key *b)
472 {
473 /* slow-path if we need to check {v,f}saturate_{s,t,r} */
474 if (a->has_per_samp || b->has_per_samp)
475 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
476 return a->global == b->global;
477 }
478
479 /* will the two keys produce different lowering for a fragment shader? */
480 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)481 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
482 struct ir3_shader_key *last_key)
483 {
484 if (last_key->has_per_samp || key->has_per_samp) {
485 if ((last_key->fsamples != key->fsamples) ||
486 (last_key->fastc_srgb != key->fastc_srgb) ||
487 memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
488 sizeof(key->fsampler_swizzles)))
489 return true;
490 }
491
492 if (last_key->rasterflat != key->rasterflat)
493 return true;
494
495 if (last_key->ucp_enables != key->ucp_enables)
496 return true;
497
498 if (last_key->safe_constlen != key->safe_constlen)
499 return true;
500
501 return false;
502 }
503
504 /* will the two keys produce different lowering for a vertex shader? */
505 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)506 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
507 struct ir3_shader_key *last_key)
508 {
509 if (last_key->has_per_samp || key->has_per_samp) {
510 if ((last_key->vsamples != key->vsamples) ||
511 (last_key->vastc_srgb != key->vastc_srgb) ||
512 memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
513 sizeof(key->vsampler_swizzles)))
514 return true;
515 }
516
517 if (last_key->ucp_enables != key->ucp_enables)
518 return true;
519
520 if (last_key->safe_constlen != key->safe_constlen)
521 return true;
522
523 return false;
524 }
525
526 /**
527 * On a4xx+a5xx, Images share state with textures and SSBOs:
528 *
529 * + Uses texture (cat5) state/instruction (isam) to read
530 * + Uses SSBO state and instructions (cat6) to write and for atomics
531 *
532 * Starting with a6xx, Images and SSBOs are basically the same thing,
533 * with texture state and isam also used for SSBO reads.
534 *
535 * On top of that, gallium makes the SSBO (shader_buffers) state semi
536 * sparse, with the first half of the state space used for atomic
537 * counters lowered to atomic buffers. We could ignore this, but I
538 * don't think we could *really* handle the case of a single shader
539 * that used the max # of textures + images + SSBOs. And once we are
540 * offsetting images by num_ssbos (or visa versa) to map them into
541 * the same hardware state, the hardware state has become coupled to
542 * the shader state, so at this point we might as well just use a
543 * mapping table to remap things from image/SSBO idx to hw idx.
544 *
545 * To make things less (more?) confusing, for the hw "SSBO" state
546 * (since it is really both SSBO and Image) I'll use the name "IBO"
547 */
548 struct ir3_ibo_mapping {
549 #define IBO_INVALID 0xff
550 /* Maps logical SSBO state to hw tex state: */
551 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
552
553 /* Maps logical Image state to hw tex state: */
554 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
555
556 /* Maps hw state back to logical SSBO or Image state:
557 *
558 * note IBO_SSBO ORd into values to indicate that the
559 * hw slot is used for SSBO state vs Image state.
560 */
561 #define IBO_SSBO 0x80
562 uint8_t tex_to_image[32];
563
564 /* including real textures */
565 uint8_t num_tex;
566 /* the number of real textures, ie. image/ssbo start here */
567 uint8_t tex_base;
568 };
569
570 struct ir3_disasm_info {
571 bool write_disasm;
572 char *nir;
573 char *disasm;
574 };
575
576 /* Represents half register in regid */
577 #define HALF_REG_ID 0x100
578
579 /* Options for common NIR optimization passes done in ir3. This is used for both
580 * finalize and post-finalize (where it has to be in the shader).
581 */
582 struct ir3_shader_nir_options {
583 /* For the modes specified, accesses are assumed to be bounds-checked as
584 * defined by VK_EXT_robustness2 and optimizations may have to be more
585 * conservative.
586 */
587 nir_variable_mode robust_modes;
588 };
589
590 struct ir3_shader_options {
591 /* What API-visible wavesizes are allowed. Even if only double wavesize is
592 * allowed, we may still use the smaller wavesize "under the hood" and the
593 * application simply sees the upper half as always disabled.
594 */
595 enum ir3_wavesize_option api_wavesize;
596 /* What wavesizes we're allowed to actually use. If the API wavesize is
597 * single-only, then this must be single-only too.
598 */
599 enum ir3_wavesize_option real_wavesize;
600 enum ir3_push_consts_type push_consts_type;
601
602 uint32_t push_consts_base;
603 uint32_t push_consts_dwords;
604
605 /* Some const allocations are required at API level. */
606 struct ir3_const_allocations const_allocs;
607
608 struct ir3_shader_nir_options nir_options;
609
610 /* Whether FRAG_RESULT_DATAi slots may be dynamically remapped by the driver.
611 * If true, ir3 will assume it cannot statically use the value of such slots
612 * anywhere (e.g., as the target of alias.rt).
613 */
614 bool fragdata_dynamic_remap;
615 };
616
617 struct ir3_shader_output {
618 uint8_t slot;
619 uint8_t regid;
620 uint8_t view;
621 uint8_t aliased_components : 4;
622 bool half : 1;
623 };
624
625 /**
626 * Shader variant which contains the actual hw shader instructions,
627 * and necessary info for shader state setup.
628 */
629 struct ir3_shader_variant {
630 struct fd_bo *bo;
631
632 /* variant id (for debug) */
633 uint32_t id;
634
635 /* id of the shader the variant came from (for debug) */
636 uint32_t shader_id;
637
638 struct ir3_shader_key key;
639
640 /* vertex shaders can have an extra version for hwbinning pass,
641 * which is pointed to by so->binning:
642 */
643 bool binning_pass;
644 // union {
645 struct ir3_shader_variant *binning;
646 struct ir3_shader_variant *nonbinning;
647 // };
648
649 struct ir3 *ir; /* freed after assembling machine instructions */
650
651 /* shader variants form a linked list: */
652 struct ir3_shader_variant *next;
653
654 /* replicated here to avoid passing extra ptrs everywhere: */
655 gl_shader_stage type;
656 struct ir3_compiler *compiler;
657
658 char *name;
659
660 /* variant's copy of nir->constant_data (since we don't track the NIR in
661 * the variant, and shader->nir is before the opt pass). Moves to v->bin
662 * after assembly.
663 */
664 void *constant_data;
665
666 struct ir3_disasm_info disasm_info;
667
668 /*
669 * Below here is serialized when written to disk cache:
670 */
671
672 /* The actual binary shader instructions, size given by info.sizedwords: */
673 uint32_t *bin;
674
675 struct ir3_const_state *const_state;
676
677 /*
678 * The following macros are used by the shader disk cache save/
679 * restore paths to serialize/deserialize the variant. Any
680 * pointers that require special handling in store_variant()
681 * and retrieve_variant() should go above here.
682 */
683 #define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info)
684 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
685 #define VARIANT_CACHE_SIZE \
686 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
687
688 struct ir3_info info;
689
690 struct ir3_shader_options shader_options;
691
692 uint32_t constant_data_size;
693
694 /* Levels of nesting of flow control:
695 */
696 unsigned branchstack;
697
698 unsigned loops;
699
700 /* the instructions length is in units of instruction groups
701 * (4 instructions for a3xx, 16 instructions for a4xx.. each
702 * instruction is 2 dwords):
703 */
704 unsigned instrlen;
705
706 /* the constants length is in units of vec4's, and is the sum of
707 * the uniforms and the built-in compiler constants
708 */
709 unsigned constlen;
710
711 /* The private memory size in bytes per fiber */
712 unsigned pvtmem_size;
713 /* Whether we should use the new per-wave layout rather than per-fiber. */
714 bool pvtmem_per_wave;
715
716 /* Whether multi-position output is enabled. */
717 bool multi_pos_output;
718
719 /* Whether dual-source blending is enabled. */
720 bool dual_src_blend;
721
722 /* Whether early preamble is enabled. */
723 bool early_preamble;
724
725 /* Size in bytes of required shared memory */
726 unsigned shared_size;
727
728 /* About Linkage:
729 * + Let the frag shader determine the position/compmask for the
730 * varyings, since it is the place where we know if the varying
731 * is actually used, and if so, which components are used. So
732 * what the hw calls "outloc" is taken from the "inloc" of the
733 * frag shader.
734 * + From the vert shader, we only need the output regid
735 */
736
737 bool frag_face, color0_mrt;
738 uint8_t fragcoord_compmask;
739
740 /* NOTE: for input/outputs, slot is:
741 * gl_vert_attrib - for VS inputs
742 * gl_varying_slot - for VS output / FS input
743 * gl_frag_result - for FS output
744 */
745
746 /* varyings/outputs: */
747 unsigned outputs_count;
748 struct ir3_shader_output outputs[32 + 2]; /* +POSITION +PSIZE */
749 bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
750 bool writes_shading_rate;
751
752 /* Size in dwords of all outputs for VS, size of entire patch for HS. */
753 uint32_t output_size;
754
755 /* Expected size of incoming output_loc for HS, DS, and GS */
756 uint32_t input_size;
757
758 /* Map from location to offset in per-primitive storage. In dwords for
759 * HS, where varyings are read in the next stage via ldg with a dword
760 * offset, and in bytes for all other stages.
761 * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
762 */
763 unsigned output_loc[13 + 32];
764
765 /* attributes (VS) / varyings (FS):
766 * Note that sysval's should come *after* normal inputs.
767 */
768 unsigned inputs_count;
769 struct {
770 uint8_t slot;
771 uint8_t regid;
772 uint8_t compmask;
773 /* location of input (ie. offset passed to bary.f, etc). This
774 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
775 * have the OUTLOCn value offset by 8, presumably to account
776 * for gl_Position/gl_PointSize)
777 */
778 uint8_t inloc;
779 /* vertex shader specific: */
780 bool sysval : 1; /* slot is a gl_system_value */
781 /* fragment shader specific: */
782 bool bary : 1; /* fetched varying (vs one loaded into reg) */
783 bool rasterflat : 1; /* special handling for emit->rasterflat */
784 bool half : 1;
785 bool flat : 1;
786 } inputs[32 + 2]; /* +POSITION +FACE */
787 bool reads_primid;
788 bool reads_shading_rate;
789 bool reads_smask;
790
791 /* sum of input components (scalar). For frag shaders, it only counts
792 * the varying inputs:
793 */
794 unsigned total_in;
795
796 /* sum of sysval input components (scalar). */
797 unsigned sysval_in;
798
799 /* For frag shaders, the total number of inputs (not scalar,
800 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
801 */
802 unsigned varying_in;
803
804 /* Remapping table to map Image and SSBO to hw state: */
805 struct ir3_ibo_mapping image_mapping;
806
807 /* number of samplers/textures (which are currently 1:1): */
808 int num_samp;
809
810 /* is there an implicit sampler to read framebuffer (FS only).. if
811 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
812 * the last "real" texture)
813 */
814 bool fb_read;
815
816 /* do we have one or more SSBO instructions: */
817 bool has_ssbo;
818
819 /* Which bindless resources are used, for filling out sp_xs_config */
820 bool bindless_tex;
821 bool bindless_samp;
822 bool bindless_ibo;
823 bool bindless_ubo;
824
825 /* do we need derivatives: */
826 bool need_pixlod;
827
828 bool need_full_quad;
829
830 /* do we need VS driver params? */
831 bool need_driver_params;
832
833 /* do we have image write, etc (which prevents early-z): */
834 bool no_earlyz;
835
836 /* do we have kill, which also prevents early-z, but not necessarily
837 * early-lrz (as long as lrz-write is disabled, which must be handled
838 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have
839 * side effects that prevent early-lrz discard.
840 */
841 bool has_kill;
842
843 bool per_samp;
844
845 bool post_depth_coverage;
846
847 /* Are we using split or merged register file? */
848 bool mergedregs;
849
850 uint8_t clip_mask, cull_mask;
851
852 /* for astc srgb workaround, the number/base of additional
853 * alpha tex states we need, and index of original tex states
854 */
855 struct {
856 unsigned base, count;
857 unsigned orig_idx[16];
858 } astc_srgb;
859
860 /* for tg4 workaround, the number/base of additional
861 * unswizzled tex states we need, and index of original tex states
862 */
863 struct {
864 unsigned base, count;
865 unsigned orig_idx[16];
866 } tg4;
867
868 /* texture sampler pre-dispatches */
869 uint32_t num_sampler_prefetch;
870 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
871
872 /* If true, the last use of helper invocations is the texture prefetch and
873 * they should be disabled for the actual shader. Equivalent to adding
874 * (eq)nop at the beginning of the shader.
875 */
876 bool prefetch_end_of_quad;
877
878 uint16_t local_size[3];
879 bool local_size_variable;
880
881 /* Important for compute shader to determine max reg footprint */
882 bool has_barrier;
883
884 /* The offset where images start in the IBO array. */
885 unsigned num_ssbos;
886
887 /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
888 unsigned num_ibos;
889
890 union {
891 struct {
892 enum tess_primitive_mode primitive_mode;
893
894 /** The number of vertices in the TCS output patch. */
895 uint8_t tcs_vertices_out;
896 enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
897
898 /** Is the vertex order counterclockwise? */
899 bool ccw:1;
900 bool point_mode:1;
901 } tess;
902 struct {
903 /** The output primitive type */
904 uint16_t output_primitive;
905
906 /** The maximum number of vertices the geometry shader might write. */
907 uint16_t vertices_out;
908
909 /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
910 uint8_t invocations;
911
912 /** The number of vertices received per input primitive (max. 6) */
913 uint8_t vertices_in:3;
914 } gs;
915 struct {
916 bool early_fragment_tests : 1;
917 bool color_is_dual_source : 1;
918 bool uses_fbfetch_output : 1;
919 bool fbfetch_coherent : 1;
920 } fs;
921 struct {
922 unsigned req_input_mem;
923 unsigned req_local_mem;
924 bool force_linear_dispatch;
925 uint32_t local_invocation_id;
926 uint32_t work_group_id;
927 } cs;
928 };
929
930 uint32_t vtxid_base;
931
932 /* For when we don't have a shader, variant's copy of streamout state */
933 struct ir3_stream_output_info stream_output;
934 };
935
936 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)937 ir3_shader_stage(struct ir3_shader_variant *v)
938 {
939 switch (v->type) {
940 case MESA_SHADER_VERTEX:
941 return v->binning_pass ? "BVERT" : "VERT";
942 case MESA_SHADER_TESS_CTRL:
943 return "TCS";
944 case MESA_SHADER_TESS_EVAL:
945 return "TES";
946 case MESA_SHADER_GEOMETRY:
947 return "GEOM";
948 case MESA_SHADER_FRAGMENT:
949 return "FRAG";
950 case MESA_SHADER_COMPUTE:
951 case MESA_SHADER_KERNEL:
952 return "CL";
953 default:
954 unreachable("invalid type");
955 return NULL;
956 }
957 }
958
959 /* Currently we do not do binning for tess. And for GS there is no
960 * cross-stage VS+GS optimization, so the full VS+GS is used in
961 * the binning pass.
962 */
963 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)964 ir3_has_binning_vs(const struct ir3_shader_key *key)
965 {
966 if (key->tessellation || key->has_gs)
967 return false;
968 return true;
969 }
970
971 /**
972 * Represents a shader at the API level, before state-specific variants are
973 * generated.
974 */
975 struct ir3_shader {
976 gl_shader_stage type;
977
978 /* shader id (for debug): */
979 uint32_t id;
980 uint32_t variant_count;
981
982 /* Set by freedreno after shader_state_create, so we can emit debug info
983 * when recompiling a shader at draw time.
984 */
985 bool initial_variants_done;
986
987 struct ir3_compiler *compiler;
988
989 struct ir3_shader_options options;
990
991 bool nir_finalized;
992 struct nir_shader *nir;
993 struct ir3_stream_output_info stream_output;
994
995 /* per shader stage specific info: */
996 union {
997 /* for compute shaders: */
998 struct {
999 unsigned req_input_mem; /* in dwords */
1000 unsigned req_local_mem;
1001 bool force_linear_dispatch;
1002 } cs;
1003 /* For vertex shaders: */
1004 struct {
1005 /* If we need to generate a passthrough TCS, it will be a function of
1006 * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
1007 * in the VS keyed by # of patch_vertices-1.
1008 */
1009 unsigned passthrough_tcs_compiled;
1010 struct ir3_shader *passthrough_tcs[32];
1011 } vs;
1012 };
1013
1014 struct ir3_shader_variant *variants;
1015 mtx_t variants_lock;
1016
1017 cache_key cache_key; /* shader disk-cache key */
1018
1019 /* Bitmask of bits of the shader key used by this shader. Used to avoid
1020 * recompiles for GL NOS that doesn't actually apply to the shader.
1021 */
1022 struct ir3_shader_key key_mask;
1023 };
1024
1025 /**
1026 * In order to use the same cmdstream, in particular constlen setup and const
1027 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
1028 * corresponding draw pass shaders const_state.
1029 */
1030 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)1031 ir3_const_state(const struct ir3_shader_variant *v)
1032 {
1033 if (v->binning_pass)
1034 return v->nonbinning->const_state;
1035 return v->const_state;
1036 }
1037
1038 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)1039 ir3_const_state_mut(const struct ir3_shader_variant *v)
1040 {
1041 assert(!v->binning_pass);
1042 return v->const_state;
1043 }
1044
1045 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)1046 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
1047 {
1048 const struct ir3_compiler *compiler = v->compiler;
1049 bool shared_consts_enable =
1050 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1051
1052 /* Shared consts size for CS and FS matches with what's acutally used,
1053 * but the size of shared consts for geomtry stages doesn't.
1054 * So we use a hw quirk for geometry shared consts.
1055 */
1056 uint32_t shared_consts_size = shared_consts_enable ?
1057 compiler->shared_consts_size : 0;
1058
1059 uint32_t shared_consts_size_geom = shared_consts_enable ?
1060 compiler->geom_shared_consts_size_quirk : 0;
1061
1062 uint32_t safe_shared_consts_size = shared_consts_enable ?
1063 ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
1064 DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
1065
1066 if ((v->type == MESA_SHADER_COMPUTE) ||
1067 (v->type == MESA_SHADER_KERNEL)) {
1068 return compiler->max_const_compute - shared_consts_size;
1069 } else if (safe_constlen) {
1070 return compiler->max_const_safe - safe_shared_consts_size;
1071 } else if (v->type == MESA_SHADER_FRAGMENT) {
1072 return compiler->max_const_frag - shared_consts_size;
1073 } else {
1074 return compiler->max_const_geom - shared_consts_size_geom;
1075 }
1076 }
1077
1078 /* Given a variant, calculate the maximum constlen it can have.
1079 */
1080 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)1081 ir3_max_const(const struct ir3_shader_variant *v)
1082 {
1083 return _ir3_max_const(v, v->key.safe_constlen);
1084 }
1085
1086 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
1087 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
1088
1089 static inline unsigned
ir3_const_reg(const struct ir3_const_state * const_state,enum ir3_const_alloc_type type,unsigned offset)1090 ir3_const_reg(const struct ir3_const_state *const_state,
1091 enum ir3_const_alloc_type type,
1092 unsigned offset)
1093 {
1094 unsigned n = const_state->allocs.consts[type].offset_vec4;
1095 assert(const_state->allocs.consts[type].size_vec4 != 0);
1096 return regid(n + offset / 4, offset % 4);
1097 }
1098
1099 /* Return true if a variant may need to be recompiled due to exceeding the
1100 * maximum "safe" constlen.
1101 */
1102 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)1103 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
1104 {
1105 return v->constlen > _ir3_max_const(v, true);
1106 }
1107
1108 void *ir3_shader_assemble(struct ir3_shader_variant *v);
1109 struct ir3_shader_variant *
1110 ir3_shader_create_variant(struct ir3_shader *shader,
1111 const struct ir3_shader_key *key,
1112 bool keep_ir);
1113 struct ir3_shader_variant *
1114 ir3_shader_get_variant(struct ir3_shader *shader,
1115 const struct ir3_shader_key *key, bool binning_pass,
1116 bool keep_ir, bool *created);
1117
1118 struct ir3_shader *
1119 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1120 const struct ir3_shader_options *options,
1121 struct ir3_stream_output_info *stream_output);
1122 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1123 const struct ir3_compiler *compiler);
1124 struct ir3_shader *
1125 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1126 void ir3_shader_destroy(struct ir3_shader *shader);
1127 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1128 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1129
1130 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1131
1132 void ir3_shader_get_subgroup_size(const struct ir3_compiler *compiler,
1133 const struct ir3_shader_options *options,
1134 gl_shader_stage stage,
1135 unsigned *subgroup_size,
1136 unsigned *max_subgroup_size);
1137
1138 /*
1139 * Helper/util:
1140 */
1141
1142 /* clears shader-key flags which don't apply to the given shader.
1143 */
1144 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1145 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1146 {
1147 uint32_t *key_bits = (uint32_t *)key;
1148 uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1149 STATIC_ASSERT(sizeof(*key) % 4 == 0);
1150 for (unsigned i = 0; i < sizeof(*key) >> 2; i++)
1151 key_bits[i] &= key_mask[i];
1152 }
1153
1154 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1155 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1156 {
1157 for (unsigned j = 0; j < so->outputs_count; j++)
1158 if (so->outputs[j].slot == slot)
1159 return j;
1160
1161 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1162 * in the vertex shader.. but the fragment shader doesn't know this
1163 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So
1164 * at link time if there is no matching OUT.BCOLOR[n], we must map
1165 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only
1166 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1167 */
1168 if (slot == VARYING_SLOT_BFC0) {
1169 slot = VARYING_SLOT_COL0;
1170 } else if (slot == VARYING_SLOT_BFC1) {
1171 slot = VARYING_SLOT_COL1;
1172 } else if (slot == VARYING_SLOT_COL0) {
1173 slot = VARYING_SLOT_BFC0;
1174 } else if (slot == VARYING_SLOT_COL1) {
1175 slot = VARYING_SLOT_BFC1;
1176 } else {
1177 return -1;
1178 }
1179
1180 for (unsigned j = 0; j < so->outputs_count; j++)
1181 if (so->outputs[j].slot == slot)
1182 return j;
1183
1184 return -1;
1185 }
1186
1187 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1188 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1189 {
1190 assert(so->inputs_count <= (unsigned)INT_MAX);
1191 while (++i < (int)so->inputs_count)
1192 if (so->inputs[i].compmask && so->inputs[i].bary)
1193 break;
1194 return i;
1195 }
1196
1197 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1198 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1199 {
1200 int j = -1;
1201
1202 while (true) {
1203 j = ir3_next_varying(so, j);
1204
1205 assert(so->inputs_count <= (unsigned)INT_MAX);
1206 if (j >= (int)so->inputs_count)
1207 return -1;
1208
1209 if (so->inputs[j].slot == slot)
1210 return j;
1211 }
1212 }
1213
1214 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1215 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1216 {
1217 int var = ir3_find_input(so, slot);
1218 return var == -1 ? 0xff : so->inputs[var].inloc;
1219 }
1220
1221 struct ir3_shader_linkage {
1222 /* Maximum location either consumed by the fragment shader or produced by
1223 * the last geometry stage, i.e. the size required for each vertex in the
1224 * VPC in DWORD's.
1225 */
1226 uint8_t max_loc;
1227
1228 /* Number of entries in var. */
1229 uint8_t cnt;
1230
1231 /* Bitset of locations used, including ones which are only used by the FS.
1232 */
1233 uint32_t varmask[4];
1234
1235 /* Map from VS output to location. */
1236 struct {
1237 uint8_t slot;
1238 uint8_t regid;
1239 uint8_t compmask;
1240 uint8_t loc;
1241 } var[32];
1242
1243 /* location for fixed-function gl_PrimitiveID passthrough */
1244 uint8_t primid_loc;
1245
1246 /* location for fixed-function gl_ViewIndex passthrough */
1247 uint8_t viewid_loc;
1248
1249 /* location for combined clip/cull distance arrays */
1250 uint8_t clip0_loc, clip1_loc;
1251 };
1252
1253 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1254 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1255 uint8_t compmask, uint8_t loc)
1256 {
1257 for (unsigned j = 0; j < util_last_bit(compmask); j++) {
1258 uint8_t comploc = loc + j;
1259 l->varmask[comploc / 32] |= 1 << (comploc % 32);
1260 }
1261
1262 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1263
1264 if (regid_ != regid(63, 0)) {
1265 int i = l->cnt++;
1266 assert(i < ARRAY_SIZE(l->var));
1267
1268 l->var[i].slot = slot;
1269 l->var[i].regid = regid_;
1270 l->var[i].compmask = compmask;
1271 l->var[i].loc = loc;
1272 }
1273 }
1274
1275 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1276 ir3_link_shaders(struct ir3_shader_linkage *l,
1277 const struct ir3_shader_variant *vs,
1278 const struct ir3_shader_variant *fs, bool pack_vs_out)
1279 {
1280 /* On older platforms, varmask isn't programmed at all, and it appears
1281 * that the hardware generates a mask of used VPC locations using the VS
1282 * output map, and hangs if a FS bary instruction references a location
1283 * not in the list. This means that we need to have a dummy entry in the
1284 * VS out map for things like gl_PointCoord which aren't written by the
1285 * VS. Furthermore we can't use r63.x, so just pick a random register to
1286 * use if there is no VS output.
1287 */
1288 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1289 int j = -1, k;
1290
1291 l->primid_loc = 0xff;
1292 l->viewid_loc = 0xff;
1293 l->clip0_loc = 0xff;
1294 l->clip1_loc = 0xff;
1295
1296 while (l->cnt < ARRAY_SIZE(l->var)) {
1297 j = ir3_next_varying(fs, j);
1298
1299 assert(fs->inputs_count <= (unsigned)INT_MAX);
1300 if (j >= (int)fs->inputs_count)
1301 break;
1302
1303 if (fs->inputs[j].inloc >= fs->total_in)
1304 continue;
1305
1306 k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1307
1308 if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1309 l->primid_loc = fs->inputs[j].inloc;
1310 }
1311
1312 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1313 assert(k < 0);
1314 l->viewid_loc = fs->inputs[j].inloc;
1315 }
1316
1317 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1318 l->clip0_loc = fs->inputs[j].inloc;
1319
1320 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1321 l->clip1_loc = fs->inputs[j].inloc;
1322
1323 ir3_link_add(l, fs->inputs[j].slot,
1324 k >= 0 ? vs->outputs[k].regid : default_regid,
1325 fs->inputs[j].compmask, fs->inputs[j].inloc);
1326 }
1327 }
1328
1329 static inline uint32_t
ir3_get_output_regid(const struct ir3_shader_output * output)1330 ir3_get_output_regid(const struct ir3_shader_output *output)
1331 {
1332 return output->regid | (output->half ? HALF_REG_ID : 0);
1333 }
1334
1335 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1336 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1337 {
1338 int output_idx = ir3_find_output(so, (gl_varying_slot)slot);
1339
1340 if (output_idx < 0) {
1341 return INVALID_REG;
1342 }
1343
1344 return ir3_get_output_regid(&so->outputs[output_idx]);
1345 }
1346
1347 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1348
1349 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1350 const struct ir3_shader_variant *v);
1351
1352 #define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0)
1353 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1354 #define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2)
1355 #define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3)
1356
1357 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1358 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1359 {
1360 if (!so)
1361 return regid(63, 0);
1362 for (unsigned j = 0; j < so->inputs_count; j++)
1363 if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1364 return so->inputs[j].regid;
1365 return regid(63, 0);
1366 }
1367
1368 /* calculate register footprint in terms of half-regs (ie. one full
1369 * reg counts as two half-regs).
1370 */
1371 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1372 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1373 {
1374 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1375 }
1376
1377 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1378 ir3_shader_nibo(const struct ir3_shader_variant *v)
1379 {
1380 return v->num_ibos;
1381 }
1382
1383 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1384 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1385 {
1386 /* Dummy shader */
1387 if (!v->compiler)
1388 return 0;
1389
1390 if (v->compiler->gen < 5)
1391 return v->branchstack;
1392
1393 return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1394 }
1395
1396 ENDC;
1397
1398 #endif /* IR3_SHADER_H_ */
1399