• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11 
12 #include <stdio.h>
13 
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19 
20 #include "ir3_compiler.h"
21 
22 BEGINC;
23 
24 #define dword_offsetof(type, name) DIV_ROUND_UP(offsetof(type, name), 4)
25 #define dword_sizeof(type)         DIV_ROUND_UP(sizeof(type), 4)
26 
27 /**
28  * Driver params for compute shaders.
29  *
30  * Note, driver param structs should be size aligned to vec4
31  */
32 struct ir3_driver_params_cs {
33    /* NOTE: gl_NumWorkGroups should be vec4 aligned because
34     * glDispatchComputeIndirect() needs to load these from
35     * the info->indirect buffer.  Keep that in mind when/if
36     * adding any addition CS driver params.
37     */
38    uint32_t num_work_groups_x;
39    uint32_t num_work_groups_y;
40    uint32_t num_work_groups_z;
41    uint32_t work_dim;
42    uint32_t base_group_x;
43    uint32_t base_group_y;
44    uint32_t base_group_z;
45    uint32_t subgroup_size;
46    uint32_t local_group_size_x;
47    uint32_t local_group_size_y;
48    uint32_t local_group_size_z;
49    uint32_t subgroup_id_shift;
50    uint32_t workgroup_id_x;
51    uint32_t workgroup_id_y;
52    uint32_t workgroup_id_z;
53    uint32_t __pad;
54 };
55 #define IR3_DP_CS(name) dword_offsetof(struct ir3_driver_params_cs, name)
56 
57 /**
58  * Driver params for vertex shaders.
59  *
60  * Note, driver param structs should be size aligned to vec4
61  */
62 struct ir3_driver_params_vs {
63    uint32_t draw_id;
64    uint32_t vtxid_base;
65    uint32_t instid_base;
66    uint32_t vtxcnt_max;
67    uint32_t is_indexed_draw;  /* Note: boolean, ie. 0 or ~0 */
68    /* user-clip-plane components, up to 8x vec4's: */
69    struct {
70       uint32_t x;
71       uint32_t y;
72       uint32_t z;
73       uint32_t w;
74    } ucp[8];
75    uint32_t __pad_37_39[3];
76 };
77 #define IR3_DP_VS(name) dword_offsetof(struct ir3_driver_params_vs, name)
78 
79 /**
80  * Driver params for TCS shaders.
81  *
82  * Note, driver param structs should be size aligned to vec4
83  */
84 struct ir3_driver_params_tcs {
85    uint32_t default_outer_level_x;
86    uint32_t default_outer_level_y;
87    uint32_t default_outer_level_z;
88    uint32_t default_outer_level_w;
89    uint32_t default_inner_level_x;
90    uint32_t default_inner_level_y;
91    uint32_t __pad_06_07[2];
92 };
93 #define IR3_DP_TCS(name) dword_offsetof(struct ir3_driver_params_tcs, name)
94 
95 /**
96  * Driver params for fragment shaders.
97  *
98  * Note, driver param structs should be size aligned to vec4
99  */
100 struct ir3_driver_params_fs {
101    uint32_t subgroup_size;
102    uint32_t __pad_01_03[3];
103    /* Dynamic params (that aren't known when compiling the shader) */
104 #define IR3_DP_FS_DYNAMIC dword_offsetof(struct ir3_driver_params_fs, frag_invocation_count)
105    uint32_t frag_invocation_count;
106    uint32_t __pad_05_07[3];
107    uint32_t frag_size;
108    uint32_t __pad_09;
109    uint32_t frag_offset;
110    uint32_t __pad_11_12[2];
111 };
112 #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
113 
114 #define IR3_MAX_SHADER_BUFFERS  32
115 #define IR3_MAX_SHADER_IMAGES   32
116 #define IR3_MAX_SO_BUFFERS      4
117 #define IR3_MAX_SO_STREAMS      4
118 #define IR3_MAX_SO_OUTPUTS      128
119 #define IR3_MAX_UBO_PUSH_RANGES 32
120 
121 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
122 enum ir3_bary {
123    IJ_PERSP_PIXEL,
124    IJ_PERSP_SAMPLE,
125    IJ_PERSP_CENTROID,
126    IJ_PERSP_CENTER_RHW,
127    IJ_LINEAR_PIXEL,
128    IJ_LINEAR_CENTROID,
129    IJ_LINEAR_SAMPLE,
130    IJ_COUNT,
131 };
132 
133 /* Description of what wavesizes are allowed. */
134 enum ir3_wavesize_option {
135    IR3_SINGLE_ONLY,
136    IR3_SINGLE_OR_DOUBLE,
137    IR3_DOUBLE_ONLY,
138 };
139 
140 /**
141  * Description of a lowered UBO.
142  */
143 struct nir_def;
144 
145 struct ir3_ubo_info {
146    struct nir_def *global_base; /* For global loads, the base address */
147    uint32_t block;         /* Which constant block */
148    uint16_t bindless_base; /* For bindless, which base register is used */
149    bool bindless;
150    bool global;
151 };
152 
153 /**
154  * Description of a range of a lowered UBO access.
155  *
156  * Drivers should not assume that there are not multiple disjoint
157  * lowered ranges of a single UBO.
158  */
159 struct ir3_ubo_range {
160    struct ir3_ubo_info ubo;
161    uint32_t offset;     /* start offset to push in the const register file */
162    uint32_t start, end; /* range of block that's actually used */
163 };
164 
165 struct ir3_ubo_analysis_state {
166    struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
167    uint32_t num_enabled;
168    uint32_t size;
169 };
170 
171 enum ir3_push_consts_type {
172    IR3_PUSH_CONSTS_NONE,
173    IR3_PUSH_CONSTS_PER_STAGE,
174    IR3_PUSH_CONSTS_SHARED,
175    IR3_PUSH_CONSTS_SHARED_PREAMBLE,
176 };
177 
178 /* This represents an internal UBO filled out by the driver. There are a few
179  * common UBOs that must be filled out identically by all drivers, for example
180  * for shader linkage, but drivers can also add their own that they manage
181  * themselves.
182  */
183 struct ir3_driver_ubo {
184    int32_t idx;
185    uint32_t size;
186 };
187 
188 enum ir3_const_alloc_type {
189    /* Vulkan, push consts. */
190    IR3_CONST_ALLOC_PUSH_CONSTS = 0,
191    /* Vulkan, offsets required to calculate offsets of descriptors with dynamic
192     * offsets.
193     */
194    IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET = 1,
195    /* Vulkan, addresses of inline uniform buffers, to which we fallback when
196     * their size is unknown.
197     */
198    IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS = 2,
199    /* Common, stage-specific params uploaded by the driver/HW. */
200    IR3_CONST_ALLOC_DRIVER_PARAMS = 3,
201    /* Common, UBOs lowered to consts. */
202    IR3_CONST_ALLOC_UBO_RANGES = 4,
203    /* Common, consts produced by a preamble to be used in a main shader. */
204    IR3_CONST_ALLOC_PREAMBLE = 5,
205    /* Vulkan, inline uniforms loaded into consts in the preamble.*/
206    IR3_CONST_ALLOC_GLOBAL = 6,
207    /* OpenGL, pre-a6xx; pointers to UBOs */
208    IR3_CONST_ALLOC_UBO_PTRS = 7,
209    /* OpenGL, a5xx only; needed to calculate pixel offset, but only
210     * for images that have image_{load,store,size,atomic*} intrinsics.
211     */
212    IR3_CONST_ALLOC_IMAGE_DIMS = 8,
213    /* OpenCL */
214    IR3_CONST_ALLOC_KERNEL_PARAMS = 9,
215    /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */
216    IR3_CONST_ALLOC_TFBO = 10,
217    /* Common, stage-dependent primitive params:
218     *  vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
219     *  hs, ds: uvec4(primitive_stride, vertex_stride,
220     *                patch_stride, patch_vertices_in)
221     *          uvec4(tess_param_base, tess_factor_base)
222     */
223    IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11,
224    /* Common, mapping from varying location to offset. */
225    IR3_CONST_ALLOC_PRIMITIVE_MAP = 12,
226    IR3_CONST_ALLOC_MAX = 13,
227 };
228 
229 struct ir3_const_allocation {
230    uint32_t offset_vec4;
231    uint32_t size_vec4;
232 
233    uint32_t reserved_size_vec4;
234    uint32_t reserved_align_vec4;
235 };
236 
237 struct ir3_const_allocations {
238    struct ir3_const_allocation consts[IR3_CONST_ALLOC_MAX];
239    uint32_t max_const_offset_vec4;
240    uint32_t reserved_vec4;
241 };
242 
243 static inline bool
ir3_const_can_upload(const struct ir3_const_allocations * const_alloc,enum ir3_const_alloc_type type,uint32_t shader_const_size_vec4)244 ir3_const_can_upload(const struct ir3_const_allocations *const_alloc,
245                      enum ir3_const_alloc_type type,
246                      uint32_t shader_const_size_vec4)
247 {
248    return const_alloc->consts[type].size_vec4 > 0 &&
249           const_alloc->consts[type].offset_vec4 < shader_const_size_vec4;
250 }
251 
252 struct ir3_const_image_dims {
253    uint32_t mask;  /* bitmask of images that have image_store */
254    uint32_t count; /* number of consts allocated */
255    /* three const allocated per image which has image_store:
256       *  + cpp         (bytes per pixel)
257       *  + pitch       (y pitch)
258       *  + array_pitch (z pitch)
259       */
260    uint32_t off[IR3_MAX_SHADER_IMAGES];
261 };
262 
263 /**
264  * Describes the layout of shader consts in the const register file
265  * and additional info about individual allocations.
266  *
267  * Each consts section is aligned to vec4. Note that pointer
268  * size (ubo, etc) changes depending on generation.
269  *
270  * The consts allocation flow is as follows:
271  * 1) Turnip/Freedreno allocates consts required by corresponding API,
272  *    e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
273  *    into IR3.
274  * 2) ir3_setup_const_state allocates consts with non-negotiable size.
275  * 3) IR3 lowerings afterwards allocate from the free space left.
276  *
277  * Note UBO size in bytes should be aligned to vec4
278  */
279 struct ir3_const_state {
280    unsigned num_ubos;
281    unsigned num_app_ubos;      /* # of UBOs not including driver UBOs */
282    unsigned num_driver_params; /* scalar */
283 
284    struct ir3_driver_ubo consts_ubo;
285    struct ir3_driver_ubo driver_params_ubo;
286    struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
287 
288    struct ir3_const_allocations allocs;
289 
290    struct ir3_const_image_dims image_dims;
291 
292    unsigned immediates_count;
293    unsigned immediates_size;
294    uint32_t *immediates;
295 
296    /* State of ubo access lowered to push consts: */
297    struct ir3_ubo_analysis_state ubo_state;
298    enum ir3_push_consts_type push_consts_type;
299 };
300 
301 /**
302  * A single output for vertex transform feedback.
303  */
304 struct ir3_stream_output {
305    unsigned register_index  : 6;  /**< 0 to 63 (OUT index) */
306    unsigned start_component : 2;  /** 0 to 3 */
307    unsigned num_components  : 3;  /** 1 to 4 */
308    unsigned output_buffer   : 3;  /**< 0 to PIPE_MAX_SO_BUFFERS */
309    unsigned dst_offset      : 16; /**< offset into the buffer in dwords */
310    unsigned stream          : 2;  /**< 0 to 3 */
311 };
312 
313 /**
314  * Stream output for vertex transform feedback.
315  */
316 struct ir3_stream_output_info {
317    unsigned num_outputs;
318    /** stride for an entire vertex for each buffer in dwords */
319    uint16_t stride[IR3_MAX_SO_BUFFERS];
320 
321    /* These correspond to the VPC_SO_STREAM_CNTL fields */
322    uint8_t streams_written;
323    uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
324 
325    /**
326     * Array of stream outputs, in the order they are to be written in.
327     * Selected components are tightly packed into the output buffer.
328     */
329    struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
330 };
331 
332 /**
333  * Starting from a4xx, HW supports pre-dispatching texture sampling
334  * instructions prior to scheduling a shader stage, when the
335  * coordinate maps exactly to an output of the previous stage.
336  */
337 
338 /**
339  * There is a limit in the number of pre-dispatches allowed for any
340  * given stage.
341  */
342 #define IR3_MAX_SAMPLER_PREFETCH 4
343 
344 /**
345  * This is the output stream value for 'cmd', as used by blob. It may
346  * encode the return type (in 3 bits) but it hasn't been verified yet.
347  */
348 #define IR3_SAMPLER_PREFETCH_CMD          0x4
349 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
350 
351 /**
352  * Stream output for texture sampling pre-dispatches.
353  */
354 struct ir3_sampler_prefetch {
355    uint8_t src;
356    bool bindless;
357    uint8_t samp_id;
358    uint8_t tex_id;
359    uint16_t samp_bindless_id;
360    uint16_t tex_bindless_id;
361    uint8_t dst;
362    uint8_t wrmask;
363    uint8_t half_precision;
364    opc_t tex_opc;
365 };
366 
367 /* Configuration key used to identify a shader variant.. different
368  * shader variants can be used to implement features not supported
369  * in hw (two sided color), binning-pass vertex shader, etc.
370  *
371  * When adding to this struct, please update ir3_shader_variant()'s debug
372  * output.
373  */
374 struct ir3_shader_key {
375    union {
376       struct {
377          /*
378           * Combined Vertex/Fragment shader parameters:
379           */
380          unsigned ucp_enables : 8;
381 
382          /* do we need to check {v,f}saturate_{s,t,r}? */
383          unsigned has_per_samp : 1;
384 
385          /*
386           * Fragment shader variant parameters:
387           */
388          unsigned sample_shading : 1;
389          unsigned msaa           : 1;
390          /* used when shader needs to handle flat varyings (a4xx)
391           * for front/back color inputs to frag shader:
392           */
393          unsigned rasterflat : 1;
394 
395          /* Indicates that this is a tessellation pipeline which requires a
396           * whole different kind of vertex shader.  In case of
397           * tessellation, this field also tells us which kind of output
398           * topology the TES uses, which the TCS needs to know.
399           */
400 #define IR3_TESS_NONE      0
401 #define IR3_TESS_QUADS     1
402 #define IR3_TESS_TRIANGLES 2
403 #define IR3_TESS_ISOLINES  3
404          unsigned tessellation : 2;
405 
406          unsigned has_gs : 1;
407 
408          /* Whether stages after TCS read gl_PrimitiveID, used to determine
409           * whether the TCS has to store it in the tess factor BO.
410           */
411          unsigned tcs_store_primid : 1;
412 
413          /* Whether this variant sticks to the "safe" maximum constlen,
414           * which guarantees that the combined stages will never go over
415           * the limit:
416           */
417          unsigned safe_constlen : 1;
418 
419          /* Whether driconf "dual_color_blend_by_location" workaround is
420           * enabled
421           */
422          unsigned force_dual_color_blend : 1;
423       };
424       uint32_t global;
425    };
426 
427    /* bitmask of ms shifts (a3xx) */
428    uint32_t vsamples, fsamples;
429 
430    /* bitmask of samplers which need astc srgb workaround (a4xx): */
431    uint16_t vastc_srgb, fastc_srgb;
432 
433    /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
434    uint16_t vsampler_swizzles[16];
435    uint16_t fsampler_swizzles[16];
436 };
437 
438 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)439 ir3_tess_mode(enum tess_primitive_mode tess_mode)
440 {
441    switch (tess_mode) {
442    case TESS_PRIMITIVE_ISOLINES:
443       return IR3_TESS_ISOLINES;
444    case TESS_PRIMITIVE_TRIANGLES:
445       return IR3_TESS_TRIANGLES;
446    case TESS_PRIMITIVE_QUADS:
447       return IR3_TESS_QUADS;
448    default:
449       unreachable("bad tessmode");
450    }
451 }
452 
453 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)454 ir3_tess_factor_stride(unsigned patch_type)
455 {
456    /* note: this matches the stride used by ir3's build_tessfactor_base */
457    switch (patch_type) {
458    case IR3_TESS_ISOLINES:
459       return 12;
460    case IR3_TESS_TRIANGLES:
461       return 20;
462    case IR3_TESS_QUADS:
463       return 28;
464    default:
465       unreachable("bad tessmode");
466    }
467 }
468 
469 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)470 ir3_shader_key_equal(const struct ir3_shader_key *a,
471                      const struct ir3_shader_key *b)
472 {
473    /* slow-path if we need to check {v,f}saturate_{s,t,r} */
474    if (a->has_per_samp || b->has_per_samp)
475       return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
476    return a->global == b->global;
477 }
478 
479 /* will the two keys produce different lowering for a fragment shader? */
480 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)481 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
482                           struct ir3_shader_key *last_key)
483 {
484    if (last_key->has_per_samp || key->has_per_samp) {
485       if ((last_key->fsamples != key->fsamples) ||
486           (last_key->fastc_srgb != key->fastc_srgb) ||
487           memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
488                 sizeof(key->fsampler_swizzles)))
489          return true;
490    }
491 
492    if (last_key->rasterflat != key->rasterflat)
493       return true;
494 
495    if (last_key->ucp_enables != key->ucp_enables)
496       return true;
497 
498    if (last_key->safe_constlen != key->safe_constlen)
499       return true;
500 
501    return false;
502 }
503 
504 /* will the two keys produce different lowering for a vertex shader? */
505 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)506 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
507                           struct ir3_shader_key *last_key)
508 {
509    if (last_key->has_per_samp || key->has_per_samp) {
510       if ((last_key->vsamples != key->vsamples) ||
511           (last_key->vastc_srgb != key->vastc_srgb) ||
512           memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
513                 sizeof(key->vsampler_swizzles)))
514          return true;
515    }
516 
517    if (last_key->ucp_enables != key->ucp_enables)
518       return true;
519 
520    if (last_key->safe_constlen != key->safe_constlen)
521       return true;
522 
523    return false;
524 }
525 
526 /**
527  * On a4xx+a5xx, Images share state with textures and SSBOs:
528  *
529  *   + Uses texture (cat5) state/instruction (isam) to read
530  *   + Uses SSBO state and instructions (cat6) to write and for atomics
531  *
532  * Starting with a6xx, Images and SSBOs are basically the same thing,
533  * with texture state and isam also used for SSBO reads.
534  *
535  * On top of that, gallium makes the SSBO (shader_buffers) state semi
536  * sparse, with the first half of the state space used for atomic
537  * counters lowered to atomic buffers.  We could ignore this, but I
538  * don't think we could *really* handle the case of a single shader
539  * that used the max # of textures + images + SSBOs.  And once we are
540  * offsetting images by num_ssbos (or visa versa) to map them into
541  * the same hardware state, the hardware state has become coupled to
542  * the shader state, so at this point we might as well just use a
543  * mapping table to remap things from image/SSBO idx to hw idx.
544  *
545  * To make things less (more?) confusing, for the hw "SSBO" state
546  * (since it is really both SSBO and Image) I'll use the name "IBO"
547  */
548 struct ir3_ibo_mapping {
549 #define IBO_INVALID 0xff
550    /* Maps logical SSBO state to hw tex state: */
551    uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
552 
553    /* Maps logical Image state to hw tex state: */
554    uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
555 
556    /* Maps hw state back to logical SSBO or Image state:
557     *
558     * note IBO_SSBO ORd into values to indicate that the
559     * hw slot is used for SSBO state vs Image state.
560     */
561 #define IBO_SSBO 0x80
562    uint8_t tex_to_image[32];
563 
564    /* including real textures */
565    uint8_t num_tex;
566    /* the number of real textures, ie. image/ssbo start here */
567    uint8_t tex_base;
568 };
569 
570 struct ir3_disasm_info {
571    bool write_disasm;
572    char *nir;
573    char *disasm;
574 };
575 
576 /* Represents half register in regid */
577 #define HALF_REG_ID 0x100
578 
579 /* Options for common NIR optimization passes done in ir3. This is used for both
580  * finalize and post-finalize (where it has to be in the shader).
581  */
582 struct ir3_shader_nir_options {
583    /* For the modes specified, accesses are assumed to be bounds-checked as
584     * defined by VK_EXT_robustness2 and optimizations may have to be more
585     * conservative.
586     */
587    nir_variable_mode robust_modes;
588 };
589 
590 struct ir3_shader_options {
591    /* What API-visible wavesizes are allowed. Even if only double wavesize is
592     * allowed, we may still use the smaller wavesize "under the hood" and the
593     * application simply sees the upper half as always disabled.
594     */
595    enum ir3_wavesize_option api_wavesize;
596    /* What wavesizes we're allowed to actually use. If the API wavesize is
597     * single-only, then this must be single-only too.
598     */
599    enum ir3_wavesize_option real_wavesize;
600    enum ir3_push_consts_type push_consts_type;
601 
602    uint32_t push_consts_base;
603    uint32_t push_consts_dwords;
604 
605    /* Some const allocations are required at API level. */
606    struct ir3_const_allocations const_allocs;
607 
608    struct ir3_shader_nir_options nir_options;
609 
610    /* Whether FRAG_RESULT_DATAi slots may be dynamically remapped by the driver.
611     * If true, ir3 will assume it cannot statically use the value of such slots
612     * anywhere (e.g., as the target of alias.rt).
613     */
614    bool fragdata_dynamic_remap;
615 };
616 
617 struct ir3_shader_output {
618    uint8_t slot;
619    uint8_t regid;
620    uint8_t view;
621    uint8_t aliased_components : 4;
622    bool half : 1;
623 };
624 
625 /**
626  * Shader variant which contains the actual hw shader instructions,
627  * and necessary info for shader state setup.
628  */
629 struct ir3_shader_variant {
630    struct fd_bo *bo;
631 
632    /* variant id (for debug) */
633    uint32_t id;
634 
635    /* id of the shader the variant came from (for debug) */
636    uint32_t shader_id;
637 
638    struct ir3_shader_key key;
639 
640    /* vertex shaders can have an extra version for hwbinning pass,
641     * which is pointed to by so->binning:
642     */
643    bool binning_pass;
644    //	union {
645    struct ir3_shader_variant *binning;
646    struct ir3_shader_variant *nonbinning;
647    //	};
648 
649    struct ir3 *ir; /* freed after assembling machine instructions */
650 
651    /* shader variants form a linked list: */
652    struct ir3_shader_variant *next;
653 
654    /* replicated here to avoid passing extra ptrs everywhere: */
655    gl_shader_stage type;
656    struct ir3_compiler *compiler;
657 
658    char *name;
659 
660    /* variant's copy of nir->constant_data (since we don't track the NIR in
661     * the variant, and shader->nir is before the opt pass).  Moves to v->bin
662     * after assembly.
663     */
664    void *constant_data;
665 
666    struct ir3_disasm_info disasm_info;
667 
668    /*
669     * Below here is serialized when written to disk cache:
670     */
671 
672    /* The actual binary shader instructions, size given by info.sizedwords: */
673    uint32_t *bin;
674 
675    struct ir3_const_state *const_state;
676 
677    /*
678     * The following macros are used by the shader disk cache save/
679     * restore paths to serialize/deserialize the variant.  Any
680     * pointers that require special handling in store_variant()
681     * and retrieve_variant() should go above here.
682     */
683 #define VARIANT_CACHE_START  offsetof(struct ir3_shader_variant, info)
684 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
685 #define VARIANT_CACHE_SIZE                                                     \
686    (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
687 
688    struct ir3_info info;
689 
690    struct ir3_shader_options shader_options;
691 
692    uint32_t constant_data_size;
693 
694    /* Levels of nesting of flow control:
695     */
696    unsigned branchstack;
697 
698    unsigned loops;
699 
700    /* the instructions length is in units of instruction groups
701     * (4 instructions for a3xx, 16 instructions for a4xx.. each
702     * instruction is 2 dwords):
703     */
704    unsigned instrlen;
705 
706    /* the constants length is in units of vec4's, and is the sum of
707     * the uniforms and the built-in compiler constants
708     */
709    unsigned constlen;
710 
711    /* The private memory size in bytes per fiber */
712    unsigned pvtmem_size;
713    /* Whether we should use the new per-wave layout rather than per-fiber. */
714    bool pvtmem_per_wave;
715 
716    /* Whether multi-position output is enabled. */
717    bool multi_pos_output;
718 
719    /* Whether dual-source blending is enabled. */
720    bool dual_src_blend;
721 
722    /* Whether early preamble is enabled. */
723    bool early_preamble;
724 
725    /* Size in bytes of required shared memory */
726    unsigned shared_size;
727 
728    /* About Linkage:
729     *   + Let the frag shader determine the position/compmask for the
730     *     varyings, since it is the place where we know if the varying
731     *     is actually used, and if so, which components are used.  So
732     *     what the hw calls "outloc" is taken from the "inloc" of the
733     *     frag shader.
734     *   + From the vert shader, we only need the output regid
735     */
736 
737    bool frag_face, color0_mrt;
738    uint8_t fragcoord_compmask;
739 
740    /* NOTE: for input/outputs, slot is:
741     *   gl_vert_attrib  - for VS inputs
742     *   gl_varying_slot - for VS output / FS input
743     *   gl_frag_result  - for FS output
744     */
745 
746    /* varyings/outputs: */
747    unsigned outputs_count;
748    struct ir3_shader_output outputs[32 + 2]; /* +POSITION +PSIZE */
749    bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
750    bool writes_shading_rate;
751 
752    /* Size in dwords of all outputs for VS, size of entire patch for HS. */
753    uint32_t output_size;
754 
755    /* Expected size of incoming output_loc for HS, DS, and GS */
756    uint32_t input_size;
757 
758    /* Map from location to offset in per-primitive storage. In dwords for
759     * HS, where varyings are read in the next stage via ldg with a dword
760     * offset, and in bytes for all other stages.
761     * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
762     */
763    unsigned output_loc[13 + 32];
764 
765    /* attributes (VS) / varyings (FS):
766     * Note that sysval's should come *after* normal inputs.
767     */
768    unsigned inputs_count;
769    struct {
770       uint8_t slot;
771       uint8_t regid;
772       uint8_t compmask;
773       /* location of input (ie. offset passed to bary.f, etc).  This
774        * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
775        * have the OUTLOCn value offset by 8, presumably to account
776        * for gl_Position/gl_PointSize)
777        */
778       uint8_t inloc;
779       /* vertex shader specific: */
780       bool sysval : 1; /* slot is a gl_system_value */
781       /* fragment shader specific: */
782       bool bary       : 1; /* fetched varying (vs one loaded into reg) */
783       bool rasterflat : 1; /* special handling for emit->rasterflat */
784       bool half       : 1;
785       bool flat       : 1;
786    } inputs[32 + 2]; /* +POSITION +FACE */
787    bool reads_primid;
788    bool reads_shading_rate;
789    bool reads_smask;
790 
791    /* sum of input components (scalar).  For frag shaders, it only counts
792     * the varying inputs:
793     */
794    unsigned total_in;
795 
796    /* sum of sysval input components (scalar). */
797    unsigned sysval_in;
798 
799    /* For frag shaders, the total number of inputs (not scalar,
800     * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
801     */
802    unsigned varying_in;
803 
804    /* Remapping table to map Image and SSBO to hw state: */
805    struct ir3_ibo_mapping image_mapping;
806 
807    /* number of samplers/textures (which are currently 1:1): */
808    int num_samp;
809 
810    /* is there an implicit sampler to read framebuffer (FS only).. if
811     * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
812     * the last "real" texture)
813     */
814    bool fb_read;
815 
816    /* do we have one or more SSBO instructions: */
817    bool has_ssbo;
818 
819    /* Which bindless resources are used, for filling out sp_xs_config */
820    bool bindless_tex;
821    bool bindless_samp;
822    bool bindless_ibo;
823    bool bindless_ubo;
824 
825    /* do we need derivatives: */
826    bool need_pixlod;
827 
828    bool need_full_quad;
829 
830    /* do we need VS driver params? */
831    bool need_driver_params;
832 
833    /* do we have image write, etc (which prevents early-z): */
834    bool no_earlyz;
835 
836    /* do we have kill, which also prevents early-z, but not necessarily
837     * early-lrz (as long as lrz-write is disabled, which must be handled
838     * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
839     * side effects that prevent early-lrz discard.
840     */
841    bool has_kill;
842 
843    bool per_samp;
844 
845    bool post_depth_coverage;
846 
847    /* Are we using split or merged register file? */
848    bool mergedregs;
849 
850    uint8_t clip_mask, cull_mask;
851 
852    /* for astc srgb workaround, the number/base of additional
853     * alpha tex states we need, and index of original tex states
854     */
855    struct {
856       unsigned base, count;
857       unsigned orig_idx[16];
858    } astc_srgb;
859 
860    /* for tg4 workaround, the number/base of additional
861     * unswizzled tex states we need, and index of original tex states
862     */
863    struct {
864       unsigned base, count;
865       unsigned orig_idx[16];
866    } tg4;
867 
868    /* texture sampler pre-dispatches */
869    uint32_t num_sampler_prefetch;
870    struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
871 
872    /* If true, the last use of helper invocations is the texture prefetch and
873     * they should be disabled for the actual shader. Equivalent to adding
874     * (eq)nop at the beginning of the shader.
875     */
876    bool prefetch_end_of_quad;
877 
878    uint16_t local_size[3];
879    bool local_size_variable;
880 
881    /* Important for compute shader to determine max reg footprint */
882    bool has_barrier;
883 
884    /* The offset where images start in the IBO array. */
885    unsigned num_ssbos;
886 
887    /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
888    unsigned num_ibos;
889 
890    union {
891       struct {
892          enum tess_primitive_mode primitive_mode;
893 
894          /** The number of vertices in the TCS output patch. */
895          uint8_t tcs_vertices_out;
896          enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
897 
898          /** Is the vertex order counterclockwise? */
899          bool ccw:1;
900          bool point_mode:1;
901       } tess;
902       struct {
903          /** The output primitive type */
904          uint16_t output_primitive;
905 
906          /** The maximum number of vertices the geometry shader might write. */
907          uint16_t vertices_out;
908 
909          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
910          uint8_t invocations;
911 
912          /** The number of vertices received per input primitive (max. 6) */
913          uint8_t vertices_in:3;
914       } gs;
915       struct {
916          bool early_fragment_tests : 1;
917          bool color_is_dual_source : 1;
918          bool uses_fbfetch_output  : 1;
919          bool fbfetch_coherent     : 1;
920       } fs;
921       struct {
922          unsigned req_input_mem;
923          unsigned req_local_mem;
924          bool force_linear_dispatch;
925          uint32_t local_invocation_id;
926          uint32_t work_group_id;
927       } cs;
928    };
929 
930    uint32_t vtxid_base;
931 
932    /* For when we don't have a shader, variant's copy of streamout state */
933    struct ir3_stream_output_info stream_output;
934 };
935 
936 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)937 ir3_shader_stage(struct ir3_shader_variant *v)
938 {
939    switch (v->type) {
940    case MESA_SHADER_VERTEX:
941       return v->binning_pass ? "BVERT" : "VERT";
942    case MESA_SHADER_TESS_CTRL:
943       return "TCS";
944    case MESA_SHADER_TESS_EVAL:
945       return "TES";
946    case MESA_SHADER_GEOMETRY:
947       return "GEOM";
948    case MESA_SHADER_FRAGMENT:
949       return "FRAG";
950    case MESA_SHADER_COMPUTE:
951    case MESA_SHADER_KERNEL:
952       return "CL";
953    default:
954       unreachable("invalid type");
955       return NULL;
956    }
957 }
958 
959 /* Currently we do not do binning for tess.  And for GS there is no
960  * cross-stage VS+GS optimization, so the full VS+GS is used in
961  * the binning pass.
962  */
963 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)964 ir3_has_binning_vs(const struct ir3_shader_key *key)
965 {
966    if (key->tessellation || key->has_gs)
967       return false;
968    return true;
969 }
970 
971 /**
972  * Represents a shader at the API level, before state-specific variants are
973  * generated.
974  */
975 struct ir3_shader {
976    gl_shader_stage type;
977 
978    /* shader id (for debug): */
979    uint32_t id;
980    uint32_t variant_count;
981 
982    /* Set by freedreno after shader_state_create, so we can emit debug info
983     * when recompiling a shader at draw time.
984     */
985    bool initial_variants_done;
986 
987    struct ir3_compiler *compiler;
988 
989    struct ir3_shader_options options;
990 
991    bool nir_finalized;
992    struct nir_shader *nir;
993    struct ir3_stream_output_info stream_output;
994 
995    /* per shader stage specific info: */
996    union {
997       /* for compute shaders: */
998       struct {
999          unsigned req_input_mem;    /* in dwords */
1000          unsigned req_local_mem;
1001          bool force_linear_dispatch;
1002       } cs;
1003       /* For vertex shaders: */
1004       struct {
1005          /* If we need to generate a passthrough TCS, it will be a function of
1006           * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
1007           * in the VS keyed by # of patch_vertices-1.
1008           */
1009          unsigned passthrough_tcs_compiled;
1010          struct ir3_shader *passthrough_tcs[32];
1011       } vs;
1012    };
1013 
1014    struct ir3_shader_variant *variants;
1015    mtx_t variants_lock;
1016 
1017    cache_key cache_key; /* shader disk-cache key */
1018 
1019    /* Bitmask of bits of the shader key used by this shader.  Used to avoid
1020     * recompiles for GL NOS that doesn't actually apply to the shader.
1021     */
1022    struct ir3_shader_key key_mask;
1023 };
1024 
1025 /**
1026  * In order to use the same cmdstream, in particular constlen setup and const
1027  * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
1028  * corresponding draw pass shaders const_state.
1029  */
1030 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)1031 ir3_const_state(const struct ir3_shader_variant *v)
1032 {
1033    if (v->binning_pass)
1034       return v->nonbinning->const_state;
1035    return v->const_state;
1036 }
1037 
1038 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)1039 ir3_const_state_mut(const struct ir3_shader_variant *v)
1040 {
1041    assert(!v->binning_pass);
1042    return v->const_state;
1043 }
1044 
1045 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)1046 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
1047 {
1048    const struct ir3_compiler *compiler = v->compiler;
1049    bool shared_consts_enable =
1050       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1051 
1052    /* Shared consts size for CS and FS matches with what's acutally used,
1053     * but the size of shared consts for geomtry stages doesn't.
1054     * So we use a hw quirk for geometry shared consts.
1055     */
1056    uint32_t shared_consts_size = shared_consts_enable ?
1057          compiler->shared_consts_size : 0;
1058 
1059    uint32_t shared_consts_size_geom = shared_consts_enable ?
1060          compiler->geom_shared_consts_size_quirk : 0;
1061 
1062    uint32_t safe_shared_consts_size = shared_consts_enable ?
1063       ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
1064                      DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
1065 
1066    if ((v->type == MESA_SHADER_COMPUTE) ||
1067        (v->type == MESA_SHADER_KERNEL)) {
1068       return compiler->max_const_compute - shared_consts_size;
1069    } else if (safe_constlen) {
1070       return compiler->max_const_safe - safe_shared_consts_size;
1071    } else if (v->type == MESA_SHADER_FRAGMENT) {
1072       return compiler->max_const_frag - shared_consts_size;
1073    } else {
1074       return compiler->max_const_geom - shared_consts_size_geom;
1075    }
1076 }
1077 
1078 /* Given a variant, calculate the maximum constlen it can have.
1079  */
1080 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)1081 ir3_max_const(const struct ir3_shader_variant *v)
1082 {
1083    return _ir3_max_const(v, v->key.safe_constlen);
1084 }
1085 
1086 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
1087 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
1088 
1089 static inline unsigned
ir3_const_reg(const struct ir3_const_state * const_state,enum ir3_const_alloc_type type,unsigned offset)1090 ir3_const_reg(const struct ir3_const_state *const_state,
1091               enum ir3_const_alloc_type type,
1092               unsigned offset)
1093 {
1094    unsigned n = const_state->allocs.consts[type].offset_vec4;
1095    assert(const_state->allocs.consts[type].size_vec4 != 0);
1096    return regid(n + offset / 4, offset % 4);
1097 }
1098 
1099 /* Return true if a variant may need to be recompiled due to exceeding the
1100  * maximum "safe" constlen.
1101  */
1102 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)1103 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
1104 {
1105    return v->constlen > _ir3_max_const(v, true);
1106 }
1107 
1108 void *ir3_shader_assemble(struct ir3_shader_variant *v);
1109 struct ir3_shader_variant *
1110 ir3_shader_create_variant(struct ir3_shader *shader,
1111                           const struct ir3_shader_key *key,
1112                           bool keep_ir);
1113 struct ir3_shader_variant *
1114 ir3_shader_get_variant(struct ir3_shader *shader,
1115                        const struct ir3_shader_key *key, bool binning_pass,
1116                        bool keep_ir, bool *created);
1117 
1118 struct ir3_shader *
1119 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1120                     const struct ir3_shader_options *options,
1121                     struct ir3_stream_output_info *stream_output);
1122 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1123                            const struct ir3_compiler *compiler);
1124 struct ir3_shader *
1125 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1126 void ir3_shader_destroy(struct ir3_shader *shader);
1127 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1128 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1129 
1130 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1131 
1132 void ir3_shader_get_subgroup_size(const struct ir3_compiler *compiler,
1133                                   const struct ir3_shader_options *options,
1134                                   gl_shader_stage stage,
1135                                   unsigned *subgroup_size,
1136                                   unsigned *max_subgroup_size);
1137 
1138 /*
1139  * Helper/util:
1140  */
1141 
1142 /* clears shader-key flags which don't apply to the given shader.
1143  */
1144 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1145 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1146 {
1147    uint32_t *key_bits = (uint32_t *)key;
1148    uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1149    STATIC_ASSERT(sizeof(*key) % 4 == 0);
1150    for (unsigned i = 0; i < sizeof(*key) >> 2; i++)
1151       key_bits[i] &= key_mask[i];
1152 }
1153 
1154 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1155 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1156 {
1157    for (unsigned j = 0; j < so->outputs_count; j++)
1158       if (so->outputs[j].slot == slot)
1159          return j;
1160 
1161    /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1162     * in the vertex shader.. but the fragment shader doesn't know this
1163     * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
1164     * at link time if there is no matching OUT.BCOLOR[n], we must map
1165     * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
1166     * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1167     */
1168    if (slot == VARYING_SLOT_BFC0) {
1169       slot = VARYING_SLOT_COL0;
1170    } else if (slot == VARYING_SLOT_BFC1) {
1171       slot = VARYING_SLOT_COL1;
1172    } else if (slot == VARYING_SLOT_COL0) {
1173       slot = VARYING_SLOT_BFC0;
1174    } else if (slot == VARYING_SLOT_COL1) {
1175       slot = VARYING_SLOT_BFC1;
1176    } else {
1177       return -1;
1178    }
1179 
1180    for (unsigned j = 0; j < so->outputs_count; j++)
1181       if (so->outputs[j].slot == slot)
1182          return j;
1183 
1184    return -1;
1185 }
1186 
1187 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1188 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1189 {
1190    assert(so->inputs_count <= (unsigned)INT_MAX);
1191    while (++i < (int)so->inputs_count)
1192       if (so->inputs[i].compmask && so->inputs[i].bary)
1193          break;
1194    return i;
1195 }
1196 
1197 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1198 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1199 {
1200    int j = -1;
1201 
1202    while (true) {
1203       j = ir3_next_varying(so, j);
1204 
1205       assert(so->inputs_count <= (unsigned)INT_MAX);
1206       if (j >= (int)so->inputs_count)
1207          return -1;
1208 
1209       if (so->inputs[j].slot == slot)
1210          return j;
1211    }
1212 }
1213 
1214 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1215 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1216 {
1217    int var = ir3_find_input(so, slot);
1218    return var == -1 ? 0xff : so->inputs[var].inloc;
1219 }
1220 
1221 struct ir3_shader_linkage {
1222    /* Maximum location either consumed by the fragment shader or produced by
1223     * the last geometry stage, i.e. the size required for each vertex in the
1224     * VPC in DWORD's.
1225     */
1226    uint8_t max_loc;
1227 
1228    /* Number of entries in var. */
1229    uint8_t cnt;
1230 
1231    /* Bitset of locations used, including ones which are only used by the FS.
1232     */
1233    uint32_t varmask[4];
1234 
1235    /* Map from VS output to location. */
1236    struct {
1237       uint8_t slot;
1238       uint8_t regid;
1239       uint8_t compmask;
1240       uint8_t loc;
1241    } var[32];
1242 
1243    /* location for fixed-function gl_PrimitiveID passthrough */
1244    uint8_t primid_loc;
1245 
1246    /* location for fixed-function gl_ViewIndex passthrough */
1247    uint8_t viewid_loc;
1248 
1249    /* location for combined clip/cull distance arrays */
1250    uint8_t clip0_loc, clip1_loc;
1251 };
1252 
1253 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1254 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1255              uint8_t compmask, uint8_t loc)
1256 {
1257    for (unsigned j = 0; j < util_last_bit(compmask); j++) {
1258       uint8_t comploc = loc + j;
1259       l->varmask[comploc / 32] |= 1 << (comploc % 32);
1260    }
1261 
1262    l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1263 
1264    if (regid_ != regid(63, 0)) {
1265       int i = l->cnt++;
1266       assert(i < ARRAY_SIZE(l->var));
1267 
1268       l->var[i].slot = slot;
1269       l->var[i].regid = regid_;
1270       l->var[i].compmask = compmask;
1271       l->var[i].loc = loc;
1272    }
1273 }
1274 
1275 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1276 ir3_link_shaders(struct ir3_shader_linkage *l,
1277                  const struct ir3_shader_variant *vs,
1278                  const struct ir3_shader_variant *fs, bool pack_vs_out)
1279 {
1280    /* On older platforms, varmask isn't programmed at all, and it appears
1281     * that the hardware generates a mask of used VPC locations using the VS
1282     * output map, and hangs if a FS bary instruction references a location
1283     * not in the list. This means that we need to have a dummy entry in the
1284     * VS out map for things like gl_PointCoord which aren't written by the
1285     * VS. Furthermore we can't use r63.x, so just pick a random register to
1286     * use if there is no VS output.
1287     */
1288    const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1289    int j = -1, k;
1290 
1291    l->primid_loc = 0xff;
1292    l->viewid_loc = 0xff;
1293    l->clip0_loc = 0xff;
1294    l->clip1_loc = 0xff;
1295 
1296    while (l->cnt < ARRAY_SIZE(l->var)) {
1297       j = ir3_next_varying(fs, j);
1298 
1299       assert(fs->inputs_count <= (unsigned)INT_MAX);
1300       if (j >= (int)fs->inputs_count)
1301          break;
1302 
1303       if (fs->inputs[j].inloc >= fs->total_in)
1304          continue;
1305 
1306       k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1307 
1308       if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1309          l->primid_loc = fs->inputs[j].inloc;
1310       }
1311 
1312       if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1313          assert(k < 0);
1314          l->viewid_loc = fs->inputs[j].inloc;
1315       }
1316 
1317       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1318          l->clip0_loc = fs->inputs[j].inloc;
1319 
1320       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1321          l->clip1_loc = fs->inputs[j].inloc;
1322 
1323       ir3_link_add(l, fs->inputs[j].slot,
1324                    k >= 0 ? vs->outputs[k].regid : default_regid,
1325                    fs->inputs[j].compmask, fs->inputs[j].inloc);
1326    }
1327 }
1328 
1329 static inline uint32_t
ir3_get_output_regid(const struct ir3_shader_output * output)1330 ir3_get_output_regid(const struct ir3_shader_output *output)
1331 {
1332    return output->regid | (output->half ? HALF_REG_ID : 0);
1333 }
1334 
1335 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1336 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1337 {
1338    int output_idx = ir3_find_output(so, (gl_varying_slot)slot);
1339 
1340    if (output_idx < 0) {
1341       return INVALID_REG;
1342    }
1343 
1344    return ir3_get_output_regid(&so->outputs[output_idx]);
1345 }
1346 
1347 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1348 
1349 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1350                          const struct ir3_shader_variant *v);
1351 
1352 #define VARYING_SLOT_GS_HEADER_IR3       (VARYING_SLOT_MAX + 0)
1353 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1354 #define VARYING_SLOT_TCS_HEADER_IR3      (VARYING_SLOT_MAX + 2)
1355 #define VARYING_SLOT_REL_PATCH_ID_IR3    (VARYING_SLOT_MAX + 3)
1356 
1357 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1358 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1359 {
1360    if (!so)
1361       return regid(63, 0);
1362    for (unsigned j = 0; j < so->inputs_count; j++)
1363       if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1364          return so->inputs[j].regid;
1365    return regid(63, 0);
1366 }
1367 
1368 /* calculate register footprint in terms of half-regs (ie. one full
1369  * reg counts as two half-regs).
1370  */
1371 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1372 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1373 {
1374    return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1375 }
1376 
1377 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1378 ir3_shader_nibo(const struct ir3_shader_variant *v)
1379 {
1380    return v->num_ibos;
1381 }
1382 
1383 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1384 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1385 {
1386    /* Dummy shader */
1387    if (!v->compiler)
1388       return 0;
1389 
1390    if (v->compiler->gen < 5)
1391       return v->branchstack;
1392 
1393    return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1394 }
1395 
1396 ENDC;
1397 
1398 #endif /* IR3_SHADER_H_ */
1399