• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #ifndef IR3_SHADER_H_
10 #define IR3_SHADER_H_
11 
12 #include <stdio.h>
13 
14 #include "c11/threads.h"
15 #include "compiler/nir/nir.h"
16 #include "compiler/shader_enums.h"
17 #include "util/bitscan.h"
18 #include "util/disk_cache.h"
19 
20 #include "ir3_compiler.h"
21 
22 BEGINC;
23 
24 #define dword_offsetof(type, name) DIV_ROUND_UP(offsetof(type, name), 4)
25 #define dword_sizeof(type)         DIV_ROUND_UP(sizeof(type), 4)
26 
27 /**
28  * Driver params for compute shaders.
29  *
30  * Note, driver param structs should be size aligned to vec4
31  */
32 struct ir3_driver_params_cs {
33    /* NOTE: gl_NumWorkGroups should be vec4 aligned because
34     * glDispatchComputeIndirect() needs to load these from
35     * the info->indirect buffer.  Keep that in mind when/if
36     * adding any addition CS driver params.
37     */
38    uint32_t num_work_groups_x;
39    uint32_t num_work_groups_y;
40    uint32_t num_work_groups_z;
41    uint32_t work_dim;
42    uint32_t base_group_x;
43    uint32_t base_group_y;
44    uint32_t base_group_z;
45    uint32_t subgroup_size;
46    uint32_t local_group_size_x;
47    uint32_t local_group_size_y;
48    uint32_t local_group_size_z;
49    uint32_t subgroup_id_shift;
50    uint32_t workgroup_id_x;
51    uint32_t workgroup_id_y;
52    uint32_t workgroup_id_z;
53    uint32_t __pad;
54 };
55 #define IR3_DP_CS(name) dword_offsetof(struct ir3_driver_params_cs, name)
56 
57 /**
58  * Driver params for vertex shaders.
59  *
60  * Note, driver param structs should be size aligned to vec4
61  */
62 struct ir3_driver_params_vs {
63    uint32_t draw_id;
64    uint32_t vtxid_base;
65    uint32_t instid_base;
66    uint32_t vtxcnt_max;
67    uint32_t is_indexed_draw;  /* Note: boolean, ie. 0 or ~0 */
68    /* user-clip-plane components, up to 8x vec4's: */
69    struct {
70       uint32_t x;
71       uint32_t y;
72       uint32_t z;
73       uint32_t w;
74    } ucp[8];
75    uint32_t __pad_37_39[3];
76 };
77 #define IR3_DP_VS(name) dword_offsetof(struct ir3_driver_params_vs, name)
78 
79 /**
80  * Driver params for TCS shaders.
81  *
82  * Note, driver param structs should be size aligned to vec4
83  */
84 struct ir3_driver_params_tcs {
85    uint32_t default_outer_level_x;
86    uint32_t default_outer_level_y;
87    uint32_t default_outer_level_z;
88    uint32_t default_outer_level_w;
89    uint32_t default_inner_level_x;
90    uint32_t default_inner_level_y;
91    uint32_t __pad_06_07[2];
92 };
93 #define IR3_DP_TCS(name) dword_offsetof(struct ir3_driver_params_tcs, name)
94 
95 /**
96  * Driver params for fragment shaders.
97  *
98  * Note, driver param structs should be size aligned to vec4
99  */
100 struct ir3_driver_params_fs {
101    uint32_t subgroup_size;
102    uint32_t __pad_01_03[3];
103    /* Dynamic params (that aren't known when compiling the shader) */
104 #define IR3_DP_FS_DYNAMIC dword_offsetof(struct ir3_driver_params_fs, frag_invocation_count)
105    uint32_t frag_invocation_count;
106    uint32_t __pad_05_07[3];
107    uint32_t frag_size;
108    uint32_t __pad_09;
109    uint32_t frag_offset;
110    uint32_t __pad_11_12[2];
111 };
112 #define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
113 
114 #define IR3_MAX_SHADER_BUFFERS  32
115 #define IR3_MAX_SHADER_IMAGES   32
116 #define IR3_MAX_SO_BUFFERS      4
117 #define IR3_MAX_SO_STREAMS      4
118 #define IR3_MAX_SO_OUTPUTS      128
119 #define IR3_MAX_UBO_PUSH_RANGES 32
120 
121 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
122 enum ir3_bary {
123    IJ_PERSP_PIXEL,
124    IJ_PERSP_SAMPLE,
125    IJ_PERSP_CENTROID,
126    IJ_PERSP_CENTER_RHW,
127    IJ_LINEAR_PIXEL,
128    IJ_LINEAR_CENTROID,
129    IJ_LINEAR_SAMPLE,
130    IJ_COUNT,
131 };
132 
133 /* Description of what wavesizes are allowed. */
134 enum ir3_wavesize_option {
135    IR3_SINGLE_ONLY,
136    IR3_SINGLE_OR_DOUBLE,
137    IR3_DOUBLE_ONLY,
138 };
139 
140 /**
141  * Description of a lowered UBO.
142  */
143 struct nir_def;
144 
145 struct ir3_ubo_info {
146    struct nir_def *global_base; /* For global loads, the base address */
147    uint32_t block;         /* Which constant block */
148    uint16_t bindless_base; /* For bindless, which base register is used */
149    bool bindless;
150    bool global;
151 };
152 
153 /**
154  * Description of a range of a lowered UBO access.
155  *
156  * Drivers should not assume that there are not multiple disjoint
157  * lowered ranges of a single UBO.
158  */
159 struct ir3_ubo_range {
160    struct ir3_ubo_info ubo;
161    uint32_t offset;     /* start offset to push in the const register file */
162    uint32_t start, end; /* range of block that's actually used */
163 };
164 
165 struct ir3_ubo_analysis_state {
166    struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
167    uint32_t num_enabled;
168    uint32_t size;
169 };
170 
171 enum ir3_push_consts_type {
172    IR3_PUSH_CONSTS_NONE,
173    IR3_PUSH_CONSTS_PER_STAGE,
174    IR3_PUSH_CONSTS_SHARED,
175    IR3_PUSH_CONSTS_SHARED_PREAMBLE,
176 };
177 
178 /* This represents an internal UBO filled out by the driver. There are a few
179  * common UBOs that must be filled out identically by all drivers, for example
180  * for shader linkage, but drivers can also add their own that they manage
181  * themselves.
182  */
183 struct ir3_driver_ubo {
184    int32_t idx;
185    uint32_t size;
186 };
187 
188 enum ir3_const_alloc_type {
189    /* Vulkan, push consts. */
190    IR3_CONST_ALLOC_PUSH_CONSTS = 0,
191    /* Vulkan, offsets required to calculate offsets of descriptors with dynamic
192     * offsets.
193     */
194    IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET = 1,
195    /* Vulkan, addresses of inline uniform buffers, to which we fallback when
196     * their size is unknown.
197     */
198    IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS = 2,
199    /* Common, stage-specific params uploaded by the driver/HW. */
200    IR3_CONST_ALLOC_DRIVER_PARAMS = 3,
201    /* Common, UBOs lowered to consts. */
202    IR3_CONST_ALLOC_UBO_RANGES = 4,
203    /* Common, consts produced by a preamble to be used in a main shader. */
204    IR3_CONST_ALLOC_PREAMBLE = 5,
205    /* Vulkan, inline uniforms loaded into consts in the preamble.*/
206    IR3_CONST_ALLOC_GLOBAL = 6,
207    /* OpenGL, pre-a6xx; pointers to UBOs */
208    IR3_CONST_ALLOC_UBO_PTRS = 7,
209    /* OpenGL, a5xx only; needed to calculate pixel offset, but only
210     * for images that have image_{load,store,size,atomic*} intrinsics.
211     */
212    IR3_CONST_ALLOC_IMAGE_DIMS = 8,
213    /* OpenCL */
214    IR3_CONST_ALLOC_KERNEL_PARAMS = 9,
215    /* OpenGL, TFBO addresses only for vs on a3xx/a4xx */
216    IR3_CONST_ALLOC_TFBO = 10,
217    /* Common, stage-dependent primitive params:
218     *  vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
219     *  hs, ds: uvec4(primitive_stride, vertex_stride,
220     *                patch_stride, patch_vertices_in)
221     *          uvec4(tess_param_base, tess_factor_base)
222     */
223    IR3_CONST_ALLOC_PRIMITIVE_PARAM = 11,
224    /* Common, mapping from varying location to offset. */
225    IR3_CONST_ALLOC_PRIMITIVE_MAP = 12,
226    IR3_CONST_ALLOC_MAX = 13,
227 };
228 
229 struct ir3_const_allocation {
230    uint32_t offset_vec4;
231    uint32_t size_vec4;
232 
233    uint32_t reserved_size_vec4;
234    uint32_t reserved_align_vec4;
235 };
236 
237 struct ir3_const_allocations {
238    struct ir3_const_allocation consts[IR3_CONST_ALLOC_MAX];
239    uint32_t max_const_offset_vec4;
240    uint32_t reserved_vec4;
241 };
242 
243 static inline bool
ir3_const_can_upload(const struct ir3_const_allocations * const_alloc,enum ir3_const_alloc_type type,uint32_t shader_const_size_vec4)244 ir3_const_can_upload(const struct ir3_const_allocations *const_alloc,
245                      enum ir3_const_alloc_type type,
246                      uint32_t shader_const_size_vec4)
247 {
248    return const_alloc->consts[type].size_vec4 > 0 &&
249           const_alloc->consts[type].offset_vec4 < shader_const_size_vec4;
250 }
251 
252 struct ir3_const_image_dims {
253    uint32_t mask;  /* bitmask of images that have image_store */
254    uint32_t count; /* number of consts allocated */
255    /* three const allocated per image which has image_store:
256       *  + cpp         (bytes per pixel)
257       *  + pitch       (y pitch)
258       *  + array_pitch (z pitch)
259       */
260    uint32_t off[IR3_MAX_SHADER_IMAGES];
261 };
262 
263 /**
264  * Describes the layout of shader consts in the const register file
265  * and additional info about individual allocations.
266  *
267  * Each consts section is aligned to vec4. Note that pointer
268  * size (ubo, etc) changes depending on generation.
269  *
270  * The consts allocation flow is as follows:
271  * 1) Turnip/Freedreno allocates consts required by corresponding API,
272  *    e.g. push const, inline uniforms, etc. Then passes ir3_const_allocations
273  *    into IR3.
274  * 2) ir3_setup_const_state allocates consts with non-negotiable size.
275  * 3) IR3 lowerings afterwards allocate from the free space left.
276  *
277  * Note UBO size in bytes should be aligned to vec4
278  */
279 struct ir3_const_state {
280    unsigned num_ubos;
281    unsigned num_app_ubos;      /* # of UBOs not including driver UBOs */
282    unsigned num_driver_params; /* scalar */
283 
284    struct ir3_driver_ubo consts_ubo;
285    struct ir3_driver_ubo driver_params_ubo;
286    struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
287 
288    struct ir3_const_allocations allocs;
289 
290    struct ir3_const_image_dims image_dims;
291 
292    unsigned immediates_count;
293    unsigned immediates_size;
294    uint32_t *immediates;
295 
296    /* State of ubo access lowered to push consts: */
297    struct ir3_ubo_analysis_state ubo_state;
298    enum ir3_push_consts_type push_consts_type;
299 };
300 
301 /**
302  * A single output for vertex transform feedback.
303  */
304 struct ir3_stream_output {
305    unsigned register_index  : 6;  /**< 0 to 63 (OUT index) */
306    unsigned start_component : 2;  /** 0 to 3 */
307    unsigned num_components  : 3;  /** 1 to 4 */
308    unsigned output_buffer   : 3;  /**< 0 to PIPE_MAX_SO_BUFFERS */
309    unsigned dst_offset      : 16; /**< offset into the buffer in dwords */
310    unsigned stream          : 2;  /**< 0 to 3 */
311 };
312 
313 /**
314  * Stream output for vertex transform feedback.
315  */
316 struct ir3_stream_output_info {
317    unsigned num_outputs;
318    /** stride for an entire vertex for each buffer in dwords */
319    uint16_t stride[IR3_MAX_SO_BUFFERS];
320 
321    /* These correspond to the VPC_SO_STREAM_CNTL fields */
322    uint8_t streams_written;
323    uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
324 
325    /**
326     * Array of stream outputs, in the order they are to be written in.
327     * Selected components are tightly packed into the output buffer.
328     */
329    struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
330 };
331 
332 /**
333  * Starting from a4xx, HW supports pre-dispatching texture sampling
334  * instructions prior to scheduling a shader stage, when the
335  * coordinate maps exactly to an output of the previous stage.
336  */
337 
338 /**
339  * There is a limit in the number of pre-dispatches allowed for any
340  * given stage.
341  */
342 #define IR3_MAX_SAMPLER_PREFETCH 4
343 
344 /**
345  * This is the output stream value for 'cmd', as used by blob. It may
346  * encode the return type (in 3 bits) but it hasn't been verified yet.
347  */
348 #define IR3_SAMPLER_PREFETCH_CMD          0x4
349 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
350 
351 /**
352  * Stream output for texture sampling pre-dispatches.
353  */
354 struct ir3_sampler_prefetch {
355    uint8_t src;
356    bool bindless;
357    uint8_t samp_id;
358    uint8_t tex_id;
359    uint16_t samp_bindless_id;
360    uint16_t tex_bindless_id;
361    uint8_t dst;
362    uint8_t wrmask;
363    uint8_t half_precision;
364    opc_t tex_opc;
365 };
366 
367 /* Configuration key used to identify a shader variant.. different
368  * shader variants can be used to implement features not supported
369  * in hw (two sided color), binning-pass vertex shader, etc.
370  *
371  * When adding to this struct, please update ir3_shader_variant()'s debug
372  * output.
373  */
374 struct ir3_shader_key {
375    union {
376       struct {
377          /*
378           * Combined Vertex/Fragment shader parameters:
379           */
380          unsigned ucp_enables : 8;
381 
382          /* do we need to check {v,f}saturate_{s,t,r}? */
383          unsigned has_per_samp : 1;
384 
385          /*
386           * Fragment shader variant parameters:
387           */
388          unsigned sample_shading : 1;
389          unsigned msaa           : 1;
390          /* used when shader needs to handle flat varyings (a4xx)
391           * for front/back color inputs to frag shader:
392           */
393          unsigned rasterflat : 1;
394 
395          /* Indicates that this is a tessellation pipeline which requires a
396           * whole different kind of vertex shader.  In case of
397           * tessellation, this field also tells us which kind of output
398           * topology the TES uses, which the TCS needs to know.
399           */
400 #define IR3_TESS_NONE      0
401 #define IR3_TESS_QUADS     1
402 #define IR3_TESS_TRIANGLES 2
403 #define IR3_TESS_ISOLINES  3
404          unsigned tessellation : 2;
405 
406          unsigned has_gs : 1;
407 
408          /* Whether stages after TCS read gl_PrimitiveID, used to determine
409           * whether the TCS has to store it in the tess factor BO.
410           */
411          unsigned tcs_store_primid : 1;
412 
413          /* Whether this variant sticks to the "safe" maximum constlen,
414           * which guarantees that the combined stages will never go over
415           * the limit:
416           */
417          unsigned safe_constlen : 1;
418 
419          /* Whether driconf "dual_color_blend_by_location" workaround is
420           * enabled
421           */
422          unsigned force_dual_color_blend : 1;
423       };
424       uint32_t global;
425    };
426 
427    /* bitmask of ms shifts (a3xx) */
428    uint32_t vsamples, fsamples;
429 
430    /* bitmask of samplers which need astc srgb workaround (a4xx): */
431    uint16_t vastc_srgb, fastc_srgb;
432 
433    /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
434    uint16_t vsampler_swizzles[16];
435    uint16_t fsampler_swizzles[16];
436 };
437 
438 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)439 ir3_tess_mode(enum tess_primitive_mode tess_mode)
440 {
441    switch (tess_mode) {
442    case TESS_PRIMITIVE_ISOLINES:
443       return IR3_TESS_ISOLINES;
444    case TESS_PRIMITIVE_TRIANGLES:
445       return IR3_TESS_TRIANGLES;
446    case TESS_PRIMITIVE_QUADS:
447       return IR3_TESS_QUADS;
448    default:
449       unreachable("bad tessmode");
450    }
451 }
452 
453 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)454 ir3_tess_factor_stride(unsigned patch_type)
455 {
456    /* note: this matches the stride used by ir3's build_tessfactor_base */
457    switch (patch_type) {
458    case IR3_TESS_ISOLINES:
459       return 12;
460    case IR3_TESS_TRIANGLES:
461       return 20;
462    case IR3_TESS_QUADS:
463       return 28;
464    default:
465       unreachable("bad tessmode");
466    }
467 }
468 
469 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)470 ir3_shader_key_equal(const struct ir3_shader_key *a,
471                      const struct ir3_shader_key *b)
472 {
473    /* slow-path if we need to check {v,f}saturate_{s,t,r} */
474    if (a->has_per_samp || b->has_per_samp)
475       return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
476    return a->global == b->global;
477 }
478 
479 /* will the two keys produce different lowering for a fragment shader? */
480 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)481 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
482                           struct ir3_shader_key *last_key)
483 {
484    if (last_key->has_per_samp || key->has_per_samp) {
485       if ((last_key->fsamples != key->fsamples) ||
486           (last_key->fastc_srgb != key->fastc_srgb) ||
487           memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
488                 sizeof(key->fsampler_swizzles)))
489          return true;
490    }
491 
492    if (last_key->rasterflat != key->rasterflat)
493       return true;
494 
495    if (last_key->ucp_enables != key->ucp_enables)
496       return true;
497 
498    if (last_key->safe_constlen != key->safe_constlen)
499       return true;
500 
501    return false;
502 }
503 
504 /* will the two keys produce different lowering for a vertex shader? */
505 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)506 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
507                           struct ir3_shader_key *last_key)
508 {
509    if (last_key->has_per_samp || key->has_per_samp) {
510       if ((last_key->vsamples != key->vsamples) ||
511           (last_key->vastc_srgb != key->vastc_srgb) ||
512           memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
513                 sizeof(key->vsampler_swizzles)))
514          return true;
515    }
516 
517    if (last_key->ucp_enables != key->ucp_enables)
518       return true;
519 
520    if (last_key->safe_constlen != key->safe_constlen)
521       return true;
522 
523    return false;
524 }
525 
526 /**
527  * On a4xx+a5xx, Images share state with textures and SSBOs:
528  *
529  *   + Uses texture (cat5) state/instruction (isam) to read
530  *   + Uses SSBO state and instructions (cat6) to write and for atomics
531  *
532  * Starting with a6xx, Images and SSBOs are basically the same thing,
533  * with texture state and isam also used for SSBO reads.
534  *
535  * On top of that, gallium makes the SSBO (shader_buffers) state semi
536  * sparse, with the first half of the state space used for atomic
537  * counters lowered to atomic buffers.  We could ignore this, but I
538  * don't think we could *really* handle the case of a single shader
539  * that used the max # of textures + images + SSBOs.  And once we are
540  * offsetting images by num_ssbos (or visa versa) to map them into
541  * the same hardware state, the hardware state has become coupled to
542  * the shader state, so at this point we might as well just use a
543  * mapping table to remap things from image/SSBO idx to hw idx.
544  *
545  * To make things less (more?) confusing, for the hw "SSBO" state
546  * (since it is really both SSBO and Image) I'll use the name "IBO"
547  */
548 struct ir3_ibo_mapping {
549 #define IBO_INVALID 0xff
550    /* Maps logical SSBO state to hw tex state: */
551    uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
552 
553    /* Maps logical Image state to hw tex state: */
554    uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
555 
556    /* Maps hw state back to logical SSBO or Image state:
557     *
558     * note IBO_SSBO ORd into values to indicate that the
559     * hw slot is used for SSBO state vs Image state.
560     */
561 #define IBO_SSBO 0x80
562    uint8_t tex_to_image[32];
563 
564    /* including real textures */
565    uint8_t num_tex;
566    /* the number of real textures, ie. image/ssbo start here */
567    uint8_t tex_base;
568 };
569 
570 struct ir3_disasm_info {
571    bool write_disasm;
572    char *nir;
573    char *disasm;
574 };
575 
576 /* Represents half register in regid */
577 #define HALF_REG_ID 0x100
578 
579 /* Options for common NIR optimization passes done in ir3. This is used for both
580  * finalize and post-finalize (where it has to be in the shader).
581  */
582 struct ir3_shader_nir_options {
583    /* For the modes specified, accesses are assumed to be bounds-checked as
584     * defined by VK_EXT_robustness2 and optimizations may have to be more
585     * conservative.
586     */
587    nir_variable_mode robust_modes;
588 };
589 
590 struct ir3_shader_options {
591    /* What API-visible wavesizes are allowed. Even if only double wavesize is
592     * allowed, we may still use the smaller wavesize "under the hood" and the
593     * application simply sees the upper half as always disabled.
594     */
595    enum ir3_wavesize_option api_wavesize;
596    /* What wavesizes we're allowed to actually use. If the API wavesize is
597     * single-only, then this must be single-only too.
598     */
599    enum ir3_wavesize_option real_wavesize;
600    enum ir3_push_consts_type push_consts_type;
601 
602    uint32_t push_consts_base;
603    uint32_t push_consts_dwords;
604 
605    /* Some const allocations are required at API level. */
606    struct ir3_const_allocations const_allocs;
607 
608    struct ir3_shader_nir_options nir_options;
609 };
610 
611 /**
612  * Shader variant which contains the actual hw shader instructions,
613  * and necessary info for shader state setup.
614  */
615 struct ir3_shader_variant {
616    struct fd_bo *bo;
617 
618    /* variant id (for debug) */
619    uint32_t id;
620 
621    /* id of the shader the variant came from (for debug) */
622    uint32_t shader_id;
623 
624    struct ir3_shader_key key;
625 
626    /* vertex shaders can have an extra version for hwbinning pass,
627     * which is pointed to by so->binning:
628     */
629    bool binning_pass;
630    //	union {
631    struct ir3_shader_variant *binning;
632    struct ir3_shader_variant *nonbinning;
633    //	};
634 
635    struct ir3 *ir; /* freed after assembling machine instructions */
636 
637    /* shader variants form a linked list: */
638    struct ir3_shader_variant *next;
639 
640    /* replicated here to avoid passing extra ptrs everywhere: */
641    gl_shader_stage type;
642    struct ir3_compiler *compiler;
643 
644    char *name;
645 
646    /* variant's copy of nir->constant_data (since we don't track the NIR in
647     * the variant, and shader->nir is before the opt pass).  Moves to v->bin
648     * after assembly.
649     */
650    void *constant_data;
651 
652    struct ir3_disasm_info disasm_info;
653 
654    /*
655     * Below here is serialized when written to disk cache:
656     */
657 
658    /* The actual binary shader instructions, size given by info.sizedwords: */
659    uint32_t *bin;
660 
661    struct ir3_const_state *const_state;
662 
663    /*
664     * The following macros are used by the shader disk cache save/
665     * restore paths to serialize/deserialize the variant.  Any
666     * pointers that require special handling in store_variant()
667     * and retrieve_variant() should go above here.
668     */
669 #define VARIANT_CACHE_START  offsetof(struct ir3_shader_variant, info)
670 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
671 #define VARIANT_CACHE_SIZE                                                     \
672    (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
673 
674    struct ir3_info info;
675 
676    struct ir3_shader_options shader_options;
677 
678    uint32_t constant_data_size;
679 
680    /* Levels of nesting of flow control:
681     */
682    unsigned branchstack;
683 
684    unsigned loops;
685 
686    /* the instructions length is in units of instruction groups
687     * (4 instructions for a3xx, 16 instructions for a4xx.. each
688     * instruction is 2 dwords):
689     */
690    unsigned instrlen;
691 
692    /* the constants length is in units of vec4's, and is the sum of
693     * the uniforms and the built-in compiler constants
694     */
695    unsigned constlen;
696 
697    /* The private memory size in bytes per fiber */
698    unsigned pvtmem_size;
699    /* Whether we should use the new per-wave layout rather than per-fiber. */
700    bool pvtmem_per_wave;
701 
702    /* Whether multi-position output is enabled. */
703    bool multi_pos_output;
704 
705    /* Whether dual-source blending is enabled. */
706    bool dual_src_blend;
707 
708    /* Whether early preamble is enabled. */
709    bool early_preamble;
710 
711    /* Size in bytes of required shared memory */
712    unsigned shared_size;
713 
714    /* About Linkage:
715     *   + Let the frag shader determine the position/compmask for the
716     *     varyings, since it is the place where we know if the varying
717     *     is actually used, and if so, which components are used.  So
718     *     what the hw calls "outloc" is taken from the "inloc" of the
719     *     frag shader.
720     *   + From the vert shader, we only need the output regid
721     */
722 
723    bool frag_face, color0_mrt;
724    uint8_t fragcoord_compmask;
725 
726    /* NOTE: for input/outputs, slot is:
727     *   gl_vert_attrib  - for VS inputs
728     *   gl_varying_slot - for VS output / FS input
729     *   gl_frag_result  - for FS output
730     */
731 
732    /* varyings/outputs: */
733    unsigned outputs_count;
734    struct {
735       uint8_t slot;
736       uint8_t regid;
737       uint8_t view;
738       bool half : 1;
739    } outputs[32 + 2]; /* +POSITION +PSIZE */
740    bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
741    bool writes_shading_rate;
742 
743    /* Size in dwords of all outputs for VS, size of entire patch for HS. */
744    uint32_t output_size;
745 
746    /* Expected size of incoming output_loc for HS, DS, and GS */
747    uint32_t input_size;
748 
749    /* Map from location to offset in per-primitive storage. In dwords for
750     * HS, where varyings are read in the next stage via ldg with a dword
751     * offset, and in bytes for all other stages.
752     * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
753     */
754    unsigned output_loc[13 + 32];
755 
756    /* attributes (VS) / varyings (FS):
757     * Note that sysval's should come *after* normal inputs.
758     */
759    unsigned inputs_count;
760    struct {
761       uint8_t slot;
762       uint8_t regid;
763       uint8_t compmask;
764       /* location of input (ie. offset passed to bary.f, etc).  This
765        * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
766        * have the OUTLOCn value offset by 8, presumably to account
767        * for gl_Position/gl_PointSize)
768        */
769       uint8_t inloc;
770       /* vertex shader specific: */
771       bool sysval : 1; /* slot is a gl_system_value */
772       /* fragment shader specific: */
773       bool bary       : 1; /* fetched varying (vs one loaded into reg) */
774       bool rasterflat : 1; /* special handling for emit->rasterflat */
775       bool half       : 1;
776       bool flat       : 1;
777    } inputs[32 + 2]; /* +POSITION +FACE */
778    bool reads_primid;
779    bool reads_shading_rate;
780    bool reads_smask;
781 
782    /* sum of input components (scalar).  For frag shaders, it only counts
783     * the varying inputs:
784     */
785    unsigned total_in;
786 
787    /* sum of sysval input components (scalar). */
788    unsigned sysval_in;
789 
790    /* For frag shaders, the total number of inputs (not scalar,
791     * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
792     */
793    unsigned varying_in;
794 
795    /* Remapping table to map Image and SSBO to hw state: */
796    struct ir3_ibo_mapping image_mapping;
797 
798    /* number of samplers/textures (which are currently 1:1): */
799    int num_samp;
800 
801    /* is there an implicit sampler to read framebuffer (FS only).. if
802     * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
803     * the last "real" texture)
804     */
805    bool fb_read;
806 
807    /* do we have one or more SSBO instructions: */
808    bool has_ssbo;
809 
810    /* Which bindless resources are used, for filling out sp_xs_config */
811    bool bindless_tex;
812    bool bindless_samp;
813    bool bindless_ibo;
814    bool bindless_ubo;
815 
816    /* do we need derivatives: */
817    bool need_pixlod;
818 
819    bool need_full_quad;
820 
821    /* do we need VS driver params? */
822    bool need_driver_params;
823 
824    /* do we have image write, etc (which prevents early-z): */
825    bool no_earlyz;
826 
827    /* do we have kill, which also prevents early-z, but not necessarily
828     * early-lrz (as long as lrz-write is disabled, which must be handled
829     * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
830     * side effects that prevent early-lrz discard.
831     */
832    bool has_kill;
833 
834    bool per_samp;
835 
836    bool post_depth_coverage;
837 
838    /* Are we using split or merged register file? */
839    bool mergedregs;
840 
841    uint8_t clip_mask, cull_mask;
842 
843    /* for astc srgb workaround, the number/base of additional
844     * alpha tex states we need, and index of original tex states
845     */
846    struct {
847       unsigned base, count;
848       unsigned orig_idx[16];
849    } astc_srgb;
850 
851    /* for tg4 workaround, the number/base of additional
852     * unswizzled tex states we need, and index of original tex states
853     */
854    struct {
855       unsigned base, count;
856       unsigned orig_idx[16];
857    } tg4;
858 
859    /* texture sampler pre-dispatches */
860    uint32_t num_sampler_prefetch;
861    struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
862 
863    /* If true, the last use of helper invocations is the texture prefetch and
864     * they should be disabled for the actual shader. Equivalent to adding
865     * (eq)nop at the beginning of the shader.
866     */
867    bool prefetch_end_of_quad;
868 
869    uint16_t local_size[3];
870    bool local_size_variable;
871 
872    /* Important for compute shader to determine max reg footprint */
873    bool has_barrier;
874 
875    /* The offset where images start in the IBO array. */
876    unsigned num_ssbos;
877 
878    /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
879    unsigned num_ibos;
880 
881    union {
882       struct {
883          enum tess_primitive_mode primitive_mode;
884 
885          /** The number of vertices in the TCS output patch. */
886          uint8_t tcs_vertices_out;
887          enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
888 
889          /** Is the vertex order counterclockwise? */
890          bool ccw:1;
891          bool point_mode:1;
892       } tess;
893       struct {
894          /** The output primitive type */
895          uint16_t output_primitive;
896 
897          /** The maximum number of vertices the geometry shader might write. */
898          uint16_t vertices_out;
899 
900          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
901          uint8_t invocations;
902 
903          /** The number of vertices received per input primitive (max. 6) */
904          uint8_t vertices_in:3;
905       } gs;
906       struct {
907          bool early_fragment_tests : 1;
908          bool color_is_dual_source : 1;
909          bool uses_fbfetch_output  : 1;
910          bool fbfetch_coherent     : 1;
911       } fs;
912       struct {
913          unsigned req_input_mem;
914          unsigned req_local_mem;
915          bool force_linear_dispatch;
916          uint32_t local_invocation_id;
917          uint32_t work_group_id;
918       } cs;
919    };
920 
921    uint32_t vtxid_base;
922 
923    /* For when we don't have a shader, variant's copy of streamout state */
924    struct ir3_stream_output_info stream_output;
925 };
926 
927 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)928 ir3_shader_stage(struct ir3_shader_variant *v)
929 {
930    switch (v->type) {
931    case MESA_SHADER_VERTEX:
932       return v->binning_pass ? "BVERT" : "VERT";
933    case MESA_SHADER_TESS_CTRL:
934       return "TCS";
935    case MESA_SHADER_TESS_EVAL:
936       return "TES";
937    case MESA_SHADER_GEOMETRY:
938       return "GEOM";
939    case MESA_SHADER_FRAGMENT:
940       return "FRAG";
941    case MESA_SHADER_COMPUTE:
942    case MESA_SHADER_KERNEL:
943       return "CL";
944    default:
945       unreachable("invalid type");
946       return NULL;
947    }
948 }
949 
950 /* Currently we do not do binning for tess.  And for GS there is no
951  * cross-stage VS+GS optimization, so the full VS+GS is used in
952  * the binning pass.
953  */
954 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)955 ir3_has_binning_vs(const struct ir3_shader_key *key)
956 {
957    if (key->tessellation || key->has_gs)
958       return false;
959    return true;
960 }
961 
962 /**
963  * Represents a shader at the API level, before state-specific variants are
964  * generated.
965  */
966 struct ir3_shader {
967    gl_shader_stage type;
968 
969    /* shader id (for debug): */
970    uint32_t id;
971    uint32_t variant_count;
972 
973    /* Set by freedreno after shader_state_create, so we can emit debug info
974     * when recompiling a shader at draw time.
975     */
976    bool initial_variants_done;
977 
978    struct ir3_compiler *compiler;
979 
980    struct ir3_shader_options options;
981 
982    bool nir_finalized;
983    struct nir_shader *nir;
984    struct ir3_stream_output_info stream_output;
985 
986    /* per shader stage specific info: */
987    union {
988       /* for compute shaders: */
989       struct {
990          unsigned req_input_mem;    /* in dwords */
991          unsigned req_local_mem;
992          bool force_linear_dispatch;
993       } cs;
994       /* For vertex shaders: */
995       struct {
996          /* If we need to generate a passthrough TCS, it will be a function of
997           * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
998           * in the VS keyed by # of patch_vertices-1.
999           */
1000          unsigned passthrough_tcs_compiled;
1001          struct ir3_shader *passthrough_tcs[32];
1002       } vs;
1003    };
1004 
1005    struct ir3_shader_variant *variants;
1006    mtx_t variants_lock;
1007 
1008    cache_key cache_key; /* shader disk-cache key */
1009 
1010    /* Bitmask of bits of the shader key used by this shader.  Used to avoid
1011     * recompiles for GL NOS that doesn't actually apply to the shader.
1012     */
1013    struct ir3_shader_key key_mask;
1014 };
1015 
1016 /**
1017  * In order to use the same cmdstream, in particular constlen setup and const
1018  * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
1019  * corresponding draw pass shaders const_state.
1020  */
1021 static inline const struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)1022 ir3_const_state(const struct ir3_shader_variant *v)
1023 {
1024    if (v->binning_pass)
1025       return v->nonbinning->const_state;
1026    return v->const_state;
1027 }
1028 
1029 static inline struct ir3_const_state *
ir3_const_state_mut(const struct ir3_shader_variant * v)1030 ir3_const_state_mut(const struct ir3_shader_variant *v)
1031 {
1032    assert(!v->binning_pass);
1033    return v->const_state;
1034 }
1035 
1036 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)1037 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
1038 {
1039    const struct ir3_compiler *compiler = v->compiler;
1040    bool shared_consts_enable =
1041       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1042 
1043    /* Shared consts size for CS and FS matches with what's acutally used,
1044     * but the size of shared consts for geomtry stages doesn't.
1045     * So we use a hw quirk for geometry shared consts.
1046     */
1047    uint32_t shared_consts_size = shared_consts_enable ?
1048          compiler->shared_consts_size : 0;
1049 
1050    uint32_t shared_consts_size_geom = shared_consts_enable ?
1051          compiler->geom_shared_consts_size_quirk : 0;
1052 
1053    uint32_t safe_shared_consts_size = shared_consts_enable ?
1054       ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
1055                      DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
1056 
1057    if ((v->type == MESA_SHADER_COMPUTE) ||
1058        (v->type == MESA_SHADER_KERNEL)) {
1059       return compiler->max_const_compute - shared_consts_size;
1060    } else if (safe_constlen) {
1061       return compiler->max_const_safe - safe_shared_consts_size;
1062    } else if (v->type == MESA_SHADER_FRAGMENT) {
1063       return compiler->max_const_frag - shared_consts_size;
1064    } else {
1065       return compiler->max_const_geom - shared_consts_size_geom;
1066    }
1067 }
1068 
1069 /* Given a variant, calculate the maximum constlen it can have.
1070  */
1071 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)1072 ir3_max_const(const struct ir3_shader_variant *v)
1073 {
1074    return _ir3_max_const(v, v->key.safe_constlen);
1075 }
1076 
1077 uint16_t ir3_const_find_imm(struct ir3_shader_variant *v, uint32_t imm);
1078 uint16_t ir3_const_add_imm(struct ir3_shader_variant *v, uint32_t imm);
1079 
1080 static inline unsigned
ir3_const_reg(const struct ir3_const_state * const_state,enum ir3_const_alloc_type type,unsigned offset)1081 ir3_const_reg(const struct ir3_const_state *const_state,
1082               enum ir3_const_alloc_type type,
1083               unsigned offset)
1084 {
1085    unsigned n = const_state->allocs.consts[type].offset_vec4;
1086    assert(const_state->allocs.consts[type].size_vec4 != 0);
1087    return regid(n + offset / 4, offset % 4);
1088 }
1089 
1090 /* Return true if a variant may need to be recompiled due to exceeding the
1091  * maximum "safe" constlen.
1092  */
1093 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)1094 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
1095 {
1096    return v->constlen > _ir3_max_const(v, true);
1097 }
1098 
1099 void *ir3_shader_assemble(struct ir3_shader_variant *v);
1100 struct ir3_shader_variant *
1101 ir3_shader_create_variant(struct ir3_shader *shader,
1102                           const struct ir3_shader_key *key,
1103                           bool keep_ir);
1104 struct ir3_shader_variant *
1105 ir3_shader_get_variant(struct ir3_shader *shader,
1106                        const struct ir3_shader_key *key, bool binning_pass,
1107                        bool keep_ir, bool *created);
1108 
1109 struct ir3_shader *
1110 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1111                     const struct ir3_shader_options *options,
1112                     struct ir3_stream_output_info *stream_output);
1113 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1114                            const struct ir3_compiler *compiler);
1115 struct ir3_shader *
1116 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1117 void ir3_shader_destroy(struct ir3_shader *shader);
1118 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1119 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1120 
1121 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1122 
1123 void ir3_shader_get_subgroup_size(const struct ir3_compiler *compiler,
1124                                   const struct ir3_shader_options *options,
1125                                   gl_shader_stage stage,
1126                                   unsigned *subgroup_size,
1127                                   unsigned *max_subgroup_size);
1128 
1129 /*
1130  * Helper/util:
1131  */
1132 
1133 /* clears shader-key flags which don't apply to the given shader.
1134  */
1135 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1136 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1137 {
1138    uint32_t *key_bits = (uint32_t *)key;
1139    uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1140    STATIC_ASSERT(sizeof(*key) % 4 == 0);
1141    for (unsigned i = 0; i < sizeof(*key) >> 2; i++)
1142       key_bits[i] &= key_mask[i];
1143 }
1144 
1145 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1146 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1147 {
1148    for (unsigned j = 0; j < so->outputs_count; j++)
1149       if (so->outputs[j].slot == slot)
1150          return j;
1151 
1152    /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1153     * in the vertex shader.. but the fragment shader doesn't know this
1154     * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
1155     * at link time if there is no matching OUT.BCOLOR[n], we must map
1156     * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
1157     * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1158     */
1159    if (slot == VARYING_SLOT_BFC0) {
1160       slot = VARYING_SLOT_COL0;
1161    } else if (slot == VARYING_SLOT_BFC1) {
1162       slot = VARYING_SLOT_COL1;
1163    } else if (slot == VARYING_SLOT_COL0) {
1164       slot = VARYING_SLOT_BFC0;
1165    } else if (slot == VARYING_SLOT_COL1) {
1166       slot = VARYING_SLOT_BFC1;
1167    } else {
1168       return -1;
1169    }
1170 
1171    for (unsigned j = 0; j < so->outputs_count; j++)
1172       if (so->outputs[j].slot == slot)
1173          return j;
1174 
1175    return -1;
1176 }
1177 
1178 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1179 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1180 {
1181    assert(so->inputs_count <= (unsigned)INT_MAX);
1182    while (++i < (int)so->inputs_count)
1183       if (so->inputs[i].compmask && so->inputs[i].bary)
1184          break;
1185    return i;
1186 }
1187 
1188 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1189 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1190 {
1191    int j = -1;
1192 
1193    while (true) {
1194       j = ir3_next_varying(so, j);
1195 
1196       assert(so->inputs_count <= (unsigned)INT_MAX);
1197       if (j >= (int)so->inputs_count)
1198          return -1;
1199 
1200       if (so->inputs[j].slot == slot)
1201          return j;
1202    }
1203 }
1204 
1205 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1206 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1207 {
1208    int var = ir3_find_input(so, slot);
1209    return var == -1 ? 0xff : so->inputs[var].inloc;
1210 }
1211 
1212 struct ir3_shader_linkage {
1213    /* Maximum location either consumed by the fragment shader or produced by
1214     * the last geometry stage, i.e. the size required for each vertex in the
1215     * VPC in DWORD's.
1216     */
1217    uint8_t max_loc;
1218 
1219    /* Number of entries in var. */
1220    uint8_t cnt;
1221 
1222    /* Bitset of locations used, including ones which are only used by the FS.
1223     */
1224    uint32_t varmask[4];
1225 
1226    /* Map from VS output to location. */
1227    struct {
1228       uint8_t slot;
1229       uint8_t regid;
1230       uint8_t compmask;
1231       uint8_t loc;
1232    } var[32];
1233 
1234    /* location for fixed-function gl_PrimitiveID passthrough */
1235    uint8_t primid_loc;
1236 
1237    /* location for fixed-function gl_ViewIndex passthrough */
1238    uint8_t viewid_loc;
1239 
1240    /* location for combined clip/cull distance arrays */
1241    uint8_t clip0_loc, clip1_loc;
1242 };
1243 
1244 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1245 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1246              uint8_t compmask, uint8_t loc)
1247 {
1248    for (unsigned j = 0; j < util_last_bit(compmask); j++) {
1249       uint8_t comploc = loc + j;
1250       l->varmask[comploc / 32] |= 1 << (comploc % 32);
1251    }
1252 
1253    l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1254 
1255    if (regid_ != regid(63, 0)) {
1256       int i = l->cnt++;
1257       assert(i < ARRAY_SIZE(l->var));
1258 
1259       l->var[i].slot = slot;
1260       l->var[i].regid = regid_;
1261       l->var[i].compmask = compmask;
1262       l->var[i].loc = loc;
1263    }
1264 }
1265 
1266 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1267 ir3_link_shaders(struct ir3_shader_linkage *l,
1268                  const struct ir3_shader_variant *vs,
1269                  const struct ir3_shader_variant *fs, bool pack_vs_out)
1270 {
1271    /* On older platforms, varmask isn't programmed at all, and it appears
1272     * that the hardware generates a mask of used VPC locations using the VS
1273     * output map, and hangs if a FS bary instruction references a location
1274     * not in the list. This means that we need to have a dummy entry in the
1275     * VS out map for things like gl_PointCoord which aren't written by the
1276     * VS. Furthermore we can't use r63.x, so just pick a random register to
1277     * use if there is no VS output.
1278     */
1279    const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1280    int j = -1, k;
1281 
1282    l->primid_loc = 0xff;
1283    l->viewid_loc = 0xff;
1284    l->clip0_loc = 0xff;
1285    l->clip1_loc = 0xff;
1286 
1287    while (l->cnt < ARRAY_SIZE(l->var)) {
1288       j = ir3_next_varying(fs, j);
1289 
1290       assert(fs->inputs_count <= (unsigned)INT_MAX);
1291       if (j >= (int)fs->inputs_count)
1292          break;
1293 
1294       if (fs->inputs[j].inloc >= fs->total_in)
1295          continue;
1296 
1297       k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1298 
1299       if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1300          l->primid_loc = fs->inputs[j].inloc;
1301       }
1302 
1303       if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1304          assert(k < 0);
1305          l->viewid_loc = fs->inputs[j].inloc;
1306       }
1307 
1308       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1309          l->clip0_loc = fs->inputs[j].inloc;
1310 
1311       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1312          l->clip1_loc = fs->inputs[j].inloc;
1313 
1314       ir3_link_add(l, fs->inputs[j].slot,
1315                    k >= 0 ? vs->outputs[k].regid : default_regid,
1316                    fs->inputs[j].compmask, fs->inputs[j].inloc);
1317    }
1318 }
1319 
1320 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1321 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1322 {
1323    for (unsigned j = 0; j < so->outputs_count; j++)
1324       if (so->outputs[j].slot == slot) {
1325          uint32_t regid = so->outputs[j].regid;
1326          if (so->outputs[j].half)
1327             regid |= HALF_REG_ID;
1328          return regid;
1329       }
1330    return regid(63, 0);
1331 }
1332 
1333 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1334 
1335 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1336                          const struct ir3_shader_variant *v);
1337 
1338 #define VARYING_SLOT_GS_HEADER_IR3       (VARYING_SLOT_MAX + 0)
1339 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1340 #define VARYING_SLOT_TCS_HEADER_IR3      (VARYING_SLOT_MAX + 2)
1341 #define VARYING_SLOT_REL_PATCH_ID_IR3    (VARYING_SLOT_MAX + 3)
1342 
1343 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1344 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1345 {
1346    if (!so)
1347       return regid(63, 0);
1348    for (unsigned j = 0; j < so->inputs_count; j++)
1349       if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1350          return so->inputs[j].regid;
1351    return regid(63, 0);
1352 }
1353 
1354 /* calculate register footprint in terms of half-regs (ie. one full
1355  * reg counts as two half-regs).
1356  */
1357 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1358 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1359 {
1360    return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1361 }
1362 
1363 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1364 ir3_shader_nibo(const struct ir3_shader_variant *v)
1365 {
1366    return v->num_ibos;
1367 }
1368 
1369 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1370 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1371 {
1372    /* Dummy shader */
1373    if (!v->compiler)
1374       return 0;
1375 
1376    if (v->compiler->gen < 5)
1377       return v->branchstack;
1378 
1379    return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1380 }
1381 
1382 ENDC;
1383 
1384 #endif /* IR3_SHADER_H_ */
1385