• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #ifndef IR3_SHADER_H_
28 #define IR3_SHADER_H_
29 
30 #include <stdio.h>
31 
32 #include "c11/threads.h"
33 #include "compiler/nir/nir.h"
34 #include "compiler/shader_enums.h"
35 #include "util/bitscan.h"
36 #include "util/disk_cache.h"
37 
38 #include "ir3_compiler.h"
39 
40 BEGINC;
41 
42 /* driver param indices: */
43 enum ir3_driver_param {
44    /* compute shader driver params: */
45    IR3_DP_NUM_WORK_GROUPS_X = 0,
46    IR3_DP_NUM_WORK_GROUPS_Y = 1,
47    IR3_DP_NUM_WORK_GROUPS_Z = 2,
48    IR3_DP_WORK_DIM          = 3,
49    IR3_DP_BASE_GROUP_X = 4,
50    IR3_DP_BASE_GROUP_Y = 5,
51    IR3_DP_BASE_GROUP_Z = 6,
52    IR3_DP_CS_SUBGROUP_SIZE = 7,
53    IR3_DP_LOCAL_GROUP_SIZE_X = 8,
54    IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
55    IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
56    IR3_DP_SUBGROUP_ID_SHIFT = 11,
57    IR3_DP_WORKGROUP_ID_X = 12,
58    IR3_DP_WORKGROUP_ID_Y = 13,
59    IR3_DP_WORKGROUP_ID_Z = 14,
60    /* NOTE: gl_NumWorkGroups should be vec4 aligned because
61     * glDispatchComputeIndirect() needs to load these from
62     * the info->indirect buffer.  Keep that in mind when/if
63     * adding any addition CS driver params.
64     */
65    IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */
66 
67    /* vertex shader driver params: */
68    IR3_DP_DRAWID = 0,
69    IR3_DP_VTXID_BASE = 1,
70    IR3_DP_INSTID_BASE = 2,
71    IR3_DP_VTXCNT_MAX = 3,
72    IR3_DP_IS_INDEXED_DRAW = 4,  /* Note: boolean, ie. 0 or ~0 */
73    /* user-clip-plane components, up to 8x vec4's: */
74    IR3_DP_UCP0_X = 5,
75    /* .... */
76    IR3_DP_UCP7_W = 36,
77    IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
78 
79    /* TCS driver params: */
80    IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,
81    IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y = 1,
82    IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z = 2,
83    IR3_DP_HS_DEFAULT_OUTER_LEVEL_W = 3,
84    IR3_DP_HS_DEFAULT_INNER_LEVEL_X = 4,
85    IR3_DP_HS_DEFAULT_INNER_LEVEL_Y = 5,
86    IR3_DP_HS_COUNT = 8, /* must be aligned to vec4 */
87 
88    /* fragment shader driver params: */
89    IR3_DP_FS_SUBGROUP_SIZE = 0,
90    /* Dynamic params (that aren't known when compiling the shader) */
91    IR3_DP_FS_DYNAMIC = 4,
92    IR3_DP_FS_FRAG_INVOCATION_COUNT = IR3_DP_FS_DYNAMIC,
93    IR3_DP_FS_FRAG_SIZE = IR3_DP_FS_DYNAMIC + 4,
94    IR3_DP_FS_FRAG_OFFSET = IR3_DP_FS_DYNAMIC + 6,
95 };
96 
97 #define IR3_MAX_SHADER_BUFFERS  32
98 #define IR3_MAX_SHADER_IMAGES   32
99 #define IR3_MAX_SO_BUFFERS      4
100 #define IR3_MAX_SO_STREAMS      4
101 #define IR3_MAX_SO_OUTPUTS      128
102 #define IR3_MAX_UBO_PUSH_RANGES 32
103 
104 /* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */
105 enum ir3_bary {
106    IJ_PERSP_PIXEL,
107    IJ_PERSP_SAMPLE,
108    IJ_PERSP_CENTROID,
109    IJ_PERSP_CENTER_RHW,
110    IJ_LINEAR_PIXEL,
111    IJ_LINEAR_CENTROID,
112    IJ_LINEAR_SAMPLE,
113    IJ_COUNT,
114 };
115 
116 /* Description of what wavesizes are allowed. */
117 enum ir3_wavesize_option {
118    IR3_SINGLE_ONLY,
119    IR3_SINGLE_OR_DOUBLE,
120    IR3_DOUBLE_ONLY,
121 };
122 
123 /**
124  * Description of a lowered UBO.
125  */
126 struct nir_def;
127 
128 struct ir3_ubo_info {
129    struct nir_def *global_base; /* For global loads, the base address */
130    uint32_t block;         /* Which constant block */
131    uint16_t bindless_base; /* For bindless, which base register is used */
132    bool bindless;
133    bool global;
134 };
135 
136 /**
137  * Description of a range of a lowered UBO access.
138  *
139  * Drivers should not assume that there are not multiple disjoint
140  * lowered ranges of a single UBO.
141  */
142 struct ir3_ubo_range {
143    struct ir3_ubo_info ubo;
144    uint32_t offset;     /* start offset to push in the const register file */
145    uint32_t start, end; /* range of block that's actually used */
146 };
147 
148 struct ir3_ubo_analysis_state {
149    struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES];
150    uint32_t num_enabled;
151    uint32_t size;
152 };
153 
154 enum ir3_push_consts_type {
155    IR3_PUSH_CONSTS_NONE,
156    IR3_PUSH_CONSTS_PER_STAGE,
157    IR3_PUSH_CONSTS_SHARED,
158    IR3_PUSH_CONSTS_SHARED_PREAMBLE,
159 };
160 
161 /* This represents an internal UBO filled out by the driver. There are a few
162  * common UBOs that must be filled out identically by all drivers, for example
163  * for shader linkage, but drivers can also add their own that they manage
164  * themselves.
165  */
166 struct ir3_driver_ubo {
167    int32_t idx;
168    uint32_t size;
169 };
170 
171 /**
172  * Describes the layout of shader consts in the const register file.
173  *
174  * Layout of constant registers, each section aligned to vec4.  Note
175  * that pointer size (ubo, etc) changes depending on generation.
176  *
177  *   + user consts: only used for turnip push consts
178  *   + lowered UBO ranges
179  *   + preamble consts
180  *   + UBO addresses: turnip is bindless and these are wasted
181  *   + image dimensions: a5xx only; needed to calculate pixel offset, but only
182  *     for images that have image_{load,store,size,atomic*} intrinsics
183  *   + kernel params: cl only
184  *   + driver params: these are stage-dependent; see ir3_driver_param
185  *   + TFBO addresses: only for vs on a3xx/a4xx
186  *   + primitive params: these are stage-dependent
187  *       vs, gs: uvec4(primitive_stride, vertex_stride, 0, 0)
188  *       hs, ds: uvec4(primitive_stride, vertex_stride,
189  *                     patch_stride, patch_vertices_in)
190  *               uvec4(tess_param_base, tess_factor_base)
191  *   + primitive map
192  *   + lowered immediates
193  *
194  * Immediates go last mostly because they are inserted in the CP pass
195  * after the nir -> ir3 frontend.
196  *
197  * Note UBO size in bytes should be aligned to vec4
198  */
199 struct ir3_const_state {
200    unsigned num_ubos;
201    unsigned num_driver_params; /* scalar */
202 
203    struct ir3_driver_ubo consts_ubo;
204    struct ir3_driver_ubo driver_params_ubo;
205    struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
206 
207    int32_t constant_data_dynamic_offsets;
208 
209    struct {
210       /* user const start at zero */
211       unsigned ubo;
212       unsigned image_dims;
213       unsigned kernel_params;
214       unsigned driver_param;
215       unsigned tfbo;
216       unsigned primitive_param;
217       unsigned primitive_map;
218       unsigned immediate;
219    } offsets;
220 
221    struct {
222       uint32_t mask;  /* bitmask of images that have image_store */
223       uint32_t count; /* number of consts allocated */
224       /* three const allocated per image which has image_store:
225        *  + cpp         (bytes per pixel)
226        *  + pitch       (y pitch)
227        *  + array_pitch (z pitch)
228        */
229       uint32_t off[IR3_MAX_SHADER_IMAGES];
230    } image_dims;
231 
232    unsigned immediates_count;
233    unsigned immediates_size;
234    uint32_t *immediates;
235 
236    unsigned preamble_size;
237    unsigned global_size;
238 
239    /* State of ubo access lowered to push consts: */
240    struct ir3_ubo_analysis_state ubo_state;
241    enum ir3_push_consts_type push_consts_type;
242 };
243 
244 /**
245  * A single output for vertex transform feedback.
246  */
247 struct ir3_stream_output {
248    unsigned register_index  : 6;  /**< 0 to 63 (OUT index) */
249    unsigned start_component : 2;  /** 0 to 3 */
250    unsigned num_components  : 3;  /** 1 to 4 */
251    unsigned output_buffer   : 3;  /**< 0 to PIPE_MAX_SO_BUFFERS */
252    unsigned dst_offset      : 16; /**< offset into the buffer in dwords */
253    unsigned stream          : 2;  /**< 0 to 3 */
254 };
255 
256 /**
257  * Stream output for vertex transform feedback.
258  */
259 struct ir3_stream_output_info {
260    unsigned num_outputs;
261    /** stride for an entire vertex for each buffer in dwords */
262    uint16_t stride[IR3_MAX_SO_BUFFERS];
263 
264    /* These correspond to the VPC_SO_STREAM_CNTL fields */
265    uint8_t streams_written;
266    uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS];
267 
268    /**
269     * Array of stream outputs, in the order they are to be written in.
270     * Selected components are tightly packed into the output buffer.
271     */
272    struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS];
273 };
274 
275 /**
276  * Starting from a4xx, HW supports pre-dispatching texture sampling
277  * instructions prior to scheduling a shader stage, when the
278  * coordinate maps exactly to an output of the previous stage.
279  */
280 
281 /**
282  * There is a limit in the number of pre-dispatches allowed for any
283  * given stage.
284  */
285 #define IR3_MAX_SAMPLER_PREFETCH 4
286 
287 /**
288  * This is the output stream value for 'cmd', as used by blob. It may
289  * encode the return type (in 3 bits) but it hasn't been verified yet.
290  */
291 #define IR3_SAMPLER_PREFETCH_CMD          0x4
292 #define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6
293 
294 /**
295  * Stream output for texture sampling pre-dispatches.
296  */
297 struct ir3_sampler_prefetch {
298    uint8_t src;
299    bool bindless;
300    uint8_t samp_id;
301    uint8_t tex_id;
302    uint16_t samp_bindless_id;
303    uint16_t tex_bindless_id;
304    uint8_t dst;
305    uint8_t wrmask;
306    uint8_t half_precision;
307    opc_t tex_opc;
308 };
309 
310 /* Configuration key used to identify a shader variant.. different
311  * shader variants can be used to implement features not supported
312  * in hw (two sided color), binning-pass vertex shader, etc.
313  *
314  * When adding to this struct, please update ir3_shader_variant()'s debug
315  * output.
316  */
317 struct ir3_shader_key {
318    union {
319       struct {
320          /*
321           * Combined Vertex/Fragment shader parameters:
322           */
323          unsigned ucp_enables : 8;
324 
325          /* do we need to check {v,f}saturate_{s,t,r}? */
326          unsigned has_per_samp : 1;
327 
328          /*
329           * Fragment shader variant parameters:
330           */
331          unsigned sample_shading : 1;
332          unsigned msaa           : 1;
333          /* used when shader needs to handle flat varyings (a4xx)
334           * for front/back color inputs to frag shader:
335           */
336          unsigned rasterflat : 1;
337 
338          /* Indicates that this is a tessellation pipeline which requires a
339           * whole different kind of vertex shader.  In case of
340           * tessellation, this field also tells us which kind of output
341           * topology the TES uses, which the TCS needs to know.
342           */
343 #define IR3_TESS_NONE      0
344 #define IR3_TESS_QUADS     1
345 #define IR3_TESS_TRIANGLES 2
346 #define IR3_TESS_ISOLINES  3
347          unsigned tessellation : 2;
348 
349          unsigned has_gs : 1;
350 
351          /* Whether stages after TCS read gl_PrimitiveID, used to determine
352           * whether the TCS has to store it in the tess factor BO.
353           */
354          unsigned tcs_store_primid : 1;
355 
356          /* Whether this variant sticks to the "safe" maximum constlen,
357           * which guarantees that the combined stages will never go over
358           * the limit:
359           */
360          unsigned safe_constlen : 1;
361       };
362       uint32_t global;
363    };
364 
365    /* bitmask of ms shifts (a3xx) */
366    uint32_t vsamples, fsamples;
367 
368    /* bitmask of samplers which need astc srgb workaround (a4xx): */
369    uint16_t vastc_srgb, fastc_srgb;
370 
371    /* per-component (3-bit) swizzles of each sampler (a4xx tg4): */
372    uint16_t vsampler_swizzles[16];
373    uint16_t fsampler_swizzles[16];
374 };
375 
376 static inline unsigned
ir3_tess_mode(enum tess_primitive_mode tess_mode)377 ir3_tess_mode(enum tess_primitive_mode tess_mode)
378 {
379    switch (tess_mode) {
380    case TESS_PRIMITIVE_ISOLINES:
381       return IR3_TESS_ISOLINES;
382    case TESS_PRIMITIVE_TRIANGLES:
383       return IR3_TESS_TRIANGLES;
384    case TESS_PRIMITIVE_QUADS:
385       return IR3_TESS_QUADS;
386    default:
387       unreachable("bad tessmode");
388    }
389 }
390 
391 static inline uint32_t
ir3_tess_factor_stride(unsigned patch_type)392 ir3_tess_factor_stride(unsigned patch_type)
393 {
394    /* note: this matches the stride used by ir3's build_tessfactor_base */
395    switch (patch_type) {
396    case IR3_TESS_ISOLINES:
397       return 12;
398    case IR3_TESS_TRIANGLES:
399       return 20;
400    case IR3_TESS_QUADS:
401       return 28;
402    default:
403       unreachable("bad tessmode");
404    }
405 }
406 
407 static inline bool
ir3_shader_key_equal(const struct ir3_shader_key * a,const struct ir3_shader_key * b)408 ir3_shader_key_equal(const struct ir3_shader_key *a,
409                      const struct ir3_shader_key *b)
410 {
411    /* slow-path if we need to check {v,f}saturate_{s,t,r} */
412    if (a->has_per_samp || b->has_per_samp)
413       return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0;
414    return a->global == b->global;
415 }
416 
417 /* will the two keys produce different lowering for a fragment shader? */
418 static inline bool
ir3_shader_key_changes_fs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)419 ir3_shader_key_changes_fs(struct ir3_shader_key *key,
420                           struct ir3_shader_key *last_key)
421 {
422    if (last_key->has_per_samp || key->has_per_samp) {
423       if ((last_key->fsamples != key->fsamples) ||
424           (last_key->fastc_srgb != key->fastc_srgb) ||
425           memcmp(last_key->fsampler_swizzles, key->fsampler_swizzles,
426                 sizeof(key->fsampler_swizzles)))
427          return true;
428    }
429 
430    if (last_key->rasterflat != key->rasterflat)
431       return true;
432 
433    if (last_key->ucp_enables != key->ucp_enables)
434       return true;
435 
436    if (last_key->safe_constlen != key->safe_constlen)
437       return true;
438 
439    return false;
440 }
441 
442 /* will the two keys produce different lowering for a vertex shader? */
443 static inline bool
ir3_shader_key_changes_vs(struct ir3_shader_key * key,struct ir3_shader_key * last_key)444 ir3_shader_key_changes_vs(struct ir3_shader_key *key,
445                           struct ir3_shader_key *last_key)
446 {
447    if (last_key->has_per_samp || key->has_per_samp) {
448       if ((last_key->vsamples != key->vsamples) ||
449           (last_key->vastc_srgb != key->vastc_srgb) ||
450           memcmp(last_key->vsampler_swizzles, key->vsampler_swizzles,
451                 sizeof(key->vsampler_swizzles)))
452          return true;
453    }
454 
455    if (last_key->ucp_enables != key->ucp_enables)
456       return true;
457 
458    if (last_key->safe_constlen != key->safe_constlen)
459       return true;
460 
461    return false;
462 }
463 
464 /**
465  * On a4xx+a5xx, Images share state with textures and SSBOs:
466  *
467  *   + Uses texture (cat5) state/instruction (isam) to read
468  *   + Uses SSBO state and instructions (cat6) to write and for atomics
469  *
470  * Starting with a6xx, Images and SSBOs are basically the same thing,
471  * with texture state and isam also used for SSBO reads.
472  *
473  * On top of that, gallium makes the SSBO (shader_buffers) state semi
474  * sparse, with the first half of the state space used for atomic
475  * counters lowered to atomic buffers.  We could ignore this, but I
476  * don't think we could *really* handle the case of a single shader
477  * that used the max # of textures + images + SSBOs.  And once we are
478  * offsetting images by num_ssbos (or visa versa) to map them into
479  * the same hardware state, the hardware state has become coupled to
480  * the shader state, so at this point we might as well just use a
481  * mapping table to remap things from image/SSBO idx to hw idx.
482  *
483  * To make things less (more?) confusing, for the hw "SSBO" state
484  * (since it is really both SSBO and Image) I'll use the name "IBO"
485  */
486 struct ir3_ibo_mapping {
487 #define IBO_INVALID 0xff
488    /* Maps logical SSBO state to hw tex state: */
489    uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS];
490 
491    /* Maps logical Image state to hw tex state: */
492    uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES];
493 
494    /* Maps hw state back to logical SSBO or Image state:
495     *
496     * note IBO_SSBO ORd into values to indicate that the
497     * hw slot is used for SSBO state vs Image state.
498     */
499 #define IBO_SSBO 0x80
500    uint8_t tex_to_image[32];
501 
502    /* including real textures */
503    uint8_t num_tex;
504    /* the number of real textures, ie. image/ssbo start here */
505    uint8_t tex_base;
506 };
507 
508 struct ir3_disasm_info {
509    bool write_disasm;
510    char *nir;
511    char *disasm;
512 };
513 
514 /* Represents half register in regid */
515 #define HALF_REG_ID 0x100
516 
517 struct ir3_shader_options {
518    unsigned num_reserved_user_consts;
519    /* What API-visible wavesizes are allowed. Even if only double wavesize is
520     * allowed, we may still use the smaller wavesize "under the hood" and the
521     * application simply sees the upper half as always disabled.
522     */
523    enum ir3_wavesize_option api_wavesize;
524    /* What wavesizes we're allowed to actually use. If the API wavesize is
525     * single-only, then this must be single-only too.
526     */
527    enum ir3_wavesize_option real_wavesize;
528    enum ir3_push_consts_type push_consts_type;
529 
530    uint32_t push_consts_base;
531    uint32_t push_consts_dwords;
532 };
533 
534 /**
535  * Shader variant which contains the actual hw shader instructions,
536  * and necessary info for shader state setup.
537  */
538 struct ir3_shader_variant {
539    struct fd_bo *bo;
540 
541    /* variant id (for debug) */
542    uint32_t id;
543 
544    /* id of the shader the variant came from (for debug) */
545    uint32_t shader_id;
546 
547    struct ir3_shader_key key;
548 
549    /* vertex shaders can have an extra version for hwbinning pass,
550     * which is pointed to by so->binning:
551     */
552    bool binning_pass;
553    //	union {
554    struct ir3_shader_variant *binning;
555    struct ir3_shader_variant *nonbinning;
556    //	};
557 
558    struct ir3 *ir; /* freed after assembling machine instructions */
559 
560    /* shader variants form a linked list: */
561    struct ir3_shader_variant *next;
562 
563    /* replicated here to avoid passing extra ptrs everywhere: */
564    gl_shader_stage type;
565    struct ir3_compiler *compiler;
566 
567    char *name;
568 
569    /* variant's copy of nir->constant_data (since we don't track the NIR in
570     * the variant, and shader->nir is before the opt pass).  Moves to v->bin
571     * after assembly.
572     */
573    void *constant_data;
574 
575    struct ir3_disasm_info disasm_info;
576 
577    /*
578     * Below here is serialized when written to disk cache:
579     */
580 
581    /* The actual binary shader instructions, size given by info.sizedwords: */
582    uint32_t *bin;
583 
584    struct ir3_const_state *const_state;
585 
586    /*
587     * The following macros are used by the shader disk cache save/
588     * restore paths to serialize/deserialize the variant.  Any
589     * pointers that require special handling in store_variant()
590     * and retrieve_variant() should go above here.
591     */
592 #define VARIANT_CACHE_START  offsetof(struct ir3_shader_variant, info)
593 #define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START)
594 #define VARIANT_CACHE_SIZE                                                     \
595    (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START)
596 
597    struct ir3_info info;
598 
599    struct ir3_shader_options shader_options;
600 
601    uint32_t constant_data_size;
602 
603    /* Levels of nesting of flow control:
604     */
605    unsigned branchstack;
606 
607    unsigned loops;
608 
609    /* the instructions length is in units of instruction groups
610     * (4 instructions for a3xx, 16 instructions for a4xx.. each
611     * instruction is 2 dwords):
612     */
613    unsigned instrlen;
614 
615    /* the constants length is in units of vec4's, and is the sum of
616     * the uniforms and the built-in compiler constants
617     */
618    unsigned constlen;
619 
620    /* The private memory size in bytes per fiber */
621    unsigned pvtmem_size;
622    /* Whether we should use the new per-wave layout rather than per-fiber. */
623    bool pvtmem_per_wave;
624 
625    /* Whether multi-position output is enabled. */
626    bool multi_pos_output;
627 
628    /* Whether dual-source blending is enabled. */
629    bool dual_src_blend;
630 
631    /* Size in bytes of required shared memory */
632    unsigned shared_size;
633 
634    /* About Linkage:
635     *   + Let the frag shader determine the position/compmask for the
636     *     varyings, since it is the place where we know if the varying
637     *     is actually used, and if so, which components are used.  So
638     *     what the hw calls "outloc" is taken from the "inloc" of the
639     *     frag shader.
640     *   + From the vert shader, we only need the output regid
641     */
642 
643    bool frag_face, color0_mrt;
644    uint8_t fragcoord_compmask;
645 
646    /* NOTE: for input/outputs, slot is:
647     *   gl_vert_attrib  - for VS inputs
648     *   gl_varying_slot - for VS output / FS input
649     *   gl_frag_result  - for FS output
650     */
651 
652    /* varyings/outputs: */
653    unsigned outputs_count;
654    struct {
655       uint8_t slot;
656       uint8_t regid;
657       uint8_t view;
658       bool half : 1;
659    } outputs[32 + 2]; /* +POSITION +PSIZE */
660    bool writes_pos, writes_smask, writes_psize, writes_viewport, writes_stencilref;
661 
662    /* Size in dwords of all outputs for VS, size of entire patch for HS. */
663    uint32_t output_size;
664 
665    /* Expected size of incoming output_loc for HS, DS, and GS */
666    uint32_t input_size;
667 
668    /* Map from location to offset in per-primitive storage. In dwords for
669     * HS, where varyings are read in the next stage via ldg with a dword
670     * offset, and in bytes for all other stages.
671     * +POSITION, +PSIZE, ... - see shader_io_get_unique_index
672     */
673    unsigned output_loc[12 + 32];
674 
675    /* attributes (VS) / varyings (FS):
676     * Note that sysval's should come *after* normal inputs.
677     */
678    unsigned inputs_count;
679    struct {
680       uint8_t slot;
681       uint8_t regid;
682       uint8_t compmask;
683       /* location of input (ie. offset passed to bary.f, etc).  This
684        * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx
685        * have the OUTLOCn value offset by 8, presumably to account
686        * for gl_Position/gl_PointSize)
687        */
688       uint8_t inloc;
689       /* vertex shader specific: */
690       bool sysval : 1; /* slot is a gl_system_value */
691       /* fragment shader specific: */
692       bool bary       : 1; /* fetched varying (vs one loaded into reg) */
693       bool rasterflat : 1; /* special handling for emit->rasterflat */
694       bool half       : 1;
695       bool flat       : 1;
696    } inputs[32 + 2]; /* +POSITION +FACE */
697    bool reads_primid;
698 
699    /* sum of input components (scalar).  For frag shaders, it only counts
700     * the varying inputs:
701     */
702    unsigned total_in;
703 
704    /* sum of sysval input components (scalar). */
705    unsigned sysval_in;
706 
707    /* For frag shaders, the total number of inputs (not scalar,
708     * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR)
709     */
710    unsigned varying_in;
711 
712    /* Remapping table to map Image and SSBO to hw state: */
713    struct ir3_ibo_mapping image_mapping;
714 
715    /* number of samplers/textures (which are currently 1:1): */
716    int num_samp;
717 
718    /* is there an implicit sampler to read framebuffer (FS only).. if
719     * so the sampler-idx is 'num_samp - 1' (ie. it is appended after
720     * the last "real" texture)
721     */
722    bool fb_read;
723 
724    /* do we have one or more SSBO instructions: */
725    bool has_ssbo;
726 
727    /* Which bindless resources are used, for filling out sp_xs_config */
728    bool bindless_tex;
729    bool bindless_samp;
730    bool bindless_ibo;
731    bool bindless_ubo;
732 
733    /* do we need derivatives: */
734    bool need_pixlod;
735 
736    bool need_full_quad;
737 
738    /* do we need VS driver params? */
739    bool need_driver_params;
740 
741    /* do we have image write, etc (which prevents early-z): */
742    bool no_earlyz;
743 
744    /* do we have kill, which also prevents early-z, but not necessarily
745     * early-lrz (as long as lrz-write is disabled, which must be handled
746     * outside of ir3.  Unlike other no_earlyz cases, kill doesn't have
747     * side effects that prevent early-lrz discard.
748     */
749    bool has_kill;
750 
751    bool per_samp;
752 
753    bool post_depth_coverage;
754 
755    /* Are we using split or merged register file? */
756    bool mergedregs;
757 
758    uint8_t clip_mask, cull_mask;
759 
760    /* for astc srgb workaround, the number/base of additional
761     * alpha tex states we need, and index of original tex states
762     */
763    struct {
764       unsigned base, count;
765       unsigned orig_idx[16];
766    } astc_srgb;
767 
768    /* for tg4 workaround, the number/base of additional
769     * unswizzled tex states we need, and index of original tex states
770     */
771    struct {
772       unsigned base, count;
773       unsigned orig_idx[16];
774    } tg4;
775 
776    /* texture sampler pre-dispatches */
777    uint32_t num_sampler_prefetch;
778    struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH];
779 
780    /* If true, the last use of helper invocations is the texture prefetch and
781     * they should be disabled for the actual shader. Equivalent to adding
782     * (eq)nop at the beginning of the shader.
783     */
784    bool prefetch_end_of_quad;
785 
786    uint16_t local_size[3];
787    bool local_size_variable;
788 
789    /* Important for compute shader to determine max reg footprint */
790    bool has_barrier;
791 
792    /* The offset where images start in the IBO array. */
793    unsigned num_ssbos;
794 
795    /* The total number of SSBOs and images, i.e. the number of hardware IBOs. */
796    unsigned num_ibos;
797 
798    union {
799       struct {
800          enum tess_primitive_mode primitive_mode;
801 
802          /** The number of vertices in the TCS output patch. */
803          uint8_t tcs_vertices_out;
804          enum gl_tess_spacing spacing:2; /*gl_tess_spacing*/
805 
806          /** Is the vertex order counterclockwise? */
807          bool ccw:1;
808          bool point_mode:1;
809       } tess;
810       struct {
811          /** The output primitive type */
812          uint16_t output_primitive;
813 
814          /** The maximum number of vertices the geometry shader might write. */
815          uint16_t vertices_out;
816 
817          /** 1 .. MAX_GEOMETRY_SHADER_INVOCATIONS */
818          uint8_t invocations;
819 
820          /** The number of vertices received per input primitive (max. 6) */
821          uint8_t vertices_in:3;
822       } gs;
823       struct {
824          bool early_fragment_tests : 1;
825          bool color_is_dual_source : 1;
826          bool uses_fbfetch_output  : 1;
827          bool fbfetch_coherent     : 1;
828       } fs;
829       struct {
830          unsigned req_input_mem;
831          unsigned req_local_mem;
832       } cs;
833    };
834 
835    /* For when we don't have a shader, variant's copy of streamout state */
836    struct ir3_stream_output_info stream_output;
837 };
838 
839 static inline const char *
ir3_shader_stage(struct ir3_shader_variant * v)840 ir3_shader_stage(struct ir3_shader_variant *v)
841 {
842    switch (v->type) {
843    case MESA_SHADER_VERTEX:
844       return v->binning_pass ? "BVERT" : "VERT";
845    case MESA_SHADER_TESS_CTRL:
846       return "TCS";
847    case MESA_SHADER_TESS_EVAL:
848       return "TES";
849    case MESA_SHADER_GEOMETRY:
850       return "GEOM";
851    case MESA_SHADER_FRAGMENT:
852       return "FRAG";
853    case MESA_SHADER_COMPUTE:
854    case MESA_SHADER_KERNEL:
855       return "CL";
856    default:
857       unreachable("invalid type");
858       return NULL;
859    }
860 }
861 
862 /* Currently we do not do binning for tess.  And for GS there is no
863  * cross-stage VS+GS optimization, so the full VS+GS is used in
864  * the binning pass.
865  */
866 static inline bool
ir3_has_binning_vs(const struct ir3_shader_key * key)867 ir3_has_binning_vs(const struct ir3_shader_key *key)
868 {
869    if (key->tessellation || key->has_gs)
870       return false;
871    return true;
872 }
873 
874 /**
875  * Represents a shader at the API level, before state-specific variants are
876  * generated.
877  */
878 struct ir3_shader {
879    gl_shader_stage type;
880 
881    /* shader id (for debug): */
882    uint32_t id;
883    uint32_t variant_count;
884 
885    /* Set by freedreno after shader_state_create, so we can emit debug info
886     * when recompiling a shader at draw time.
887     */
888    bool initial_variants_done;
889 
890    struct ir3_compiler *compiler;
891 
892    struct ir3_shader_options options;
893 
894    bool nir_finalized;
895    struct nir_shader *nir;
896    struct ir3_stream_output_info stream_output;
897 
898    /* per shader stage specific info: */
899    union {
900       /* for compute shaders: */
901       struct {
902          unsigned req_input_mem;    /* in dwords */
903          unsigned req_local_mem;
904       } cs;
905       /* For vertex shaders: */
906       struct {
907          /* If we need to generate a passthrough TCS, it will be a function of
908           * (a) the VS and (b) the # of patch_vertices (max 32), so cache them
909           * in the VS keyed by # of patch_vertices-1.
910           */
911          unsigned passthrough_tcs_compiled;
912          struct ir3_shader *passthrough_tcs[32];
913       } vs;
914    };
915 
916    struct ir3_shader_variant *variants;
917    mtx_t variants_lock;
918 
919    cache_key cache_key; /* shader disk-cache key */
920 
921    /* Bitmask of bits of the shader key used by this shader.  Used to avoid
922     * recompiles for GL NOS that doesn't actually apply to the shader.
923     */
924    struct ir3_shader_key key_mask;
925 };
926 
927 /**
928  * In order to use the same cmdstream, in particular constlen setup and const
929  * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's
930  * corresponding draw pass shaders const_state.
931  */
932 static inline struct ir3_const_state *
ir3_const_state(const struct ir3_shader_variant * v)933 ir3_const_state(const struct ir3_shader_variant *v)
934 {
935    if (v->binning_pass)
936       return v->nonbinning->const_state;
937    return v->const_state;
938 }
939 
940 static inline unsigned
_ir3_max_const(const struct ir3_shader_variant * v,bool safe_constlen)941 _ir3_max_const(const struct ir3_shader_variant *v, bool safe_constlen)
942 {
943    const struct ir3_compiler *compiler = v->compiler;
944    bool shared_consts_enable =
945       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
946 
947    /* Shared consts size for CS and FS matches with what's acutally used,
948     * but the size of shared consts for geomtry stages doesn't.
949     * So we use a hw quirk for geometry shared consts.
950     */
951    uint32_t shared_consts_size = shared_consts_enable ?
952          compiler->shared_consts_size : 0;
953 
954    uint32_t shared_consts_size_geom = shared_consts_enable ?
955          compiler->geom_shared_consts_size_quirk : 0;
956 
957    uint32_t safe_shared_consts_size = shared_consts_enable ?
958       ALIGN_POT(MAX2(DIV_ROUND_UP(shared_consts_size_geom, 4),
959                      DIV_ROUND_UP(shared_consts_size, 5)), 4) : 0;
960 
961    if ((v->type == MESA_SHADER_COMPUTE) ||
962        (v->type == MESA_SHADER_KERNEL)) {
963       return compiler->max_const_compute - shared_consts_size;
964    } else if (safe_constlen) {
965       return compiler->max_const_safe - safe_shared_consts_size;
966    } else if (v->type == MESA_SHADER_FRAGMENT) {
967       return compiler->max_const_frag - shared_consts_size;
968    } else {
969       return compiler->max_const_geom - shared_consts_size_geom;
970    }
971 }
972 
973 /* Given a variant, calculate the maximum constlen it can have.
974  */
975 static inline unsigned
ir3_max_const(const struct ir3_shader_variant * v)976 ir3_max_const(const struct ir3_shader_variant *v)
977 {
978    return _ir3_max_const(v, v->key.safe_constlen);
979 }
980 
981 /* Return true if a variant may need to be recompiled due to exceeding the
982  * maximum "safe" constlen.
983  */
984 static inline bool
ir3_exceeds_safe_constlen(const struct ir3_shader_variant * v)985 ir3_exceeds_safe_constlen(const struct ir3_shader_variant *v)
986 {
987    return v->constlen > _ir3_max_const(v, true);
988 }
989 
990 void *ir3_shader_assemble(struct ir3_shader_variant *v);
991 struct ir3_shader_variant *
992 ir3_shader_create_variant(struct ir3_shader *shader,
993                           const struct ir3_shader_key *key,
994                           bool keep_ir);
995 struct ir3_shader_variant *
996 ir3_shader_get_variant(struct ir3_shader *shader,
997                        const struct ir3_shader_key *key, bool binning_pass,
998                        bool keep_ir, bool *created);
999 
1000 struct ir3_shader *
1001 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir,
1002                     const struct ir3_shader_options *options,
1003                     struct ir3_stream_output_info *stream_output);
1004 uint32_t ir3_trim_constlen(const struct ir3_shader_variant **variants,
1005                            const struct ir3_compiler *compiler);
1006 struct ir3_shader *
1007 ir3_shader_passthrough_tcs(struct ir3_shader *vs, unsigned patch_vertices);
1008 void ir3_shader_destroy(struct ir3_shader *shader);
1009 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
1010 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
1011 
1012 int ir3_glsl_type_size(const struct glsl_type *type, bool bindless);
1013 
1014 /*
1015  * Helper/util:
1016  */
1017 
1018 /* clears shader-key flags which don't apply to the given shader.
1019  */
1020 static inline void
ir3_key_clear_unused(struct ir3_shader_key * key,struct ir3_shader * shader)1021 ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader)
1022 {
1023    uint32_t *key_bits = (uint32_t *)key;
1024    uint32_t *key_mask = (uint32_t *)&shader->key_mask;
1025    STATIC_ASSERT(sizeof(*key) % 4 == 0);
1026    for (int i = 0; i < sizeof(*key) >> 2; i++)
1027       key_bits[i] &= key_mask[i];
1028 }
1029 
1030 static inline int
ir3_find_output(const struct ir3_shader_variant * so,gl_varying_slot slot)1031 ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
1032 {
1033    int j;
1034 
1035    for (j = 0; j < so->outputs_count; j++)
1036       if (so->outputs[j].slot == slot)
1037          return j;
1038 
1039    /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n]
1040     * in the vertex shader.. but the fragment shader doesn't know this
1041     * so  it will always have both IN.COLOR[n] and IN.BCOLOR[n].  So
1042     * at link time if there is no matching OUT.BCOLOR[n], we must map
1043     * OUT.COLOR[n] to IN.BCOLOR[n].  And visa versa if there is only
1044     * a OUT.BCOLOR[n] but no matching OUT.COLOR[n]
1045     */
1046    if (slot == VARYING_SLOT_BFC0) {
1047       slot = VARYING_SLOT_COL0;
1048    } else if (slot == VARYING_SLOT_BFC1) {
1049       slot = VARYING_SLOT_COL1;
1050    } else if (slot == VARYING_SLOT_COL0) {
1051       slot = VARYING_SLOT_BFC0;
1052    } else if (slot == VARYING_SLOT_COL1) {
1053       slot = VARYING_SLOT_BFC1;
1054    } else {
1055       return -1;
1056    }
1057 
1058    for (j = 0; j < so->outputs_count; j++)
1059       if (so->outputs[j].slot == slot)
1060          return j;
1061 
1062    return -1;
1063 }
1064 
1065 static inline int
ir3_next_varying(const struct ir3_shader_variant * so,int i)1066 ir3_next_varying(const struct ir3_shader_variant *so, int i)
1067 {
1068    while (++i < so->inputs_count)
1069       if (so->inputs[i].compmask && so->inputs[i].bary)
1070          break;
1071    return i;
1072 }
1073 
1074 static inline int
ir3_find_input(const struct ir3_shader_variant * so,gl_varying_slot slot)1075 ir3_find_input(const struct ir3_shader_variant *so, gl_varying_slot slot)
1076 {
1077    int j = -1;
1078 
1079    while (true) {
1080       j = ir3_next_varying(so, j);
1081 
1082       if (j >= so->inputs_count)
1083          return -1;
1084 
1085       if (so->inputs[j].slot == slot)
1086          return j;
1087    }
1088 }
1089 
1090 static inline unsigned
ir3_find_input_loc(const struct ir3_shader_variant * so,gl_varying_slot slot)1091 ir3_find_input_loc(const struct ir3_shader_variant *so, gl_varying_slot slot)
1092 {
1093    int var = ir3_find_input(so, slot);
1094    return var == -1 ? 0xff : so->inputs[var].inloc;
1095 }
1096 
1097 struct ir3_shader_linkage {
1098    /* Maximum location either consumed by the fragment shader or produced by
1099     * the last geometry stage, i.e. the size required for each vertex in the
1100     * VPC in DWORD's.
1101     */
1102    uint8_t max_loc;
1103 
1104    /* Number of entries in var. */
1105    uint8_t cnt;
1106 
1107    /* Bitset of locations used, including ones which are only used by the FS.
1108     */
1109    uint32_t varmask[4];
1110 
1111    /* Map from VS output to location. */
1112    struct {
1113       uint8_t slot;
1114       uint8_t regid;
1115       uint8_t compmask;
1116       uint8_t loc;
1117    } var[32];
1118 
1119    /* location for fixed-function gl_PrimitiveID passthrough */
1120    uint8_t primid_loc;
1121 
1122    /* location for fixed-function gl_ViewIndex passthrough */
1123    uint8_t viewid_loc;
1124 
1125    /* location for combined clip/cull distance arrays */
1126    uint8_t clip0_loc, clip1_loc;
1127 };
1128 
1129 static inline void
ir3_link_add(struct ir3_shader_linkage * l,uint8_t slot,uint8_t regid_,uint8_t compmask,uint8_t loc)1130 ir3_link_add(struct ir3_shader_linkage *l, uint8_t slot, uint8_t regid_,
1131              uint8_t compmask, uint8_t loc)
1132 {
1133    for (int j = 0; j < util_last_bit(compmask); j++) {
1134       uint8_t comploc = loc + j;
1135       l->varmask[comploc / 32] |= 1 << (comploc % 32);
1136    }
1137 
1138    l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
1139 
1140    if (regid_ != regid(63, 0)) {
1141       int i = l->cnt++;
1142       assert(i < ARRAY_SIZE(l->var));
1143 
1144       l->var[i].slot = slot;
1145       l->var[i].regid = regid_;
1146       l->var[i].compmask = compmask;
1147       l->var[i].loc = loc;
1148    }
1149 }
1150 
1151 static inline void
ir3_link_shaders(struct ir3_shader_linkage * l,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * fs,bool pack_vs_out)1152 ir3_link_shaders(struct ir3_shader_linkage *l,
1153                  const struct ir3_shader_variant *vs,
1154                  const struct ir3_shader_variant *fs, bool pack_vs_out)
1155 {
1156    /* On older platforms, varmask isn't programmed at all, and it appears
1157     * that the hardware generates a mask of used VPC locations using the VS
1158     * output map, and hangs if a FS bary instruction references a location
1159     * not in the list. This means that we need to have a dummy entry in the
1160     * VS out map for things like gl_PointCoord which aren't written by the
1161     * VS. Furthermore we can't use r63.x, so just pick a random register to
1162     * use if there is no VS output.
1163     */
1164    const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
1165    int j = -1, k;
1166 
1167    l->primid_loc = 0xff;
1168    l->viewid_loc = 0xff;
1169    l->clip0_loc = 0xff;
1170    l->clip1_loc = 0xff;
1171 
1172    while (l->cnt < ARRAY_SIZE(l->var)) {
1173       j = ir3_next_varying(fs, j);
1174 
1175       if (j >= fs->inputs_count)
1176          break;
1177 
1178       if (fs->inputs[j].inloc >= fs->total_in)
1179          continue;
1180 
1181       k = ir3_find_output(vs, (gl_varying_slot)fs->inputs[j].slot);
1182 
1183       if (fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
1184          l->primid_loc = fs->inputs[j].inloc;
1185       }
1186 
1187       if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) {
1188          assert(k < 0);
1189          l->viewid_loc = fs->inputs[j].inloc;
1190       }
1191 
1192       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0)
1193          l->clip0_loc = fs->inputs[j].inloc;
1194 
1195       if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1)
1196          l->clip1_loc = fs->inputs[j].inloc;
1197 
1198       ir3_link_add(l, fs->inputs[j].slot,
1199                    k >= 0 ? vs->outputs[k].regid : default_regid,
1200                    fs->inputs[j].compmask, fs->inputs[j].inloc);
1201    }
1202 }
1203 
1204 static inline uint32_t
ir3_find_output_regid(const struct ir3_shader_variant * so,unsigned slot)1205 ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot)
1206 {
1207    int j;
1208    for (j = 0; j < so->outputs_count; j++)
1209       if (so->outputs[j].slot == slot) {
1210          uint32_t regid = so->outputs[j].regid;
1211          if (so->outputs[j].half)
1212             regid |= HALF_REG_ID;
1213          return regid;
1214       }
1215    return regid(63, 0);
1216 }
1217 
1218 void print_raw(FILE *out, const BITSET_WORD *data, size_t size);
1219 
1220 void ir3_link_stream_out(struct ir3_shader_linkage *l,
1221                          const struct ir3_shader_variant *v);
1222 
1223 #define VARYING_SLOT_GS_HEADER_IR3       (VARYING_SLOT_MAX + 0)
1224 #define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1)
1225 #define VARYING_SLOT_TCS_HEADER_IR3      (VARYING_SLOT_MAX + 2)
1226 #define VARYING_SLOT_REL_PATCH_ID_IR3    (VARYING_SLOT_MAX + 3)
1227 
1228 static inline uint32_t
ir3_find_sysval_regid(const struct ir3_shader_variant * so,unsigned slot)1229 ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot)
1230 {
1231    if (!so)
1232       return regid(63, 0);
1233    for (int j = 0; j < so->inputs_count; j++)
1234       if (so->inputs[j].sysval && (so->inputs[j].slot == slot))
1235          return so->inputs[j].regid;
1236    return regid(63, 0);
1237 }
1238 
1239 /* calculate register footprint in terms of half-regs (ie. one full
1240  * reg counts as two half-regs).
1241  */
1242 static inline uint32_t
ir3_shader_halfregs(const struct ir3_shader_variant * v)1243 ir3_shader_halfregs(const struct ir3_shader_variant *v)
1244 {
1245    return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
1246 }
1247 
1248 static inline uint32_t
ir3_shader_nibo(const struct ir3_shader_variant * v)1249 ir3_shader_nibo(const struct ir3_shader_variant *v)
1250 {
1251    return v->num_ibos;
1252 }
1253 
1254 static inline uint32_t
ir3_shader_branchstack_hw(const struct ir3_shader_variant * v)1255 ir3_shader_branchstack_hw(const struct ir3_shader_variant *v)
1256 {
1257    /* Dummy shader */
1258    if (!v->compiler)
1259       return 0;
1260 
1261    if (v->compiler->gen < 5)
1262       return v->branchstack;
1263 
1264    return DIV_ROUND_UP(MIN2(v->branchstack, v->compiler->branchstack_size), 2);
1265 }
1266 
1267 ENDC;
1268 
1269 #endif /* IR3_SHADER_H_ */
1270