1 /*
2 * Copyright © 2010 - 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef ELK_COMPILER_H
25 #define ELK_COMPILER_H
26
27 #include <stdio.h>
28 #include "c11/threads.h"
29 #include "dev/intel_device_info.h"
30 #include "isl/isl.h"
31 #include "util/macros.h"
32 #include "util/mesa-sha1.h"
33 #include "util/enum_operators.h"
34 #include "util/ralloc.h"
35 #include "util/u_math.h"
36 #include "elk_isa_info.h"
37 #include "../intel_shader_enums.h"
38
39 #ifdef __cplusplus
40 extern "C" {
41 #endif
42
43 struct ra_regs;
44 struct nir_shader;
45 struct shader_info;
46
47 struct nir_shader_compiler_options;
48 typedef struct nir_shader nir_shader;
49
50 struct elk_compiler {
51 const struct intel_device_info *devinfo;
52
53 /* This lock must be taken if the compiler is to be modified in any way,
54 * including adding something to the ralloc child list.
55 */
56 mtx_t mutex;
57
58 struct elk_isa_info isa;
59
60 struct {
61 struct ra_regs *regs;
62
63 /**
64 * Array of the ra classes for the unaligned contiguous register
65 * block sizes used.
66 */
67 struct ra_class **classes;
68 } vec4_reg_set;
69
70 struct {
71 struct ra_regs *regs;
72
73 /**
74 * Array of the ra classes for the unaligned contiguous register
75 * block sizes used, indexed by register size.
76 */
77 struct ra_class *classes[16];
78
79 /**
80 * ra class for the aligned barycentrics we use for PLN, which doesn't
81 * appear in *classes.
82 */
83 struct ra_class *aligned_bary_class;
84 } fs_reg_sets[3];
85
86 void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
87 void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
88
89 bool scalar_stage[MESA_ALL_SHADER_STAGES];
90 bool use_tcs_multi_patch;
91 struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
92
93 /**
94 * Apply workarounds for SIN and COS output range problems.
95 * This can negatively impact performance.
96 */
97 bool precise_trig;
98
99 /**
100 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State
101 * Base Address? (If not, it's a normal GPU address.)
102 */
103 bool constant_buffer_0_is_relative;
104
105 /**
106 * Whether or not the driver supports NIR shader constants. This controls
107 * whether nir_opt_large_constants will be run.
108 */
109 bool supports_shader_constants;
110
111 /**
112 * Whether indirect UBO loads should use the sampler or go through the
113 * data/constant cache. For the sampler, UBO surface states have to be set
114 * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
115 * constant or data cache, UBOs must use VK_FORMAT_RAW.
116 */
117 bool indirect_ubos_use_sampler;
118
119 /**
120 * Gfx12.5+ has a bit in the SEND instruction extending the bindless
121 * surface offset range from 20 to 26 bits, effectively giving us 4Gb of
122 * bindless surface descriptors instead of 64Mb previously.
123 */
124 bool extended_bindless_surface_offset;
125
126 /**
127 * Gfx11+ has a bit in the dword 3 of the sampler message header that
128 * indicates whether the sampler handle is relative to the dynamic state
129 * base address (0) or the bindless sampler base address (1). The driver
130 * can select this.
131 */
132 bool use_bindless_sampler_offset;
133
134 /**
135 * Should DPAS instructions be lowered?
136 *
137 * This will be set for all platforms before Gfx12.5. It may also be set
138 * platforms that support DPAS for testing purposes.
139 */
140 bool lower_dpas;
141
142 /**
143 * Calling the ra_allocate function after each register spill can take
144 * several minutes. This option speeds up shader compilation by spilling
145 * more registers after the ra_allocate failure. Required for
146 * Cyberpunk 2077, which uses a watchdog thread to terminate the process
147 * in case the render thread hasn't responded within 2 minutes.
148 */
149 int spilling_rate;
150
151 struct nir_shader *clc_shader;
152 };
153
154 #define elk_shader_debug_log(compiler, data, fmt, ... ) do { \
155 static unsigned id = 0; \
156 compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \
157 } while (0)
158
159 #define elk_shader_perf_log(compiler, data, fmt, ... ) do { \
160 static unsigned id = 0; \
161 compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \
162 } while (0)
163
164 /**
165 * We use a constant subgroup size of 32. It really only needs to be a
166 * maximum and, since we do SIMD32 for compute shaders in some cases, it
167 * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a
168 * subgroup size of 32 but will act as if 16 or 24 of those channels are
169 * disabled.
170 */
171 #define ELK_SUBGROUP_SIZE 32
172
173 /**
174 * Program key structures.
175 *
176 * When drawing, we look for the currently bound shaders in the program
177 * cache. This is essentially a hash table lookup, and these are the keys.
178 *
179 * Sometimes OpenGL features specified as state need to be simulated via
180 * shader code, due to a mismatch between the API and the hardware. This
181 * is often referred to as "non-orthagonal state" or "NOS". We store NOS
182 * in the program key so it's considered when searching for a program. If
183 * we haven't seen a particular combination before, we have to recompile a
184 * new specialized version.
185 *
186 * Shader compilation should not look up state in gl_context directly, but
187 * instead use the copy in the program key. This guarantees recompiles will
188 * happen correctly.
189 *
190 * @{
191 */
192
193 enum PACKED elk_gfx6_gather_sampler_wa {
194 ELK_WA_SIGN = 1, /* whether we need to sign extend */
195 ELK_WA_8BIT = 2, /* if we have an 8bit format needing wa */
196 ELK_WA_16BIT = 4, /* if we have a 16bit format needing wa */
197 };
198
199 #define ELK_MAX_SAMPLERS 32
200
201 /* Provide explicit padding for each member, to ensure that the compiler
202 * initializes every bit in the shader cache keys. The keys will be compared
203 * with memcmp.
204 */
205 PRAGMA_DIAGNOSTIC_PUSH
206 PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
207
208 /**
209 * Sampler information needed by VS, WM, and GS program cache keys.
210 */
211 struct elk_sampler_prog_key_data {
212 /**
213 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
214 *
215 * This field is not consumed by the back-end compiler and is only relevant
216 * for the crocus OpenGL driver for Broadwell and earlier hardware.
217 */
218 uint16_t swizzles[ELK_MAX_SAMPLERS];
219
220 uint32_t gl_clamp_mask[3];
221
222 /**
223 * For RG32F, gather4's channel select is broken.
224 */
225 uint32_t gather_channel_quirk_mask;
226
227 /**
228 * For Sandybridge, which shader w/a we need for gather quirks.
229 */
230 enum elk_gfx6_gather_sampler_wa gfx6_gather_wa[ELK_MAX_SAMPLERS];
231 };
232
233 enum elk_robustness_flags {
234 ELK_ROBUSTNESS_UBO = BITFIELD_BIT(0),
235 ELK_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
236 };
237
238 struct elk_base_prog_key {
239 unsigned program_string_id;
240
241 enum elk_robustness_flags robust_flags:2;
242
243 unsigned padding:22;
244
245 /**
246 * Apply workarounds for SIN and COS input range problems.
247 * This limits input range for SIN and COS to [-2p : 2p] to
248 * avoid precision issues.
249 */
250 bool limit_trig_input_range;
251
252 struct elk_sampler_prog_key_data tex;
253 };
254
255 /**
256 * The VF can't natively handle certain types of attributes, such as GL_FIXED
257 * or most 10_10_10_2 types. These flags enable various VS workarounds to
258 * "fix" attributes at the beginning of shaders.
259 */
260 #define ELK_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */
261 #define ELK_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */
262 #define ELK_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */
263 #define ELK_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */
264 #define ELK_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */
265
266 /**
267 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
268 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
269 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input
270 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
271 */
272 #define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX
273 #define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28)
274
275 /**
276 * Max number of binding table entries used for stream output.
277 *
278 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
279 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
280 *
281 * On Gfx6, the size of transform feedback data is limited not by the number
282 * of components but by the number of binding table entries we set aside. We
283 * use one binding table entry for a float, one entry for a vector, and one
284 * entry per matrix column. Since the only way we can communicate our
285 * transform feedback capabilities to the client is via
286 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
287 * worst case, in which all the varyings are floats, so we use up one binding
288 * table entry per component. Therefore we need to set aside at least 64
289 * binding table entries for use by transform feedback.
290 *
291 * Note: since we don't currently pack varyings, it is currently impossible
292 * for the client to actually use up all of these binding table entries--if
293 * all of their varyings were floats, they would run out of varying slots and
294 * fail to link. But that's a bug, so it seems prudent to go ahead and
295 * allocate the number of binding table entries we will need once the bug is
296 * fixed.
297 */
298 #define ELK_MAX_SOL_BINDINGS 64
299
300 /** The program key for Vertex Shaders. */
301 struct elk_vs_prog_key {
302 struct elk_base_prog_key base;
303
304 /**
305 * Per-attribute workaround flags
306 *
307 * For each attribute, a combination of ELK_ATTRIB_WA_*.
308 *
309 * For OpenGL, where we expose a maximum of 16 user input attributes
310 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan
311 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can
312 * expose up to 28 user input vertex attributes that are mapped to slots
313 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large
314 * enough to hold this many slots.
315 */
316 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)];
317
318 /**
319 * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates
320 * are going to be replaced with point coordinates (as a consequence of a
321 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because
322 * our SF thread requires exact matching between VS outputs and FS inputs,
323 * these texture coordinates will need to be unconditionally included in
324 * the VUE, even if they aren't written by the vertex shader.
325 */
326 uint8_t point_coord_replace;
327 unsigned clamp_pointsize:1;
328
329 bool copy_edgeflag:1;
330
331 bool clamp_vertex_color:1;
332
333 /**
334 * How many user clipping planes are being uploaded to the vertex shader as
335 * push constants.
336 *
337 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
338 * clip distances.
339 */
340 unsigned nr_userclip_plane_consts:4;
341
342 uint32_t padding: 25;
343 };
344
345 /** The program key for Tessellation Control Shaders. */
346 struct elk_tcs_prog_key
347 {
348 struct elk_base_prog_key base;
349
350 /** A bitfield of per-vertex outputs written. */
351 uint64_t outputs_written;
352
353 enum tess_primitive_mode _tes_primitive_mode;
354
355 /** Number of input vertices, 0 means dynamic */
356 unsigned input_vertices;
357
358 /** A bitfield of per-patch outputs written. */
359 uint32_t patch_outputs_written;
360
361 bool quads_workaround;
362 uint32_t padding:24;
363 };
364
365 #define ELK_MAX_TCS_INPUT_VERTICES (32)
366
367 static inline uint32_t
elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key * key)368 elk_tcs_prog_key_input_vertices(const struct elk_tcs_prog_key *key)
369 {
370 return key->input_vertices != 0 ?
371 key->input_vertices : ELK_MAX_TCS_INPUT_VERTICES;
372 }
373
374 /** The program key for Tessellation Evaluation Shaders. */
375 struct elk_tes_prog_key
376 {
377 struct elk_base_prog_key base;
378
379 /** A bitfield of per-vertex inputs read. */
380 uint64_t inputs_read;
381
382 /** A bitfield of per-patch inputs read. */
383 uint32_t patch_inputs_read;
384
385 /**
386 * How many user clipping planes are being uploaded to the tessellation
387 * evaluation shader as push constants.
388 *
389 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
390 * clip distances.
391 */
392 unsigned nr_userclip_plane_consts:4;
393 unsigned clamp_pointsize:1;
394 uint32_t padding:27;
395 };
396
397 /** The program key for Geometry Shaders. */
398 struct elk_gs_prog_key
399 {
400 struct elk_base_prog_key base;
401
402 /**
403 * How many user clipping planes are being uploaded to the geometry shader
404 * as push constants.
405 *
406 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
407 * clip distances.
408 */
409 unsigned nr_userclip_plane_consts:4;
410 unsigned clamp_pointsize:1;
411 unsigned padding:27;
412 };
413
414 enum elk_sf_primitive {
415 ELK_SF_PRIM_POINTS = 0,
416 ELK_SF_PRIM_LINES = 1,
417 ELK_SF_PRIM_TRIANGLES = 2,
418 ELK_SF_PRIM_UNFILLED_TRIS = 3,
419 };
420
421 struct elk_sf_prog_key {
422 uint64_t attrs;
423 bool contains_flat_varying;
424 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
425 uint8_t point_sprite_coord_replace;
426 enum elk_sf_primitive primitive:2;
427 bool do_twoside_color:1;
428 bool frontface_ccw:1;
429 bool do_point_sprite:1;
430 bool do_point_coord:1;
431 bool sprite_origin_lower_left:1;
432 bool userclip_active:1;
433 unsigned padding: 32;
434 };
435
436 enum elk_clip_mode {
437 ELK_CLIP_MODE_NORMAL = 0,
438 ELK_CLIP_MODE_CLIP_ALL = 1,
439 ELK_CLIP_MODE_CLIP_NON_REJECTED = 2,
440 ELK_CLIP_MODE_REJECT_ALL = 3,
441 ELK_CLIP_MODE_ACCEPT_ALL = 4,
442 ELK_CLIP_MODE_KERNEL_CLIP = 5,
443 };
444
445 enum elk_clip_fill_mode {
446 ELK_CLIP_FILL_MODE_LINE = 0,
447 ELK_CLIP_FILL_MODE_POINT = 1,
448 ELK_CLIP_FILL_MODE_FILL = 2,
449 ELK_CLIP_FILL_MODE_CULL = 3,
450 };
451
452 /* Note that if unfilled primitives are being emitted, we have to fix
453 * up polygon offset and flatshading at this point:
454 */
455 struct elk_clip_prog_key {
456 uint64_t attrs;
457 float offset_factor;
458 float offset_units;
459 float offset_clamp;
460 bool contains_flat_varying;
461 bool contains_noperspective_varying;
462 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
463 unsigned primitive:4;
464 unsigned nr_userclip:4;
465 bool pv_first:1;
466 bool do_unfilled:1;
467 enum elk_clip_fill_mode fill_cw:2; /* includes cull information */
468 enum elk_clip_fill_mode fill_ccw:2; /* includes cull information */
469 bool offset_cw:1;
470 bool offset_ccw:1;
471 bool copy_bfc_cw:1;
472 bool copy_bfc_ccw:1;
473 enum elk_clip_mode clip_mode:3;
474 uint64_t padding:51;
475 };
476
477 /* A big lookup table is used to figure out which and how many
478 * additional regs will inserted before the main payload in the WM
479 * program execution. These mainly relate to depth and stencil
480 * processing and the early-depth-test optimization.
481 */
482 enum elk_wm_iz_bits {
483 ELK_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1,
484 ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2,
485 ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4,
486 ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8,
487 ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10,
488 ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20,
489 ELK_WM_IZ_BIT_MAX = 0x40
490 };
491
492 enum elk_sometimes {
493 ELK_NEVER = 0,
494 ELK_SOMETIMES,
495 ELK_ALWAYS
496 };
497
498 static inline enum elk_sometimes
elk_sometimes_invert(enum elk_sometimes x)499 elk_sometimes_invert(enum elk_sometimes x)
500 {
501 return (enum elk_sometimes)((int)ELK_ALWAYS - (int)x);
502 }
503
504 /** The program key for Fragment/Pixel Shaders. */
505 struct elk_wm_prog_key {
506 struct elk_base_prog_key base;
507
508 uint64_t input_slots_valid;
509 float alpha_test_ref;
510 uint8_t color_outputs_valid;
511
512 /* Some collection of ELK_WM_IZ_* */
513 uint8_t iz_lookup;
514 bool stats_wm:1;
515 bool flat_shade:1;
516 unsigned nr_color_regions:5;
517 bool emit_alpha_test:1;
518 enum compare_func alpha_test_func:3; /* < For Gfx4/5 MRT alpha test */
519 bool alpha_test_replicate_alpha:1;
520 enum elk_sometimes alpha_to_coverage:2;
521 bool clamp_fragment_color:1;
522
523 bool force_dual_color_blend:1;
524
525 /** Whether or inputs are interpolated at sample rate by default
526 *
527 * This corresponds to the sample shading API bit in Vulkan or OpenGL which
528 * controls how inputs with no interpolation qualifier are interpolated.
529 * This is distinct from the way that using gl_SampleID or similar requires
530 * us to run per-sample. Even when running per-sample due to gl_SampleID,
531 * we may still interpolate unqualified inputs at the pixel center.
532 */
533 enum elk_sometimes persample_interp:2;
534
535 /* Whether or not we are running on a multisampled framebuffer */
536 enum elk_sometimes multisample_fbo:2;
537
538 enum elk_sometimes line_aa:2;
539
540 bool coherent_fb_fetch:1;
541 bool ignore_sample_mask_out:1;
542 bool coarse_pixel:1;
543
544 uint64_t padding:55;
545 };
546
547 struct elk_cs_prog_key {
548 struct elk_base_prog_key base;
549 };
550
551 struct elk_ff_gs_prog_key {
552 uint64_t attrs;
553
554 /**
555 * Map from the index of a transform feedback binding table entry to the
556 * gl_varying_slot that should be streamed out through that binding table
557 * entry.
558 */
559 unsigned char transform_feedback_bindings[ELK_MAX_SOL_BINDINGS];
560
561 /**
562 * Map from the index of a transform feedback binding table entry to the
563 * swizzles that should be used when streaming out data through that
564 * binding table entry.
565 */
566 unsigned char transform_feedback_swizzles[ELK_MAX_SOL_BINDINGS];
567
568 /**
569 * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST.
570 */
571 unsigned primitive:8;
572
573 unsigned pv_first:1;
574 unsigned need_gs_prog:1;
575
576 /**
577 * Number of varyings that are output to transform feedback.
578 */
579 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
580 uint64_t padding:47;
581 };
582
583 /* elk_any_prog_key is any of the keys that map to an API stage */
584 union elk_any_prog_key {
585 struct elk_base_prog_key base;
586 struct elk_vs_prog_key vs;
587 struct elk_tcs_prog_key tcs;
588 struct elk_tes_prog_key tes;
589 struct elk_gs_prog_key gs;
590 struct elk_wm_prog_key wm;
591 struct elk_cs_prog_key cs;
592 };
593
594 PRAGMA_DIAGNOSTIC_POP
595
596 /** Max number of render targets in a shader */
597 #define ELK_MAX_DRAW_BUFFERS 8
598
599 /**
600 * Binding table index for the first gfx6 SOL binding.
601 */
602 #define ELK_GFX6_SOL_BINDING_START 0
603
604 struct elk_ubo_range
605 {
606 uint16_t block;
607
608 /* In units of 32-byte registers */
609 uint8_t start;
610 uint8_t length;
611 };
612
613 /* We reserve the first 2^16 values for builtins */
614 #define ELK_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
615
616 enum elk_param_builtin {
617 ELK_PARAM_BUILTIN_ZERO,
618
619 ELK_PARAM_BUILTIN_CLIP_PLANE_0_X,
620 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Y,
621 ELK_PARAM_BUILTIN_CLIP_PLANE_0_Z,
622 ELK_PARAM_BUILTIN_CLIP_PLANE_0_W,
623 ELK_PARAM_BUILTIN_CLIP_PLANE_1_X,
624 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Y,
625 ELK_PARAM_BUILTIN_CLIP_PLANE_1_Z,
626 ELK_PARAM_BUILTIN_CLIP_PLANE_1_W,
627 ELK_PARAM_BUILTIN_CLIP_PLANE_2_X,
628 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Y,
629 ELK_PARAM_BUILTIN_CLIP_PLANE_2_Z,
630 ELK_PARAM_BUILTIN_CLIP_PLANE_2_W,
631 ELK_PARAM_BUILTIN_CLIP_PLANE_3_X,
632 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Y,
633 ELK_PARAM_BUILTIN_CLIP_PLANE_3_Z,
634 ELK_PARAM_BUILTIN_CLIP_PLANE_3_W,
635 ELK_PARAM_BUILTIN_CLIP_PLANE_4_X,
636 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Y,
637 ELK_PARAM_BUILTIN_CLIP_PLANE_4_Z,
638 ELK_PARAM_BUILTIN_CLIP_PLANE_4_W,
639 ELK_PARAM_BUILTIN_CLIP_PLANE_5_X,
640 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Y,
641 ELK_PARAM_BUILTIN_CLIP_PLANE_5_Z,
642 ELK_PARAM_BUILTIN_CLIP_PLANE_5_W,
643 ELK_PARAM_BUILTIN_CLIP_PLANE_6_X,
644 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Y,
645 ELK_PARAM_BUILTIN_CLIP_PLANE_6_Z,
646 ELK_PARAM_BUILTIN_CLIP_PLANE_6_W,
647 ELK_PARAM_BUILTIN_CLIP_PLANE_7_X,
648 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Y,
649 ELK_PARAM_BUILTIN_CLIP_PLANE_7_Z,
650 ELK_PARAM_BUILTIN_CLIP_PLANE_7_W,
651
652 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
653 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
654 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
655 ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
656 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
657 ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
658
659 ELK_PARAM_BUILTIN_PATCH_VERTICES_IN,
660
661 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
662 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
663 ELK_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
664 ELK_PARAM_BUILTIN_SUBGROUP_ID,
665 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
666 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
667 ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
668 ELK_PARAM_BUILTIN_WORK_DIM,
669 };
670
671 #define ELK_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
672 (ELK_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
673
674 #define ELK_PARAM_BUILTIN_IS_CLIP_PLANE(param) \
675 ((param) >= ELK_PARAM_BUILTIN_CLIP_PLANE_0_X && \
676 (param) <= ELK_PARAM_BUILTIN_CLIP_PLANE_7_W)
677
678 #define ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
679 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
680
681 #define ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
682 (((param) - ELK_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
683
684 enum elk_shader_reloc_id {
685 ELK_SHADER_RELOC_CONST_DATA_ADDR_LOW,
686 ELK_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
687 ELK_SHADER_RELOC_SHADER_START_OFFSET,
688 ELK_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
689 };
690
691 enum elk_shader_reloc_type {
692 /** An arbitrary 32-bit value */
693 ELK_SHADER_RELOC_TYPE_U32,
694 /** A MOV instruction with an immediate source */
695 ELK_SHADER_RELOC_TYPE_MOV_IMM,
696 };
697
698 /** Represents a code relocation
699 *
700 * Relocatable constants are immediates in the code which we want to be able
701 * to replace post-compile with the actual value.
702 */
703 struct elk_shader_reloc {
704 /** The 32-bit ID of the relocatable constant */
705 uint32_t id;
706
707 /** Type of this relocation */
708 enum elk_shader_reloc_type type;
709
710 /** The offset in the shader to the relocated value
711 *
712 * For MOV_IMM relocs, this is an offset to the MOV instruction. This
713 * allows us to do some sanity checking while we update the value.
714 */
715 uint32_t offset;
716
717 /** Value to be added to the relocated value before it is written */
718 uint32_t delta;
719 };
720
721 /** A value to write to a relocation */
722 struct elk_shader_reloc_value {
723 /** The 32-bit ID of the relocatable constant */
724 uint32_t id;
725
726 /** The value with which to replace the relocated immediate */
727 uint32_t value;
728 };
729
730 struct elk_stage_prog_data {
731 struct elk_ubo_range ubo_ranges[4];
732
733 unsigned nr_params; /**< number of float params/constants */
734
735 gl_shader_stage stage;
736
737 /* zero_push_reg is a bitfield which indicates what push registers (if any)
738 * should be zeroed by SW at the start of the shader. The corresponding
739 * push_reg_mask_param specifies the param index (in 32-bit units) where
740 * the actual runtime 64-bit mask will be pushed. The shader will zero
741 * push reg i if
742 *
743 * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
744 *
745 * If this field is set, elk_compiler::compact_params must be false.
746 */
747 uint64_t zero_push_reg;
748 unsigned push_reg_mask_param;
749
750 unsigned curb_read_length;
751 unsigned total_scratch;
752 unsigned total_shared;
753
754 unsigned program_size;
755
756 unsigned const_data_size;
757 unsigned const_data_offset;
758
759 unsigned num_relocs;
760 const struct elk_shader_reloc *relocs;
761
762 /** Does this program pull from any UBO or other constant buffers? */
763 bool has_ubo_pull;
764
765 /** How many ray queries objects in this shader. */
766 unsigned ray_queries;
767
768 /**
769 * Register where the thread expects to find input data from the URB
770 * (typically uniforms, followed by vertex or fragment attributes).
771 */
772 unsigned dispatch_grf_start_reg;
773
774 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */
775
776 /* 32-bit identifiers for all push/pull parameters. These can be anything
777 * the driver wishes them to be; the core of the back-end compiler simply
778 * re-arranges them. The one restriction is that the bottom 2^16 values
779 * are reserved for builtins defined in the elk_param_builtin enum defined
780 * above.
781 */
782 uint32_t *param;
783
784 /* Whether shader uses atomic operations. */
785 bool uses_atomic_load_store;
786 };
787
788 static inline uint32_t *
elk_stage_prog_data_add_params(struct elk_stage_prog_data * prog_data,unsigned nr_new_params)789 elk_stage_prog_data_add_params(struct elk_stage_prog_data *prog_data,
790 unsigned nr_new_params)
791 {
792 unsigned old_nr_params = prog_data->nr_params;
793 prog_data->nr_params += nr_new_params;
794 prog_data->param = reralloc(ralloc_parent(prog_data->param),
795 prog_data->param, uint32_t,
796 prog_data->nr_params);
797 return prog_data->param + old_nr_params;
798 }
799
800 enum elk_barycentric_mode {
801 ELK_BARYCENTRIC_PERSPECTIVE_PIXEL = 0,
802 ELK_BARYCENTRIC_PERSPECTIVE_CENTROID = 1,
803 ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2,
804 ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3,
805 ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
806 ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5,
807 ELK_BARYCENTRIC_MODE_COUNT = 6
808 };
809 #define ELK_BARYCENTRIC_PERSPECTIVE_BITS \
810 ((1 << ELK_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
811 (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
812 (1 << ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))
813 #define ELK_BARYCENTRIC_NONPERSPECTIVE_BITS \
814 ((1 << ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
815 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
816 (1 << ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
817
818 enum elk_pixel_shader_computed_depth_mode {
819 ELK_PSCDEPTH_OFF = 0, /* PS does not compute depth */
820 ELK_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */
821 ELK_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
822 ELK_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
823 };
824
825 /* Data about a particular attempt to compile a program. Note that
826 * there can be many of these, each in a different GL state
827 * corresponding to a different elk_wm_prog_key struct, with different
828 * compiled programs.
829 */
830 struct elk_wm_prog_data {
831 struct elk_stage_prog_data base;
832
833 unsigned num_per_primitive_inputs;
834 unsigned num_varying_inputs;
835
836 uint8_t reg_blocks_8;
837 uint8_t reg_blocks_16;
838 uint8_t reg_blocks_32;
839
840 uint8_t dispatch_grf_start_reg_16;
841 uint8_t dispatch_grf_start_reg_32;
842 uint32_t prog_offset_16;
843 uint32_t prog_offset_32;
844
845 struct {
846 /** @{
847 * surface indices the WM-specific surfaces
848 */
849 uint32_t render_target_read_start;
850 /** @} */
851 } binding_table;
852
853 uint8_t color_outputs_written;
854 uint8_t computed_depth_mode;
855
856 /**
857 * Number of polygons handled in parallel by the multi-polygon PS
858 * kernel.
859 */
860 uint8_t max_polygons;
861
862 /**
863 * Dispatch width of the multi-polygon PS kernel, or 0 if no
864 * multi-polygon kernel was built.
865 */
866 uint8_t dispatch_multi;
867
868 bool computed_stencil;
869 bool early_fragment_tests;
870 bool post_depth_coverage;
871 bool inner_coverage;
872 bool dispatch_8;
873 bool dispatch_16;
874 bool dispatch_32;
875 bool dual_src_blend;
876 bool uses_pos_offset;
877 bool uses_omask;
878 bool uses_kill;
879 bool uses_src_depth;
880 bool uses_src_w;
881 bool uses_depth_w_coefficients;
882 bool uses_sample_mask;
883 bool uses_vmask;
884 bool has_side_effects;
885 bool pulls_bary;
886
887 bool contains_flat_varying;
888 bool contains_noperspective_varying;
889
890 /** True if the shader wants sample shading
891 *
892 * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
893 * a sample-qualified input are used in the shader. It is independent of
894 * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
895 */
896 bool sample_shading;
897
898 /** Should this shader be dispatched per-sample */
899 enum elk_sometimes persample_dispatch;
900
901 /**
902 * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS).
903 */
904 enum elk_sometimes coarse_pixel_dispatch;
905
906 /**
907 * Shader writes the SampleMask and this is AND-ed with the API's
908 * SampleMask to generate a new coverage mask.
909 */
910 enum elk_sometimes alpha_to_coverage;
911
912 unsigned msaa_flags_param;
913
914 /**
915 * Mask of which interpolation modes are required by the fragment shader.
916 * Those interpolations are delivered as part of the thread payload. Used
917 * in hardware setup on gfx6+.
918 */
919 uint32_t barycentric_interp_modes;
920
921 /**
922 * Whether nonperspective interpolation modes are used by the
923 * barycentric_interp_modes or fragment shader through interpolator messages.
924 */
925 bool uses_nonperspective_interp_modes;
926
927 /**
928 * Mask of which FS inputs are marked flat by the shader source. This is
929 * needed for setting up 3DSTATE_SF/SBE.
930 */
931 uint32_t flat_inputs;
932
933 /**
934 * The FS inputs
935 */
936 uint64_t inputs;
937
938 /* Mapping of VUE slots to interpolation modes.
939 * Used by the Gfx4-5 clip/sf/wm stages.
940 */
941 unsigned char interp_mode[65]; /* ELK_VARYING_SLOT_COUNT */
942
943 /**
944 * Map from gl_varying_slot to the position within the FS setup data
945 * payload where the varying's attribute vertex deltas should be delivered.
946 * For varying slots that are not used by the FS, the value is -1.
947 */
948 int urb_setup[VARYING_SLOT_MAX];
949 int urb_setup_channel[VARYING_SLOT_MAX];
950
951 /**
952 * Cache structure into the urb_setup array above that contains the
953 * attribute numbers of active varyings out of urb_setup.
954 * The actual count is stored in urb_setup_attribs_count.
955 */
956 uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
957 uint8_t urb_setup_attribs_count;
958 };
959
960 #ifdef GFX_VERx10
961
962 #if GFX_VERx10 >= 200
963
964 /** Returns the SIMD width corresponding to a given KSP index
965 *
966 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
967 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
968 * kernel start pointer (KSP) indices that is based on what dispatch widths
969 * are enabled. This function provides, effectively, the reverse mapping.
970 *
971 * If the given KSP is enabled, a SIMD width of 8, 16, or 32 is
972 * returned. Note that for a multipolygon dispatch kernel 8 is always
973 * returned, since multipolygon kernels use the "_8" fields from
974 * elk_wm_prog_data regardless of their SIMD width. If the KSP is
975 * invalid, 0 is returned.
976 */
977 static inline unsigned
elk_fs_simd_width_for_ksp(unsigned ksp_idx,bool enabled,unsigned width_sel)978 elk_fs_simd_width_for_ksp(unsigned ksp_idx, bool enabled, unsigned width_sel)
979 {
980 assert(ksp_idx < 2);
981 return !enabled ? 0 :
982 width_sel ? 32 :
983 16;
984 }
985
986 #define elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
987 (ksp_idx == 0 && (wm_state).Kernel0MaximumPolysperThread ? 8 : \
988 ksp_idx == 0 ? elk_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel0Enable, \
989 (wm_state).Kernel0SIMDWidth): \
990 elk_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel1Enable, \
991 (wm_state).Kernel1SIMDWidth))
992
993 #else
994
995 /** Returns the SIMD width corresponding to a given KSP index
996 *
997 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
998 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
999 * kernel start pointer (KSP) indices that is based on what dispatch widths
1000 * are enabled. This function provides, effectively, the reverse mapping.
1001 *
1002 * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
1003 * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned.
1004 */
1005 static inline unsigned
elk_fs_simd_width_for_ksp(unsigned ksp_idx,bool simd8_enabled,bool simd16_enabled,bool simd32_enabled)1006 elk_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
1007 bool simd16_enabled, bool simd32_enabled)
1008 {
1009 /* This function strictly ignores contiguous dispatch */
1010 switch (ksp_idx) {
1011 case 0:
1012 return simd8_enabled ? 8 :
1013 (simd16_enabled && !simd32_enabled) ? 16 :
1014 (simd32_enabled && !simd16_enabled) ? 32 : 0;
1015 case 1:
1016 return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
1017 case 2:
1018 return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
1019 default:
1020 unreachable("Invalid KSP index");
1021 }
1022 }
1023
1024 #define elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
1025 elk_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
1026 (wm_state)._16PixelDispatchEnable, \
1027 (wm_state)._32PixelDispatchEnable)
1028
1029 #endif
1030
1031 #endif
1032
1033 #define elk_wm_state_has_ksp(wm_state, ksp_idx) \
1034 (elk_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
1035
1036 static inline uint32_t
_elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data * prog_data,unsigned simd_width)1037 _elk_wm_prog_data_prog_offset(const struct elk_wm_prog_data *prog_data,
1038 unsigned simd_width)
1039 {
1040 switch (simd_width) {
1041 case 8: return 0;
1042 case 16: return prog_data->prog_offset_16;
1043 case 32: return prog_data->prog_offset_32;
1044 default: return 0;
1045 }
1046 }
1047
1048 #define elk_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
1049 _elk_wm_prog_data_prog_offset(prog_data, \
1050 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
1051
1052 static inline uint8_t
_elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data * prog_data,unsigned simd_width)1053 _elk_wm_prog_data_dispatch_grf_start_reg(const struct elk_wm_prog_data *prog_data,
1054 unsigned simd_width)
1055 {
1056 switch (simd_width) {
1057 case 8: return prog_data->base.dispatch_grf_start_reg;
1058 case 16: return prog_data->dispatch_grf_start_reg_16;
1059 case 32: return prog_data->dispatch_grf_start_reg_32;
1060 default: return 0;
1061 }
1062 }
1063
1064 #define elk_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
1065 _elk_wm_prog_data_dispatch_grf_start_reg(prog_data, \
1066 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
1067
1068 static inline uint8_t
_elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data * prog_data,unsigned simd_width)1069 _elk_wm_prog_data_reg_blocks(const struct elk_wm_prog_data *prog_data,
1070 unsigned simd_width)
1071 {
1072 switch (simd_width) {
1073 case 8: return prog_data->reg_blocks_8;
1074 case 16: return prog_data->reg_blocks_16;
1075 case 32: return prog_data->reg_blocks_32;
1076 default: return 0;
1077 }
1078 }
1079
1080 #define elk_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \
1081 _elk_wm_prog_data_reg_blocks(prog_data, \
1082 elk_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
1083
1084 static inline bool
elk_wm_prog_data_is_persample(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1085 elk_wm_prog_data_is_persample(const struct elk_wm_prog_data *prog_data,
1086 enum intel_msaa_flags pushed_msaa_flags)
1087 {
1088 if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
1089 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
1090 return false;
1091
1092 if (prog_data->sample_shading)
1093 assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
1094
1095 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)
1096 assert(prog_data->persample_dispatch != ELK_NEVER);
1097 else
1098 assert(prog_data->persample_dispatch != ELK_ALWAYS);
1099
1100 return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
1101 }
1102
1103 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1104 prog_data->persample_dispatch == ELK_NEVER);
1105
1106 return prog_data->persample_dispatch;
1107 }
1108
1109 static inline uint32_t
elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1110 elk_wm_prog_data_barycentric_modes(const struct elk_wm_prog_data *prog_data,
1111 enum intel_msaa_flags pushed_msaa_flags)
1112 {
1113 uint32_t modes = prog_data->barycentric_interp_modes;
1114
1115 /* In the non dynamic case, we can just return the computed modes from
1116 * compilation time.
1117 */
1118 if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC))
1119 return modes;
1120
1121 if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
1122 assert(prog_data->persample_dispatch == ELK_ALWAYS ||
1123 (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH));
1124
1125 /* Making dynamic per-sample interpolation work is a bit tricky. The
1126 * hardware will hang if SAMPLE is requested but per-sample dispatch is
1127 * not enabled. This means we can't preemptively add SAMPLE to the
1128 * barycentrics bitfield. Instead, we have to add it late and only
1129 * on-demand. Annoyingly, changing the number of barycentrics requested
1130 * changes the whole PS shader payload so we very much don't want to do
1131 * that. Instead, if the dynamic per-sample interpolation flag is set,
1132 * we check to see if SAMPLE was requested and, if not, replace the
1133 * highest barycentric bit in the [non]perspective grouping (CENTROID,
1134 * if it exists, else PIXEL) with SAMPLE. The shader will stomp all the
1135 * barycentrics in the shader with SAMPLE so it really doesn't matter
1136 * which one we replace. The important thing is that we keep the number
1137 * of barycentrics in each [non]perspective grouping the same.
1138 */
1139 if ((modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) &&
1140 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
1141 int sample_mode =
1142 util_last_bit(modes & ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
1143 assert(modes & BITFIELD_BIT(sample_mode));
1144
1145 modes &= ~BITFIELD_BIT(sample_mode);
1146 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1147 }
1148
1149 if ((modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
1150 !(modes & BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
1151 int sample_mode =
1152 util_last_bit(modes & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
1153 assert(modes & BITFIELD_BIT(sample_mode));
1154
1155 modes &= ~BITFIELD_BIT(sample_mode);
1156 modes |= BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
1157 }
1158 } else {
1159 /* If we're not using per-sample interpolation, we need to disable the
1160 * per-sample bits.
1161 *
1162 * SKL PRMs, Volume 2a: Command Reference: Instructions,
1163 * 3DSTATE_WM:Barycentric Interpolation Mode:
1164
1165 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1166 * Sample or Non-perspective Sample barycentric coordinates."
1167 */
1168 modes &= ~(BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1169 BITFIELD_BIT(ELK_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1170 }
1171
1172 return modes;
1173 }
1174
1175 static inline bool
elk_wm_prog_data_is_coarse(const struct elk_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)1176 elk_wm_prog_data_is_coarse(const struct elk_wm_prog_data *prog_data,
1177 enum intel_msaa_flags pushed_msaa_flags)
1178 {
1179 if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
1180 if (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES)
1181 assert(prog_data->coarse_pixel_dispatch != ELK_NEVER);
1182 else
1183 assert(prog_data->coarse_pixel_dispatch != ELK_ALWAYS);
1184
1185 return pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES;
1186 }
1187
1188 assert(prog_data->coarse_pixel_dispatch == ELK_ALWAYS ||
1189 prog_data->coarse_pixel_dispatch == ELK_NEVER);
1190
1191 return prog_data->coarse_pixel_dispatch;
1192 }
1193
1194 struct elk_push_const_block {
1195 unsigned dwords; /* Dword count, not reg aligned */
1196 unsigned regs;
1197 unsigned size; /* Bytes, register aligned */
1198 };
1199
1200 struct elk_cs_prog_data {
1201 struct elk_stage_prog_data base;
1202
1203 unsigned local_size[3];
1204
1205 /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are
1206 * kept when using variable group size, and the right one can only be
1207 * decided at dispatch time.
1208 */
1209 unsigned prog_offset[3];
1210
1211 /* Bitmask indicating which program offsets are valid. */
1212 unsigned prog_mask;
1213
1214 /* Bitmask indicating which programs have spilled. */
1215 unsigned prog_spilled;
1216
1217 bool uses_barrier;
1218 bool uses_num_work_groups;
1219 bool uses_inline_data;
1220 bool uses_btd_stack_ids;
1221 bool uses_systolic;
1222 uint8_t generate_local_id;
1223 enum intel_compute_walk_order walk_order;
1224
1225 struct {
1226 struct elk_push_const_block cross_thread;
1227 struct elk_push_const_block per_thread;
1228 } push;
1229
1230 struct {
1231 /** @{
1232 * surface indices the CS-specific surfaces
1233 */
1234 uint32_t work_groups_start;
1235 /** @} */
1236 } binding_table;
1237 };
1238
1239 static inline uint32_t
elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data * prog_data,unsigned dispatch_width)1240 elk_cs_prog_data_prog_offset(const struct elk_cs_prog_data *prog_data,
1241 unsigned dispatch_width)
1242 {
1243 assert(dispatch_width == 8 ||
1244 dispatch_width == 16 ||
1245 dispatch_width == 32);
1246 const unsigned index = dispatch_width / 16;
1247 assert(prog_data->prog_mask & (1 << index));
1248 return prog_data->prog_offset[index];
1249 }
1250
1251 struct elk_ff_gs_prog_data {
1252 unsigned urb_read_length;
1253 unsigned total_grf;
1254
1255 /**
1256 * Gfx6 transform feedback: Amount by which the streaming vertex buffer
1257 * indices should be incremented each time the GS is invoked.
1258 */
1259 unsigned svbi_postincrement_value;
1260 };
1261
1262 /**
1263 * Enum representing the i965-specific vertex results that don't correspond
1264 * exactly to any element of gl_varying_slot. The values of this enum are
1265 * assigned such that they don't conflict with gl_varying_slot.
1266 */
1267 typedef enum
1268 {
1269 ELK_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
1270 ELK_VARYING_SLOT_PAD,
1271 /**
1272 * Technically this is not a varying but just a placeholder that
1273 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
1274 * builtin variable to be compiled correctly. see compile_sf_prog() for
1275 * more info.
1276 */
1277 ELK_VARYING_SLOT_PNTC,
1278 ELK_VARYING_SLOT_COUNT
1279 } elk_varying_slot;
1280
1281 /**
1282 * We always program SF to start reading at an offset of 1 (2 varying slots)
1283 * from the start of the vertex URB entry. This causes it to skip:
1284 * - VARYING_SLOT_PSIZ and ELK_VARYING_SLOT_NDC on gfx4-5
1285 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+
1286 */
1287 #define ELK_SF_URB_ENTRY_READ_OFFSET 1
1288
1289 /**
1290 * Bitmask indicating which fragment shader inputs represent varyings (and
1291 * hence have to be delivered to the fragment shader by the SF/SBE stage).
1292 */
1293 #define ELK_FS_VARYING_INPUT_MASK \
1294 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
1295 ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
1296
1297 void elk_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
1298 gl_shader_stage stage);
1299
1300 /**
1301 * Convert a VUE slot number into a byte offset within the VUE.
1302 */
elk_vue_slot_to_offset(unsigned slot)1303 static inline unsigned elk_vue_slot_to_offset(unsigned slot)
1304 {
1305 return 16*slot;
1306 }
1307
1308 /**
1309 * Convert a vertex output (elk_varying_slot) into a byte offset within the
1310 * VUE.
1311 */
1312 static inline unsigned
elk_varying_to_offset(const struct intel_vue_map * vue_map,unsigned varying)1313 elk_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
1314 {
1315 return elk_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
1316 }
1317
1318 void elk_compute_vue_map(const struct intel_device_info *devinfo,
1319 struct intel_vue_map *vue_map,
1320 uint64_t slots_valid,
1321 bool separate_shader,
1322 uint32_t pos_slots);
1323
1324 void elk_compute_tess_vue_map(struct intel_vue_map *const vue_map,
1325 uint64_t slots_valid,
1326 uint32_t is_patch);
1327
1328 /* elk_interpolation_map.c */
1329 void elk_setup_vue_interpolation(const struct intel_vue_map *vue_map,
1330 struct nir_shader *nir,
1331 struct elk_wm_prog_data *prog_data);
1332
1333 struct elk_vue_prog_data {
1334 struct elk_stage_prog_data base;
1335 struct intel_vue_map vue_map;
1336
1337 /** Should the hardware deliver input VUE handles for URB pull loads? */
1338 bool include_vue_handles;
1339
1340 unsigned urb_read_length;
1341 unsigned total_grf;
1342
1343 uint32_t clip_distance_mask;
1344 uint32_t cull_distance_mask;
1345
1346 /* Used for calculating urb partitions. In the VS, this is the size of the
1347 * URB entry used for both input and output to the thread. In the GS, this
1348 * is the size of the URB entry used for output.
1349 */
1350 unsigned urb_entry_size;
1351
1352 enum intel_shader_dispatch_mode dispatch_mode;
1353 };
1354
1355 struct elk_vs_prog_data {
1356 struct elk_vue_prog_data base;
1357
1358 uint64_t inputs_read;
1359 uint64_t double_inputs_read;
1360
1361 unsigned nr_attribute_slots;
1362
1363 bool uses_vertexid;
1364 bool uses_instanceid;
1365 bool uses_is_indexed_draw;
1366 bool uses_firstvertex;
1367 bool uses_baseinstance;
1368 bool uses_drawid;
1369 };
1370
1371 struct elk_tcs_prog_data
1372 {
1373 struct elk_vue_prog_data base;
1374
1375 /** Should the non-SINGLE_PATCH payload provide primitive ID? */
1376 bool include_primitive_id;
1377
1378 /** Number vertices in output patch */
1379 int instances;
1380
1381 /** Track patch count threshold */
1382 int patch_count_threshold;
1383 };
1384
1385
1386 struct elk_tes_prog_data
1387 {
1388 struct elk_vue_prog_data base;
1389
1390 enum intel_tess_partitioning partitioning;
1391 enum intel_tess_output_topology output_topology;
1392 enum intel_tess_domain domain;
1393 bool include_primitive_id;
1394 };
1395
1396 struct elk_gs_prog_data
1397 {
1398 struct elk_vue_prog_data base;
1399
1400 unsigned vertices_in;
1401
1402 /**
1403 * Size of an output vertex, measured in HWORDS (32 bytes).
1404 */
1405 unsigned output_vertex_size_hwords;
1406
1407 unsigned output_topology;
1408
1409 /**
1410 * Size of the control data (cut bits or StreamID bits), in hwords (32
1411 * bytes). 0 if there is no control data.
1412 */
1413 unsigned control_data_header_size_hwords;
1414
1415 /**
1416 * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
1417 * if the control data is StreamID bits, or
1418 * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
1419 * Ignored if control_data_header_size is 0.
1420 */
1421 unsigned control_data_format;
1422
1423 bool include_primitive_id;
1424
1425 /**
1426 * The number of vertices emitted, if constant - otherwise -1.
1427 */
1428 int static_vertex_count;
1429
1430 int invocations;
1431
1432 /**
1433 * Gfx6: Provoking vertex convention for odd-numbered triangles
1434 * in tristrips.
1435 */
1436 unsigned pv_first:1;
1437
1438 /**
1439 * Gfx6: Number of varyings that are output to transform feedback.
1440 */
1441 unsigned num_transform_feedback_bindings:7; /* 0-ELK_MAX_SOL_BINDINGS */
1442
1443 /**
1444 * Gfx6: Map from the index of a transform feedback binding table entry to the
1445 * gl_varying_slot that should be streamed out through that binding table
1446 * entry.
1447 */
1448 unsigned char transform_feedback_bindings[64 /* ELK_MAX_SOL_BINDINGS */];
1449
1450 /**
1451 * Gfx6: Map from the index of a transform feedback binding table entry to the
1452 * swizzles that should be used when streaming out data through that
1453 * binding table entry.
1454 */
1455 unsigned char transform_feedback_swizzles[64 /* ELK_MAX_SOL_BINDINGS */];
1456 };
1457
1458 struct elk_sf_prog_data {
1459 uint32_t urb_read_length;
1460 uint32_t total_grf;
1461
1462 /* Each vertex may have up to 12 attributes, 4 components each,
1463 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11
1464 * rows.
1465 *
1466 * Actually we use 4 for each, so call it 12 rows.
1467 */
1468 unsigned urb_entry_size;
1469 };
1470
1471 struct elk_clip_prog_data {
1472 uint32_t curb_read_length; /* user planes? */
1473 uint32_t clip_mode;
1474 uint32_t urb_read_length;
1475 uint32_t total_grf;
1476 };
1477
1478 /* elk_any_prog_data is prog_data for any stage that maps to an API stage */
1479 union elk_any_prog_data {
1480 struct elk_stage_prog_data base;
1481 struct elk_vue_prog_data vue;
1482 struct elk_vs_prog_data vs;
1483 struct elk_tcs_prog_data tcs;
1484 struct elk_tes_prog_data tes;
1485 struct elk_gs_prog_data gs;
1486 struct elk_wm_prog_data wm;
1487 struct elk_cs_prog_data cs;
1488 };
1489
1490 #define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \
1491 static inline struct elk_##STAGE##_prog_data * \
1492 elk_##STAGE##_prog_data(struct elk_stage_prog_data *prog_data) \
1493 { \
1494 if (prog_data) \
1495 assert(CHECK); \
1496 return (struct elk_##STAGE##_prog_data *) prog_data; \
1497 } \
1498 static inline const struct elk_##STAGE##_prog_data * \
1499 elk_##STAGE##_prog_data_const(const struct elk_stage_prog_data *prog_data) \
1500 { \
1501 if (prog_data) \
1502 assert(CHECK); \
1503 return (const struct elk_##STAGE##_prog_data *) prog_data; \
1504 }
1505
1506 DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX)
1507 DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
1508 DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
1509 DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY)
1510 DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT)
1511 DEFINE_PROG_DATA_DOWNCAST(cs, gl_shader_stage_uses_workgroup(prog_data->stage))
1512
1513 DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
1514 prog_data->stage == MESA_SHADER_TESS_CTRL ||
1515 prog_data->stage == MESA_SHADER_TESS_EVAL ||
1516 prog_data->stage == MESA_SHADER_GEOMETRY)
1517
1518 /* These are not really elk_stage_prog_data. */
1519 DEFINE_PROG_DATA_DOWNCAST(ff_gs, true)
1520 DEFINE_PROG_DATA_DOWNCAST(clip, true)
1521 DEFINE_PROG_DATA_DOWNCAST(sf, true)
1522 #undef DEFINE_PROG_DATA_DOWNCAST
1523
1524 struct elk_compile_stats {
1525 uint32_t dispatch_width; /**< 0 for vec4 */
1526 uint32_t max_polygons;
1527 uint32_t max_dispatch_width;
1528 uint32_t instructions;
1529 uint32_t sends;
1530 uint32_t loops;
1531 uint32_t cycles;
1532 uint32_t spills;
1533 uint32_t fills;
1534 uint32_t max_live_registers;
1535 };
1536
1537 /** @} */
1538
1539 struct elk_compiler *
1540 elk_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
1541
1542 /**
1543 * Returns a compiler configuration for use with disk shader cache
1544 *
1545 * This value only needs to change for settings that can cause different
1546 * program generation between two runs on the same hardware.
1547 *
1548 * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
1549 * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
1550 */
1551 uint64_t
1552 elk_get_compiler_config_value(const struct elk_compiler *compiler);
1553
1554 unsigned
1555 elk_prog_data_size(gl_shader_stage stage);
1556
1557 unsigned
1558 elk_prog_key_size(gl_shader_stage stage);
1559
1560 struct elk_compile_params {
1561 void *mem_ctx;
1562
1563 nir_shader *nir;
1564
1565 struct elk_compile_stats *stats;
1566
1567 void *log_data;
1568
1569 char *error_str;
1570
1571 uint64_t debug_flag;
1572
1573 uint32_t source_hash;
1574 };
1575
1576 /**
1577 * Parameters for compiling a vertex shader.
1578 *
1579 * Some of these will be modified during the shader compilation.
1580 */
1581 struct elk_compile_vs_params {
1582 struct elk_compile_params base;
1583
1584 const struct elk_vs_prog_key *key;
1585 struct elk_vs_prog_data *prog_data;
1586
1587 bool edgeflag_is_last; /* true for gallium */
1588 };
1589
1590 /**
1591 * Compile a vertex shader.
1592 *
1593 * Returns the final assembly and updates the parameters structure.
1594 */
1595 const unsigned *
1596 elk_compile_vs(const struct elk_compiler *compiler,
1597 struct elk_compile_vs_params *params);
1598
1599 /**
1600 * Parameters for compiling a tessellation control shader.
1601 *
1602 * Some of these will be modified during the shader compilation.
1603 */
1604 struct elk_compile_tcs_params {
1605 struct elk_compile_params base;
1606
1607 const struct elk_tcs_prog_key *key;
1608 struct elk_tcs_prog_data *prog_data;
1609 };
1610
1611 /**
1612 * Compile a tessellation control shader.
1613 *
1614 * Returns the final assembly and updates the parameters structure.
1615 */
1616 const unsigned *
1617 elk_compile_tcs(const struct elk_compiler *compiler,
1618 struct elk_compile_tcs_params *params);
1619
1620 /**
1621 * Parameters for compiling a tessellation evaluation shader.
1622 *
1623 * Some of these will be modified during the shader compilation.
1624 */
1625 struct elk_compile_tes_params {
1626 struct elk_compile_params base;
1627
1628 const struct elk_tes_prog_key *key;
1629 struct elk_tes_prog_data *prog_data;
1630 const struct intel_vue_map *input_vue_map;
1631 };
1632
1633 /**
1634 * Compile a tessellation evaluation shader.
1635 *
1636 * Returns the final assembly and updates the parameters structure.
1637 */
1638 const unsigned *
1639 elk_compile_tes(const struct elk_compiler *compiler,
1640 struct elk_compile_tes_params *params);
1641
1642 /**
1643 * Parameters for compiling a geometry shader.
1644 *
1645 * Some of these will be modified during the shader compilation.
1646 */
1647 struct elk_compile_gs_params {
1648 struct elk_compile_params base;
1649
1650 const struct elk_gs_prog_key *key;
1651 struct elk_gs_prog_data *prog_data;
1652 };
1653
1654 /**
1655 * Compile a geometry shader.
1656 *
1657 * Returns the final assembly and updates the parameters structure.
1658 */
1659 const unsigned *
1660 elk_compile_gs(const struct elk_compiler *compiler,
1661 struct elk_compile_gs_params *params);
1662
1663 /**
1664 * Compile a strips and fans shader.
1665 *
1666 * This is a fixed-function shader determined entirely by the shader key and
1667 * a VUE map.
1668 *
1669 * Returns the final assembly and the program's size.
1670 */
1671 const unsigned *
1672 elk_compile_sf(const struct elk_compiler *compiler,
1673 void *mem_ctx,
1674 const struct elk_sf_prog_key *key,
1675 struct elk_sf_prog_data *prog_data,
1676 struct intel_vue_map *vue_map,
1677 unsigned *final_assembly_size);
1678
1679 /**
1680 * Compile a clipper shader.
1681 *
1682 * This is a fixed-function shader determined entirely by the shader key and
1683 * a VUE map.
1684 *
1685 * Returns the final assembly and the program's size.
1686 */
1687 const unsigned *
1688 elk_compile_clip(const struct elk_compiler *compiler,
1689 void *mem_ctx,
1690 const struct elk_clip_prog_key *key,
1691 struct elk_clip_prog_data *prog_data,
1692 struct intel_vue_map *vue_map,
1693 unsigned *final_assembly_size);
1694
1695 /**
1696 * Parameters for compiling a fragment shader.
1697 *
1698 * Some of these will be modified during the shader compilation.
1699 */
1700 struct elk_compile_fs_params {
1701 struct elk_compile_params base;
1702
1703 const struct elk_wm_prog_key *key;
1704 struct elk_wm_prog_data *prog_data;
1705
1706 const struct intel_vue_map *vue_map;
1707 const struct elk_mue_map *mue_map;
1708
1709 bool allow_spilling;
1710 bool use_rep_send;
1711 uint8_t max_polygons;
1712 };
1713
1714 /**
1715 * Compile a fragment shader.
1716 *
1717 * Returns the final assembly and updates the parameters structure.
1718 */
1719 const unsigned *
1720 elk_compile_fs(const struct elk_compiler *compiler,
1721 struct elk_compile_fs_params *params);
1722
1723 /**
1724 * Parameters for compiling a compute shader.
1725 *
1726 * Some of these will be modified during the shader compilation.
1727 */
1728 struct elk_compile_cs_params {
1729 struct elk_compile_params base;
1730
1731 const struct elk_cs_prog_key *key;
1732 struct elk_cs_prog_data *prog_data;
1733 };
1734
1735 /**
1736 * Compile a compute shader.
1737 *
1738 * Returns the final assembly and updates the parameters structure.
1739 */
1740 const unsigned *
1741 elk_compile_cs(const struct elk_compiler *compiler,
1742 struct elk_compile_cs_params *params);
1743
1744 /**
1745 * Compile a fixed function geometry shader.
1746 *
1747 * Returns the final assembly and the program's size.
1748 */
1749 const unsigned *
1750 elk_compile_ff_gs_prog(struct elk_compiler *compiler,
1751 void *mem_ctx,
1752 const struct elk_ff_gs_prog_key *key,
1753 struct elk_ff_gs_prog_data *prog_data,
1754 struct intel_vue_map *vue_map,
1755 unsigned *final_assembly_size);
1756
1757 void elk_debug_key_recompile(const struct elk_compiler *c, void *log,
1758 gl_shader_stage stage,
1759 const struct elk_base_prog_key *old_key,
1760 const struct elk_base_prog_key *key);
1761
1762 /* Shared Local Memory Size is specified as powers of two,
1763 * and also have a Gen-dependent minimum value if not zero.
1764 */
1765 static inline uint32_t
elk_calculate_slm_size(unsigned gen,uint32_t bytes)1766 elk_calculate_slm_size(unsigned gen, uint32_t bytes)
1767 {
1768 assert(bytes <= 64 * 1024);
1769 if (bytes > 0)
1770 return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
1771 else
1772 return 0;
1773 }
1774
1775 static inline uint32_t
elk_encode_slm_size(unsigned gen,uint32_t bytes)1776 elk_encode_slm_size(unsigned gen, uint32_t bytes)
1777 {
1778 uint32_t slm_size = 0;
1779
1780 /* Shared Local Memory is specified as powers of two, and encoded in
1781 * INTERFACE_DESCRIPTOR_DATA with the following representations:
1782 *
1783 * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
1784 * -------------------------------------------------------------------
1785 * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 |
1786 * -------------------------------------------------------------------
1787 * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
1788 */
1789
1790 if (bytes > 0) {
1791 slm_size = elk_calculate_slm_size(gen, bytes);
1792 assert(util_is_power_of_two_nonzero(slm_size));
1793
1794 if (gen >= 9) {
1795 /* Turn an exponent of 10 (1024 kB) into 1. */
1796 assert(slm_size >= 1024);
1797 slm_size = ffs(slm_size) - 10;
1798 } else {
1799 assert(slm_size >= 4096);
1800 /* Convert to the pre-Gfx9 representation. */
1801 slm_size = slm_size / 4096;
1802 }
1803 }
1804
1805 return slm_size;
1806 }
1807
1808 unsigned
1809 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
1810 unsigned threads);
1811
1812 void
1813 elk_write_shader_relocs(const struct elk_isa_info *isa,
1814 void *program,
1815 const struct elk_stage_prog_data *prog_data,
1816 struct elk_shader_reloc_value *values,
1817 unsigned num_values);
1818
1819 /**
1820 * Get the dispatch information for a shader to be used with GPGPU_WALKER and
1821 * similar instructions.
1822 *
1823 * If override_local_size is not NULL, it must to point to a 3-element that
1824 * will override the value from prog_data->local_size. This is used by
1825 * ARB_compute_variable_group_size, where the size is set only at dispatch
1826 * time (so prog_data is outdated).
1827 */
1828 struct intel_cs_dispatch_info
1829 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1830 const struct elk_cs_prog_data *prog_data,
1831 const unsigned *override_local_size);
1832
1833 /**
1834 * Return true if the given shader stage is dispatched contiguously by the
1835 * relevant fixed function starting from channel 0 of the SIMD thread, which
1836 * implies that the dispatch mask of a thread can be assumed to have the form
1837 * '2^n - 1' for some n.
1838 */
1839 static inline bool
elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info * devinfo,gl_shader_stage stage,unsigned max_polygons,const struct elk_stage_prog_data * prog_data)1840 elk_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
1841 gl_shader_stage stage, unsigned max_polygons,
1842 const struct elk_stage_prog_data *prog_data)
1843 {
1844 /* The code below makes assumptions about the hardware's thread dispatch
1845 * behavior that could be proven wrong in future generations -- Make sure
1846 * to do a full test run with elk_fs_test_dispatch_packing() hooked up to
1847 * the NIR front-end before changing this assertion.
1848 */
1849 assert(devinfo->ver <= 12);
1850
1851 switch (stage) {
1852 case MESA_SHADER_FRAGMENT: {
1853 /* The PSD discards subspans coming in with no lit samples, which in the
1854 * per-pixel shading case implies that each subspan will either be fully
1855 * lit (due to the VMask being used to allow derivative computations),
1856 * or not dispatched at all. In per-sample dispatch mode individual
1857 * samples from the same subspan have a fixed relative location within
1858 * the SIMD thread, so dispatch of unlit samples cannot be avoided in
1859 * general and we should return false.
1860 */
1861 const struct elk_wm_prog_data *wm_prog_data =
1862 (const struct elk_wm_prog_data *)prog_data;
1863 return devinfo->verx10 < 125 &&
1864 !wm_prog_data->persample_dispatch &&
1865 wm_prog_data->uses_vmask &&
1866 max_polygons < 2;
1867 }
1868 case MESA_SHADER_COMPUTE:
1869 /* Compute shaders will be spawned with either a fully enabled dispatch
1870 * mask or with whatever bottom/right execution mask was given to the
1871 * GPGPU walker command to be used along the workgroup edges -- In both
1872 * cases the dispatch mask is required to be tightly packed for our
1873 * invocation index calculations to work.
1874 */
1875 return true;
1876 default:
1877 /* Most remaining fixed functions are limited to use a packed dispatch
1878 * mask due to the hardware representation of the dispatch mask as a
1879 * single counter representing the number of enabled channels.
1880 */
1881 return true;
1882 }
1883 }
1884
1885 /**
1886 * Computes the first varying slot in the URB produced by the previous stage
1887 * that is used in the next stage. We do this by testing the varying slots in
1888 * the previous stage's vue map against the inputs read in the next stage.
1889 *
1890 * Note that:
1891 *
1892 * - Each URB offset contains two varying slots and we can only skip a
1893 * full offset if both slots are unused, so the value we return here is always
1894 * rounded down to the closest multiple of two.
1895 *
1896 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are
1897 * part of the vue header, so if these are read we can't skip anything.
1898 */
1899 static inline int
elk_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)1900 elk_compute_first_urb_slot_required(uint64_t inputs_read,
1901 const struct intel_vue_map *prev_stage_vue_map)
1902 {
1903 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) {
1904 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
1905 int varying = prev_stage_vue_map->slot_to_varying[i];
1906 if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0)
1907 return ROUND_DOWN_TO(i, 2);
1908 }
1909 }
1910
1911 return 0;
1912 }
1913
1914 /**
1915 * This enum is used as the base indice of the nir_load_topology_id_intel
1916 * intrinsic. This is used to return different values based on some aspect of
1917 * the topology of the device.
1918 */
1919 enum elk_topology_id
1920 {
1921 /* A value based of the DSS identifier the shader is currently running on.
1922 * Be mindful that the DSS ID can be higher than the total number of DSS on
1923 * the device. This is because of the fusing that can occur on different
1924 * parts.
1925 */
1926 ELK_TOPOLOGY_ID_DSS,
1927
1928 /* A value composed of EU ID, thread ID & SIMD lane ID. */
1929 ELK_TOPOLOGY_ID_EU_THREAD_SIMD,
1930 };
1931
1932 #ifdef __cplusplus
1933 } /* extern "C" */
1934 #endif
1935
1936 #endif /* ELK_COMPILER_H */
1937