1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #pragma once
29
30 #include "brw_cfg.h"
31 #include "brw_compiler.h"
32 #include "brw_ir_allocator.h"
33 #include "brw_ir_fs.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_ir_performance.h"
36 #include "compiler/nir/nir.h"
37
38 struct bblock_t;
39 namespace {
40 struct acp_entry;
41 }
42
43 struct fs_visitor;
44
45 namespace brw {
46 /**
47 * Register pressure analysis of a shader. Estimates how many registers
48 * are live at any point of the program in GRF units.
49 */
50 struct register_pressure {
51 register_pressure(const fs_visitor *v);
52 ~register_pressure();
53
54 analysis_dependency_class
dependency_classregister_pressure55 dependency_class() const
56 {
57 return (DEPENDENCY_INSTRUCTION_IDENTITY |
58 DEPENDENCY_INSTRUCTION_DATA_FLOW |
59 DEPENDENCY_VARIABLES);
60 }
61
62 bool
validateregister_pressure63 validate(const fs_visitor *) const
64 {
65 /* FINISHME */
66 return true;
67 }
68
69 unsigned *regs_live_at_ip;
70 };
71
72 class def_analysis {
73 public:
74 def_analysis(const fs_visitor *v);
75 ~def_analysis();
76
77 fs_inst *
get(const brw_reg & reg)78 get(const brw_reg ®) const
79 {
80 return reg.file == VGRF && reg.nr < def_count ?
81 def_insts[reg.nr] : NULL;
82 }
83
84 bblock_t *
get_block(const brw_reg & reg)85 get_block(const brw_reg ®) const
86 {
87 return reg.file == VGRF && reg.nr < def_count ?
88 def_blocks[reg.nr] : NULL;
89 }
90
91 uint32_t
get_use_count(const brw_reg & reg)92 get_use_count(const brw_reg ®) const
93 {
94 return reg.file == VGRF && reg.nr < def_count ?
95 def_use_counts[reg.nr] : 0;
96 }
97
count()98 unsigned count() const { return def_count; }
99 unsigned ssa_count() const;
100
101 void print_stats(const fs_visitor *) const;
102
103 analysis_dependency_class
dependency_class()104 dependency_class() const
105 {
106 return DEPENDENCY_INSTRUCTION_IDENTITY |
107 DEPENDENCY_INSTRUCTION_DATA_FLOW |
108 DEPENDENCY_VARIABLES |
109 DEPENDENCY_BLOCKS;
110 }
111
112 bool validate(const fs_visitor *) const;
113
114 private:
115 void mark_invalid(int);
116 bool fully_defines(const fs_visitor *v, fs_inst *);
117 void update_for_reads(const idom_tree &idom, bblock_t *block, fs_inst *);
118 void update_for_write(const fs_visitor *v, bblock_t *block, fs_inst *);
119
120 fs_inst **def_insts;
121 bblock_t **def_blocks;
122 uint32_t *def_use_counts;
123 unsigned def_count;
124 };
125 }
126
127 #define UBO_START ((1 << 16) - 4)
128
129 /**
130 * Scratch data used when compiling a GLSL geometry shader.
131 */
132 struct brw_gs_compile
133 {
134 struct brw_gs_prog_key key;
135 struct intel_vue_map input_vue_map;
136
137 unsigned control_data_bits_per_vertex;
138 unsigned control_data_header_size_bits;
139 };
140
141 class brw_builder;
142
143 struct brw_shader_stats {
144 const char *scheduler_mode;
145 unsigned promoted_constants;
146 unsigned spill_count;
147 unsigned fill_count;
148 unsigned max_register_pressure;
149 unsigned non_ssa_registers_after_nir;
150 };
151
152 /** Register numbers for thread payload fields. */
153 struct thread_payload {
154 /** The number of thread payload registers the hardware will supply. */
155 uint8_t num_regs;
156
157 virtual ~thread_payload() = default;
158
159 protected:
thread_payloadthread_payload160 thread_payload() : num_regs() {}
161 };
162
163 struct vs_thread_payload : public thread_payload {
164 vs_thread_payload(const fs_visitor &v);
165
166 brw_reg urb_handles;
167 };
168
169 struct tcs_thread_payload : public thread_payload {
170 tcs_thread_payload(const fs_visitor &v);
171
172 brw_reg patch_urb_output;
173 brw_reg primitive_id;
174 brw_reg icp_handle_start;
175 };
176
177 struct tes_thread_payload : public thread_payload {
178 tes_thread_payload(const fs_visitor &v);
179
180 brw_reg patch_urb_input;
181 brw_reg primitive_id;
182 brw_reg coords[3];
183 brw_reg urb_output;
184 };
185
186 struct gs_thread_payload : public thread_payload {
187 gs_thread_payload(fs_visitor &v);
188
189 brw_reg urb_handles;
190 brw_reg primitive_id;
191 brw_reg instance_id;
192 brw_reg icp_handle_start;
193 };
194
195 struct fs_thread_payload : public thread_payload {
196 fs_thread_payload(const fs_visitor &v,
197 bool &source_depth_to_render_target);
198
199 uint8_t subspan_coord_reg[2];
200 uint8_t source_depth_reg[2];
201 uint8_t source_w_reg[2];
202 uint8_t aa_dest_stencil_reg[2];
203 uint8_t dest_depth_reg[2];
204 uint8_t sample_pos_reg[2];
205 uint8_t sample_mask_in_reg[2];
206 uint8_t barycentric_coord_reg[INTEL_BARYCENTRIC_MODE_COUNT][2];
207
208 uint8_t depth_w_coef_reg;
209 uint8_t pc_bary_coef_reg;
210 uint8_t npc_bary_coef_reg;
211 uint8_t sample_offsets_reg;
212 };
213
214 struct cs_thread_payload : public thread_payload {
215 cs_thread_payload(const fs_visitor &v);
216
217 void load_subgroup_id(const brw_builder &bld, brw_reg &dest) const;
218
219 brw_reg local_invocation_id[3];
220
221 brw_reg inline_parameter;
222
223 protected:
224 brw_reg subgroup_id_;
225 };
226
227 struct task_mesh_thread_payload : public cs_thread_payload {
228 task_mesh_thread_payload(fs_visitor &v);
229
230 brw_reg extended_parameter_0;
231 brw_reg local_index;
232
233 brw_reg urb_output;
234
235 /* URB to read Task memory inputs. Only valid for MESH stage. */
236 brw_reg task_urb_input;
237 };
238
239 struct bs_thread_payload : public thread_payload {
240 bs_thread_payload(const fs_visitor &v);
241
242 brw_reg global_arg_ptr;
243 brw_reg local_arg_ptr;
244
245 void load_shader_type(const brw_builder &bld, brw_reg &dest) const;
246 };
247
248 enum brw_shader_phase {
249 BRW_SHADER_PHASE_INITIAL = 0,
250 BRW_SHADER_PHASE_AFTER_NIR,
251 BRW_SHADER_PHASE_AFTER_OPT_LOOP,
252 BRW_SHADER_PHASE_AFTER_EARLY_LOWERING,
253 BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING,
254 BRW_SHADER_PHASE_AFTER_LATE_LOWERING,
255 BRW_SHADER_PHASE_AFTER_REGALLOC,
256
257 /* Larger value than any other phase. */
258 BRW_SHADER_PHASE_INVALID,
259 };
260
261 /**
262 * The fragment shader front-end.
263 *
264 * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
265 */
266 struct fs_visitor
267 {
268 public:
269 fs_visitor(const struct brw_compiler *compiler,
270 const struct brw_compile_params *params,
271 const brw_base_prog_key *key,
272 struct brw_stage_prog_data *prog_data,
273 const nir_shader *shader,
274 unsigned dispatch_width,
275 bool needs_register_pressure,
276 bool debug_enabled);
277 fs_visitor(const struct brw_compiler *compiler,
278 const struct brw_compile_params *params,
279 const brw_wm_prog_key *key,
280 struct brw_wm_prog_data *prog_data,
281 const nir_shader *shader,
282 unsigned dispatch_width,
283 unsigned num_polygons,
284 bool needs_register_pressure,
285 bool debug_enabled);
286 fs_visitor(const struct brw_compiler *compiler,
287 const struct brw_compile_params *params,
288 struct brw_gs_compile *gs_compile,
289 struct brw_gs_prog_data *prog_data,
290 const nir_shader *shader,
291 bool needs_register_pressure,
292 bool debug_enabled);
293 void init();
294 ~fs_visitor();
295
296 void import_uniforms(fs_visitor *v);
297
298 void assign_curb_setup();
299 void convert_attr_sources_to_hw_regs(fs_inst *inst);
300 void calculate_payload_ranges(bool allow_spilling,
301 unsigned payload_node_count,
302 int *payload_last_use_ip) const;
303 void invalidate_analysis(brw::analysis_dependency_class c);
304
305 void vfail(const char *msg, va_list args);
306 void fail(const char *msg, ...);
307 void limit_dispatch_width(unsigned n, const char *msg);
308
309 void emit_urb_writes(const brw_reg &gs_vertex_count = brw_reg());
310 void emit_gs_control_data_bits(const brw_reg &vertex_count);
311 brw_reg gs_urb_channel_mask(const brw_reg &dword_index);
312 brw_reg gs_urb_per_slot_dword_index(const brw_reg &vertex_count);
313 bool mark_last_urb_write_with_eot();
314 void emit_cs_terminate();
315
316 const struct brw_compiler *compiler;
317 void *log_data; /* Passed to compiler->*_log functions */
318
319 const struct intel_device_info * const devinfo;
320 const nir_shader *nir;
321
322 /** ralloc context for temporary data used during compile */
323 void *mem_ctx;
324
325 /** List of fs_inst. */
326 exec_list instructions;
327
328 cfg_t *cfg;
329
330 gl_shader_stage stage;
331 bool debug_enabled;
332
333 brw::simple_allocator alloc;
334
335 const brw_base_prog_key *const key;
336
337 struct brw_gs_compile *gs_compile;
338
339 struct brw_stage_prog_data *prog_data;
340
341 brw_analysis<brw::fs_live_variables, fs_visitor> live_analysis;
342 brw_analysis<brw::register_pressure, fs_visitor> regpressure_analysis;
343 brw_analysis<brw::performance, fs_visitor> performance_analysis;
344 brw_analysis<brw::idom_tree, fs_visitor> idom_analysis;
345 brw_analysis<brw::def_analysis, fs_visitor> def_analysis;
346
347 /** Number of uniform variable components visited. */
348 unsigned uniforms;
349
350 /** Byte-offset for the next available spot in the scratch space buffer. */
351 unsigned last_scratch;
352
353 brw_reg frag_depth;
354 brw_reg frag_stencil;
355 brw_reg sample_mask;
356 brw_reg outputs[VARYING_SLOT_MAX];
357 brw_reg dual_src_output;
358 int first_non_payload_grf;
359
360 enum brw_shader_phase phase;
361
362 bool failed;
363 char *fail_msg;
364
365 thread_payload *payload_;
366
payloadfs_visitor367 thread_payload &payload() {
368 return *this->payload_;
369 }
370
vs_payloadfs_visitor371 vs_thread_payload &vs_payload() {
372 assert(stage == MESA_SHADER_VERTEX);
373 return *static_cast<vs_thread_payload *>(this->payload_);
374 }
375
tcs_payloadfs_visitor376 tcs_thread_payload &tcs_payload() {
377 assert(stage == MESA_SHADER_TESS_CTRL);
378 return *static_cast<tcs_thread_payload *>(this->payload_);
379 }
380
tes_payloadfs_visitor381 tes_thread_payload &tes_payload() {
382 assert(stage == MESA_SHADER_TESS_EVAL);
383 return *static_cast<tes_thread_payload *>(this->payload_);
384 }
385
gs_payloadfs_visitor386 gs_thread_payload &gs_payload() {
387 assert(stage == MESA_SHADER_GEOMETRY);
388 return *static_cast<gs_thread_payload *>(this->payload_);
389 }
390
fs_payloadfs_visitor391 fs_thread_payload &fs_payload() {
392 assert(stage == MESA_SHADER_FRAGMENT);
393 return *static_cast<fs_thread_payload *>(this->payload_);
394 };
395
fs_payloadfs_visitor396 const fs_thread_payload &fs_payload() const {
397 assert(stage == MESA_SHADER_FRAGMENT);
398 return *static_cast<const fs_thread_payload *>(this->payload_);
399 };
400
cs_payloadfs_visitor401 cs_thread_payload &cs_payload() {
402 assert(gl_shader_stage_uses_workgroup(stage));
403 return *static_cast<cs_thread_payload *>(this->payload_);
404 }
405
task_mesh_payloadfs_visitor406 task_mesh_thread_payload &task_mesh_payload() {
407 assert(stage == MESA_SHADER_TASK || stage == MESA_SHADER_MESH);
408 return *static_cast<task_mesh_thread_payload *>(this->payload_);
409 }
410
bs_payloadfs_visitor411 bs_thread_payload &bs_payload() {
412 assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
413 return *static_cast<bs_thread_payload *>(this->payload_);
414 }
415
416 bool source_depth_to_render_target;
417
418 brw_reg pixel_x;
419 brw_reg pixel_y;
420 brw_reg pixel_z;
421 brw_reg wpos_w;
422 brw_reg pixel_w;
423 brw_reg delta_xy[INTEL_BARYCENTRIC_MODE_COUNT];
424 brw_reg final_gs_vertex_count;
425 brw_reg control_data_bits;
426 brw_reg invocation_id;
427
428 unsigned grf_used;
429 bool spilled_any_registers;
430 bool needs_register_pressure;
431
432 const unsigned dispatch_width; /**< 8, 16 or 32 */
433 const unsigned max_polygons;
434 unsigned max_dispatch_width;
435
436 /* The API selected subgroup size */
437 unsigned api_subgroup_size; /**< 0, 8, 16, 32 */
438
439 unsigned next_address_register_nr;
440
441 struct brw_shader_stats shader_stats;
442
443 void debug_optimizer(const nir_shader *nir,
444 const char *pass_name,
445 int iteration, int pass_num) const;
446 };
447
448 void brw_print_instructions(const fs_visitor &s, FILE *file = stderr);
449
450 void brw_print_instruction(const fs_visitor &s, const fs_inst *inst,
451 FILE *file = stderr,
452 const brw::def_analysis *defs = nullptr);
453
454 void brw_print_swsb(FILE *f, const struct intel_device_info *devinfo, const tgl_swsb swsb);
455
456 /**
457 * Return the flag register used in fragment shaders to keep track of live
458 * samples. On Gfx7+ we use f1.0-f1.1 to allow discard jumps in SIMD32
459 * dispatch mode.
460 */
461 static inline unsigned
sample_mask_flag_subreg(const fs_visitor & s)462 sample_mask_flag_subreg(const fs_visitor &s)
463 {
464 assert(s.stage == MESA_SHADER_FRAGMENT);
465 return 2;
466 }
467
468 inline brw_reg
brw_dynamic_msaa_flags(const struct brw_wm_prog_data * wm_prog_data)469 brw_dynamic_msaa_flags(const struct brw_wm_prog_data *wm_prog_data)
470 {
471 return brw_uniform_reg(wm_prog_data->msaa_flags_param, BRW_TYPE_UD);
472 }
473
474 enum intel_barycentric_mode brw_barycentric_mode(const struct brw_wm_prog_key *key,
475 nir_intrinsic_instr *intr);
476
477 uint32_t brw_fb_write_msg_control(const fs_inst *inst,
478 const struct brw_wm_prog_data *prog_data);
479
480 void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data);
481
482 int brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
483 const brw_stage_prog_data *prog_data);
484
485 void nir_to_brw(fs_visitor *s);
486
487 void brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase);
488
489 #ifndef NDEBUG
490 void brw_validate(const fs_visitor &s);
491 #else
brw_validate(const fs_visitor & s)492 static inline void brw_validate(const fs_visitor &s) {}
493 #endif
494
495 void brw_calculate_cfg(fs_visitor &s);
496
497 void brw_optimize(fs_visitor &s);
498
499 enum brw_instruction_scheduler_mode {
500 BRW_SCHEDULE_PRE,
501 BRW_SCHEDULE_PRE_NON_LIFO,
502 BRW_SCHEDULE_PRE_LIFO,
503 BRW_SCHEDULE_POST,
504 BRW_SCHEDULE_NONE,
505 };
506
507 class brw_instruction_scheduler;
508
509 brw_instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
510 void brw_schedule_instructions_pre_ra(fs_visitor &s, brw_instruction_scheduler *sched,
511 brw_instruction_scheduler_mode mode);
512 void brw_schedule_instructions_post_ra(fs_visitor &s);
513
514 void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
515 bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
516 void brw_assign_regs_trivial(fs_visitor &s);
517
518 bool brw_lower_3src_null_dest(fs_visitor &s);
519 bool brw_lower_alu_restrictions(fs_visitor &s);
520 bool brw_lower_barycentrics(fs_visitor &s);
521 bool brw_lower_constant_loads(fs_visitor &s);
522 bool brw_lower_csel(fs_visitor &s);
523 bool brw_lower_derivatives(fs_visitor &s);
524 bool brw_lower_dpas(fs_visitor &s);
525 bool brw_lower_find_live_channel(fs_visitor &s);
526 bool brw_lower_indirect_mov(fs_visitor &s);
527 bool brw_lower_integer_multiplication(fs_visitor &s);
528 bool brw_lower_load_payload(fs_visitor &s);
529 bool brw_lower_load_subgroup_invocation(fs_visitor &s);
530 bool brw_lower_logical_sends(fs_visitor &s);
531 bool brw_lower_pack(fs_visitor &s);
532 bool brw_lower_regioning(fs_visitor &s);
533 bool brw_lower_scalar_fp64_MAD(fs_visitor &s);
534 bool brw_lower_scoreboard(fs_visitor &s);
535 bool brw_lower_send_descriptors(fs_visitor &s);
536 bool brw_lower_send_gather(fs_visitor &s);
537 bool brw_lower_sends_overlapping_payload(fs_visitor &s);
538 bool brw_lower_simd_width(fs_visitor &s);
539 bool brw_lower_src_modifiers(fs_visitor &s, bblock_t *block, fs_inst *inst, unsigned i);
540 bool brw_lower_sub_sat(fs_visitor &s);
541 bool brw_lower_subgroup_ops(fs_visitor &s);
542 bool brw_lower_uniform_pull_constant_loads(fs_visitor &s);
543 void brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s);
544
545 bool brw_opt_address_reg_load(fs_visitor &s);
546 bool brw_opt_algebraic(fs_visitor &s);
547 bool brw_opt_bank_conflicts(fs_visitor &s);
548 bool brw_opt_cmod_propagation(fs_visitor &s);
549 bool brw_opt_combine_constants(fs_visitor &s);
550 bool brw_opt_combine_convergent_txf(fs_visitor &s);
551 bool brw_opt_compact_virtual_grfs(fs_visitor &s);
552 bool brw_opt_constant_fold_instruction(const intel_device_info *devinfo, fs_inst *inst);
553 bool brw_opt_copy_propagation(fs_visitor &s);
554 bool brw_opt_copy_propagation_defs(fs_visitor &s);
555 bool brw_opt_cse_defs(fs_visitor &s);
556 bool brw_opt_dead_code_eliminate(fs_visitor &s);
557 bool brw_opt_eliminate_find_live_channel(fs_visitor &s);
558 bool brw_opt_register_coalesce(fs_visitor &s);
559 bool brw_opt_remove_extra_rounding_modes(fs_visitor &s);
560 bool brw_opt_remove_redundant_halts(fs_visitor &s);
561 bool brw_opt_saturate_propagation(fs_visitor &s);
562 bool brw_opt_send_gather_to_send(fs_visitor &s);
563 bool brw_opt_send_to_send_gather(fs_visitor &s);
564 bool brw_opt_split_sends(fs_visitor &s);
565 bool brw_opt_split_virtual_grfs(fs_visitor &s);
566 bool brw_opt_zero_samples(fs_visitor &s);
567
568 bool brw_workaround_emit_dummy_mov_instruction(fs_visitor &s);
569 bool brw_workaround_memory_fence_before_eot(fs_visitor &s);
570 bool brw_workaround_nomask_control_flow(fs_visitor &s);
571 bool brw_workaround_source_arf_before_eot(fs_visitor &s);
572
573 /* Helpers. */
574 unsigned brw_get_lowered_simd_width(const fs_visitor *shader,
575 const fs_inst *inst);
576