1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2022 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_shader.h"
28
29 #include "gallium/drivers/r600/r600_shader.h"
30 #include "nir.h"
31 #include "nir_intrinsics.h"
32 #include "nir_intrinsics_indices.h"
33 #include "sfn_debug.h"
34 #include "sfn_instr.h"
35 #include "sfn_instr_alu.h"
36 #include "sfn_instr_alugroup.h"
37 #include "sfn_instr_controlflow.h"
38 #include "sfn_instr_export.h"
39 #include "sfn_instr_fetch.h"
40 #include "sfn_instr_lds.h"
41 #include "sfn_instr_mem.h"
42 #include "sfn_liverangeevaluator.h"
43 #include "sfn_shader_cs.h"
44 #include "sfn_shader_fs.h"
45 #include "sfn_shader_gs.h"
46 #include "sfn_shader_tess.h"
47 #include "sfn_shader_vs.h"
48 #include "util/u_math.h"
49
50 #include <numeric>
51 #include <sstream>
52
53 namespace r600 {
54
55 using std::string;
56
57 void
print(std::ostream & os) const58 ShaderIO::print(std::ostream& os) const
59 {
60 os << m_type << " LOC:" << m_location;
61 if (m_varying_slot != NUM_TOTAL_VARYING_SLOTS)
62 os << " VARYING_SLOT:" << static_cast<int>(m_varying_slot);
63 if (m_no_varying)
64 os << " NO_VARYING";
65 do_print(os);
66 }
67
68 int
spi_sid() const69 ShaderIO::spi_sid() const
70 {
71 if (no_varying())
72 return 0;
73
74 switch (varying_slot()) {
75 case NUM_TOTAL_VARYING_SLOTS:
76 case VARYING_SLOT_POS:
77 case VARYING_SLOT_PSIZ:
78 case VARYING_SLOT_EDGE:
79 case VARYING_SLOT_FACE:
80 case VARYING_SLOT_CLIP_VERTEX:
81 return 0;
82 default:
83 static_assert(static_cast<int>(NUM_TOTAL_VARYING_SLOTS) <= 0x100 - 1,
84 "All varying slots plus 1 must be usable as 8-bit SPI semantic IDs");
85 return static_cast<int>(varying_slot()) + 1;
86 }
87 }
88
ShaderIO(const char * type,int loc,gl_varying_slot varying_slot)89 ShaderIO::ShaderIO(const char *type, int loc, gl_varying_slot varying_slot):
90 m_type(type),
91 m_location(loc),
92 m_varying_slot(varying_slot)
93 {
94 }
95
ShaderOutput(int location,int writemask,gl_varying_slot varying_slot)96 ShaderOutput::ShaderOutput(int location, int writemask, gl_varying_slot varying_slot):
97 ShaderIO("OUTPUT", location, varying_slot),
98 m_writemask(writemask)
99 {
100 }
101
ShaderOutput()102 ShaderOutput::ShaderOutput():
103 ShaderOutput(-1, 0)
104 {
105 }
106
107 void
do_print(std::ostream & os) const108 ShaderOutput::do_print(std::ostream& os) const
109 {
110 if (m_frag_result != static_cast<gl_frag_result>(FRAG_RESULT_MAX))
111 os << " FRAG_RESULT:" << static_cast<int>(m_frag_result);
112 os << " MASK:" << m_writemask;
113 }
114
ShaderInput(int location,gl_varying_slot varying_slot)115 ShaderInput::ShaderInput(int location, gl_varying_slot varying_slot):
116 ShaderIO("INPUT", location, varying_slot)
117 {
118 }
119
ShaderInput()120 ShaderInput::ShaderInput():
121 ShaderInput(-1)
122 {
123 }
124
125 void
do_print(std::ostream & os) const126 ShaderInput::do_print(std::ostream& os) const
127 {
128 if (m_system_value != SYSTEM_VALUE_MAX)
129 os << " SYSVALUE: " << static_cast<int>(m_system_value);
130 if (m_interpolator)
131 os << " INTERP:" << m_interpolator;
132 if (m_interpolate_loc)
133 os << " ILOC:" << m_interpolate_loc;
134 if (m_uses_interpolate_at_centroid)
135 os << " USE_CENTROID";
136 }
137
138 void
set_interpolator(int interp,int interp_loc,bool uses_interpolate_at_centroid)139 ShaderInput::set_interpolator(int interp,
140 int interp_loc,
141 bool uses_interpolate_at_centroid)
142 {
143 m_interpolator = interp;
144 m_interpolate_loc = interp_loc;
145 m_uses_interpolate_at_centroid = uses_interpolate_at_centroid;
146 }
147
148 void
set_uses_interpolate_at_centroid()149 ShaderInput::set_uses_interpolate_at_centroid()
150 {
151 m_uses_interpolate_at_centroid = true;
152 }
153
154 int64_t Shader::s_next_shader_id = 1;
155
Shader(const char * type_id,unsigned atomic_base)156 Shader::Shader(const char *type_id, unsigned atomic_base):
157 m_current_block(nullptr),
158 m_type_id(type_id),
159 m_chip_class(ISA_CC_R600),
160 m_next_block(0),
161 m_atomic_base(atomic_base),
162 m_shader_id(s_next_shader_id++)
163 {
164 m_instr_factory = new InstrFactory();
165 m_chain_instr.this_shader = this;
166 start_new_block(0);
167 }
168
169 void
set_input_gpr(int driver_lcation,int gpr)170 Shader::set_input_gpr(int driver_lcation, int gpr)
171 {
172 auto i = m_inputs.find(driver_lcation);
173 assert(i != m_inputs.end());
174 i->second.set_gpr(gpr);
175 }
176
177 bool
add_info_from_string(std::istream & is)178 Shader::add_info_from_string(std::istream& is)
179 {
180 std::string type;
181 is >> type;
182
183 if (type == "CHIPCLASS")
184 return read_chipclass(is);
185 if (type == "FAMILY")
186 return read_family(is);
187 if (type == "OUTPUT")
188 return read_output(is);
189 if (type == "INPUT")
190 return read_input(is);
191 if (type == "PROP")
192 return read_prop(is);
193 if (type == "SYSVALUES")
194 return allocate_registers_from_string(is, pin_fully);
195 if (type == "REGISTERS")
196 return allocate_registers_from_string(is, pin_free);
197 if (type == "ARRAYS")
198 return allocate_arrays_from_string(is);
199
200 return false;
201 }
202
203 void
emit_instruction_from_string(const std::string & s)204 Shader::emit_instruction_from_string(const std::string& s)
205 {
206
207 sfn_log << SfnLog::instr << "Create Instr from '" << s << "'\n";
208 if (s == "BLOCK_START") {
209 if (!m_current_block->empty()) {
210 start_new_block(m_current_block->nesting_offset());
211 sfn_log << SfnLog::instr << " Emit start block\n";
212 }
213 return;
214 }
215
216 if (s == "BLOCK_END") {
217 return;
218 }
219
220 auto ir = m_instr_factory->from_string(s, m_current_block->nesting_depth(),
221 m_chip_class == ISA_CC_CAYMAN);
222 if (ir) {
223 emit_instruction(ir);
224 if (ir->end_block())
225 start_new_block(ir->nesting_offset());
226 sfn_log << SfnLog::instr << " " << *ir << "\n";
227 }
228 }
229
230 bool
read_output(std::istream & is)231 Shader::read_output(std::istream& is)
232 {
233 ShaderOutput output;
234
235 std::string token;
236 for (is >> token; !token.empty(); token.clear(), is >> token) {
237 int value;
238 if (int_from_string_with_prefix_optional(token, "LOC:", value))
239 output.set_location(value);
240 else if (int_from_string_with_prefix_optional(token, "VARYING_SLOT:", value))
241 output.set_varying_slot(static_cast<gl_varying_slot>(value));
242 else if (token == "NO_VARYING")
243 output.set_no_varying(true);
244 else if (int_from_string_with_prefix_optional(token, "FRAG_RESULT:", value))
245 output.set_frag_result(static_cast<gl_frag_result>(value));
246 else if (int_from_string_with_prefix_optional(token, "MASK:", value))
247 output.set_writemask(value);
248 else {
249 std::cerr << "Unknown parse value '" << token << "'";
250 assert(!"Unknown parse value in read_output");
251 }
252 }
253
254 add_output(output);
255 return true;
256 }
257
258 bool
read_input(std::istream & is)259 Shader::read_input(std::istream& is)
260 {
261 ShaderInput input;
262
263 int interp = 0;
264 int interp_loc = 0;
265 bool use_centroid = false;
266
267 std::string token;
268 for (is >> token; !token.empty(); token.clear(), is >> token) {
269 int value;
270 if (int_from_string_with_prefix_optional(token, "LOC:", value))
271 input.set_location(value);
272 else if (int_from_string_with_prefix_optional(token, "VARYING_SLOT:", value))
273 input.set_varying_slot(static_cast<gl_varying_slot>(value));
274 else if (token == "NO_VARYING")
275 input.set_no_varying(true);
276 else if (int_from_string_with_prefix_optional(token, "SYSVALUE:", value))
277 input.set_system_value(static_cast<gl_system_value>(value));
278 else if (int_from_string_with_prefix_optional(token, "INTERP:", interp))
279 ;
280 else if (int_from_string_with_prefix_optional(token, "ILOC:", interp_loc))
281 ;
282 else if (token == "USE_CENTROID")
283 use_centroid = true;
284 else {
285 std::cerr << "Unknown parse value '" << token << "'";
286 assert(!"Unknown parse value in read_input");
287 }
288 }
289
290 input.set_interpolator(interp, interp_loc, use_centroid);
291
292 add_input(input);
293 return true;
294 }
295
296 bool
allocate_registers_from_string(std::istream & is,Pin pin)297 Shader::allocate_registers_from_string(std::istream& is, Pin pin)
298 {
299 std::string line;
300 if (!std::getline(is, line))
301 return false;
302
303 std::istringstream iline(line);
304
305 while (!iline.eof()) {
306 string reg_str;
307 iline >> reg_str;
308
309 if (reg_str.empty())
310 break;
311
312 if (strchr(reg_str.c_str(), '@') ||
313 reg_str == "AR" ||
314 reg_str.substr(0,3) == "IDX") {
315 value_factory().dest_from_string(reg_str);
316 } else {
317 RegisterVec4::Swizzle swz = {0, 1, 2, 3};
318 auto regs = value_factory().dest_vec4_from_string(reg_str, swz, pin);
319 for (int i = 0; i < 4; ++i) {
320 if (swz[i] < 4 && pin == pin_fully) {
321 regs[i]->set_flag(Register::pin_start);
322 }
323 }
324 }
325 }
326 return true;
327 }
328
329 bool
allocate_arrays_from_string(std::istream & is)330 Shader::allocate_arrays_from_string(std::istream& is)
331 {
332 std::string line;
333 if (!std::getline(is, line))
334 return false;
335
336 std::istringstream iline(line);
337
338 while (!iline.eof()) {
339 string reg_str;
340 iline >> reg_str;
341
342 if (reg_str.empty())
343 break;
344
345 value_factory().array_from_string(reg_str);
346 }
347 return true;
348 }
349
350 bool
read_chipclass(std::istream & is)351 Shader::read_chipclass(std::istream& is)
352 {
353 string name;
354 is >> name;
355 if (name == "R600")
356 m_chip_class = ISA_CC_R600;
357 else if (name == "R700")
358 m_chip_class = ISA_CC_R700;
359 else if (name == "EVERGREEN")
360 m_chip_class = ISA_CC_EVERGREEN;
361 else if (name == "CAYMAN")
362 m_chip_class = ISA_CC_CAYMAN;
363 else
364 return false;
365 return true;
366 }
367
368 bool
read_family(std::istream & is)369 Shader::read_family(std::istream& is)
370 {
371 string name;
372 is >> name;
373 #define CHECK_FAMILY(F) if (name == #F) m_chip_family = CHIP_ ## F
374
375 CHECK_FAMILY(R600);
376 else CHECK_FAMILY(R600);
377 else CHECK_FAMILY(RV610);
378 else CHECK_FAMILY(RV630);
379 else CHECK_FAMILY(RV670);
380 else CHECK_FAMILY(RV620);
381 else CHECK_FAMILY(RV635);
382 else CHECK_FAMILY(RS780);
383 else CHECK_FAMILY(RS880);
384 /* GFX3 (R7xx) */
385 else CHECK_FAMILY(RV770);
386 else CHECK_FAMILY(RV730);
387 else CHECK_FAMILY(RV710);
388 else CHECK_FAMILY(RV740);
389 /* GFX4 (Evergreen) */
390 else CHECK_FAMILY(CEDAR);
391 else CHECK_FAMILY(REDWOOD);
392 else CHECK_FAMILY(JUNIPER);
393 else CHECK_FAMILY(CYPRESS);
394 else CHECK_FAMILY(HEMLOCK);
395 else CHECK_FAMILY(PALM);
396 else CHECK_FAMILY(SUMO);
397 else CHECK_FAMILY(SUMO2);
398 else CHECK_FAMILY(BARTS);
399 else CHECK_FAMILY(TURKS);
400 else CHECK_FAMILY(CAICOS);
401 /* GFX5 (Northern Islands) */
402 else CHECK_FAMILY(CAYMAN);
403 else CHECK_FAMILY(ARUBA);
404 else
405 return false;
406 return true;
407 }
408
409 void
allocate_reserved_registers()410 Shader::allocate_reserved_registers()
411 {
412 m_instr_factory->value_factory().set_virtual_register_base(0);
413 auto reserved_registers_end = do_allocate_reserved_registers();
414 m_instr_factory->value_factory().set_virtual_register_base(reserved_registers_end);
415 if (!m_atomics.empty()) {
416 m_atomic_update = value_factory().temp_register();
417 auto alu = new AluInstr(op1_mov,
418 m_atomic_update,
419 value_factory().one_i(),
420 AluInstr::last_write);
421 alu->set_alu_flag(alu_no_schedule_bias);
422 emit_instruction(alu);
423 }
424
425 if (m_flags.test(sh_needs_sbo_ret_address)) {
426 m_rat_return_address = value_factory().temp_register(0);
427 auto temp0 = value_factory().temp_register(0);
428 auto temp1 = value_factory().temp_register(1);
429 auto temp2 = value_factory().temp_register(2);
430
431 auto group = new AluGroup();
432 group->add_instruction(new AluInstr(
433 op1_mbcnt_32lo_accum_prev_int, temp0, value_factory().literal(-1), {alu_write}));
434 group->add_instruction(new AluInstr(
435 op1_mbcnt_32hi_int, temp1, value_factory().literal(-1), {alu_write}));
436 emit_instruction(group);
437 emit_instruction(new AluInstr(op3_muladd_uint24,
438 temp2,
439 value_factory().inline_const(ALU_SRC_SE_ID, 0),
440 value_factory().literal(256),
441 value_factory().inline_const(ALU_SRC_HW_WAVE_ID, 0),
442 {alu_write, alu_last_instr}));
443 emit_instruction(new AluInstr(op3_muladd_uint24,
444 m_rat_return_address,
445 temp2,
446 value_factory().literal(0x40),
447 temp0,
448 {alu_write, alu_last_instr}));
449 }
450 }
451
452 Shader *
translate_from_nir(nir_shader * nir,const pipe_stream_output_info * so_info,struct r600_shader * gs_shader,const r600_shader_key & key,r600_chip_class chip_class,radeon_family family)453 Shader::translate_from_nir(nir_shader *nir,
454 const pipe_stream_output_info *so_info,
455 struct r600_shader *gs_shader,
456 const r600_shader_key& key,
457 r600_chip_class chip_class,
458 radeon_family family)
459 {
460 Shader *shader = nullptr;
461
462 switch (nir->info.stage) {
463 case MESA_SHADER_FRAGMENT:
464 if (chip_class >= ISA_CC_EVERGREEN)
465 shader = new FragmentShaderEG(key);
466 else
467 shader = new FragmentShaderR600(key);
468 break;
469 case MESA_SHADER_VERTEX:
470 shader = new VertexShader(so_info, gs_shader, key);
471 break;
472 case MESA_SHADER_GEOMETRY:
473 shader = new GeometryShader(key);
474 break;
475 case MESA_SHADER_TESS_CTRL:
476 shader = new TCSShader(key);
477 break;
478 case MESA_SHADER_TESS_EVAL:
479 shader = new TESShader(so_info, gs_shader, key);
480 break;
481 case MESA_SHADER_KERNEL:
482 case MESA_SHADER_COMPUTE:
483 shader = new ComputeShader(key, BITSET_COUNT(nir->info.samplers_used));
484 break;
485 default:
486 return nullptr;
487 }
488
489 shader->set_info(nir);
490
491 shader->set_chip_class(chip_class);
492 shader->set_chip_family(family);
493
494 if (!shader->process(nir))
495 return nullptr;
496
497 return shader;
498 }
499
500 void
set_info(nir_shader * nir)501 Shader::set_info(nir_shader *nir)
502 {
503 m_scratch_size = nir->scratch_size;
504 }
505
506 ValueFactory&
value_factory()507 Shader::value_factory()
508 {
509 return m_instr_factory->value_factory();
510 }
511
512 bool
process(nir_shader * nir)513 Shader::process(nir_shader *nir)
514 {
515 m_ssbo_image_offset = nir->info.num_images;
516
517 if (nir->info.use_legacy_math_rules)
518 set_flag(sh_legacy_math_rules);
519
520 nir_foreach_uniform_variable(var, nir) scan_uniforms(var);
521
522 // at this point all functions should be inlined
523 const nir_function *func =
524 reinterpret_cast<const nir_function *>(exec_list_get_head_const(&nir->functions));
525
526 if (!scan_shader(func))
527 return false;
528
529 allocate_reserved_registers();
530
531 value_factory().allocate_registers(m_register_allocations);
532 m_required_registers = value_factory().array_registers();
533
534 sfn_log << SfnLog::trans << "Process shader \n";
535 foreach_list_typed(nir_cf_node, node, node, &func->impl->body)
536 {
537 if (!process_cf_node(node))
538 return false;
539 }
540
541 finalize();
542
543 return true;
544 }
545
546 bool
scan_shader(const nir_function * func)547 Shader::scan_shader(const nir_function *func)
548 {
549
550 nir_foreach_block(block, func->impl)
551 {
552 nir_foreach_instr(instr, block)
553 {
554 if (!scan_instruction(instr)) {
555 fprintf(stderr, "Unhandled sysvalue access ");
556 nir_print_instr(instr, stderr);
557 fprintf(stderr, "\n");
558 return false;
559 }
560 }
561 }
562
563 int lds_pos = 0;
564 for (auto& [index, input] : m_inputs) {
565 if (input.need_lds_pos()) {
566 if (chip_class() < ISA_CC_EVERGREEN)
567 input.set_gpr(lds_pos);
568 input.set_lds_pos(lds_pos++);
569 }
570 }
571
572 int export_param = 0;
573 for (auto& [index, out] : m_outputs) {
574 if (out.spi_sid())
575 out.set_export_param(export_param++);
576 }
577
578 return true;
579 }
580
581 bool
scan_uniforms(nir_variable * uniform)582 Shader::scan_uniforms(nir_variable *uniform)
583 {
584 if (glsl_contains_atomic(uniform->type)) {
585 int natomics = glsl_atomic_size(uniform->type) / 4; /* ATOMIC_COUNTER_SIZE */
586 m_nhwatomic += natomics;
587
588 if (glsl_type_is_array(uniform->type))
589 m_indirect_files |= 1 << TGSI_FILE_HW_ATOMIC;
590
591 m_flags.set(sh_uses_atomics);
592
593 r600_shader_atomic atom = {0};
594
595 atom.buffer_id = uniform->data.binding;
596 atom.hw_idx = m_atomic_base + m_next_hwatomic_loc;
597
598 atom.start = uniform->data.offset >> 2;
599 atom.end = atom.start + natomics - 1;
600
601 if (m_atomic_base_map.find(uniform->data.binding) == m_atomic_base_map.end())
602 m_atomic_base_map[uniform->data.binding] = m_next_hwatomic_loc;
603
604 m_next_hwatomic_loc += natomics;
605
606 m_atomic_file_count += atom.end - atom.start + 1;
607
608 sfn_log << SfnLog::io << "HW_ATOMIC file count: " << m_atomic_file_count << "\n";
609
610 m_atomics.push_back(atom);
611 }
612
613 auto type = glsl_without_array(uniform->type);
614 if (glsl_type_is_image(type) || uniform->data.mode == nir_var_mem_ssbo) {
615 m_flags.set(sh_uses_images);
616 if (glsl_type_is_array(uniform->type) && !(uniform->data.mode == nir_var_mem_ssbo))
617 m_indirect_files |= 1 << TGSI_FILE_IMAGE;
618 }
619
620 return true;
621 }
622
623 bool
scan_instruction(nir_instr * instr)624 Shader::scan_instruction(nir_instr *instr)
625 {
626 if (do_scan_instruction(instr))
627 return true;
628
629 if (instr->type != nir_instr_type_intrinsic)
630 return true;
631
632 auto intr = nir_instr_as_intrinsic(instr);
633
634 // handle unhandled instructions
635 switch (intr->intrinsic) {
636 case nir_intrinsic_ssbo_atomic:
637 case nir_intrinsic_ssbo_atomic_swap:
638 case nir_intrinsic_image_load:
639 case nir_intrinsic_image_atomic:
640 case nir_intrinsic_image_atomic_swap:
641 m_flags.set(sh_needs_sbo_ret_address);
642 FALLTHROUGH;
643 case nir_intrinsic_image_store:
644 case nir_intrinsic_store_ssbo:
645 m_flags.set(sh_writes_memory);
646 m_flags.set(sh_uses_images);
647 break;
648 case nir_intrinsic_barrier:
649 m_chain_instr.prepare_mem_barrier |=
650 (nir_intrinsic_memory_modes(intr) &
651 (nir_var_mem_ssbo | nir_var_mem_global | nir_var_image) &&
652 nir_intrinsic_memory_scope(intr) != SCOPE_NONE);
653 break;
654 case nir_intrinsic_decl_reg:
655 m_register_allocations.push_back(intr);
656 break;
657 default:;
658 }
659 return true;
660 }
661
662 bool
process_cf_node(nir_cf_node * node)663 Shader::process_cf_node(nir_cf_node *node)
664 {
665 SFN_TRACE_FUNC(SfnLog::flow, "CF");
666
667 switch (node->type) {
668 case nir_cf_node_block:
669 return process_block(nir_cf_node_as_block(node));
670 case nir_cf_node_if:
671 return process_if(nir_cf_node_as_if(node));
672 case nir_cf_node_loop:
673 return process_loop(nir_cf_node_as_loop(node));
674 default:
675 return false;
676 }
677 }
678
679 static bool
child_block_empty(const exec_list & list)680 child_block_empty(const exec_list& list)
681 {
682 if (list.is_empty())
683 return true;
684
685 bool result = true;
686
687 foreach_list_typed(nir_cf_node, n, node, &list)
688 {
689
690 if (n->type == nir_cf_node_block) {
691 if (!nir_cf_node_as_block(n)->instr_list.is_empty())
692 return false;
693 }
694 if (n->type == nir_cf_node_if)
695 return false;
696 }
697 return result;
698 }
699
value_has_non_const_source(VirtualValue * value)700 static bool value_has_non_const_source(VirtualValue *value)
701 {
702 auto reg = value->as_register();
703 if (reg) {
704 // Non-ssa registers are probably the result of some control flow
705 // that makes the values non-uniform across the work group
706 if (!reg->has_flag(Register::ssa))
707 return true;
708
709 for (const auto& p : reg->parents()) {
710 auto alu = p->as_alu();
711 if (alu) {
712 for (auto& s : p->as_alu()->sources()) {
713 return value_has_non_const_source(s);
714 }
715 } else {
716 return true;
717 }
718 }
719 }
720 return false;
721 }
722
723 bool
process_if(nir_if * if_stmt)724 Shader::process_if(nir_if *if_stmt)
725 {
726 SFN_TRACE_FUNC(SfnLog::flow, "IF");
727
728 auto value = value_factory().src(if_stmt->condition, 0);
729
730 bool non_const_cond = value_has_non_const_source(value);
731
732 EAluOp op = child_block_empty(if_stmt->then_list) ? op2_prede_int :
733 op2_pred_setne_int;
734
735 AluInstr *pred = new AluInstr(op,
736 value_factory().temp_register(),
737 value,
738 value_factory().zero(),
739 AluInstr::last);
740 pred->set_alu_flag(alu_update_exec);
741 pred->set_alu_flag(alu_update_pred);
742 pred->set_cf_type(cf_alu_push_before);
743
744 IfInstr *ir = new IfInstr(pred);
745 emit_instruction(ir);
746 if (non_const_cond)
747 ++m_control_flow_depth;
748 start_new_block(1);
749
750 if (!child_block_empty(if_stmt->then_list)) {
751 foreach_list_typed(nir_cf_node, n, node, &if_stmt->then_list)
752 {
753 SFN_TRACE_FUNC(SfnLog::flow, "IF-then");
754 if (!process_cf_node(n))
755 return false;
756 }
757 if (!child_block_empty(if_stmt->else_list)) {
758 if (!emit_control_flow(ControlFlowInstr::cf_else))
759 return false;
760 foreach_list_typed(nir_cf_node,
761 n,
762 node,
763 &if_stmt->else_list)
764 if (!process_cf_node(n)) return false;
765 }
766 } else {
767 assert(!child_block_empty(if_stmt->else_list));
768 foreach_list_typed(nir_cf_node,
769 n,
770 node,
771 &if_stmt->else_list)
772 if (!process_cf_node(n)) return false;
773 }
774
775 if (!emit_control_flow(ControlFlowInstr::cf_endif))
776 return false;
777
778 if (non_const_cond)
779 --m_control_flow_depth;
780
781 return true;
782 }
783
784 bool
emit_control_flow(ControlFlowInstr::CFType type)785 Shader::emit_control_flow(ControlFlowInstr::CFType type)
786 {
787 auto ir = new ControlFlowInstr(type);
788 emit_instruction(ir);
789 int depth = 0;
790 switch (type) {
791 case ControlFlowInstr::cf_loop_begin:
792 m_loops.push_back(ir);
793 m_nloops++;
794 depth = 1;
795 break;
796 case ControlFlowInstr::cf_loop_end:
797 m_loops.pop_back();
798 FALLTHROUGH;
799 case ControlFlowInstr::cf_endif:
800 depth = -1;
801 break;
802 default:;
803 }
804
805 start_new_block(depth);
806 return true;
807 }
808
809 bool
process_loop(nir_loop * node)810 Shader::process_loop(nir_loop *node)
811 {
812 assert(!nir_loop_has_continue_construct(node));
813 SFN_TRACE_FUNC(SfnLog::flow, "LOOP");
814 if (!emit_control_flow(ControlFlowInstr::cf_loop_begin))
815 return false;
816
817 foreach_list_typed(nir_cf_node,
818 n,
819 node,
820 &node->body) if (!process_cf_node(n)) return false;
821
822 if (!emit_control_flow(ControlFlowInstr::cf_loop_end))
823 return false;
824
825 return true;
826 }
827
828 bool
process_block(nir_block * block)829 Shader::process_block(nir_block *block)
830 {
831 SFN_TRACE_FUNC(SfnLog::flow, "BLOCK");
832
833 nir_foreach_instr(instr, block)
834 {
835 sfn_log << SfnLog::instr << "FROM:" << *instr << "\n";
836 bool r = process_instr(instr);
837 if (!r) {
838 sfn_log << SfnLog::err << "R600: Unsupported instruction: " << *instr << "\n";
839 return false;
840 }
841 }
842 return true;
843 }
844
845 bool
process_instr(nir_instr * instr)846 Shader::process_instr(nir_instr *instr)
847 {
848 return m_instr_factory->from_nir(instr, *this);
849 }
850
851 bool
process_intrinsic(nir_intrinsic_instr * intr)852 Shader::process_intrinsic(nir_intrinsic_instr *intr)
853 {
854 if (process_stage_intrinsic(intr))
855 return true;
856
857 if (GDSInstr::emit_atomic_counter(intr, *this)) {
858 set_flag(sh_writes_memory);
859 return true;
860 }
861
862 if (RatInstr::emit(intr, *this))
863 return true;
864
865 switch (intr->intrinsic) {
866 case nir_intrinsic_store_output:
867 return store_output(intr);
868 case nir_intrinsic_load_input:
869 return load_input(intr);
870 case nir_intrinsic_load_ubo_vec4:
871 return load_ubo(intr);
872 case nir_intrinsic_store_scratch:
873 return emit_store_scratch(intr);
874 case nir_intrinsic_load_scratch:
875 return emit_load_scratch(intr);
876 case nir_intrinsic_store_local_shared_r600:
877 return emit_local_store(intr);
878 case nir_intrinsic_load_global:
879 case nir_intrinsic_load_global_constant:
880 return emit_load_global(intr);
881 case nir_intrinsic_load_local_shared_r600:
882 return emit_local_load(intr);
883 case nir_intrinsic_load_tcs_in_param_base_r600:
884 return emit_load_tcs_param_base(intr, 0);
885 case nir_intrinsic_load_tcs_out_param_base_r600:
886 return emit_load_tcs_param_base(intr, 16);
887 case nir_intrinsic_barrier:
888 return emit_barrier(intr);
889 case nir_intrinsic_shared_atomic:
890 case nir_intrinsic_shared_atomic_swap:
891 return emit_atomic_local_shared(intr);
892 case nir_intrinsic_shader_clock:
893 return emit_shader_clock(intr);
894 case nir_intrinsic_load_reg:
895 return emit_load_reg(intr);
896 case nir_intrinsic_load_reg_indirect:
897 return emit_load_reg_indirect(intr);
898 case nir_intrinsic_store_reg:
899 return emit_store_reg(intr);
900 case nir_intrinsic_store_reg_indirect:
901 return emit_store_reg_indirect(intr);
902 case nir_intrinsic_decl_reg:
903 // Registers and arrays are allocated at
904 // conversion startup time
905 return true;
906 default:
907 return false;
908 }
909 }
910
911 static ESDOp
lds_op_from_intrinsic(nir_atomic_op op,bool ret)912 lds_op_from_intrinsic(nir_atomic_op op, bool ret)
913 {
914 switch (op) {
915 case nir_atomic_op_iadd:
916 return ret ? LDS_ADD_RET : LDS_ADD;
917 case nir_atomic_op_iand:
918 return ret ? LDS_AND_RET : LDS_AND;
919 case nir_atomic_op_ior:
920 return ret ? LDS_OR_RET : LDS_OR;
921 case nir_atomic_op_imax:
922 return ret ? LDS_MAX_INT_RET : LDS_MAX_INT;
923 case nir_atomic_op_umax:
924 return ret ? LDS_MAX_UINT_RET : LDS_MAX_UINT;
925 case nir_atomic_op_imin:
926 return ret ? LDS_MIN_INT_RET : LDS_MIN_INT;
927 case nir_atomic_op_umin:
928 return ret ? LDS_MIN_UINT_RET : LDS_MIN_UINT;
929 case nir_atomic_op_ixor:
930 return ret ? LDS_XOR_RET : LDS_XOR;
931 case nir_atomic_op_xchg:
932 return LDS_XCHG_RET;
933 case nir_atomic_op_cmpxchg:
934 return LDS_CMP_XCHG_RET;
935 default:
936 unreachable("Unsupported shared atomic_op opcode");
937 }
938 }
939
940 PRegister
emit_load_to_register(PVirtualValue src)941 Shader::emit_load_to_register(PVirtualValue src)
942 {
943 assert(src);
944 PRegister dest = src->as_register();
945
946 if (!dest) {
947 dest = value_factory().temp_register();
948 emit_instruction(new AluInstr(op1_mov, dest, src, AluInstr::last_write));
949 }
950 return dest;
951 }
952
953 // add visitor to resolve array and register
954 class RegisterAccessHandler : public RegisterVisitor {
955
956 public:
957 RegisterAccessHandler(Shader& shader, nir_intrinsic_instr *intr);
958
visit(LocalArrayValue & value)959 void visit(LocalArrayValue& value) override {(void)value; assert(0);}
visit(UniformValue & value)960 void visit(UniformValue& value) override {(void)value; assert(0);}
visit(LiteralConstant & value)961 void visit(LiteralConstant& value) override {(void)value; assert(0);}
visit(InlineConstant & value)962 void visit(InlineConstant& value) override {(void)value; assert(0);}
963
964 Shader& sh;
965 nir_intrinsic_instr *ir;
966 PVirtualValue addr{nullptr};
967 bool success{true};
968 };
969
970 class RegisterReadHandler : public RegisterAccessHandler {
971
972 public:
973 using RegisterAccessHandler::RegisterAccessHandler;
974 using RegisterAccessHandler::visit;
975
976 void visit(LocalArray& value) override;
977 void visit(Register& value) override;
978 };
979
emit_load_reg(nir_intrinsic_instr * intr)980 bool Shader::emit_load_reg(nir_intrinsic_instr *intr)
981 {
982 RegisterReadHandler visitor(*this, intr);
983 auto handle = value_factory().src(intr->src[0], 0);
984 handle->accept(visitor);
985 return visitor.success;
986 }
987
emit_load_reg_indirect(nir_intrinsic_instr * intr)988 bool Shader::emit_load_reg_indirect(nir_intrinsic_instr *intr)
989 {
990 RegisterReadHandler visitor(*this, intr);
991 visitor.addr = value_factory().src(intr->src[1], 0);
992 auto handle = value_factory().src(intr->src[0], 0);
993 handle->accept(visitor);
994 return visitor.success;
995 }
996
997 class RegisterWriteHandler : public RegisterAccessHandler {
998
999 public:
1000 using RegisterAccessHandler::RegisterAccessHandler;
1001 using RegisterAccessHandler::visit;
1002
1003 void visit(LocalArray& value) override;
1004 void visit(Register& value) override;
1005 };
1006
1007
emit_store_reg(nir_intrinsic_instr * intr)1008 bool Shader::emit_store_reg(nir_intrinsic_instr *intr)
1009 {
1010 RegisterWriteHandler visitor(*this, intr);
1011 auto handle = value_factory().src(intr->src[1], 0);
1012 handle->accept(visitor);
1013 return visitor.success;
1014 }
1015
emit_store_reg_indirect(nir_intrinsic_instr * intr)1016 bool Shader::emit_store_reg_indirect(nir_intrinsic_instr *intr)
1017 {
1018 RegisterWriteHandler visitor(*this, intr);
1019 visitor.addr = value_factory().src(intr->src[2], 0);
1020
1021 auto handle = value_factory().src(intr->src[1], 0);
1022 handle->accept(visitor);
1023 return visitor.success;
1024 }
1025
RegisterAccessHandler(Shader & shader,nir_intrinsic_instr * intr)1026 RegisterAccessHandler::RegisterAccessHandler(Shader& shader, nir_intrinsic_instr *intr):
1027 sh(shader),
1028 ir(intr)
1029 {}
1030
visit(LocalArray & array)1031 void RegisterReadHandler::visit(LocalArray& array)
1032 {
1033 int slots = ir->def.bit_size / 32;
1034 auto pin = ir->def.num_components > 1 ? pin_none : pin_free;
1035 for (int i = 0; i < ir->def.num_components; ++i) {
1036 for (int s = 0; s < slots; ++s) {
1037 int chan = i * slots + s;
1038 auto dest = sh.value_factory().dest(ir->def, chan, pin);
1039 auto src = array.element(nir_intrinsic_base(ir), addr, chan);
1040 sh.emit_instruction(new AluInstr(op1_mov, dest, src, AluInstr::write));
1041 }
1042 }
1043 }
1044
visit(Register & reg)1045 void RegisterReadHandler::visit(Register& reg)
1046 {
1047 auto dest = sh.value_factory().dest(ir->def, 0, pin_free);
1048 sh.emit_instruction(new AluInstr(op1_mov, dest, ®, AluInstr::write));
1049 }
1050
visit(LocalArray & array)1051 void RegisterWriteHandler::visit(LocalArray& array)
1052 {
1053 int writemask = nir_intrinsic_write_mask(ir);
1054 int slots = ir->src->ssa->bit_size / 32;
1055
1056 for (int i = 0; i < ir->num_components; ++i) {
1057 if (!(writemask & (1 << i)))
1058 continue;
1059 for (int s = 0; s < slots; ++s) {
1060 int chan = i * slots + s;
1061
1062 auto dest = array.element(nir_intrinsic_base(ir), addr, chan);
1063 auto src = sh.value_factory().src(ir->src[0], chan);
1064 sh.emit_instruction(new AluInstr(op1_mov, dest, src, AluInstr::write));
1065 }
1066 }
1067 }
1068
visit(Register & dest)1069 void RegisterWriteHandler::visit(Register& dest)
1070 {
1071 int writemask = nir_intrinsic_write_mask(ir);
1072 assert(writemask == 1);
1073 auto src = sh.value_factory().src(ir->src[0], 0);
1074 sh.emit_instruction(new AluInstr(op1_mov, &dest, src, AluInstr::write));
1075 }
1076
1077 bool
emit_atomic_local_shared(nir_intrinsic_instr * instr)1078 Shader::emit_atomic_local_shared(nir_intrinsic_instr *instr)
1079 {
1080 bool uses_retval = !list_is_empty(&instr->def.uses);
1081
1082 auto& vf = value_factory();
1083
1084 auto dest_value = uses_retval ? vf.dest(instr->def, 0, pin_free) : nullptr;
1085
1086 auto op = lds_op_from_intrinsic(nir_intrinsic_atomic_op(instr), uses_retval);
1087
1088 /* For these two instructions we don't have opcodes that don't read back
1089 * the result, so we have to add a dummy-readback to remove the the return
1090 * value from read queue. */
1091 if (!uses_retval &&
1092 (op == LDS_XCHG_RET || op == LDS_CMP_XCHG_RET)) {
1093 dest_value = vf.dest(instr->def, 0, pin_free);
1094 }
1095
1096 auto address = vf.src(instr->src[0], 0);
1097
1098 AluInstr::SrcValues src;
1099 src.push_back(vf.src(instr->src[1], 0));
1100
1101 if (unlikely(instr->intrinsic == nir_intrinsic_shared_atomic_swap))
1102 src.push_back(vf.src(instr->src[2], 0));
1103 emit_instruction(new LDSAtomicInstr(op, dest_value, address, src));
1104 return true;
1105 }
1106
1107 auto
evaluate_resource_offset(nir_intrinsic_instr * instr,int src_id)1108 Shader::evaluate_resource_offset(nir_intrinsic_instr *instr, int src_id)
1109 -> std::pair<int, PRegister>
1110 {
1111 auto& vf = value_factory();
1112
1113 PRegister uav_id{nullptr};
1114 int offset = nir_intrinsic_has_range_base(instr) ?
1115 nir_intrinsic_range_base(instr) : 0;
1116
1117 auto uav_id_const = nir_src_as_const_value(instr->src[src_id]);
1118 if (uav_id_const) {
1119 offset += uav_id_const->u32;
1120 } else {
1121 auto uav_id_val = vf.src(instr->src[src_id], 0);
1122 if (uav_id_val->as_register()) {
1123 uav_id = uav_id_val->as_register();
1124 } else {
1125 uav_id = vf.temp_register();
1126 emit_instruction(new AluInstr(op1_mov, uav_id, uav_id_val, AluInstr::last_write));
1127 }
1128 }
1129 return std::make_pair(offset, uav_id);
1130 }
1131
1132 bool
emit_store_scratch(nir_intrinsic_instr * intr)1133 Shader::emit_store_scratch(nir_intrinsic_instr *intr)
1134 {
1135 auto& vf = m_instr_factory->value_factory();
1136
1137 int writemask = nir_intrinsic_write_mask(intr);
1138
1139 RegisterVec4::Swizzle swz = {7, 7, 7, 7};
1140
1141 for (unsigned i = 0; i < intr->num_components; ++i)
1142 swz[i] = (1 << i) & writemask ? i : 7;
1143
1144 auto value = vf.temp_vec4(pin_group, swz);
1145 AluInstr *ir = nullptr;
1146 for (unsigned i = 0; i < intr->num_components; ++i) {
1147 if (value[i]->chan() < 4) {
1148 ir = new AluInstr(op1_mov, value[i], vf.src(intr->src[0], i), AluInstr::write);
1149 ir->set_alu_flag(alu_no_schedule_bias);
1150 emit_instruction(ir);
1151 }
1152 }
1153 if (!ir)
1154 return true;
1155
1156 ir->set_alu_flag(alu_last_instr);
1157
1158 auto address = vf.src(intr->src[1], 0);
1159
1160 int align = nir_intrinsic_align_mul(intr);
1161 int align_offset = nir_intrinsic_align_offset(intr);
1162
1163 ScratchIOInstr *ws_ir = nullptr;
1164
1165 int offset = -1;
1166 if (address->as_literal()) {
1167 offset = address->as_literal()->value();
1168 } else if (address->as_inline_const()) {
1169 auto il = address->as_inline_const();
1170 if (il->sel() == ALU_SRC_0)
1171 offset = 0;
1172 else if (il->sel() == ALU_SRC_1_INT)
1173 offset = 1;
1174 }
1175
1176 if (offset >= 0) {
1177 ws_ir = new ScratchIOInstr(value, offset, align, align_offset, writemask);
1178 } else {
1179 auto addr_temp = vf.temp_register(0);
1180 auto load_addr = new AluInstr(op1_mov, addr_temp, address, AluInstr::last_write);
1181 load_addr->set_alu_flag(alu_no_schedule_bias);
1182 emit_instruction(load_addr);
1183
1184 ws_ir = new ScratchIOInstr(
1185 value, addr_temp, align, align_offset, writemask, m_scratch_size);
1186 }
1187 emit_instruction(ws_ir);
1188
1189 m_flags.set(sh_needs_scratch_space);
1190 return true;
1191 }
1192
1193 bool
emit_load_scratch(nir_intrinsic_instr * intr)1194 Shader::emit_load_scratch(nir_intrinsic_instr *intr)
1195 {
1196 auto addr = value_factory().src(intr->src[0], 0);
1197 auto dest = value_factory().dest_vec4(intr->def, pin_group);
1198
1199 if (chip_class() >= ISA_CC_R700) {
1200 RegisterVec4::Swizzle dest_swz = {7, 7, 7, 7};
1201
1202 for (unsigned i = 0; i < intr->num_components; ++i)
1203 dest_swz[i] = i;
1204
1205 auto *ir = new LoadFromScratch(dest, dest_swz, addr, m_scratch_size);
1206 emit_instruction(ir);
1207 chain_scratch_read(ir);
1208 } else {
1209 int align = nir_intrinsic_align_mul(intr);
1210 int align_offset = nir_intrinsic_align_offset(intr);
1211
1212 int offset = -1;
1213 if (addr->as_literal()) {
1214 offset = addr->as_literal()->value();
1215 } else if (addr->as_inline_const()) {
1216 auto il = addr->as_inline_const();
1217 if (il->sel() == ALU_SRC_0)
1218 offset = 0;
1219 else if (il->sel() == ALU_SRC_1_INT)
1220 offset = 1;
1221 }
1222
1223 ScratchIOInstr *ir = nullptr;
1224 if (offset >= 0) {
1225 ir = new ScratchIOInstr(dest, offset, align, align_offset, 0xf, true);
1226 } else {
1227 auto addr_temp = value_factory().temp_register(0);
1228 auto load_addr = new AluInstr(op1_mov, addr_temp, addr, AluInstr::last_write);
1229 load_addr->set_alu_flag(alu_no_schedule_bias);
1230 emit_instruction(load_addr);
1231
1232 ir = new ScratchIOInstr(
1233 dest, addr_temp, align, align_offset, 0xf, m_scratch_size, true);
1234 }
1235 emit_instruction(ir);
1236 }
1237
1238 m_flags.set(sh_needs_scratch_space);
1239
1240 return true;
1241 }
1242
emit_load_global(nir_intrinsic_instr * intr)1243 bool Shader::emit_load_global(nir_intrinsic_instr *intr)
1244 {
1245 auto dest = value_factory().dest_vec4(intr->def, pin_group);
1246
1247 auto src_value = value_factory().src(intr->src[0], 0);
1248 auto src = src_value->as_register();
1249 if (!src) {
1250 src = value_factory().temp_register();
1251 emit_instruction(new AluInstr(op1_mov, src, src_value, AluInstr::last_write));
1252 }
1253 auto load = new LoadFromBuffer(dest, {0,7,7,7}, src, 0, 1, NULL, fmt_32);
1254 load->set_mfc(4);
1255 load->set_num_format(vtx_nf_int);
1256 load->reset_fetch_flag(FetchInstr::format_comp_signed);
1257
1258 emit_instruction(load);
1259 return true;
1260 }
1261
1262 bool
emit_local_store(nir_intrinsic_instr * instr)1263 Shader::emit_local_store(nir_intrinsic_instr *instr)
1264 {
1265 unsigned write_mask = nir_intrinsic_write_mask(instr);
1266
1267 auto address = value_factory().src(instr->src[1], 0);
1268 int swizzle_base = 0;
1269 unsigned w = write_mask;
1270 while (!(w & 1)) {
1271 ++swizzle_base;
1272 w >>= 1;
1273 }
1274 write_mask = write_mask >> swizzle_base;
1275
1276 if ((write_mask & 3) != 3) {
1277 auto value = value_factory().src(instr->src[0], swizzle_base);
1278 emit_instruction(new LDSAtomicInstr(LDS_WRITE, nullptr, address, {value}));
1279 } else {
1280 auto value = value_factory().src(instr->src[0], swizzle_base);
1281 auto value1 = value_factory().src(instr->src[0], swizzle_base + 1);
1282 emit_instruction(
1283 new LDSAtomicInstr(LDS_WRITE_REL, nullptr, address, {value, value1}));
1284 }
1285 return true;
1286 }
1287
1288 bool
emit_local_load(nir_intrinsic_instr * instr)1289 Shader::emit_local_load(nir_intrinsic_instr *instr)
1290 {
1291 auto address = value_factory().src_vec(instr->src[0], instr->num_components);
1292 auto dest_value = value_factory().dest_vec(instr->def, instr->num_components);
1293 emit_instruction(new LDSReadInstr(dest_value, address));
1294 return true;
1295 }
1296
1297 void
chain_scratch_read(Instr * instr)1298 Shader::chain_scratch_read(Instr *instr)
1299 {
1300 m_chain_instr.apply(instr, &m_chain_instr.last_scratch_instr);
1301 }
1302
1303 void
chain_ssbo_read(Instr * instr)1304 Shader::chain_ssbo_read(Instr *instr)
1305 {
1306 m_chain_instr.apply(instr, &m_chain_instr.last_ssbo_instr);
1307 }
1308
1309 bool
emit_wait_ack()1310 Shader::emit_wait_ack()
1311 {
1312 start_new_block(0);
1313 emit_instruction(new ControlFlowInstr(ControlFlowInstr::cf_wait_ack));
1314 start_new_block(0);
1315 return true;
1316 }
1317
get_array_hash(const VirtualValue & value)1318 static uint32_t get_array_hash(const VirtualValue& value)
1319 {
1320 assert (value.pin() == pin_array);
1321 const LocalArrayValue& av = static_cast<const LocalArrayValue&>(value);
1322 return av.chan() | (av.array().base_sel() << 2);
1323 }
1324
visit(AluInstr * instr)1325 void Shader::InstructionChain::visit(AluInstr *instr)
1326 {
1327 if (instr->is_kill()) {
1328 last_kill_instr = instr;
1329
1330 // these instructions have side effects, they should
1331 // not be re-order with kill
1332 if (last_gds_instr)
1333 instr->add_required_instr(last_gds_instr);
1334
1335 if (last_ssbo_instr)
1336 instr->add_required_instr(last_ssbo_instr);
1337 }
1338
1339 /* Make sure array reads and writes depends on the last indirect access
1340 * so that we don't overwrite array elements too early */
1341
1342 if (auto d = instr->dest()) {
1343 if (d->pin() == pin_array) {
1344 if (d->addr()) {
1345 last_alu_with_indirect_reg[get_array_hash(*d)] = instr;
1346 return;
1347 }
1348 auto pos = last_alu_with_indirect_reg.find(get_array_hash(*d));
1349 if (pos != last_alu_with_indirect_reg.end()) {
1350 instr->add_required_instr(pos->second);
1351 }
1352 }
1353 }
1354
1355 for (auto& s : instr->sources()) {
1356 if (s->pin() == pin_array) {
1357 if (s->get_addr()) {
1358 last_alu_with_indirect_reg[get_array_hash(*s)] = instr;
1359 return;
1360 }
1361 auto pos = last_alu_with_indirect_reg.find(get_array_hash(*s));
1362 if (pos != last_alu_with_indirect_reg.end()) {
1363 instr->add_required_instr(pos->second);
1364 }
1365 }
1366 }
1367 }
1368
1369 void
visit(ScratchIOInstr * instr)1370 Shader::InstructionChain::visit(ScratchIOInstr *instr)
1371 {
1372 apply(instr, &last_scratch_instr);
1373 }
1374
1375 void
visit(GDSInstr * instr)1376 Shader::InstructionChain::visit(GDSInstr *instr)
1377 {
1378 apply(instr, &last_gds_instr);
1379 Instr::Flags flag = instr->has_instr_flag(Instr::helper) ? Instr::helper : Instr::vpm;
1380 for (auto& loop : this_shader->m_loops) {
1381 loop->set_instr_flag(flag);
1382 }
1383 if (last_kill_instr)
1384 instr->add_required_instr(last_kill_instr);
1385
1386 }
1387
1388 void
visit(RatInstr * instr)1389 Shader::InstructionChain::visit(RatInstr *instr)
1390 {
1391 apply(instr, &last_ssbo_instr);
1392 Instr::Flags flag = instr->has_instr_flag(Instr::helper) ? Instr::helper : Instr::vpm;
1393 for (auto& loop : this_shader->m_loops) {
1394 loop->set_instr_flag(flag);
1395 }
1396
1397 if (prepare_mem_barrier)
1398 instr->set_ack();
1399
1400 if (this_shader->m_current_block->inc_rat_emitted() > 15)
1401 this_shader->start_new_block(0);
1402
1403 if (last_kill_instr)
1404 instr->add_required_instr(last_kill_instr);
1405 }
1406
1407 void
apply(Instr * current,Instr ** last)1408 Shader::InstructionChain::apply(Instr *current, Instr **last)
1409 {
1410 if (*last)
1411 current->add_required_instr(*last);
1412 *last = current;
1413 }
1414
1415 void
emit_instruction(PInst instr)1416 Shader::emit_instruction(PInst instr)
1417 {
1418 sfn_log << SfnLog::instr << " " << *instr << "\n";
1419 instr->accept(m_chain_instr);
1420 m_current_block->push_back(instr);
1421 }
1422
1423 bool
emit_load_tcs_param_base(nir_intrinsic_instr * instr,int offset)1424 Shader::emit_load_tcs_param_base(nir_intrinsic_instr *instr, int offset)
1425 {
1426 auto src = value_factory().temp_register();
1427 emit_instruction(
1428 new AluInstr(op1_mov, src, value_factory().zero(), AluInstr::last_write));
1429
1430 auto dest = value_factory().dest_vec4(instr->def, pin_group);
1431 auto fetch = new LoadFromBuffer(dest,
1432 {0, 1, 2, 3},
1433 src,
1434 offset,
1435 R600_LDS_INFO_CONST_BUFFER,
1436 nullptr,
1437 fmt_32_32_32_32);
1438
1439 fetch->set_fetch_flag(LoadFromBuffer::srf_mode);
1440 emit_instruction(fetch);
1441
1442 return true;
1443 }
1444
1445 bool
emit_shader_clock(nir_intrinsic_instr * instr)1446 Shader::emit_shader_clock(nir_intrinsic_instr *instr)
1447 {
1448 auto& vf = value_factory();
1449 auto group = new AluGroup();
1450 group->add_instruction(new AluInstr(op1_mov,
1451 vf.dest(instr->def, 0, pin_chan),
1452 vf.inline_const(ALU_SRC_TIME_LO, 0),
1453 AluInstr::write));
1454 group->add_instruction(new AluInstr(op1_mov,
1455 vf.dest(instr->def, 1, pin_chan),
1456 vf.inline_const(ALU_SRC_TIME_HI, 0),
1457 AluInstr::last_write));
1458 emit_instruction(group);
1459 return true;
1460 }
1461
1462 bool
emit_group_barrier(nir_intrinsic_instr * intr)1463 Shader::emit_group_barrier(nir_intrinsic_instr *intr)
1464 {
1465 assert(m_control_flow_depth == 0);
1466 (void)intr;
1467 /* Put barrier into it's own block, so that optimizers and the
1468 * scheduler don't move code */
1469 start_new_block(0);
1470 auto op = new AluInstr(op0_group_barrier, 0);
1471 op->set_alu_flag(alu_last_instr);
1472 emit_instruction(op);
1473 start_new_block(0);
1474 return true;
1475 }
1476
emit_barrier(nir_intrinsic_instr * intr)1477 bool Shader::emit_barrier(nir_intrinsic_instr *intr)
1478 {
1479
1480 if ((nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP)) {
1481 if (!emit_group_barrier(intr))
1482 return false;
1483 }
1484
1485 /* We don't check nir_var_mem_shared because we don't emit a real barrier -
1486 * for this we need to implement GWS (Global Wave Sync).
1487 * Here we just emit a wait_ack - this is no real barrier,
1488 * it's just a wait for RAT writes to be finished (if they
1489 * are emitted with the _ACK opcode and the `mark` flag set - it
1490 * is very likely that WAIT_ACK is also only relevant for this
1491 * shader instance). */
1492 auto full_barrier_mem_modes = nir_var_mem_ssbo | nir_var_image | nir_var_mem_global;
1493
1494 if ((nir_intrinsic_memory_scope(intr) != SCOPE_NONE) &&
1495 (nir_intrinsic_memory_modes(intr) & full_barrier_mem_modes)) {
1496 return emit_wait_ack();
1497 }
1498
1499 return true;
1500 }
1501
1502 bool
load_ubo(nir_intrinsic_instr * instr)1503 Shader::load_ubo(nir_intrinsic_instr *instr)
1504 {
1505 auto bufid = nir_src_as_const_value(instr->src[0]);
1506 auto buf_offset = nir_src_as_const_value(instr->src[1]);
1507 auto base_id = nir_intrinsic_base(instr);
1508
1509 if (!buf_offset) {
1510 /* TODO: if bufid is constant then this can also be solved by using the
1511 * CF index on the ALU block, and this would probably make sense when
1512 * there are more then one loads with the same buffer ID. */
1513
1514 auto addr = value_factory().src(instr->src[1], 0)->as_register();
1515 RegisterVec4::Swizzle dest_swz{7, 7, 7, 7};
1516 auto dest = value_factory().dest_vec4(instr->def, pin_group);
1517
1518 for (unsigned i = 0; i < instr->def.num_components; ++i) {
1519 dest_swz[i] = i + nir_intrinsic_component(instr);
1520 }
1521
1522 LoadFromBuffer *ir;
1523 if (bufid) {
1524 ir = new LoadFromBuffer(
1525 dest, dest_swz, addr, 0, bufid->u32, nullptr, fmt_32_32_32_32_float);
1526 } else {
1527 auto buffer_id = emit_load_to_register(value_factory().src(instr->src[0], 0));
1528 ir = new LoadFromBuffer(
1529 dest, dest_swz, addr, 0, base_id, buffer_id, fmt_32_32_32_32_float);
1530 }
1531 emit_instruction(ir);
1532 return true;
1533 }
1534
1535 /* direct load using the constant cache */
1536 if (bufid) {
1537 int buf_cmp = nir_intrinsic_component(instr);
1538
1539 AluInstr *ir = nullptr;
1540 auto pin = instr->def.num_components == 1
1541 ? pin_free
1542 : pin_none;
1543 for (unsigned i = 0; i < instr->def.num_components; ++i) {
1544
1545 sfn_log << SfnLog::io << "UBO[" << bufid << "] " << instr->def.index
1546 << " const[" << i << "]: " << instr->const_index[i] << "\n";
1547
1548 auto uniform =
1549 value_factory().uniform(512 + buf_offset->u32, i + buf_cmp, bufid->u32);
1550 ir = new AluInstr(op1_mov,
1551 value_factory().dest(instr->def, i, pin),
1552 uniform,
1553 {alu_write});
1554 emit_instruction(ir);
1555 }
1556 if (ir)
1557 ir->set_alu_flag(alu_last_instr);
1558 return true;
1559 } else {
1560 int buf_cmp = nir_intrinsic_component(instr);
1561 AluInstr *ir = nullptr;
1562 auto kc_id = value_factory().src(instr->src[0], 0);
1563
1564 for (unsigned i = 0; i < instr->def.num_components; ++i) {
1565 int cmp = buf_cmp + i;
1566 auto u =
1567 new UniformValue(512 + buf_offset->u32, cmp, kc_id, nir_intrinsic_base(instr));
1568 auto dest = value_factory().dest(instr->def, i, pin_none);
1569 ir = new AluInstr(op1_mov, dest, u, AluInstr::write);
1570 emit_instruction(ir);
1571 }
1572 if (ir)
1573 ir->set_alu_flag(alu_last_instr);
1574 m_indirect_files |= 1 << TGSI_FILE_CONSTANT;
1575 return true;
1576 }
1577 }
1578
1579 void
start_new_block(int depth)1580 Shader::start_new_block(int depth)
1581 {
1582 int depth_offset = m_current_block ? m_current_block->nesting_depth() : 0;
1583 m_current_block = new Block(depth + depth_offset, m_next_block++);
1584 m_root.push_back(m_current_block);
1585 }
1586
1587 bool
emit_simple_mov(nir_def & def,int chan,PVirtualValue src,Pin pin)1588 Shader::emit_simple_mov(nir_def& def, int chan, PVirtualValue src, Pin pin)
1589 {
1590 auto dst = value_factory().dest(def, chan, pin);
1591 emit_instruction(new AluInstr(op1_mov, dst, src, AluInstr::last_write));
1592 return true;
1593 }
1594
1595 void
print(std::ostream & os) const1596 Shader::print(std::ostream& os) const
1597 {
1598 print_header(os);
1599
1600 for (auto& [dummy, i] : m_inputs) {
1601 i.print(os);
1602 os << "\n";
1603 }
1604
1605 for (auto& [dummy, o] : m_outputs) {
1606 o.print(os);
1607 os << "\n";
1608 }
1609
1610 os << "SHADER\n";
1611 for (auto& b : m_root)
1612 b->print(os);
1613 }
1614
1615 const char *chip_class_names[] = {"R600", "R700", "EVERGREEN", "CAYMAN"};
1616
1617 void
print_header(std::ostream & os) const1618 Shader::print_header(std::ostream& os) const
1619 {
1620 assert(m_chip_class <= ISA_CC_CAYMAN);
1621 os << "Shader: " << m_shader_id << "\n";
1622 os << m_type_id << "\n";
1623 os << "CHIPCLASS " << chip_class_names[m_chip_class] << "\n";
1624 print_properties(os);
1625 }
1626
1627 void
print_properties(std::ostream & os) const1628 Shader::print_properties(std::ostream& os) const
1629 {
1630 do_print_properties(os);
1631 }
1632
1633 bool
equal_to(const Shader & other) const1634 Shader::equal_to(const Shader& other) const
1635 {
1636 if (m_root.size() != other.m_root.size())
1637 return false;
1638 return std::inner_product(
1639 m_root.begin(),
1640 m_root.end(),
1641 other.m_root.begin(),
1642 true,
1643 [](bool lhs, bool rhs) { return lhs & rhs; },
1644 [](const Block::Pointer lhs, const Block::Pointer rhs) -> bool {
1645 return lhs->is_equal_to(*rhs);
1646 });
1647 }
1648
1649 void
get_shader_info(r600_shader * sh_info)1650 Shader::get_shader_info(r600_shader *sh_info)
1651 {
1652 sh_info->ninput = m_inputs.size();
1653 sh_info->nlds = 0;
1654 int input_array_array_loc = 0;
1655 for (auto& [index, info] : m_inputs) {
1656 r600_shader_io& io = sh_info->input[input_array_array_loc++];
1657
1658 io.varying_slot = info.varying_slot();
1659 io.system_value = info.system_value();
1660 io.gpr = info.gpr();
1661 io.spi_sid = info.spi_sid();
1662 io.ij_index = info.ij_index();
1663 io.interpolate = info.interpolator();
1664 io.interpolate_location = info.interpolate_loc();
1665 if (info.need_lds_pos()) {
1666 io.lds_pos = info.lds_pos();
1667 sh_info->nlds = MAX2(unsigned(info.lds_pos() + 1), sh_info->nlds);
1668 } else {
1669 io.lds_pos = 0;
1670 }
1671
1672 io.ring_offset = info.ring_offset();
1673 io.uses_interpolate_at_centroid = info.uses_interpolate_at_centroid();
1674
1675 sfn_log << SfnLog::io << "Emit input [" << index << "]";
1676 if (io.varying_slot != NUM_TOTAL_VARYING_SLOTS)
1677 sfn_log << " varying_slot:" << static_cast<int>(io.varying_slot);
1678 if (io.system_value != SYSTEM_VALUE_MAX)
1679 sfn_log << " system_value:" << static_cast<int>(io.system_value);
1680 sfn_log << " spi_sid:" << io.spi_sid << "\n";
1681 assert(io.spi_sid >= 0);
1682 }
1683
1684 sh_info->noutput = m_outputs.size();
1685 /* VS is required to export at least one parameter. */
1686 sh_info->highest_export_param = 0;
1687 sh_info->num_loops = m_nloops;
1688 int output_array_array_loc = 0;
1689
1690 for (auto& [index, info] : m_outputs) {
1691 r600_shader_io& io = sh_info->output[output_array_array_loc++];
1692 io.varying_slot = info.varying_slot();
1693 io.frag_result = info.frag_result();
1694 io.gpr = info.gpr();
1695 io.spi_sid = info.spi_sid();
1696 io.write_mask = info.writemask();
1697 io.export_param = info.export_param();
1698 if (info.export_param() >= 0)
1699 sh_info->highest_export_param = MAX2(unsigned(info.export_param()),
1700 sh_info->highest_export_param);
1701
1702 sfn_log << SfnLog::io << "Emit output[" << index << "]";
1703 if (io.varying_slot != NUM_TOTAL_VARYING_SLOTS)
1704 sfn_log << " varying_slot:" << static_cast<int>(io.varying_slot);
1705 if (io.frag_result != static_cast<gl_frag_result>(FRAG_RESULT_MAX))
1706 sfn_log << " frag_result:" << static_cast<int>(io.frag_result);
1707 sfn_log << " spi_sid:" << io.spi_sid << " write_mask:" << io.write_mask << "\n";
1708 assert(io.spi_sid >= 0);
1709 }
1710
1711 sh_info->nhwatomic = m_nhwatomic;
1712 sh_info->atomic_base = m_atomic_base;
1713 sh_info->nhwatomic_ranges = m_atomics.size();
1714 for (unsigned i = 0; i < m_atomics.size(); ++i)
1715 sh_info->atomics[i] = m_atomics[i];
1716
1717 if (m_flags.test(sh_indirect_const_file))
1718 sh_info->indirect_files |= 1 << TGSI_FILE_CONSTANT;
1719
1720 if (m_flags.test(sh_indirect_atomic))
1721 sh_info->indirect_files |= 1 << TGSI_FILE_HW_ATOMIC;
1722
1723 sh_info->uses_tex_buffers = m_flags.test(sh_uses_tex_buffer);
1724
1725 value_factory().get_shader_info(sh_info);
1726
1727 sh_info->needs_scratch_space = m_flags.test(sh_needs_scratch_space);
1728 sh_info->uses_images = m_flags.test(sh_uses_images);
1729 sh_info->uses_atomics = m_flags.test(sh_uses_atomics);
1730 sh_info->disable_sb = m_flags.test(sh_disble_sb);
1731 sh_info->has_txq_cube_array_z_comp = m_flags.test(sh_txs_cube_array_comp);
1732 sh_info->indirect_files = m_indirect_files;
1733 do_get_shader_info(sh_info);
1734 }
1735
1736 PRegister
atomic_update()1737 Shader::atomic_update()
1738 {
1739 assert(m_atomic_update);
1740 return m_atomic_update;
1741 }
1742
1743 int
remap_atomic_base(int base)1744 Shader::remap_atomic_base(int base)
1745 {
1746 return m_atomic_base_map[base];
1747 }
1748
1749 void
do_get_shader_info(r600_shader * sh_info)1750 Shader::do_get_shader_info(r600_shader *sh_info)
1751 {
1752 sh_info->uses_atomics = m_nhwatomic > 0;
1753 }
1754
1755 const ShaderInput&
input(int base) const1756 Shader::input(int base) const
1757 {
1758 auto io = m_inputs.find(base);
1759 assert(io != m_inputs.end());
1760 return io->second;
1761 }
1762
1763 const ShaderOutput&
output(int base) const1764 Shader::output(int base) const
1765 {
1766 auto io = m_outputs.find(base);
1767 assert(io != m_outputs.end());
1768 return io->second;
1769 }
1770
1771 LiveRangeMap
prepare_live_range_map()1772 Shader::prepare_live_range_map()
1773 {
1774 return m_instr_factory->value_factory().prepare_live_range_map();
1775 }
1776
1777 void
reset_function(ShaderBlocks & new_root)1778 Shader::reset_function(ShaderBlocks& new_root)
1779 {
1780 std::swap(m_root, new_root);
1781 }
1782
1783 void
finalize()1784 Shader::finalize()
1785 {
1786 do_finalize();
1787 }
1788
1789 void
do_finalize()1790 Shader::do_finalize()
1791 {
1792 }
1793
1794 } // namespace r600
1795