1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2022 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_assembler.h"
28
29 #include "../eg_sq.h"
30 #include "../r600_asm.h"
31
32 #include "sfn_callstack.h"
33 #include "sfn_conditionaljumptracker.h"
34 #include "sfn_debug.h"
35 #include "sfn_instr_alugroup.h"
36 #include "sfn_instr_controlflow.h"
37 #include "sfn_instr_export.h"
38 #include "sfn_instr_fetch.h"
39 #include "sfn_instr_mem.h"
40 #include "sfn_instr_tex.h"
41
42 namespace r600 {
Assembler(r600_shader * sh,const r600_shader_key & key)43 Assembler::Assembler(r600_shader *sh, const r600_shader_key& key):
44 m_sh(sh),
45 m_key(key)
46 {
47 }
48
49 extern const std::map<ESDOp, int> ds_opcode_map;
50
51 class AssamblerVisitor : public ConstInstrVisitor {
52 public:
53 AssamblerVisitor(r600_shader *sh, const r600_shader_key& key, bool legacy_math_rules);
54
55 void visit(const AluInstr& instr) override;
56 void visit(const AluGroup& instr) override;
57 void visit(const TexInstr& instr) override;
58 void visit(const ExportInstr& instr) override;
59 void visit(const FetchInstr& instr) override;
60 void visit(const Block& instr) override;
61 void visit(const IfInstr& instr) override;
62 void visit(const ControlFlowInstr& instr) override;
63 void visit(const ScratchIOInstr& instr) override;
64 void visit(const StreamOutInstr& instr) override;
65 void visit(const MemRingOutInstr& instr) override;
66 void visit(const EmitVertexInstr& instr) override;
67 void visit(const GDSInstr& instr) override;
68 void visit(const WriteTFInstr& instr) override;
69 void visit(const LDSAtomicInstr& instr) override;
70 void visit(const LDSReadInstr& instr) override;
71 void visit(const RatInstr& instr) override;
72
73 void finalize();
74
75 const uint32_t sf_vtx = 1;
76 const uint32_t sf_tex = 2;
77 const uint32_t sf_alu = 4;
78 const uint32_t sf_addr_register = 8;
79 const uint32_t sf_all = 0xf;
80
81 void clear_states(const uint32_t& states);
82 bool copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write);
83 PVirtualValue copy_src(r600_bytecode_alu_src& src, const VirtualValue& s);
84
85 EBufferIndexMode emit_index_reg(const VirtualValue& addr, unsigned idx);
86
87 void emit_endif();
88 void emit_else();
89 void emit_loop_begin(bool vpm);
90 void emit_loop_end();
91 void emit_loop_break();
92 void emit_loop_cont();
93
94 void emit_alu_op(const AluInstr& ai);
95 void emit_lds_op(const AluInstr& lds);
96
97 auto translate_for_mathrules(EAluOp op) -> EAluOp;
98
99 void emit_wait_ack();
100
101 /* Start initialized in constructor */
102 const r600_shader_key& m_key;
103 r600_shader *m_shader;
104 r600_bytecode *m_bc;
105
106 ConditionalJumpTracker m_jump_tracker;
107 CallStack m_callstack;
108 bool ps_alpha_to_one;
109 /* End initialized in constructor */
110
111 std::set<uint32_t> m_nliterals_in_group;
112 std::set<int> vtx_fetch_results;
113 std::set<int> tex_fetch_results;
114
115 const VirtualValue *m_last_addr{nullptr};
116
117 unsigned m_max_color_exports{0};
118 int m_loop_nesting{0};
119
120 bool m_ack_suggested{false};
121 bool m_has_param_output{false};
122 bool m_has_pos_output{false};
123 bool m_last_op_was_barrier{false};
124 bool m_result{true};
125 bool m_legacy_math_rules{false};
126 };
127
128 bool
lower(Shader * shader)129 Assembler::lower(Shader *shader)
130 {
131 AssamblerVisitor ass(m_sh, m_key, shader->has_flag(Shader::sh_legacy_math_rules));
132
133 auto& blocks = shader->func();
134 for (auto b : blocks) {
135 b->accept(ass);
136 if (!ass.m_result)
137 return false;
138 }
139
140 ass.finalize();
141
142 return ass.m_result;
143 }
144
AssamblerVisitor(r600_shader * sh,const r600_shader_key & key,bool legacy_math_rules)145 AssamblerVisitor::AssamblerVisitor(r600_shader *sh, const r600_shader_key& key,
146 bool legacy_math_rules):
147 m_key(key),
148 m_shader(sh),
149
150 m_bc(&sh->bc),
151 m_callstack(sh->bc),
152 ps_alpha_to_one(key.ps.alpha_to_one),
153 m_legacy_math_rules(legacy_math_rules)
154 {
155 if (m_shader->processor_type == PIPE_SHADER_FRAGMENT)
156 m_max_color_exports = MAX2(m_key.ps.nr_cbufs, 1);
157
158 if (m_shader->processor_type == PIPE_SHADER_VERTEX && m_shader->ninput > 0)
159 r600_bytecode_add_cfinst(m_bc, CF_OP_CALL_FS);
160 }
161
162 void
finalize()163 AssamblerVisitor::finalize()
164 {
165 const struct cf_op_info *last = nullptr;
166
167 if (m_bc->cf_last)
168 last = r600_isa_cf(m_bc->cf_last->op);
169
170 /* alu clause instructions don't have EOP bit, so add NOP */
171 if (m_shader->bc.gfx_level < CAYMAN &&
172 (!last || last->flags & CF_ALU || m_bc->cf_last->op == CF_OP_LOOP_END ||
173 m_bc->cf_last->op == CF_OP_POP))
174 r600_bytecode_add_cfinst(m_bc, CF_OP_NOP);
175
176 /* A fetch shader only can't be EOP (results in hang), but we can replace
177 * it by a NOP */
178 else if (last && m_bc->cf_last->op == CF_OP_CALL_FS)
179 m_bc->cf_last->op = CF_OP_NOP;
180
181 if (m_shader->bc.gfx_level != CAYMAN)
182 m_bc->cf_last->end_of_program = 1;
183 else
184 cm_bytecode_add_cf_end(m_bc);
185 }
186
187 extern const std::map<EAluOp, int> opcode_map;
188
189 void
visit(const AluInstr & ai)190 AssamblerVisitor::visit(const AluInstr& ai)
191 {
192 assert(vtx_fetch_results.empty());
193 assert(tex_fetch_results.empty());
194
195 if (unlikely(ai.has_alu_flag(alu_is_lds)))
196 emit_lds_op(ai);
197 else
198 emit_alu_op(ai);
199 }
200
201 void
emit_lds_op(const AluInstr & lds)202 AssamblerVisitor::emit_lds_op(const AluInstr& lds)
203 {
204 struct r600_bytecode_alu alu;
205 memset(&alu, 0, sizeof(alu));
206
207 alu.is_lds_idx_op = true;
208 alu.op = lds.lds_opcode();
209
210 bool has_lds_fetch = false;
211 switch (alu.op) {
212 case LDS_WRITE:
213 alu.op = LDS_OP2_LDS_WRITE;
214 break;
215 case LDS_WRITE_REL:
216 alu.op = LDS_OP3_LDS_WRITE_REL;
217 alu.lds_idx = 1;
218 break;
219 case DS_OP_READ_RET:
220 alu.op = LDS_OP1_LDS_READ_RET;
221 FALLTHROUGH;
222 case LDS_ADD_RET:
223 case LDS_AND_RET:
224 case LDS_OR_RET:
225 case LDS_MAX_INT_RET:
226 case LDS_MAX_UINT_RET:
227 case LDS_MIN_INT_RET:
228 case LDS_MIN_UINT_RET:
229 case LDS_XOR_RET:
230 case LDS_XCHG_RET:
231 case LDS_CMP_XCHG_RET:
232 has_lds_fetch = true;
233 break;
234 case LDS_ADD:
235 case LDS_AND:
236 case LDS_OR:
237 case LDS_MAX_INT:
238 case LDS_MAX_UINT:
239 case LDS_MIN_INT:
240 case LDS_MIN_UINT:
241 case LDS_XOR:
242 break;
243 default:
244 std::cerr << "\n R600: error op: " << lds << "\n";
245 unreachable("Unhandled LDS op");
246 }
247
248 copy_src(alu.src[0], lds.src(0));
249
250 if (lds.n_sources() > 1)
251 copy_src(alu.src[1], lds.src(1));
252 else
253 alu.src[1].sel = V_SQ_ALU_SRC_0;
254
255 if (lds.n_sources() > 2)
256 copy_src(alu.src[2], lds.src(2));
257 else
258 alu.src[2].sel = V_SQ_ALU_SRC_0;
259
260 alu.last = lds.has_alu_flag(alu_last_instr);
261
262 int r = r600_bytecode_add_alu(m_bc, &alu);
263 if (has_lds_fetch)
264 m_bc->cf_last->nlds_read++;
265
266 if (r)
267 m_result = false;
268 }
269
translate_for_mathrules(EAluOp op)270 auto AssamblerVisitor::translate_for_mathrules(EAluOp op) -> EAluOp
271 {
272 switch (op) {
273 case op2_dot_ieee: return op2_dot;
274 case op2_dot4_ieee: return op2_dot4;
275 case op2_mul_ieee: return op2_mul;
276 case op3_muladd_ieee : return op2_mul_ieee;
277 default:
278 return op;
279 }
280 }
281
282 void
emit_alu_op(const AluInstr & ai)283 AssamblerVisitor::emit_alu_op(const AluInstr& ai)
284 {
285 sfn_log << SfnLog::assembly << "Emit ALU op " << ai << "\n";
286
287 struct r600_bytecode_alu alu;
288 memset(&alu, 0, sizeof(alu));
289
290 auto opcode = ai.opcode();
291
292 if (unlikely(ai.opcode() == op1_mova_int &&
293 (m_bc->gfx_level < CAYMAN || alu.dst.sel == 0))) {
294 m_last_addr = ai.psrc(0);
295 m_bc->ar_chan = m_last_addr->chan();
296 m_bc->ar_reg = m_last_addr->sel();
297 }
298
299 if (m_legacy_math_rules)
300 opcode = translate_for_mathrules(opcode);
301
302 auto hw_opcode = opcode_map.find(opcode);
303
304 if (hw_opcode == opcode_map.end()) {
305 std::cerr << "Opcode not handled for " << ai << "\n";
306 m_result = false;
307 return;
308 }
309
310 // skip multiple barriers
311 if (m_last_op_was_barrier && opcode == op0_group_barrier)
312 return;
313
314 m_last_op_was_barrier = opcode == op0_group_barrier;
315
316 alu.op = hw_opcode->second;
317
318 auto dst = ai.dest();
319 if (dst) {
320 if (ai.opcode() != op1_mova_int) {
321 if (!copy_dst(alu.dst, *dst, ai.has_alu_flag(alu_write))) {
322 m_result = false;
323 return;
324 }
325
326 alu.dst.write = ai.has_alu_flag(alu_write);
327 alu.dst.clamp = ai.has_alu_flag(alu_dst_clamp);
328 alu.dst.rel = dst->addr() ? 1 : 0;
329 } else if (m_bc->gfx_level == CAYMAN && ai.dest()->sel() > 0) {
330 alu.dst.sel = ai.dest()->sel() + 1;
331 }
332 }
333
334 alu.is_op3 = ai.n_sources() == 3;
335
336 EBufferIndexMode kcache_index_mode = bim_none;
337 PVirtualValue buffer_offset = nullptr;
338
339 for (unsigned i = 0; i < ai.n_sources(); ++i) {
340 buffer_offset = copy_src(alu.src[i], ai.src(i));
341 alu.src[i].neg = ai.has_source_mod(i, AluInstr::mod_neg);
342 if (!alu.is_op3)
343 alu.src[i].abs = ai.has_source_mod(i, AluInstr::mod_abs);
344
345 if (buffer_offset && kcache_index_mode == bim_none) {
346 auto idx_reg = buffer_offset->as_register();
347 if (idx_reg && idx_reg->has_flag(Register::addr_or_idx)) {
348 switch (idx_reg->sel()) {
349 case 1: kcache_index_mode = bim_zero; break;
350 case 2: kcache_index_mode = bim_one; break;
351 default:
352 unreachable("Unsupported index mode");
353 }
354 } else {
355 kcache_index_mode = bim_zero;
356 }
357 alu.src[i].kc_rel = kcache_index_mode;
358 }
359
360 if (ai.has_lds_queue_read()) {
361 assert(m_bc->cf_last->nlds_read > 0);
362 m_bc->cf_last->nlds_read--;
363 }
364 }
365
366 if (ai.bank_swizzle() != alu_vec_unknown)
367 alu.bank_swizzle_force = ai.bank_swizzle();
368
369 alu.last = ai.has_alu_flag(alu_last_instr);
370 alu.execute_mask = ai.has_alu_flag(alu_update_exec);
371
372 /* If the destination register is equal to the last loaded address register
373 * then clear the latter one, because the values will no longer be
374 * identical */
375 if (m_last_addr)
376 sfn_log << SfnLog::assembly << " Current address register is " << *m_last_addr
377 << "\n";
378
379 if (dst)
380 sfn_log << SfnLog::assembly << " Current dst register is " << *dst << "\n";
381
382 auto cf_op = ai.cf_type();
383
384 unsigned type = 0;
385 switch (cf_op) {
386 case cf_alu:
387 type = CF_OP_ALU;
388 break;
389 case cf_alu_push_before:
390 type = CF_OP_ALU_PUSH_BEFORE;
391 break;
392 case cf_alu_pop_after:
393 type = CF_OP_ALU_POP_AFTER;
394 break;
395 case cf_alu_pop2_after:
396 type = CF_OP_ALU_POP2_AFTER;
397 break;
398 case cf_alu_break:
399 type = CF_OP_ALU_BREAK;
400 break;
401 case cf_alu_else_after:
402 type = CF_OP_ALU_ELSE_AFTER;
403 break;
404 case cf_alu_continue:
405 type = CF_OP_ALU_CONTINUE;
406 break;
407 case cf_alu_extended:
408 type = CF_OP_ALU_EXT;
409 break;
410 default:
411 assert(0 && "cf_alu_undefined should have been replaced");
412 }
413
414 if (alu.last)
415 m_nliterals_in_group.clear();
416
417 m_result = !r600_bytecode_add_alu_type(m_bc, &alu, type);
418
419 if (unlikely(ai.opcode() == op1_mova_int)) {
420 if (m_bc->gfx_level < CAYMAN || alu.dst.sel == 0) {
421 m_bc->ar_loaded = 1;
422 } else if (m_bc->gfx_level == CAYMAN) {
423 int idx = alu.dst.sel - 2;
424 m_bc->index_loaded[idx] = 1;
425 m_bc->index_reg[idx] = -1;
426 }
427 }
428
429 if (alu.dst.sel >= g_clause_local_start && alu.dst.sel < g_clause_local_end) {
430 int clidx = 4 * (alu.dst.sel - g_clause_local_start) + alu.dst.chan;
431 m_bc->cf_last->clause_local_written |= 1 << clidx;
432 }
433
434 if (ai.opcode() == op1_set_cf_idx0) {
435 m_bc->index_loaded[0] = 1;
436 m_bc->index_reg[0] = -1;
437 }
438
439 if (ai.opcode() == op1_set_cf_idx1) {
440 m_bc->index_loaded[1] = 1;
441 m_bc->index_reg[1] = -1;
442 }
443 }
444
445 void
visit(const AluGroup & group)446 AssamblerVisitor::visit(const AluGroup& group)
447 {
448 clear_states(sf_vtx | sf_tex);
449
450 if (group.slots() == 0)
451 return;
452
453 static const unsigned slot_limit = 256;
454
455 if (m_bc->cf_last && !m_bc->force_add_cf) {
456 if (group.has_lds_group_start()) {
457 if (m_bc->cf_last->ndw + 2 * (*group.begin())->required_slots() > slot_limit) {
458 assert(m_bc->cf_last->nlds_read == 0);
459 assert(0 && "Not allowed to start new alu group here");
460 m_bc->force_add_cf = 1;
461 m_last_addr = nullptr;
462 }
463 } else {
464 if (m_bc->cf_last->ndw + 2 * group.slots() > slot_limit) {
465 std::cerr << "m_bc->cf_last->ndw = " << m_bc->cf_last->ndw
466 << " group.slots() = " << group.slots()
467 << " -> " << m_bc->cf_last->ndw + 2 * group.slots()
468 << "> slot_limit = " << slot_limit << "\n";
469 assert(m_bc->cf_last->nlds_read == 0);
470 assert(0 && "Not allowed to start new alu group here");
471 m_bc->force_add_cf = 1;
472 m_last_addr = nullptr;
473 } else {
474 auto instr = *group.begin();
475 if (instr && !instr->has_alu_flag(alu_is_lds) &&
476 instr->opcode() == op0_group_barrier && m_bc->cf_last->ndw + 14 > slot_limit) {
477 assert(0 && "Not allowed to start new alu group here");
478 assert(m_bc->cf_last->nlds_read == 0);
479 m_bc->force_add_cf = 1;
480 m_last_addr = nullptr;
481 }
482 }
483 }
484 }
485
486 auto [addr, is_index] = group.addr();
487
488 if (addr) {
489 if (!addr->has_flag(Register::addr_or_idx)) {
490 if (is_index) {
491 emit_index_reg(*addr, 0);
492 } else {
493 auto reg = addr->as_register();
494 assert(reg);
495 if (!m_last_addr || !m_bc->ar_loaded || !m_last_addr->equal_to(*reg)) {
496 m_last_addr = reg;
497 m_bc->ar_reg = reg->sel();
498 m_bc->ar_chan = reg->chan();
499 m_bc->ar_loaded = 0;
500 r600_load_ar(m_bc, group.addr_for_src());
501 }
502 }
503 }
504 }
505
506 for (auto& i : group) {
507 if (i)
508 i->accept(*this);
509 }
510 }
511
512 void
visit(const TexInstr & tex_instr)513 AssamblerVisitor::visit(const TexInstr& tex_instr)
514 {
515 clear_states(sf_vtx | sf_alu);
516
517 if (tex_fetch_results.find(tex_instr.src().sel()) != tex_fetch_results.end()) {
518 m_bc->force_add_cf = 1;
519 tex_fetch_results.clear();
520 }
521
522 r600_bytecode_tex tex;
523 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
524 tex.op = tex_instr.opcode();
525 tex.sampler_id = tex_instr.sampler_id();
526 tex.resource_id = tex_instr.resource_id();
527 tex.src_gpr = tex_instr.src().sel();
528 tex.dst_gpr = tex_instr.dst().sel();
529 tex.dst_sel_x = tex_instr.dest_swizzle(0);
530 tex.dst_sel_y = tex_instr.dest_swizzle(1);
531 tex.dst_sel_z = tex_instr.dest_swizzle(2);
532 tex.dst_sel_w = tex_instr.dest_swizzle(3);
533 tex.src_sel_x = tex_instr.src()[0]->chan();
534 tex.src_sel_y = tex_instr.src()[1]->chan();
535 tex.src_sel_z = tex_instr.src()[2]->chan();
536 tex.src_sel_w = tex_instr.src()[3]->chan();
537 tex.coord_type_x = !tex_instr.has_tex_flag(TexInstr::x_unnormalized);
538 tex.coord_type_y = !tex_instr.has_tex_flag(TexInstr::y_unnormalized);
539 tex.coord_type_z = !tex_instr.has_tex_flag(TexInstr::z_unnormalized);
540 tex.coord_type_w = !tex_instr.has_tex_flag(TexInstr::w_unnormalized);
541 tex.offset_x = tex_instr.get_offset(0);
542 tex.offset_y = tex_instr.get_offset(1);
543 tex.offset_z = tex_instr.get_offset(2);
544 tex.resource_index_mode = tex_instr.resource_index_mode();
545 tex.sampler_index_mode = tex_instr.sampler_index_mode();
546
547 if (tex.dst_sel_x < 4 && tex.dst_sel_y < 4 && tex.dst_sel_z < 4 && tex.dst_sel_w < 4)
548 tex_fetch_results.insert(tex.dst_gpr);
549
550 if (tex_instr.opcode() == TexInstr::get_gradient_h ||
551 tex_instr.opcode() == TexInstr::get_gradient_v)
552 tex.inst_mod = tex_instr.has_tex_flag(TexInstr::grad_fine) ? 1 : 0;
553 else
554 tex.inst_mod = tex_instr.inst_mode();
555 if (r600_bytecode_add_tex(m_bc, &tex)) {
556 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
557 m_result = false;
558 }
559 }
560
561 void
visit(const ExportInstr & exi)562 AssamblerVisitor::visit(const ExportInstr& exi)
563 {
564 const auto& value = exi.value();
565
566 r600_bytecode_output output;
567 memset(&output, 0, sizeof(output));
568
569 output.gpr = value.sel();
570 output.elem_size = 3;
571 output.swizzle_x = value[0]->chan();
572 output.swizzle_y = value[1]->chan();
573 output.swizzle_z = value[2]->chan();
574 output.burst_count = 1;
575 output.op = exi.is_last_export() ? CF_OP_EXPORT_DONE : CF_OP_EXPORT;
576 output.type = exi.export_type();
577
578 clear_states(sf_all);
579 switch (exi.export_type()) {
580 case ExportInstr::pixel:
581 output.swizzle_w = ps_alpha_to_one ? 5 : exi.value()[3]->chan();
582 output.array_base = exi.location();
583 break;
584 case ExportInstr::pos:
585 output.swizzle_w = exi.value()[3]->chan();
586 output.array_base = 60 + exi.location();
587 break;
588 case ExportInstr::param:
589 output.swizzle_w = exi.value()[3]->chan();
590 output.array_base = exi.location();
591 break;
592 default:
593 R600_ASM_ERR("shader_from_nir: export %d type not yet supported\n",
594 exi.export_type());
595 m_result = false;
596 }
597
598 /* If all register elements pinned to fixed values
599 * we can override the gpr (the register allocator doesn't see
600 * this because it doesn't take these channels into account. */
601 if (output.swizzle_x > 3 && output.swizzle_y > 3 && output.swizzle_z > 3 &&
602 output.swizzle_w > 3)
603 output.gpr = 0;
604
605 int r = 0;
606 if ((r = r600_bytecode_add_output(m_bc, &output))) {
607 R600_ASM_ERR("Error adding export at location %d : err: %d\n", exi.location(), r);
608 m_result = false;
609 }
610 }
611
612 void
visit(const ScratchIOInstr & instr)613 AssamblerVisitor::visit(const ScratchIOInstr& instr)
614 {
615 clear_states(sf_all);
616
617 struct r600_bytecode_output cf;
618
619 memset(&cf, 0, sizeof(struct r600_bytecode_output));
620
621 cf.op = CF_OP_MEM_SCRATCH;
622 cf.elem_size = 3;
623 cf.gpr = instr.value().sel();
624 cf.mark = !instr.is_read();
625 cf.comp_mask = instr.is_read() ? 0xf : instr.write_mask();
626 cf.swizzle_x = 0;
627 cf.swizzle_y = 1;
628 cf.swizzle_z = 2;
629 cf.swizzle_w = 3;
630 cf.burst_count = 1;
631
632 assert(!instr.is_read() || m_bc->gfx_level < R700);
633
634 if (instr.address()) {
635 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 3 : 1;
636 cf.index_gpr = instr.address()->sel();
637
638 /* The docu seems to be wrong here: In indirect addressing the
639 * address_base seems to be the array_size */
640 cf.array_size = instr.array_size();
641 } else {
642 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 2 : 0;
643 cf.array_base = instr.location();
644 }
645
646 if (r600_bytecode_add_output(m_bc, &cf)) {
647 R600_ASM_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n");
648 m_result = false;
649 }
650 }
651
652 void
visit(const StreamOutInstr & instr)653 AssamblerVisitor::visit(const StreamOutInstr& instr)
654 {
655 struct r600_bytecode_output output;
656 memset(&output, 0, sizeof(struct r600_bytecode_output));
657
658 output.gpr = instr.value().sel();
659 output.elem_size = instr.element_size();
660 output.array_base = instr.array_base();
661 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
662 output.burst_count = instr.burst_count();
663 output.array_size = instr.array_size();
664 output.comp_mask = instr.comp_mask();
665 output.op = instr.op(m_shader->bc.gfx_level);
666
667 if (r600_bytecode_add_output(m_bc, &output)) {
668 R600_ASM_ERR("shader_from_nir: Error creating stream output instruction\n");
669 m_result = false;
670 }
671 }
672
673 void
visit(const MemRingOutInstr & instr)674 AssamblerVisitor::visit(const MemRingOutInstr& instr)
675 {
676 struct r600_bytecode_output output;
677 memset(&output, 0, sizeof(struct r600_bytecode_output));
678
679 output.gpr = instr.value().sel();
680 output.type = instr.type();
681 output.elem_size = 3;
682 output.comp_mask = 0xf;
683 output.burst_count = 1;
684 output.op = instr.op();
685 if (instr.type() == MemRingOutInstr::mem_write_ind ||
686 instr.type() == MemRingOutInstr::mem_write_ind_ack) {
687 output.index_gpr = instr.index_reg();
688 output.array_size = 0xfff;
689 }
690 output.array_base = instr.array_base();
691
692 if (r600_bytecode_add_output(m_bc, &output)) {
693 R600_ASM_ERR("shader_from_nir: Error creating mem ring write instruction\n");
694 m_result = false;
695 }
696 }
697
698 void
visit(const EmitVertexInstr & instr)699 AssamblerVisitor::visit(const EmitVertexInstr& instr)
700 {
701 int r = r600_bytecode_add_cfinst(m_bc, instr.op());
702 if (!r)
703 m_bc->cf_last->count = instr.stream();
704 else
705 m_result = false;
706 assert(m_bc->cf_last->count < 4);
707 }
708
709 void
visit(const FetchInstr & fetch_instr)710 AssamblerVisitor::visit(const FetchInstr& fetch_instr)
711 {
712 bool use_tc =
713 fetch_instr.has_fetch_flag(FetchInstr::use_tc) || (m_bc->gfx_level == CAYMAN);
714
715 auto clear_flags = use_tc ? sf_vtx : sf_tex;
716
717 clear_states(clear_flags | sf_alu);
718
719 if (fetch_instr.has_fetch_flag(FetchInstr::wait_ack))
720 emit_wait_ack();
721
722
723 if (!use_tc &&
724 vtx_fetch_results.find(fetch_instr.src().sel()) != vtx_fetch_results.end()) {
725 m_bc->force_add_cf = 1;
726 vtx_fetch_results.clear();
727 }
728
729 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc) &&
730 tex_fetch_results.find(fetch_instr.src().sel()) != tex_fetch_results.end()) {
731 m_bc->force_add_cf = 1;
732 tex_fetch_results.clear();
733 }
734
735 if (use_tc)
736 tex_fetch_results.insert(fetch_instr.dst().sel());
737 else
738 vtx_fetch_results.insert(fetch_instr.dst().sel());
739
740 struct r600_bytecode_vtx vtx;
741 memset(&vtx, 0, sizeof(vtx));
742 vtx.op = fetch_instr.opcode();
743 vtx.buffer_id = fetch_instr.resource_id();
744 vtx.fetch_type = fetch_instr.fetch_type();
745 vtx.src_gpr = fetch_instr.src().sel();
746 vtx.src_sel_x = fetch_instr.src().chan();
747 vtx.mega_fetch_count = fetch_instr.mega_fetch_count();
748 vtx.dst_gpr = fetch_instr.dst().sel();
749 vtx.dst_sel_x = fetch_instr.dest_swizzle(0); /* SEL_X */
750 vtx.dst_sel_y = fetch_instr.dest_swizzle(1); /* SEL_Y */
751 vtx.dst_sel_z = fetch_instr.dest_swizzle(2); /* SEL_Z */
752 vtx.dst_sel_w = fetch_instr.dest_swizzle(3); /* SEL_W */
753 vtx.use_const_fields = fetch_instr.has_fetch_flag(FetchInstr::use_const_field);
754 vtx.data_format = fetch_instr.data_format();
755 vtx.num_format_all = fetch_instr.num_format(); /* NUM_FORMAT_SCALED */
756 vtx.format_comp_all = fetch_instr.has_fetch_flag(FetchInstr::format_comp_signed);
757 vtx.endian = fetch_instr.endian_swap();
758 vtx.buffer_index_mode = fetch_instr.resource_index_mode();
759 vtx.offset = fetch_instr.src_offset();
760 vtx.indexed = fetch_instr.has_fetch_flag(FetchInstr::indexed);
761 vtx.uncached = fetch_instr.has_fetch_flag(FetchInstr::uncached);
762 vtx.elem_size = fetch_instr.elm_size();
763 vtx.array_base = fetch_instr.array_base();
764 vtx.array_size = fetch_instr.array_size();
765 vtx.srf_mode_all = fetch_instr.has_fetch_flag(FetchInstr::srf_mode);
766
767 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc)) {
768 if ((r600_bytecode_add_vtx_tc(m_bc, &vtx))) {
769 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
770 m_result = false;
771 }
772
773 } else {
774 if ((r600_bytecode_add_vtx(m_bc, &vtx))) {
775 R600_ASM_ERR("shader_from_nir: Error creating tex assembly instruction\n");
776 m_result = false;
777 }
778 }
779
780 m_bc->cf_last->vpm =
781 (m_bc->type == PIPE_SHADER_FRAGMENT) && fetch_instr.has_fetch_flag(FetchInstr::vpm);
782 m_bc->cf_last->barrier = 1;
783 }
784
785 void
visit(const WriteTFInstr & instr)786 AssamblerVisitor::visit(const WriteTFInstr& instr)
787 {
788 struct r600_bytecode_gds gds;
789
790 auto& value = instr.value();
791
792 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
793 gds.src_gpr = value.sel();
794 gds.src_sel_x = value[0]->chan();
795 gds.src_sel_y = value[1]->chan();
796 gds.src_sel_z = 4;
797 gds.dst_sel_x = 7;
798 gds.dst_sel_y = 7;
799 gds.dst_sel_z = 7;
800 gds.dst_sel_w = 7;
801 gds.op = FETCH_OP_TF_WRITE;
802
803 if (r600_bytecode_add_gds(m_bc, &gds) != 0) {
804 m_result = false;
805 return;
806 }
807
808 if (value[2]->chan() != 7) {
809 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
810 gds.src_gpr = value.sel();
811 gds.src_sel_x = value[2]->chan();
812 gds.src_sel_y = value[3]->chan();
813 gds.src_sel_z = 4;
814 gds.dst_sel_x = 7;
815 gds.dst_sel_y = 7;
816 gds.dst_sel_z = 7;
817 gds.dst_sel_w = 7;
818 gds.op = FETCH_OP_TF_WRITE;
819
820 if (r600_bytecode_add_gds(m_bc, &gds)) {
821 m_result = false;
822 return;
823 }
824 }
825 }
826
827 void
visit(const RatInstr & instr)828 AssamblerVisitor::visit(const RatInstr& instr)
829 {
830 struct r600_bytecode_gds gds;
831
832 /* The instruction writes to the retuen buffer location, and
833 * the value will actually be read back, so make sure all previous writes
834 * have been finished */
835 if (m_ack_suggested /*&& instr.has_instr_flag(Instr::ack_rat_return_write)*/)
836 emit_wait_ack();
837
838 int rat_idx = instr.resource_id();
839
840 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
841
842 r600_bytecode_add_cfinst(m_bc, instr.cf_opcode());
843 auto cf = m_bc->cf_last;
844 cf->rat.id = rat_idx + m_shader->rat_base;
845 cf->rat.inst = instr.rat_op();
846 cf->rat.index_mode = instr.resource_index_mode();
847 cf->output.type = instr.need_ack() ? 3 : 1;
848 cf->output.gpr = instr.data_gpr();
849 cf->output.index_gpr = instr.index_gpr();
850 cf->output.comp_mask = instr.comp_mask();
851 cf->output.burst_count = instr.burst_count();
852 assert(instr.data_swz(0) == PIPE_SWIZZLE_X);
853 if (cf->rat.inst != RatInstr::STORE_TYPED) {
854 assert(instr.data_swz(1) == PIPE_SWIZZLE_Y ||
855 instr.data_swz(1) == PIPE_SWIZZLE_MAX);
856 assert(instr.data_swz(2) == PIPE_SWIZZLE_Z ||
857 instr.data_swz(2) == PIPE_SWIZZLE_MAX);
858 }
859
860 cf->vpm = m_bc->type == PIPE_SHADER_FRAGMENT;
861 cf->barrier = 1;
862 cf->mark = instr.need_ack();
863 cf->output.elem_size = instr.elm_size();
864
865 m_ack_suggested |= instr.need_ack();
866 }
867
868 void
clear_states(const uint32_t & states)869 AssamblerVisitor::clear_states(const uint32_t& states)
870 {
871 if (states & sf_vtx)
872 vtx_fetch_results.clear();
873
874 if (states & sf_tex)
875 tex_fetch_results.clear();
876
877 if (states & sf_alu) {
878 m_last_op_was_barrier = false;
879 m_last_addr = nullptr;
880 }
881 }
882
883 void
visit(const Block & block)884 AssamblerVisitor::visit(const Block& block)
885 {
886 if (block.empty())
887 return;
888
889 if (block.has_instr_flag(Instr::force_cf)) {
890 m_bc->force_add_cf = 1;
891 m_bc->ar_loaded = 0;
892 m_last_addr = nullptr;
893 }
894 sfn_log << SfnLog::assembly << "Translate block size: " << block.size()
895 << " new_cf:" << m_bc->force_add_cf << "\n";
896
897 for (const auto& i : block) {
898 sfn_log << SfnLog::assembly << "Translate " << *i << " ";
899 i->accept(*this);
900 sfn_log << SfnLog::assembly << (m_result ? "good" : "fail") << "\n";
901
902 if (!m_result)
903 break;
904 }
905 }
906
907 void
visit(const IfInstr & instr)908 AssamblerVisitor::visit(const IfInstr& instr)
909 {
910 int elems = m_callstack.push(FC_PUSH_VPM);
911 bool needs_workaround = false;
912
913 if (m_bc->gfx_level == CAYMAN && m_bc->stack.loop > 1)
914 needs_workaround = true;
915
916 if (m_bc->gfx_level == EVERGREEN && m_bc->family != CHIP_HEMLOCK &&
917 m_bc->family != CHIP_CYPRESS && m_bc->family != CHIP_JUNIPER) {
918 unsigned dmod1 = (elems - 1) % m_bc->stack.entry_size;
919 unsigned dmod2 = (elems) % m_bc->stack.entry_size;
920
921 if (elems && (!dmod1 || !dmod2))
922 needs_workaround = true;
923 }
924
925 auto pred = instr.predicate();
926 auto [addr, dummy0, dummy1] = pred->indirect_addr();
927 {
928 }
929 assert(!dummy1);
930 if (addr) {
931 if (!m_last_addr || !m_bc->ar_loaded || !m_last_addr->equal_to(*addr)) {
932 m_bc->ar_reg = addr->sel();
933 m_bc->ar_chan = addr->chan();
934 m_last_addr = addr;
935 m_bc->ar_loaded = 0;
936
937 r600_load_ar(m_bc, true);
938 }
939 }
940
941 if (needs_workaround) {
942 r600_bytecode_add_cfinst(m_bc, CF_OP_PUSH);
943 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
944 r600_bytecode_add_cfinst(m_bc, CF_OP_ALU);
945 pred->set_cf_type(cf_alu);
946 }
947
948 clear_states(sf_tex | sf_vtx);
949 pred->accept(*this);
950
951 r600_bytecode_add_cfinst(m_bc, CF_OP_JUMP);
952 clear_states(sf_all);
953
954 m_jump_tracker.push(m_bc->cf_last, jt_if);
955 }
956
957 void
visit(const ControlFlowInstr & instr)958 AssamblerVisitor::visit(const ControlFlowInstr& instr)
959 {
960 clear_states(sf_all);
961 switch (instr.cf_type()) {
962 case ControlFlowInstr::cf_else:
963 emit_else();
964 break;
965 case ControlFlowInstr::cf_endif:
966 emit_endif();
967 break;
968 case ControlFlowInstr::cf_loop_begin: {
969 bool use_vpm = m_shader->processor_type == PIPE_SHADER_FRAGMENT &&
970 instr.has_instr_flag(Instr::vpm) &&
971 !instr.has_instr_flag(Instr::helper);
972 emit_loop_begin(use_vpm);
973 break;
974 }
975 case ControlFlowInstr::cf_loop_end:
976 emit_loop_end();
977 break;
978 case ControlFlowInstr::cf_loop_break:
979 emit_loop_break();
980 break;
981 case ControlFlowInstr::cf_loop_continue:
982 emit_loop_cont();
983 break;
984 case ControlFlowInstr::cf_wait_ack: {
985 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
986 if (!r) {
987 m_bc->cf_last->cf_addr = 0;
988 m_bc->cf_last->barrier = 1;
989 m_ack_suggested = false;
990 } else {
991 m_result = false;
992 }
993 } break;
994 default:
995 unreachable("Unknown CF instruction type");
996 }
997 }
998
999 void
visit(const GDSInstr & instr)1000 AssamblerVisitor::visit(const GDSInstr& instr)
1001 {
1002 struct r600_bytecode_gds gds;
1003
1004 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
1005
1006 gds.op = ds_opcode_map.at(instr.opcode());
1007 gds.uav_id = instr.resource_id();
1008 gds.uav_index_mode = instr.resource_index_mode();
1009 gds.src_gpr = instr.src().sel();
1010
1011 gds.src_sel_x = instr.src()[0]->chan() < 7 ? instr.src()[0]->chan() : 4;
1012 gds.src_sel_y = instr.src()[1]->chan() < 7 ? instr.src()[1]->chan() : 4;
1013 gds.src_sel_z = instr.src()[2]->chan() < 7 ? instr.src()[2]->chan() : 4;
1014
1015 gds.dst_sel_x = 7;
1016 gds.dst_sel_y = 7;
1017 gds.dst_sel_z = 7;
1018 gds.dst_sel_w = 7;
1019
1020 if (instr.dest()) {
1021 gds.dst_gpr = instr.dest()->sel();
1022 switch (instr.dest()->chan()) {
1023 case 0:
1024 gds.dst_sel_x = 0;
1025 break;
1026 case 1:
1027 gds.dst_sel_y = 0;
1028 break;
1029 case 2:
1030 gds.dst_sel_z = 0;
1031 break;
1032 case 3:
1033 gds.dst_sel_w = 0;
1034 }
1035 }
1036
1037 gds.src_gpr2 = 0;
1038 gds.alloc_consume = m_bc->gfx_level < CAYMAN ? 1 : 0; // Not Cayman
1039
1040 int r = r600_bytecode_add_gds(m_bc, &gds);
1041 if (r) {
1042 m_result = false;
1043 return;
1044 }
1045 m_bc->cf_last->vpm = PIPE_SHADER_FRAGMENT == m_bc->type;
1046 m_bc->cf_last->barrier = 1;
1047 }
1048
1049 void
visit(const LDSAtomicInstr & instr)1050 AssamblerVisitor::visit(const LDSAtomicInstr& instr)
1051 {
1052 (void)instr;
1053 unreachable("LDSAtomicInstr must be lowered to ALUInstr");
1054 }
1055
1056 void
visit(const LDSReadInstr & instr)1057 AssamblerVisitor::visit(const LDSReadInstr& instr)
1058 {
1059 (void)instr;
1060 unreachable("LDSReadInstr must be lowered to ALUInstr");
1061 }
1062
1063 EBufferIndexMode
emit_index_reg(const VirtualValue & addr,unsigned idx)1064 AssamblerVisitor::emit_index_reg(const VirtualValue& addr, unsigned idx)
1065 {
1066 assert(idx < 2);
1067
1068 if (!m_bc->index_loaded[idx] || m_loop_nesting ||
1069 m_bc->index_reg[idx] != (unsigned)addr.sel() ||
1070 m_bc->index_reg_chan[idx] != (unsigned)addr.chan()) {
1071 struct r600_bytecode_alu alu;
1072
1073 // Make sure MOVA is not last instr in clause
1074
1075 if (!m_bc->cf_last || (m_bc->cf_last->ndw >> 1) >= 110)
1076 m_bc->force_add_cf = 1;
1077
1078 if (m_bc->gfx_level != CAYMAN) {
1079
1080 EAluOp idxop = idx ? op1_set_cf_idx1 : op1_set_cf_idx0;
1081
1082 memset(&alu, 0, sizeof(alu));
1083 alu.op = opcode_map.at(op1_mova_int);
1084 alu.dst.chan = 0;
1085 alu.src[0].sel = addr.sel();
1086 alu.src[0].chan = addr.chan();
1087 alu.last = 1;
1088 sfn_log << SfnLog::assembly << " mova_int, ";
1089 int r = r600_bytecode_add_alu(m_bc, &alu);
1090 if (r)
1091 return bim_invalid;
1092
1093 alu.op = opcode_map.at(idxop);
1094 alu.dst.chan = 0;
1095 alu.src[0].sel = 0;
1096 alu.src[0].chan = 0;
1097 alu.last = 1;
1098 sfn_log << SfnLog::assembly << "op1_set_cf_idx" << idx;
1099 r = r600_bytecode_add_alu(m_bc, &alu);
1100 if (r)
1101 return bim_invalid;
1102 } else {
1103 memset(&alu, 0, sizeof(alu));
1104 alu.op = opcode_map.at(op1_mova_int);
1105 alu.dst.sel = idx == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
1106 alu.dst.chan = 0;
1107 alu.src[0].sel = addr.sel();
1108 alu.src[0].chan = addr.chan();
1109 alu.last = 1;
1110 sfn_log << SfnLog::assembly << " mova_int, ";
1111 int r = r600_bytecode_add_alu(m_bc, &alu);
1112 if (r)
1113 return bim_invalid;
1114 }
1115
1116 m_bc->ar_loaded = 0;
1117 m_bc->index_reg[idx] = addr.sel();
1118 m_bc->index_reg_chan[idx] = addr.chan();
1119 m_bc->index_loaded[idx] = true;
1120 m_bc->force_add_cf = 1;
1121 sfn_log << SfnLog::assembly << "\n";
1122 }
1123 return idx == 0 ? bim_zero : bim_one;
1124 }
1125
1126 void
emit_else()1127 AssamblerVisitor::emit_else()
1128 {
1129 r600_bytecode_add_cfinst(m_bc, CF_OP_ELSE);
1130 m_bc->cf_last->pop_count = 1;
1131 m_result &= m_jump_tracker.add_mid(m_bc->cf_last, jt_if);
1132 }
1133
1134 void
emit_endif()1135 AssamblerVisitor::emit_endif()
1136 {
1137 m_callstack.pop(FC_PUSH_VPM);
1138
1139 unsigned force_pop = m_bc->force_add_cf;
1140 if (!force_pop) {
1141 int alu_pop = 3;
1142 if (m_bc->cf_last) {
1143 if (m_bc->cf_last->op == CF_OP_ALU)
1144 alu_pop = 0;
1145 else if (m_bc->cf_last->op == CF_OP_ALU_POP_AFTER)
1146 alu_pop = 1;
1147 }
1148 alu_pop += 1;
1149 if (alu_pop == 1) {
1150 m_bc->cf_last->op = CF_OP_ALU_POP_AFTER;
1151 m_bc->force_add_cf = 1;
1152 } else {
1153 force_pop = 1;
1154 }
1155 }
1156
1157 if (force_pop) {
1158 r600_bytecode_add_cfinst(m_bc, CF_OP_POP);
1159 m_bc->cf_last->pop_count = 1;
1160 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
1161 }
1162
1163 m_result &= m_jump_tracker.pop(m_bc->cf_last, jt_if);
1164 }
1165
1166 void
emit_loop_begin(bool vpm)1167 AssamblerVisitor::emit_loop_begin(bool vpm)
1168 {
1169 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_START_DX10);
1170 m_bc->cf_last->vpm = vpm && m_bc->type == PIPE_SHADER_FRAGMENT;
1171 m_jump_tracker.push(m_bc->cf_last, jt_loop);
1172 m_callstack.push(FC_LOOP);
1173 ++m_loop_nesting;
1174 }
1175
1176 void
emit_loop_end()1177 AssamblerVisitor::emit_loop_end()
1178 {
1179 if (m_ack_suggested) {
1180 emit_wait_ack();
1181 m_ack_suggested = false;
1182 }
1183
1184 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_END);
1185 m_callstack.pop(FC_LOOP);
1186 assert(m_loop_nesting);
1187 --m_loop_nesting;
1188 m_result |= m_jump_tracker.pop(m_bc->cf_last, jt_loop);
1189 }
1190
1191 void
emit_loop_break()1192 AssamblerVisitor::emit_loop_break()
1193 {
1194 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_BREAK);
1195 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1196 }
1197
1198 void
emit_loop_cont()1199 AssamblerVisitor::emit_loop_cont()
1200 {
1201 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_CONTINUE);
1202 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1203 }
1204
1205 bool
copy_dst(r600_bytecode_alu_dst & dst,const Register & d,bool write)1206 AssamblerVisitor::copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write)
1207 {
1208 if (write && d.sel() > g_clause_local_end) {
1209 R600_ASM_ERR("shader_from_nir: Don't support more then 123 GPRs + 4 clause "
1210 "local, but try using %d\n",
1211 d.sel());
1212 m_result = false;
1213 return false;
1214 }
1215
1216 dst.sel = d.sel();
1217 dst.chan = d.chan();
1218
1219 if (m_last_addr && m_last_addr->equal_to(d))
1220 m_last_addr = nullptr;
1221
1222 for (int i = 0; i < 2; ++i) {
1223 /* Force emitting index register, if we didn't emit it yet, because
1224 * the register value will change now */
1225 if (dst.sel == m_bc->index_reg[i] && dst.chan == m_bc->index_reg_chan[i])
1226 m_bc->index_loaded[i] = false;
1227 }
1228
1229 return true;
1230 }
1231
1232 void
emit_wait_ack()1233 AssamblerVisitor::emit_wait_ack()
1234 {
1235 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
1236 if (!r) {
1237 m_bc->cf_last->cf_addr = 0;
1238 m_bc->cf_last->barrier = 1;
1239 m_ack_suggested = false;
1240 } else
1241 m_result = false;
1242 }
1243
1244 class EncodeSourceVisitor : public ConstRegisterVisitor {
1245 public:
1246 EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc);
1247 void visit(const Register& value) override;
1248 void visit(const LocalArray& value) override;
1249 void visit(const LocalArrayValue& value) override;
1250 void visit(const UniformValue& value) override;
1251 void visit(const LiteralConstant& value) override;
1252 void visit(const InlineConstant& value) override;
1253
1254 r600_bytecode_alu_src& src;
1255 r600_bytecode *m_bc;
1256 PVirtualValue m_buffer_offset{nullptr};
1257 };
1258
1259 PVirtualValue
copy_src(r600_bytecode_alu_src & src,const VirtualValue & s)1260 AssamblerVisitor::copy_src(r600_bytecode_alu_src& src, const VirtualValue& s)
1261 {
1262
1263 EncodeSourceVisitor visitor(src, m_bc);
1264 src.sel = s.sel();
1265 src.chan = s.chan();
1266
1267 if (s.sel() >= g_clause_local_start && s.sel() < g_clause_local_end ) {
1268 assert(m_bc->cf_last);
1269 int clidx = 4 * (s.sel() - g_clause_local_start) + s.chan();
1270 /* Ensure that the clause local register was already written */
1271 assert(m_bc->cf_last->clause_local_written & (1 << clidx));
1272 }
1273
1274 s.accept(visitor);
1275 return visitor.m_buffer_offset;
1276 }
1277
EncodeSourceVisitor(r600_bytecode_alu_src & s,r600_bytecode * bc)1278 EncodeSourceVisitor::EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc):
1279 src(s),
1280 m_bc(bc)
1281 {
1282 }
1283
1284 void
visit(const Register & value)1285 EncodeSourceVisitor::visit(const Register& value)
1286 {
1287 assert(value.sel() < g_clause_local_end && "Only have 123 reisters + 4 clause local");
1288 }
1289
1290 void
visit(const LocalArray & value)1291 EncodeSourceVisitor::visit(const LocalArray& value)
1292 {
1293 (void)value;
1294 unreachable("An array can't be a source register");
1295 }
1296
1297 void
visit(const LocalArrayValue & value)1298 EncodeSourceVisitor::visit(const LocalArrayValue& value)
1299 {
1300 src.rel = value.addr() ? 1 : 0;
1301 }
1302
1303 void
visit(const UniformValue & value)1304 EncodeSourceVisitor::visit(const UniformValue& value)
1305 {
1306 assert(value.sel() >= 512 && "Uniform values must have a sel >= 512");
1307 m_buffer_offset = value.buf_addr();
1308 src.kc_bank = value.kcache_bank();
1309 }
1310
1311 void
visit(const LiteralConstant & value)1312 EncodeSourceVisitor::visit(const LiteralConstant& value)
1313 {
1314 src.value = value.value();
1315 }
1316
1317 void
visit(const InlineConstant & value)1318 EncodeSourceVisitor::visit(const InlineConstant& value)
1319 {
1320 (void)value;
1321 }
1322
1323 const std::map<EAluOp, int> opcode_map = {
1324
1325 {op2_add, ALU_OP2_ADD },
1326 {op2_mul, ALU_OP2_MUL },
1327 {op2_mul_ieee, ALU_OP2_MUL_IEEE },
1328 {op2_max, ALU_OP2_MAX },
1329 {op2_min, ALU_OP2_MIN },
1330 {op2_max_dx10, ALU_OP2_MAX_DX10 },
1331 {op2_min_dx10, ALU_OP2_MIN_DX10 },
1332 {op2_sete, ALU_OP2_SETE },
1333 {op2_setgt, ALU_OP2_SETGT },
1334 {op2_setge, ALU_OP2_SETGE },
1335 {op2_setne, ALU_OP2_SETNE },
1336 {op2_sete_dx10, ALU_OP2_SETE_DX10 },
1337 {op2_setgt_dx10, ALU_OP2_SETGT_DX10 },
1338 {op2_setge_dx10, ALU_OP2_SETGE_DX10 },
1339 {op2_setne_dx10, ALU_OP2_SETNE_DX10 },
1340 {op1_fract, ALU_OP1_FRACT },
1341 {op1_trunc, ALU_OP1_TRUNC },
1342 {op1_ceil, ALU_OP1_CEIL },
1343 {op1_rndne, ALU_OP1_RNDNE },
1344 {op1_floor, ALU_OP1_FLOOR },
1345 {op2_ashr_int, ALU_OP2_ASHR_INT },
1346 {op2_lshr_int, ALU_OP2_LSHR_INT },
1347 {op2_lshl_int, ALU_OP2_LSHL_INT },
1348 {op1_mov, ALU_OP1_MOV },
1349 {op0_nop, ALU_OP0_NOP },
1350 {op2_mul_64, ALU_OP2_MUL_64 },
1351 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
1352 {op1v_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
1353 {op2_prede_int, ALU_OP2_PRED_SETE_INT },
1354 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
1355 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
1356 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
1357 {op2_pred_setgt_uint, ALU_OP2_PRED_SETGT_UINT },
1358 {op2_pred_setge_uint, ALU_OP2_PRED_SETGE_UINT },
1359 {op2_pred_sete, ALU_OP2_PRED_SETE },
1360 {op2_pred_setgt, ALU_OP2_PRED_SETGT },
1361 {op2_pred_setge, ALU_OP2_PRED_SETGE },
1362 {op2_pred_setne, ALU_OP2_PRED_SETNE },
1363 {op0_pred_set_clr, ALU_OP0_PRED_SET_CLR },
1364 {op1_pred_set_restore, ALU_OP1_PRED_SET_RESTORE },
1365 {op2_pred_sete_push, ALU_OP2_PRED_SETE_PUSH },
1366 {op2_pred_setgt_push, ALU_OP2_PRED_SETGT_PUSH },
1367 {op2_pred_setge_push, ALU_OP2_PRED_SETGE_PUSH },
1368 {op2_pred_setne_push, ALU_OP2_PRED_SETNE_PUSH },
1369 {op2_kille, ALU_OP2_KILLE },
1370 {op2_killgt, ALU_OP2_KILLGT },
1371 {op2_killge, ALU_OP2_KILLGE },
1372 {op2_killne, ALU_OP2_KILLNE },
1373 {op2_and_int, ALU_OP2_AND_INT },
1374 {op2_or_int, ALU_OP2_OR_INT },
1375 {op2_xor_int, ALU_OP2_XOR_INT },
1376 {op1_not_int, ALU_OP1_NOT_INT },
1377 {op2_add_int, ALU_OP2_ADD_INT },
1378 {op2_sub_int, ALU_OP2_SUB_INT },
1379 {op2_max_int, ALU_OP2_MAX_INT },
1380 {op2_min_int, ALU_OP2_MIN_INT },
1381 {op2_max_uint, ALU_OP2_MAX_UINT },
1382 {op2_min_uint, ALU_OP2_MIN_UINT },
1383 {op2_sete_int, ALU_OP2_SETE_INT },
1384 {op2_setgt_int, ALU_OP2_SETGT_INT },
1385 {op2_setge_int, ALU_OP2_SETGE_INT },
1386 {op2_setne_int, ALU_OP2_SETNE_INT },
1387 {op2_setgt_uint, ALU_OP2_SETGT_UINT },
1388 {op2_setge_uint, ALU_OP2_SETGE_UINT },
1389 {op2_killgt_uint, ALU_OP2_KILLGT_UINT },
1390 {op2_killge_uint, ALU_OP2_KILLGE_UINT },
1391 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT },
1392 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT },
1393 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT },
1394 {op2_kille_int, ALU_OP2_KILLE_INT },
1395 {op2_killgt_int, ALU_OP2_KILLGT_INT },
1396 {op2_killge_int, ALU_OP2_KILLGE_INT },
1397 {op2_killne_int, ALU_OP2_KILLNE_INT },
1398 {op2_pred_sete_push_int, ALU_OP2_PRED_SETE_PUSH_INT },
1399 {op2_pred_setgt_push_int, ALU_OP2_PRED_SETGT_PUSH_INT },
1400 {op2_pred_setge_push_int, ALU_OP2_PRED_SETGE_PUSH_INT },
1401 {op2_pred_setne_push_int, ALU_OP2_PRED_SETNE_PUSH_INT },
1402 {op2_pred_setlt_push_int, ALU_OP2_PRED_SETLT_PUSH_INT },
1403 {op2_pred_setle_push_int, ALU_OP2_PRED_SETLE_PUSH_INT },
1404 {op1_flt_to_int, ALU_OP1_FLT_TO_INT },
1405 {op1_bfrev_int, ALU_OP1_BFREV_INT },
1406 {op2_addc_uint, ALU_OP2_ADDC_UINT },
1407 {op2_subb_uint, ALU_OP2_SUBB_UINT },
1408 {op0_group_barrier, ALU_OP0_GROUP_BARRIER },
1409 {op0_group_seq_begin, ALU_OP0_GROUP_SEQ_BEGIN },
1410 {op0_group_seq_end, ALU_OP0_GROUP_SEQ_END },
1411 {op2_set_mode, ALU_OP2_SET_MODE },
1412 {op1_set_cf_idx0, ALU_OP0_SET_CF_IDX0 },
1413 {op1_set_cf_idx1, ALU_OP0_SET_CF_IDX1 },
1414 {op2_set_lds_size, ALU_OP2_SET_LDS_SIZE },
1415 {op1_exp_ieee, ALU_OP1_EXP_IEEE },
1416 {op1_log_clamped, ALU_OP1_LOG_CLAMPED },
1417 {op1_log_ieee, ALU_OP1_LOG_IEEE },
1418 {op1_recip_clamped, ALU_OP1_RECIP_CLAMPED },
1419 {op1_recip_ff, ALU_OP1_RECIP_FF },
1420 {op1_recip_ieee, ALU_OP1_RECIP_IEEE },
1421 {op1_recipsqrt_clamped, ALU_OP1_RECIPSQRT_CLAMPED },
1422 {op1_recipsqrt_ff, ALU_OP1_RECIPSQRT_FF },
1423 {op1_recipsqrt_ieee1, ALU_OP1_RECIPSQRT_IEEE },
1424 {op1_sqrt_ieee, ALU_OP1_SQRT_IEEE },
1425 {op1_sin, ALU_OP1_SIN },
1426 {op1_cos, ALU_OP1_COS },
1427 {op2_mullo_int, ALU_OP2_MULLO_INT },
1428 {op2_mulhi_int, ALU_OP2_MULHI_INT },
1429 {op2_mullo_uint, ALU_OP2_MULLO_UINT },
1430 {op2_mulhi_uint, ALU_OP2_MULHI_UINT },
1431 {op1_recip_int, ALU_OP1_RECIP_INT },
1432 {op1_recip_uint, ALU_OP1_RECIP_UINT },
1433 {op1_recip_64, ALU_OP2_RECIP_64 },
1434 {op1_recip_clamped_64, ALU_OP2_RECIP_CLAMPED_64 },
1435 {op1_recipsqrt_64, ALU_OP2_RECIPSQRT_64 },
1436 {op1_recipsqrt_clamped_64, ALU_OP2_RECIPSQRT_CLAMPED_64 },
1437 {op1_sqrt_64, ALU_OP2_SQRT_64 },
1438 {op1_flt_to_uint, ALU_OP1_FLT_TO_UINT },
1439 {op1_int_to_flt, ALU_OP1_INT_TO_FLT },
1440 {op1_uint_to_flt, ALU_OP1_UINT_TO_FLT },
1441 {op2_bfm_int, ALU_OP2_BFM_INT },
1442 {op1_flt32_to_flt16, ALU_OP1_FLT32_TO_FLT16 },
1443 {op1_flt16_to_flt32, ALU_OP1_FLT16_TO_FLT32 },
1444 {op1_ubyte0_flt, ALU_OP1_UBYTE0_FLT },
1445 {op1_ubyte1_flt, ALU_OP1_UBYTE1_FLT },
1446 {op1_ubyte2_flt, ALU_OP1_UBYTE2_FLT },
1447 {op1_ubyte3_flt, ALU_OP1_UBYTE3_FLT },
1448 {op1_bcnt_int, ALU_OP1_BCNT_INT },
1449 {op1_ffbh_uint, ALU_OP1_FFBH_UINT },
1450 {op1_ffbl_int, ALU_OP1_FFBL_INT },
1451 {op1_ffbh_int, ALU_OP1_FFBH_INT },
1452 {op1_flt_to_uint4, ALU_OP1_FLT_TO_UINT4 },
1453 {op2_dot_ieee, ALU_OP2_DOT_IEEE },
1454 {op1_flt_to_int_rpi, ALU_OP1_FLT_TO_INT_RPI },
1455 {op1_flt_to_int_floor, ALU_OP1_FLT_TO_INT_FLOOR },
1456 {op2_mulhi_uint24, ALU_OP2_MULHI_UINT24 },
1457 {op1_mbcnt_32hi_int, ALU_OP1_MBCNT_32HI_INT },
1458 {op1_offset_to_flt, ALU_OP1_OFFSET_TO_FLT },
1459 {op2_mul_uint24, ALU_OP2_MUL_UINT24 },
1460 {op1_bcnt_accum_prev_int, ALU_OP1_BCNT_ACCUM_PREV_INT },
1461 {op1_mbcnt_32lo_accum_prev_int, ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT},
1462 {op2_sete_64, ALU_OP2_SETE_64 },
1463 {op2_setne_64, ALU_OP2_SETNE_64 },
1464 {op2_setgt_64, ALU_OP2_SETGT_64 },
1465 {op2_setge_64, ALU_OP2_SETGE_64 },
1466 {op2_min_64, ALU_OP2_MIN_64 },
1467 {op2_max_64, ALU_OP2_MAX_64 },
1468 {op2_dot4, ALU_OP2_DOT4 },
1469 {op2_dot4_ieee, ALU_OP2_DOT4_IEEE },
1470 {op2_cube, ALU_OP2_CUBE },
1471 {op1_max4, ALU_OP1_MAX4 },
1472 {op1_frexp_64, ALU_OP1_FREXP_64 },
1473 {op1_ldexp_64, ALU_OP2_LDEXP_64 },
1474 {op1_fract_64, ALU_OP1_FRACT_64 },
1475 {op2_pred_setgt_64, ALU_OP2_PRED_SETGT_64 },
1476 {op2_pred_sete_64, ALU_OP2_PRED_SETE_64 },
1477 {op2_pred_setge_64, ALU_OP2_PRED_SETGE_64 },
1478 {op2_add_64, ALU_OP2_ADD_64 },
1479 {op1_mova_int, ALU_OP1_MOVA_INT },
1480 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32 },
1481 {op1_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64 },
1482 {op2_sad_accum_prev_uint, ALU_OP2_SAD_ACCUM_PREV_UINT },
1483 {op2_dot, ALU_OP2_DOT },
1484 {op1_mul_prev, ALU_OP1_MUL_PREV },
1485 {op1_mul_ieee_prev, ALU_OP1_MUL_IEEE_PREV },
1486 {op1_add_prev, ALU_OP1_ADD_PREV },
1487 {op2_muladd_prev, ALU_OP2_MULADD_PREV },
1488 {op2_muladd_ieee_prev, ALU_OP2_MULADD_IEEE_PREV },
1489 {op2_interp_xy, ALU_OP2_INTERP_XY },
1490 {op2_interp_zw, ALU_OP2_INTERP_ZW },
1491 {op2_interp_x, ALU_OP2_INTERP_X },
1492 {op2_interp_z, ALU_OP2_INTERP_Z },
1493 {op0_store_flags, ALU_OP1_STORE_FLAGS },
1494 {op1_load_store_flags, ALU_OP1_LOAD_STORE_FLAGS },
1495 {op0_lds_1a, ALU_OP2_LDS_1A },
1496 {op0_lds_1a1d, ALU_OP2_LDS_1A1D },
1497 {op0_lds_2a, ALU_OP2_LDS_2A },
1498 {op1_interp_load_p0, ALU_OP1_INTERP_LOAD_P0 },
1499 {op1_interp_load_p10, ALU_OP1_INTERP_LOAD_P10 },
1500 {op1_interp_load_p20, ALU_OP1_INTERP_LOAD_P20 },
1501 {op3_bfe_uint, ALU_OP3_BFE_UINT },
1502 {op3_bfe_int, ALU_OP3_BFE_INT },
1503 {op3_bfi_int, ALU_OP3_BFI_INT },
1504 {op3_fma, ALU_OP3_FMA },
1505 {op3_cndne_64, ALU_OP3_CNDNE_64 },
1506 {op3_fma_64, ALU_OP3_FMA_64 },
1507 {op3_lerp_uint, ALU_OP3_LERP_UINT },
1508 {op3_bit_align_int, ALU_OP3_BIT_ALIGN_INT },
1509 {op3_byte_align_int, ALU_OP3_BYTE_ALIGN_INT },
1510 {op3_sad_accum_uint, ALU_OP3_SAD_ACCUM_UINT },
1511 {op3_sad_accum_hi_uint, ALU_OP3_SAD_ACCUM_HI_UINT },
1512 {op3_muladd_uint24, ALU_OP3_MULADD_UINT24 },
1513 {op3_lds_idx_op, ALU_OP3_LDS_IDX_OP },
1514 {op3_muladd, ALU_OP3_MULADD },
1515 {op3_muladd_m2, ALU_OP3_MULADD_M2 },
1516 {op3_muladd_m4, ALU_OP3_MULADD_M4 },
1517 {op3_muladd_d2, ALU_OP3_MULADD_D2 },
1518 {op3_muladd_ieee, ALU_OP3_MULADD_IEEE },
1519 {op3_cnde, ALU_OP3_CNDE },
1520 {op3_cndgt, ALU_OP3_CNDGT },
1521 {op3_cndge, ALU_OP3_CNDGE },
1522 {op3_cnde_int, ALU_OP3_CNDE_INT },
1523 {op3_cndgt_int, ALU_OP3_CNDGT_INT },
1524 {op3_cndge_int, ALU_OP3_CNDGE_INT },
1525 {op3_mul_lit, ALU_OP3_MUL_LIT },
1526 };
1527
1528 const std::map<ESDOp, int> ds_opcode_map = {
1529 {DS_OP_ADD, FETCH_OP_GDS_ADD },
1530 {DS_OP_SUB, FETCH_OP_GDS_SUB },
1531 {DS_OP_RSUB, FETCH_OP_GDS_RSUB },
1532 {DS_OP_INC, FETCH_OP_GDS_INC },
1533 {DS_OP_DEC, FETCH_OP_GDS_DEC },
1534 {DS_OP_MIN_INT, FETCH_OP_GDS_MIN_INT },
1535 {DS_OP_MAX_INT, FETCH_OP_GDS_MAX_INT },
1536 {DS_OP_MIN_UINT, FETCH_OP_GDS_MIN_UINT },
1537 {DS_OP_MAX_UINT, FETCH_OP_GDS_MAX_UINT },
1538 {DS_OP_AND, FETCH_OP_GDS_AND },
1539 {DS_OP_OR, FETCH_OP_GDS_OR },
1540 {DS_OP_XOR, FETCH_OP_GDS_XOR },
1541 {DS_OP_MSKOR, FETCH_OP_GDS_MSKOR },
1542 {DS_OP_WRITE, FETCH_OP_GDS_WRITE },
1543 {DS_OP_WRITE_REL, FETCH_OP_GDS_WRITE_REL },
1544 {DS_OP_WRITE2, FETCH_OP_GDS_WRITE2 },
1545 {DS_OP_CMP_STORE, FETCH_OP_GDS_CMP_STORE },
1546 {DS_OP_CMP_STORE_SPF, FETCH_OP_GDS_CMP_STORE_SPF },
1547 {DS_OP_BYTE_WRITE, FETCH_OP_GDS_BYTE_WRITE },
1548 {DS_OP_SHORT_WRITE, FETCH_OP_GDS_SHORT_WRITE },
1549 {DS_OP_ADD_RET, FETCH_OP_GDS_ADD_RET },
1550 {DS_OP_SUB_RET, FETCH_OP_GDS_SUB_RET },
1551 {DS_OP_RSUB_RET, FETCH_OP_GDS_RSUB_RET },
1552 {DS_OP_INC_RET, FETCH_OP_GDS_INC_RET },
1553 {DS_OP_DEC_RET, FETCH_OP_GDS_DEC_RET },
1554 {DS_OP_MIN_INT_RET, FETCH_OP_GDS_MIN_INT_RET },
1555 {DS_OP_MAX_INT_RET, FETCH_OP_GDS_MAX_INT_RET },
1556 {DS_OP_MIN_UINT_RET, FETCH_OP_GDS_MIN_UINT_RET },
1557 {DS_OP_MAX_UINT_RET, FETCH_OP_GDS_MAX_UINT_RET },
1558 {DS_OP_AND_RET, FETCH_OP_GDS_AND_RET },
1559 {DS_OP_OR_RET, FETCH_OP_GDS_OR_RET },
1560 {DS_OP_XOR_RET, FETCH_OP_GDS_XOR_RET },
1561 {DS_OP_MSKOR_RET, FETCH_OP_GDS_MSKOR_RET },
1562 {DS_OP_XCHG_RET, FETCH_OP_GDS_XCHG_RET },
1563 {DS_OP_XCHG_REL_RET, FETCH_OP_GDS_XCHG_REL_RET },
1564 {DS_OP_XCHG2_RET, FETCH_OP_GDS_XCHG2_RET },
1565 {DS_OP_CMP_XCHG_RET, FETCH_OP_GDS_CMP_XCHG_RET },
1566 {DS_OP_CMP_XCHG_SPF_RET, FETCH_OP_GDS_CMP_XCHG_SPF_RET },
1567 {DS_OP_READ_RET, FETCH_OP_GDS_READ_RET },
1568 {DS_OP_READ_REL_RET, FETCH_OP_GDS_READ_REL_RET },
1569 {DS_OP_READ2_RET, FETCH_OP_GDS_READ2_RET },
1570 {DS_OP_READWRITE_RET, FETCH_OP_GDS_READWRITE_RET },
1571 {DS_OP_BYTE_READ_RET, FETCH_OP_GDS_BYTE_READ_RET },
1572 {DS_OP_UBYTE_READ_RET, FETCH_OP_GDS_UBYTE_READ_RET },
1573 {DS_OP_SHORT_READ_RET, FETCH_OP_GDS_SHORT_READ_RET },
1574 {DS_OP_USHORT_READ_RET, FETCH_OP_GDS_USHORT_READ_RET },
1575 {DS_OP_ATOMIC_ORDERED_ALLOC_RET, FETCH_OP_GDS_ATOMIC_ORDERED_ALLOC},
1576 {DS_OP_INVALID, 0 },
1577 };
1578
1579 } // namespace r600
1580