1 /* -*- mesa-c++ -*-
2 *
3 * Copyright (c) 2022 Collabora LTD
4 *
5 * Author: Gert Wollny <gert.wollny@collabora.com>
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * on the rights to use, copy, modify, merge, publish, distribute, sub
11 * license, and/or sell copies of the Software, and to permit persons to whom
12 * the Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 */
26
27 #include "sfn_assembler.h"
28 #include "sfn_debug.h"
29 #include "sfn_instr_alugroup.h"
30 #include "sfn_instr_controlflow.h"
31 #include "sfn_instr_fetch.h"
32 #include "sfn_instr_export.h"
33 #include "sfn_instr_mem.h"
34 #include "sfn_instr_tex.h"
35
36 #include "sfn_conditionaljumptracker.h"
37 #include "sfn_callstack.h"
38
39 #include "../eg_sq.h"
40
41 namespace r600 {
Assembler(r600_shader * sh,const r600_shader_key & key)42 Assembler::Assembler(r600_shader *sh, const r600_shader_key& key):
43 m_sh(sh), m_key(key)
44 {
45 }
46
47 extern const std::map<ESDOp, int> ds_opcode_map;
48
49 class AssamblerVisitor : public ConstInstrVisitor {
50 public:
51 AssamblerVisitor(r600_shader *sh, const r600_shader_key& key);
52
53 void visit(const AluInstr& instr) override;
54 void visit(const AluGroup& instr) override;
55 void visit(const TexInstr& instr) override;
56 void visit(const ExportInstr& instr) override;
57 void visit(const FetchInstr& instr) override;
58 void visit(const Block& instr) override;
59 void visit(const IfInstr& instr) override;
60 void visit(const ControlFlowInstr& instr) override;
61 void visit(const ScratchIOInstr& instr) override;
62 void visit(const StreamOutInstr& instr) override;
63 void visit(const MemRingOutInstr& instr) override;
64 void visit(const EmitVertexInstr& instr) override;
65 void visit(const GDSInstr& instr) override;
66 void visit(const WriteTFInstr& instr) override;
67 void visit(const LDSAtomicInstr& instr) override;
68 void visit(const LDSReadInstr& instr) override;
69 void visit(const RatInstr& instr) override;
70
71 void finalize();
72
73 const uint32_t sf_vtx = 1;
74 const uint32_t sf_tex = 2;
75 const uint32_t sf_alu = 4;
76 const uint32_t sf_addr_register = 8;
77 const uint32_t sf_all = 0xf;
78
79 void clear_states(const uint32_t& states);
80 bool copy_dst(r600_bytecode_alu_dst& dst, const Register& d, bool write);
81 PVirtualValue copy_src(r600_bytecode_alu_src& src, const VirtualValue& s);
82
83 EBufferIndexMode
84 emit_index_reg(const VirtualValue& addr, unsigned idx);
85
86 void emit_endif();
87 void emit_else();
88 void emit_loop_begin(bool vpm);
89 void emit_loop_end();
90 void emit_loop_break();
91 void emit_loop_cont();
92
93 void emit_alu_op(const AluInstr& ai);
94 void emit_lds_op(const AluInstr& lds);
95
96 void emit_wait_ack();
97
98 /* Start initialized in constructor */
99 const r600_shader_key& m_key;
100 r600_shader *m_shader;
101 r600_bytecode *m_bc;
102
103 ConditionalJumpTracker m_jump_tracker;
104 CallStack m_callstack;
105 bool ps_alpha_to_one;
106 /* End initialized in constructor */
107
108 std::set<uint32_t> m_nliterals_in_group;
109 std::set<int> vtx_fetch_results;
110 std::set<int> tex_fetch_results;
111
112 PRegister m_last_addr{nullptr};
113
114 unsigned m_max_color_exports{0};
115 int m_loop_nesting{0};
116
117 bool m_ack_suggested{false};
118 bool m_has_param_output{false};
119 bool m_has_pos_output{false};
120 bool m_last_op_was_barrier{false};
121 bool m_result{true};
122 };
123
lower(Shader * shader)124 bool Assembler::lower(Shader *shader)
125 {
126 AssamblerVisitor ass(m_sh, m_key);
127
128 auto& blocks = shader->func();
129 for (auto b : blocks) {
130 b->accept(ass);
131 if (!ass.m_result)
132 return false;
133 }
134
135 ass.finalize();
136
137 return ass.m_result;
138
139 }
140
AssamblerVisitor(r600_shader * sh,const r600_shader_key & key)141 AssamblerVisitor::AssamblerVisitor(r600_shader *sh, const r600_shader_key& key):
142 m_key(key),
143 m_shader(sh),
144
145 m_bc(&sh->bc),
146 m_callstack(sh->bc),
147 ps_alpha_to_one(key.ps.alpha_to_one)
148 {
149 if (m_shader->processor_type == PIPE_SHADER_FRAGMENT)
150 m_max_color_exports = MAX2(m_key.ps.nr_cbufs, 1);
151
152 if (m_shader->processor_type == PIPE_SHADER_VERTEX &&
153 m_shader->ninput > 0)
154 r600_bytecode_add_cfinst(m_bc, CF_OP_CALL_FS);
155 }
156
finalize()157 void AssamblerVisitor::finalize()
158 {
159 const struct cf_op_info *last = nullptr;
160
161 if (m_bc->cf_last)
162 last = r600_isa_cf(m_bc->cf_last->op);
163
164 /* alu clause instructions don't have EOP bit, so add NOP */
165 if (m_shader->bc.gfx_level < CAYMAN &&
166 (!last || last->flags & CF_ALU || m_bc->cf_last->op == CF_OP_LOOP_END
167 || m_bc->cf_last->op == CF_OP_POP))
168 r600_bytecode_add_cfinst(m_bc, CF_OP_NOP);
169
170 /* A fetch shader only can't be EOP (results in hang), but we can replace it
171 * by a NOP */
172 else if (last && m_bc->cf_last->op == CF_OP_CALL_FS)
173 m_bc->cf_last->op = CF_OP_NOP;
174
175 if (m_shader->bc.gfx_level != CAYMAN)
176 m_bc->cf_last->end_of_program = 1;
177 else
178 cm_bytecode_add_cf_end(m_bc);
179 }
180
181 extern const std::map<EAluOp, int> opcode_map;
182
visit(const AluInstr & ai)183 void AssamblerVisitor::visit(const AluInstr& ai)
184 {
185 assert(vtx_fetch_results.empty());
186 assert(tex_fetch_results.empty());
187
188 if (unlikely(ai.has_alu_flag(alu_is_lds)))
189 emit_lds_op(ai);
190 else
191 emit_alu_op(ai);
192 }
193
emit_lds_op(const AluInstr & lds)194 void AssamblerVisitor::emit_lds_op(const AluInstr& lds)
195 {
196 struct r600_bytecode_alu alu;
197 memset(&alu, 0, sizeof(alu));
198
199 alu.is_lds_idx_op = true;
200 alu.op = lds.lds_opcode();
201
202 bool has_lds_fetch = false;
203 switch (alu.op) {
204 case LDS_WRITE:
205 alu.op =LDS_OP2_LDS_WRITE;
206 break;
207 case LDS_WRITE_REL:
208 alu.op = LDS_OP3_LDS_WRITE_REL;
209 alu.lds_idx = 1;
210 break;
211 case DS_OP_READ_RET:
212 alu.op = LDS_OP1_LDS_READ_RET;
213 FALLTHROUGH;
214 case LDS_ADD_RET:
215 case LDS_AND_RET:
216 case LDS_OR_RET:
217 case LDS_MAX_INT_RET:
218 case LDS_MAX_UINT_RET:
219 case LDS_MIN_INT_RET:
220 case LDS_MIN_UINT_RET:
221 case LDS_XOR_RET:
222 case LDS_XCHG_RET:
223 case LDS_CMP_XCHG_RET:
224 has_lds_fetch = true;
225 break;
226 case LDS_ADD:
227 case LDS_AND:
228 case LDS_OR:
229 case LDS_MAX_INT:
230 case LDS_MAX_UINT:
231 case LDS_MIN_INT:
232 case LDS_MIN_UINT:
233 case LDS_XOR:
234 break;
235 default:
236 std::cerr << "\n R600: error op: " << lds << "\n";
237 unreachable("Unhandled LDS op");
238 }
239
240 copy_src(alu.src[0], lds.src(0));
241
242 if (lds.n_sources() > 1)
243 copy_src(alu.src[1], lds.src(1));
244 else
245 alu.src[1].sel = V_SQ_ALU_SRC_0;
246
247 if (lds.n_sources() > 2)
248 copy_src(alu.src[2], lds.src(2));
249 else
250 alu.src[2].sel = V_SQ_ALU_SRC_0;
251
252 alu.last = lds.has_alu_flag(alu_last_instr);
253
254 int r = r600_bytecode_add_alu(m_bc, &alu);
255 if (has_lds_fetch)
256 m_bc->cf_last->nlds_read++;
257
258 if (r)
259 m_result = false;
260 }
261
emit_alu_op(const AluInstr & ai)262 void AssamblerVisitor::emit_alu_op(const AluInstr& ai)
263 {
264 struct r600_bytecode_alu alu;
265 memset(&alu, 0, sizeof(alu));
266
267 if (opcode_map.find(ai.opcode()) == opcode_map.end()) {
268 std::cerr << "Opcode not handled for " << ai <<"\n";
269 m_result = false;
270 return;
271 }
272
273 // skip multiple barriers
274 if (m_last_op_was_barrier && ai.opcode() == op0_group_barrier)
275 return;
276
277 m_last_op_was_barrier = ai.opcode() == op0_group_barrier;
278
279 alu.op = opcode_map.at(ai.opcode());
280
281 auto dst = ai.dest();
282 if (dst) {
283 if (!copy_dst(alu.dst, *dst, ai.has_alu_flag(alu_write))) {
284 m_result = false;
285 return;
286 }
287
288 alu.dst.write = ai.has_alu_flag(alu_write);
289 alu.dst.clamp = ai.has_alu_flag(alu_dst_clamp);
290 alu.dst.rel = dst->addr() ? 1 : 0;
291 } else {
292 alu.dst.chan = ai.dest_chan();
293 }
294
295 alu.is_op3 = ai.n_sources() == 3;
296
297 EBufferIndexMode kcache_index_mode = bim_none;
298 PVirtualValue buffer_offset = nullptr;
299
300 for (unsigned i = 0; i < ai.n_sources(); ++i) {
301 buffer_offset = copy_src(alu.src[i], ai.src(i));
302 alu.src[i].neg = ai.has_alu_flag(AluInstr::src_neg_flags[i]);
303 if (!alu.is_op3)
304 alu.src[i].abs = ai.has_alu_flag(AluInstr::src_abs_flags[i]);
305
306 if (buffer_offset && kcache_index_mode == bim_none) {
307 kcache_index_mode = bim_zero;
308 alu.src[i].kc_bank = 1;
309 alu.src[i].kc_rel = 1;
310 }
311
312 if (ai.has_lds_queue_read()) {
313 assert(m_bc->cf_last->nlds_read > 0);
314 m_bc->cf_last->nlds_read--;
315 }
316 }
317
318 if (ai.bank_swizzle() != alu_vec_unknown)
319 alu.bank_swizzle_force = ai.bank_swizzle();
320
321 alu.last = ai.has_alu_flag(alu_last_instr);
322 alu.execute_mask = ai.has_alu_flag(alu_update_exec);
323
324 /* If the destination register is equal to the last loaded address register
325 * then clear the latter one, because the values will no longer be identical */
326 if (m_last_addr)
327 sfn_log << SfnLog::assembly << " Current address register is " << *m_last_addr << "\n";
328
329 if (dst)
330 sfn_log << SfnLog::assembly << " Current dst register is " << *dst << "\n";
331
332 if (dst && m_last_addr && *dst == *m_last_addr) {
333 sfn_log << SfnLog::assembly << " Clear address register (was " << *m_last_addr << "\n";
334 m_last_addr = nullptr;
335 }
336
337 auto cf_op = ai.cf_type();
338
339 unsigned type = 0;
340 switch (cf_op) {
341 case cf_alu: type = CF_OP_ALU; break;
342 case cf_alu_push_before: type = CF_OP_ALU_PUSH_BEFORE; break;
343 case cf_alu_pop_after: type = CF_OP_ALU_POP_AFTER; break;
344 case cf_alu_pop2_after: type = CF_OP_ALU_POP2_AFTER; break;
345 case cf_alu_break: type = CF_OP_ALU_BREAK; break;
346 case cf_alu_else_after: type = CF_OP_ALU_ELSE_AFTER; break;
347 case cf_alu_continue: type = CF_OP_ALU_CONTINUE; break;
348 case cf_alu_extended: type = CF_OP_ALU_EXT; break;
349 default:
350 assert(0 && "cf_alu_undefined should have been replaced");
351 }
352
353 if (alu.last)
354 m_nliterals_in_group.clear();
355
356
357 m_result = !r600_bytecode_add_alu_type(m_bc, &alu, type);
358
359 if (ai.opcode() == op1_mova_int)
360 m_bc->ar_loaded = 0;
361
362 if (ai.opcode() == op1_set_cf_idx0)
363 m_bc->index_loaded[0] = 1;
364
365 if (ai.opcode() == op1_set_cf_idx1)
366 m_bc->index_loaded[1] = 1;
367
368 m_bc->force_add_cf |= (ai.opcode() == op2_kille ||
369 ai.opcode() == op2_killne_int ||
370 ai.opcode() == op1_set_cf_idx0 ||
371 ai.opcode() == op1_set_cf_idx1);
372 }
373
visit(const AluGroup & group)374 void AssamblerVisitor::visit(const AluGroup& group)
375 {
376 clear_states(sf_vtx | sf_tex);
377
378 if (group.slots() == 0)
379 return;
380
381 if (group.has_lds_group_start()) {
382 if (m_bc->cf_last->ndw + 2 * (*group.begin())->required_slots() > 220) {
383 assert(m_bc->cf_last->nlds_read == 0);
384 m_bc->force_add_cf = 1;
385 m_last_addr = nullptr;
386 }
387 } else if (m_bc->cf_last) {
388 if (m_bc->cf_last->ndw + 2 * group.slots() > 240) {
389 assert(m_bc->cf_last->nlds_read == 0);
390 m_bc->force_add_cf = 1;
391 m_last_addr = nullptr;
392 } else {
393 auto instr = *group.begin();
394 if (instr &&
395 !instr->has_alu_flag(alu_is_lds) &&
396 instr->opcode() == op0_group_barrier &&
397 m_bc->cf_last->ndw + 14 > 240) {
398 assert(m_bc->cf_last->nlds_read == 0);
399 m_bc->force_add_cf = 1;
400 m_last_addr = nullptr;
401 }
402 }
403 }
404
405 auto addr = group.addr();
406
407 if (addr.first) {
408 if (!addr.second) {
409 if (!m_last_addr || !m_bc->ar_loaded ||
410 !m_last_addr->equal_to(*addr.first)) {
411 m_bc->ar_reg = addr.first->sel();
412 m_bc->ar_chan = addr.first->chan();
413 m_last_addr = addr.first;
414 m_bc->ar_loaded = 0;
415
416 r600_load_ar(m_bc, group.addr_for_src());
417 }
418 } else {
419 emit_index_reg(*addr.first, 0);
420 }
421 }
422
423 for (auto& i : group) {
424 if (i)
425 i->accept(*this);
426 }
427 }
428
visit(const TexInstr & tex_instr)429 void AssamblerVisitor::visit(const TexInstr& tex_instr)
430 {
431 clear_states(sf_vtx | sf_alu);
432
433 int sampler_offset = 0;
434 auto addr = tex_instr.sampler_offset();
435 EBufferIndexMode index_mode = bim_none;
436
437 if (addr)
438 index_mode = emit_index_reg(*addr, 1);
439
440 if (tex_fetch_results.find(tex_instr.src().sel()) !=
441 tex_fetch_results.end()) {
442 m_bc->force_add_cf = 1;
443 tex_fetch_results.clear();
444 }
445
446 r600_bytecode_tex tex;
447 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
448 tex.op = tex_instr.opcode();
449 tex.sampler_id = tex_instr.sampler_id() + sampler_offset;
450 tex.resource_id = tex_instr.resource_id() + sampler_offset;
451 tex.src_gpr = tex_instr.src().sel();
452 tex.dst_gpr = tex_instr.dst().sel();
453 tex.dst_sel_x = tex_instr.dest_swizzle(0);
454 tex.dst_sel_y = tex_instr.dest_swizzle(1);
455 tex.dst_sel_z = tex_instr.dest_swizzle(2);
456 tex.dst_sel_w = tex_instr.dest_swizzle(3);
457 tex.src_sel_x = tex_instr.src()[0]->chan();
458 tex.src_sel_y = tex_instr.src()[1]->chan();
459 tex.src_sel_z = tex_instr.src()[2]->chan();
460 tex.src_sel_w = tex_instr.src()[3]->chan();
461 tex.coord_type_x = !tex_instr.has_tex_flag(TexInstr::x_unnormalized);
462 tex.coord_type_y = !tex_instr.has_tex_flag(TexInstr::y_unnormalized);
463 tex.coord_type_z = !tex_instr.has_tex_flag(TexInstr::z_unnormalized);
464 tex.coord_type_w = !tex_instr.has_tex_flag(TexInstr::w_unnormalized);
465 tex.offset_x = tex_instr.get_offset(0);
466 tex.offset_y = tex_instr.get_offset(1);
467 tex.offset_z = tex_instr.get_offset(2);
468 tex.resource_index_mode = index_mode;
469 tex.sampler_index_mode = index_mode;
470
471 if (tex.dst_sel_x < 4 &&
472 tex.dst_sel_y < 4 &&
473 tex.dst_sel_z < 4 &&
474 tex.dst_sel_w < 4)
475 tex_fetch_results.insert(tex.dst_gpr);
476
477 if (tex_instr.opcode() == TexInstr::get_gradient_h ||
478 tex_instr.opcode() == TexInstr::get_gradient_v)
479 tex.inst_mod = tex_instr.has_tex_flag(TexInstr::grad_fine) ? 1 : 0;
480 else
481 tex.inst_mod = tex_instr.inst_mode();
482 if (r600_bytecode_add_tex(m_bc, &tex)) {
483 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n");
484 m_result = false;
485 }
486 }
487
visit(const ExportInstr & exi)488 void AssamblerVisitor::visit(const ExportInstr& exi)
489 {
490 const auto& value = exi.value();
491
492 r600_bytecode_output output;
493 memset(&output, 0, sizeof(output));
494
495 output.gpr = value.sel();
496 output.elem_size = 3;
497 output.swizzle_x = value[0]->chan();
498 output.swizzle_y = value[1]->chan();
499 output.swizzle_z = value[2]->chan();
500 output.burst_count = 1;
501 output.op = exi.is_last_export() ? CF_OP_EXPORT_DONE: CF_OP_EXPORT;
502 output.type = exi.export_type();
503
504
505 clear_states(sf_all);
506 switch (exi.export_type()) {
507 case ExportInstr::pixel:
508 output.swizzle_w = ps_alpha_to_one ? 5 : exi.value()[3]->chan();
509 output.array_base = exi.location();
510 break;
511 case ExportInstr::pos:
512 output.swizzle_w = exi.value()[3]->chan();
513 output.array_base = 60 + exi.location();
514 break;
515 case ExportInstr::param:
516 output.swizzle_w = exi.value()[3]->chan();
517 output.array_base = exi.location();
518 break;
519 default:
520 R600_ERR("shader_from_nir: export %d type not yet supported\n", exi.export_type());
521 m_result = false;
522 }
523
524 /* If all register elements pinned to fixed values
525 * we can override the gpr (the register allocator doesn't see
526 * this because it doesn't take these channels into account. */
527 if (output.swizzle_x > 3 && output.swizzle_y > 3 &&
528 output.swizzle_z > 3 && output.swizzle_w > 3)
529 output.gpr = 0;
530
531 int r = 0;
532 if ((r =r600_bytecode_add_output(m_bc, &output))) {
533 R600_ERR("Error adding export at location %d : err: %d\n", exi.location(), r);
534 m_result = false;
535 }
536 }
537
visit(const ScratchIOInstr & instr)538 void AssamblerVisitor::visit(const ScratchIOInstr& instr)
539 {
540 clear_states(sf_all);
541
542 struct r600_bytecode_output cf;
543
544 memset(&cf, 0, sizeof(struct r600_bytecode_output));
545
546 cf.op = CF_OP_MEM_SCRATCH;
547 cf.elem_size = 3;
548 cf.gpr = instr.value().sel();
549 cf.mark = !instr.is_read();
550 cf.comp_mask = instr.is_read() ? 0xf : instr.write_mask();
551 cf.swizzle_x = 0;
552 cf.swizzle_y = 1;
553 cf.swizzle_z = 2;
554 cf.swizzle_w = 3;
555 cf.burst_count = 1;
556
557 assert(!instr.is_read() || m_bc->gfx_level < R700);
558
559 if (instr.address()) {
560 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 3 : 1;
561 cf.index_gpr = instr.address()->sel();
562
563 /* The docu seems to be wrong here: In indirect addressing the
564 * address_base seems to be the array_size */
565 cf.array_size = instr.array_size();
566 } else {
567 cf.type = instr.is_read() || m_bc->gfx_level > R600 ? 2 : 0;
568 cf.array_base = instr.location();
569 }
570
571 if (r600_bytecode_add_output(m_bc, &cf)){
572 R600_ERR("shader_from_nir: Error creating SCRATCH_WR assembly instruction\n");
573 m_result = false;
574 }
575 }
576
visit(const StreamOutInstr & instr)577 void AssamblerVisitor::visit(const StreamOutInstr& instr)
578 {
579 struct r600_bytecode_output output;
580 memset(&output, 0, sizeof(struct r600_bytecode_output));
581
582 output.gpr = instr.value().sel();
583 output.elem_size = instr.element_size();
584 output.array_base = instr.array_base();
585 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
586 output.burst_count = instr.burst_count();
587 output.array_size = instr.array_size();
588 output.comp_mask = instr.comp_mask();
589 output.op = instr.op(m_shader->bc.gfx_level);
590
591
592 if (r600_bytecode_add_output(m_bc, &output)) {
593 R600_ERR("shader_from_nir: Error creating stream output instruction\n");
594 m_result = false;
595 }
596 }
597
visit(const MemRingOutInstr & instr)598 void AssamblerVisitor::visit(const MemRingOutInstr& instr)
599 {
600 struct r600_bytecode_output output;
601 memset(&output, 0, sizeof(struct r600_bytecode_output));
602
603 output.gpr = instr.value().sel();
604 output.type = instr.type();
605 output.elem_size = 3;
606 output.comp_mask = 0xf;
607 output.burst_count = 1;
608 output.op = instr.op();
609 if (instr.type() == MemRingOutInstr::mem_write_ind ||
610 instr.type() == MemRingOutInstr::mem_write_ind_ack) {
611 output.index_gpr = instr.index_reg();
612 output.array_size = 0xfff;
613 }
614 output.array_base = instr.array_base();
615
616 if (r600_bytecode_add_output(m_bc, &output)) {
617 R600_ERR("shader_from_nir: Error creating mem ring write instruction\n");
618 m_result = false;
619 }
620 }
621
visit(const EmitVertexInstr & instr)622 void AssamblerVisitor::visit(const EmitVertexInstr& instr)
623 {
624 int r = r600_bytecode_add_cfinst(m_bc, instr.op());
625 if (!r)
626 m_bc->cf_last->count = instr.stream();
627 else
628 m_result = false;
629 assert(m_bc->cf_last->count < 4);
630 }
631
visit(const FetchInstr & fetch_instr)632 void AssamblerVisitor::visit(const FetchInstr& fetch_instr)
633 {
634 clear_states(sf_tex | sf_alu);
635
636 auto buffer_offset = fetch_instr.resource_offset();
637 EBufferIndexMode rat_index_mode = bim_none;
638
639 if (buffer_offset)
640 rat_index_mode = emit_index_reg(*buffer_offset, 0);
641
642 if (fetch_instr.has_fetch_flag(FetchInstr::wait_ack))
643 emit_wait_ack();
644
645 bool use_tc = fetch_instr.has_fetch_flag(FetchInstr::use_tc) ||
646 (m_bc->gfx_level == CAYMAN);
647 if (!use_tc &&
648 vtx_fetch_results.find(fetch_instr.src().sel()) !=
649 vtx_fetch_results.end()) {
650 m_bc->force_add_cf = 1;
651 vtx_fetch_results.clear();
652 }
653
654 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc) &&
655 tex_fetch_results.find(fetch_instr.src().sel()) !=
656 tex_fetch_results.end()) {
657 m_bc->force_add_cf = 1;
658 tex_fetch_results.clear();
659 }
660
661 if (use_tc)
662 tex_fetch_results.insert(fetch_instr.dst().sel());
663 else
664 vtx_fetch_results.insert(fetch_instr.dst().sel());
665
666 struct r600_bytecode_vtx vtx;
667 memset(&vtx, 0, sizeof(vtx));
668 vtx.op = fetch_instr.opcode();
669 vtx.buffer_id = fetch_instr.resource_id();
670 vtx.fetch_type = fetch_instr.fetch_type();
671 vtx.src_gpr = fetch_instr.src().sel();
672 vtx.src_sel_x = fetch_instr.src().chan();
673 vtx.mega_fetch_count = fetch_instr.mega_fetch_count();
674 vtx.dst_gpr = fetch_instr.dst().sel();
675 vtx.dst_sel_x = fetch_instr.dest_swizzle(0); /* SEL_X */
676 vtx.dst_sel_y = fetch_instr.dest_swizzle(1); /* SEL_Y */
677 vtx.dst_sel_z = fetch_instr.dest_swizzle(2); /* SEL_Z */
678 vtx.dst_sel_w = fetch_instr.dest_swizzle(3); /* SEL_W */
679 vtx.use_const_fields = fetch_instr.has_fetch_flag(FetchInstr::use_const_field);
680 vtx.data_format = fetch_instr.data_format();
681 vtx.num_format_all = fetch_instr.num_format(); /* NUM_FORMAT_SCALED */
682 vtx.format_comp_all = fetch_instr.has_fetch_flag(FetchInstr::format_comp_signed);
683 vtx.endian = fetch_instr.endian_swap();
684 vtx.buffer_index_mode = rat_index_mode;
685 vtx.offset = fetch_instr.src_offset();
686 vtx.indexed = fetch_instr.has_fetch_flag(FetchInstr::indexed);
687 vtx.uncached = fetch_instr.has_fetch_flag(FetchInstr::uncached);
688 vtx.elem_size = fetch_instr.elm_size();
689 vtx.array_base = fetch_instr.array_base();
690 vtx.array_size = fetch_instr.array_size();
691 vtx.srf_mode_all = fetch_instr.has_fetch_flag(FetchInstr::srf_mode);
692
693 if (fetch_instr.has_fetch_flag(FetchInstr::use_tc)) {
694 if ((r600_bytecode_add_vtx_tc(m_bc, &vtx))) {
695 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n");
696 m_result = false;
697 }
698
699 } else {
700 if ((r600_bytecode_add_vtx(m_bc, &vtx))) {
701 R600_ERR("shader_from_nir: Error creating tex assembly instruction\n");
702 m_result = false;
703 }
704 }
705
706 m_bc->cf_last->vpm = (m_bc->type == PIPE_SHADER_FRAGMENT) &&
707 fetch_instr.has_fetch_flag(FetchInstr::vpm);
708 m_bc->cf_last->barrier = 1;
709 }
710
visit(const WriteTFInstr & instr)711 void AssamblerVisitor::visit(const WriteTFInstr& instr)
712 {
713 struct r600_bytecode_gds gds;
714
715 auto& value = instr.value();
716
717 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
718 gds.src_gpr = value.sel();
719 gds.src_sel_x = value[0]->chan();
720 gds.src_sel_y = value[1]->chan();
721 gds.src_sel_z = 4;
722 gds.dst_sel_x = 7;
723 gds.dst_sel_y = 7;
724 gds.dst_sel_z = 7;
725 gds.dst_sel_w = 7;
726 gds.op = FETCH_OP_TF_WRITE;
727
728 if (r600_bytecode_add_gds(m_bc, &gds) != 0) {
729 m_result = false;
730 return;
731 }
732
733 if (value[2]->chan() != 7) {
734 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
735 gds.src_gpr = value.sel();
736 gds.src_sel_x = value[2]->chan();
737 gds.src_sel_y = value[3]->chan();
738 gds.src_sel_z = 4;
739 gds.dst_sel_x = 7;
740 gds.dst_sel_y = 7;
741 gds.dst_sel_z = 7;
742 gds.dst_sel_w = 7;
743 gds.op = FETCH_OP_TF_WRITE;
744
745 if (r600_bytecode_add_gds(m_bc, &gds)) {
746 m_result = false;
747 return;
748 }
749 }
750 }
751
visit(const RatInstr & instr)752 void AssamblerVisitor::visit(const RatInstr& instr)
753 {
754 struct r600_bytecode_gds gds;
755
756 /* The instruction writes to the retuen buffer loaction, and
757 * the value will actually be read bach, so make sure all previous writes
758 * have been finished */
759 if (m_ack_suggested /*&& instr.has_instr_flag(Instr::ack_rat_return_write)*/)
760 emit_wait_ack();
761
762 int rat_idx = instr.rat_id();
763 EBufferIndexMode rat_index_mode = bim_none;
764 auto addr = instr.rat_id_offset();
765
766 if (addr)
767 rat_index_mode = emit_index_reg(*addr, 1);
768
769 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
770
771 r600_bytecode_add_cfinst(m_bc, instr.cf_opcode());
772 auto cf = m_bc->cf_last;
773 cf->rat.id = rat_idx + m_shader->rat_base;
774 cf->rat.inst = instr.rat_op();
775 cf->rat.index_mode = rat_index_mode;
776 cf->output.type = instr.need_ack() ? 3 : 1;
777 cf->output.gpr = instr.data_gpr();
778 cf->output.index_gpr = instr.index_gpr();
779 cf->output.comp_mask = instr.comp_mask();
780 cf->output.burst_count = instr.burst_count();
781 assert(instr.data_swz(0) == PIPE_SWIZZLE_X);
782 if (cf->rat.inst != RatInstr::STORE_TYPED) {
783 assert(instr.data_swz(1) == PIPE_SWIZZLE_Y ||
784 instr.data_swz(1) == PIPE_SWIZZLE_MAX) ;
785 assert(instr.data_swz(2) == PIPE_SWIZZLE_Z ||
786 instr.data_swz(2) == PIPE_SWIZZLE_MAX) ;
787 }
788
789 cf->vpm = m_bc->type == PIPE_SHADER_FRAGMENT;
790 cf->barrier = 1;
791 cf->mark = instr.need_ack();
792 cf->output.elem_size = instr.elm_size();
793
794 m_ack_suggested |= instr.need_ack();
795 }
796
797
clear_states(const uint32_t & states)798 void AssamblerVisitor::clear_states(const uint32_t& states)
799 {
800 if (states & sf_vtx)
801 vtx_fetch_results.clear();
802
803 if (states & sf_tex)
804 tex_fetch_results.clear();
805
806 if (states & sf_alu) {
807 m_last_op_was_barrier = false;
808 m_last_addr = nullptr;
809 }
810
811 }
812
813
visit(const Block & block)814 void AssamblerVisitor::visit(const Block& block)
815 {
816 if (block.empty())
817 return;
818
819 m_bc->force_add_cf = block.has_instr_flag(Instr::force_cf);
820 sfn_log << SfnLog::assembly << "Translate block size: " << block.size() << " new_cf:" << m_bc->force_add_cf << "\n";
821
822 for (const auto& i : block) {
823 sfn_log << SfnLog::assembly << "Translate " << *i << " ";
824 i->accept(*this);
825 sfn_log << SfnLog::assembly << (m_result ? "good" : "fail") << "\n";
826
827 if (!m_result)
828 break;
829 }
830 }
831
visit(const IfInstr & instr)832 void AssamblerVisitor::visit(const IfInstr& instr)
833 {
834 int elems = m_callstack.push(FC_PUSH_VPM);
835 bool needs_workaround = false;
836
837 if (m_bc->gfx_level == CAYMAN && m_bc->stack.loop > 1)
838 needs_workaround = true;
839
840 if (m_bc->gfx_level == EVERGREEN &&
841 m_bc->family != CHIP_HEMLOCK &&
842 m_bc->family != CHIP_CYPRESS &&
843 m_bc->family != CHIP_JUNIPER) {
844 unsigned dmod1 = (elems - 1) % m_bc->stack.entry_size;
845 unsigned dmod2 = (elems) % m_bc->stack.entry_size;
846
847 if (elems && (!dmod1 || !dmod2))
848 needs_workaround = true;
849 }
850
851 auto pred = instr.predicate();
852 auto [addr, dummy0, dummy1 ] = pred->indirect_addr(); {}
853 if (addr) {
854 if (!m_last_addr || !m_bc->ar_loaded ||
855 !m_last_addr->equal_to(*addr)) {
856 m_bc->ar_reg = addr->sel();
857 m_bc->ar_chan = addr->chan();
858 m_last_addr = addr;
859 m_bc->ar_loaded = 0;
860
861 r600_load_ar(m_bc, true);
862 }
863 }
864
865 if (needs_workaround) {
866 r600_bytecode_add_cfinst(m_bc, CF_OP_PUSH);
867 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
868 pred->set_cf_type(cf_alu);
869 }
870
871 clear_states(sf_tex|sf_vtx);
872 pred->accept(*this);
873
874 r600_bytecode_add_cfinst(m_bc, CF_OP_JUMP);
875 clear_states(sf_all);
876
877 m_jump_tracker.push(m_bc->cf_last, jt_if);
878 }
879
visit(const ControlFlowInstr & instr)880 void AssamblerVisitor::visit(const ControlFlowInstr& instr)
881 {
882 clear_states(sf_all);
883 switch (instr.cf_type()) {
884 case ControlFlowInstr::cf_else:
885 emit_else();
886 break;
887 case ControlFlowInstr::cf_endif:
888 emit_endif();
889 break;
890 case ControlFlowInstr::cf_loop_begin:
891 emit_loop_begin(instr.has_instr_flag(Instr::vpm));
892 break;
893 case ControlFlowInstr::cf_loop_end:
894 emit_loop_end();
895 break;
896 case ControlFlowInstr::cf_loop_break:
897 emit_loop_break();
898 break;
899 case ControlFlowInstr::cf_loop_continue:
900 emit_loop_cont();
901 break;
902 case ControlFlowInstr::cf_wait_ack:
903 {
904 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
905 if (!r) {
906 m_bc->cf_last->cf_addr = 0;
907 m_bc->cf_last->barrier = 1;
908 m_ack_suggested = false;
909 } else {
910 m_result = false;
911 }
912 }
913 break;
914 default:
915 unreachable("Unknown CF instruction type");
916 }
917 }
918
visit(const GDSInstr & instr)919 void AssamblerVisitor::visit(const GDSInstr& instr)
920 {
921 struct r600_bytecode_gds gds;
922
923 bool indirect = false;
924 auto addr = instr.uav_id();
925
926 if (addr) {
927 indirect = true;
928 emit_index_reg(*addr, 1);
929 }
930
931 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
932
933 gds.op = ds_opcode_map.at(instr.opcode());
934 gds.dst_gpr = instr.dest()->sel();
935 gds.uav_id = instr.uav_base();
936 gds.uav_index_mode = indirect ? bim_one : bim_none;
937 gds.src_gpr = instr.src().sel();
938
939 gds.src_sel_x = instr.src()[0]->chan() < 7 ? instr.src()[0]->chan() : 4;
940 gds.src_sel_y = instr.src()[1]->chan();
941 gds.src_sel_z = instr.src()[2]->chan() < 7 ? instr.src()[2]->chan() : 4;
942
943 gds.dst_sel_x = 7;
944 gds.dst_sel_y = 7;
945 gds.dst_sel_z = 7;
946 gds.dst_sel_w = 7;
947
948 switch (instr.dest()->chan()) {
949 case 0: gds.dst_sel_x = 0;break;
950 case 1: gds.dst_sel_y = 0;break;
951 case 2: gds.dst_sel_z = 0;break;
952 case 3: gds.dst_sel_w = 0;
953 }
954
955 gds.src_gpr2 = 0;
956 gds.alloc_consume = m_bc->gfx_level < CAYMAN ? 1 : 0; // Not Cayman
957
958 int r = r600_bytecode_add_gds(m_bc, &gds);
959 if (r) {
960 m_result = false;
961 return;
962 }
963 m_bc->cf_last->vpm = PIPE_SHADER_FRAGMENT == m_bc->type;
964 m_bc->cf_last->barrier = 1;
965 }
966
visit(const LDSAtomicInstr & instr)967 void AssamblerVisitor::visit(const LDSAtomicInstr& instr)
968 {
969 (void)instr;
970 unreachable("LDSAtomicInstr must be lowered to ALUInstr");
971 }
972
visit(const LDSReadInstr & instr)973 void AssamblerVisitor::visit(const LDSReadInstr& instr)
974 {
975 (void)instr;
976 unreachable("LDSReadInstr must be lowered to ALUInstr");
977 }
978
979 EBufferIndexMode
emit_index_reg(const VirtualValue & addr,unsigned idx)980 AssamblerVisitor::emit_index_reg(const VirtualValue& addr, unsigned idx)
981 {
982 assert(idx < 2);
983
984 if (!m_bc->index_loaded[idx] || m_loop_nesting ||
985 m_bc->index_reg[idx] != (unsigned)addr.sel()
986 || m_bc->index_reg_chan[idx] != (unsigned)addr.chan()) {
987 struct r600_bytecode_alu alu;
988
989 // Make sure MOVA is not last instr in clause
990
991 if (!m_bc->cf_last || (m_bc->cf_last->ndw>>1) >= 110)
992 m_bc->force_add_cf = 1;
993
994 if (m_bc->gfx_level != CAYMAN) {
995
996 EAluOp idxop = idx ? op1_set_cf_idx1 : op1_set_cf_idx0;
997
998 memset(&alu, 0, sizeof(alu));
999 alu.op = opcode_map.at(op1_mova_int);
1000 alu.dst.chan = 0;
1001 alu.src[0].sel = addr.sel();
1002 alu.src[0].chan = addr.chan();
1003 alu.last = 1;
1004 sfn_log << SfnLog::assembly << " mova_int, ";
1005 int r = r600_bytecode_add_alu(m_bc, &alu);
1006 if (r)
1007 return bim_invalid;
1008
1009 alu.op = opcode_map.at(idxop);
1010 alu.dst.chan = 0;
1011 alu.src[0].sel = 0;
1012 alu.src[0].chan = 0;
1013 alu.last = 1;
1014 sfn_log << SfnLog::assembly << "op1_set_cf_idx" << idx;
1015 r = r600_bytecode_add_alu(m_bc, &alu);
1016 if (r)
1017 return bim_invalid;
1018 } else {
1019 memset(&alu, 0, sizeof(alu));
1020 alu.op = opcode_map.at(op1_mova_int);
1021 alu.dst.sel = idx == 0 ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
1022 alu.dst.chan = 0;
1023 alu.src[0].sel = addr.sel();
1024 alu.src[0].chan = addr.chan();
1025 alu.last = 1;
1026 sfn_log << SfnLog::assembly << " mova_int, ";
1027 int r = r600_bytecode_add_alu(m_bc, &alu);
1028 if (r)
1029 return bim_invalid;
1030 }
1031
1032 m_bc->ar_loaded = 0;
1033 m_bc->index_reg[idx] = addr.sel();
1034 m_bc->index_reg_chan[idx] = addr.chan();
1035 m_bc->index_loaded[idx] = true;
1036 m_bc->force_add_cf = 1;
1037 sfn_log << SfnLog::assembly << "\n";
1038 }
1039 return idx == 0 ? bim_zero : bim_one;
1040 }
1041
emit_else()1042 void AssamblerVisitor::emit_else()
1043 {
1044 r600_bytecode_add_cfinst(m_bc, CF_OP_ELSE);
1045 m_bc->cf_last->pop_count = 1;
1046 m_result &= m_jump_tracker.add_mid(m_bc->cf_last, jt_if);
1047 }
1048
emit_endif()1049 void AssamblerVisitor::emit_endif()
1050 {
1051 m_callstack.pop(FC_PUSH_VPM);
1052
1053 unsigned force_pop = m_bc->force_add_cf;
1054 if (!force_pop) {
1055 int alu_pop = 3;
1056 if (m_bc->cf_last) {
1057 if (m_bc->cf_last->op == CF_OP_ALU)
1058 alu_pop = 0;
1059 else if (m_bc->cf_last->op == CF_OP_ALU_POP_AFTER)
1060 alu_pop = 1;
1061 }
1062 alu_pop += 1;
1063 if (alu_pop == 1) {
1064 m_bc->cf_last->op = CF_OP_ALU_POP_AFTER;
1065 m_bc->force_add_cf = 1;
1066 } else {
1067 force_pop = 1;
1068 }
1069 }
1070
1071 if (force_pop) {
1072 r600_bytecode_add_cfinst(m_bc, CF_OP_POP);
1073 m_bc->cf_last->pop_count = 1;
1074 m_bc->cf_last->cf_addr = m_bc->cf_last->id + 2;
1075 }
1076
1077 m_result &= m_jump_tracker.pop(m_bc->cf_last, jt_if);
1078 }
1079
emit_loop_begin(bool vpm)1080 void AssamblerVisitor::emit_loop_begin(bool vpm)
1081 {
1082 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_START_DX10);
1083 m_bc->cf_last->vpm = vpm && m_bc->type == PIPE_SHADER_FRAGMENT;
1084 m_jump_tracker.push(m_bc->cf_last, jt_loop);
1085 m_callstack.push(FC_LOOP);
1086 ++m_loop_nesting;
1087 }
1088
emit_loop_end()1089 void AssamblerVisitor::emit_loop_end()
1090 {
1091 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_END);
1092 m_callstack.pop(FC_LOOP);
1093 assert(m_loop_nesting);
1094 --m_loop_nesting;
1095 m_result |= m_jump_tracker.pop(m_bc->cf_last, jt_loop);
1096 }
1097
emit_loop_break()1098 void AssamblerVisitor::emit_loop_break()
1099 {
1100 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_BREAK);
1101 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1102 }
1103
emit_loop_cont()1104 void AssamblerVisitor::emit_loop_cont()
1105 {
1106 r600_bytecode_add_cfinst(m_bc, CF_OP_LOOP_CONTINUE);
1107 m_result |= m_jump_tracker.add_mid(m_bc->cf_last, jt_loop);
1108 }
1109
copy_dst(r600_bytecode_alu_dst & dst,const Register & d,bool write)1110 bool AssamblerVisitor::copy_dst(r600_bytecode_alu_dst& dst,
1111 const Register& d, bool write)
1112 {
1113 if (write && d.sel() > 124) {
1114 R600_ERR("shader_from_nir: Don't support more then 124 GPRs, but try using %d\n",
1115 d.sel());
1116 m_result = false;
1117 return false;
1118 }
1119
1120 dst.sel = d.sel();
1121 dst.chan = d.chan();
1122
1123 if (m_bc->index_reg[1] == dst.sel &&
1124 m_bc->index_reg_chan[1] == dst.chan)
1125 m_bc->index_loaded[1] = false;
1126
1127 if (m_bc->index_reg[0] == dst.sel &&
1128 m_bc->index_reg_chan[0] == dst.chan)
1129 m_bc->index_loaded[0] = false;
1130
1131 return true;
1132 }
1133
emit_wait_ack()1134 void AssamblerVisitor::emit_wait_ack()
1135 {
1136 int r = r600_bytecode_add_cfinst(m_bc, CF_OP_WAIT_ACK);
1137 if (!r) {
1138 m_bc->cf_last->cf_addr = 0;
1139 m_bc->cf_last->barrier = 1;
1140 m_ack_suggested = false;
1141 } else
1142 m_result = false;
1143 }
1144
1145 class EncodeSourceVisitor : public ConstRegisterVisitor {
1146 public:
1147
1148 EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc);
1149 void visit(const Register& value) override;
1150 void visit(const LocalArray& value) override;
1151 void visit(const LocalArrayValue& value) override;
1152 void visit(const UniformValue& value) override;
1153 void visit(const LiteralConstant& value) override;
1154 void visit(const InlineConstant& value) override;
1155
1156 r600_bytecode_alu_src& src;
1157 r600_bytecode *m_bc;
1158 PVirtualValue m_buffer_offset{nullptr};
1159 };
1160
copy_src(r600_bytecode_alu_src & src,const VirtualValue & s)1161 PVirtualValue AssamblerVisitor::copy_src(r600_bytecode_alu_src& src, const VirtualValue& s)
1162 {
1163
1164 EncodeSourceVisitor visitor(src, m_bc);
1165 src.sel = s.sel();
1166 src.chan = s.chan();
1167
1168 s.accept(visitor);
1169 return visitor.m_buffer_offset;
1170 }
1171
EncodeSourceVisitor(r600_bytecode_alu_src & s,r600_bytecode * bc)1172 EncodeSourceVisitor::EncodeSourceVisitor(r600_bytecode_alu_src& s, r600_bytecode *bc):
1173 src(s), m_bc(bc)
1174 {
1175 }
1176
visit(const Register & value)1177 void EncodeSourceVisitor::visit(const Register& value)
1178 {
1179 assert(value.sel() <= 124 && "Only have 124 registers");
1180 }
1181
visit(const LocalArray & value)1182 void EncodeSourceVisitor::visit(const LocalArray& value)
1183 {
1184 (void)value;
1185 unreachable("An array can't be a source register");
1186 }
1187
visit(const LocalArrayValue & value)1188 void EncodeSourceVisitor::visit(const LocalArrayValue& value)
1189 {
1190 src.rel = value.addr() ? 1 : 0;
1191 }
1192
visit(const UniformValue & value)1193 void EncodeSourceVisitor::visit(const UniformValue& value)
1194 {
1195 assert(value.sel() >= 512 && "Uniform values must have a sel >= 512");
1196 m_buffer_offset = value.buf_addr();
1197 src.kc_bank = value.kcache_bank();
1198 }
1199
visit(const LiteralConstant & value)1200 void EncodeSourceVisitor::visit(const LiteralConstant& value)
1201 {
1202 src.value = value.value();
1203 }
1204
visit(const InlineConstant & value)1205 void EncodeSourceVisitor::visit(const InlineConstant& value)
1206 {
1207 (void)value;
1208 }
1209
1210
1211
1212 const std::map<EAluOp, int> opcode_map = {
1213
1214 {op2_add, ALU_OP2_ADD},
1215 {op2_mul, ALU_OP2_MUL},
1216 {op2_mul_ieee, ALU_OP2_MUL_IEEE},
1217 {op2_max, ALU_OP2_MAX},
1218 {op2_min, ALU_OP2_MIN},
1219 {op2_max_dx10, ALU_OP2_MAX_DX10},
1220 {op2_min_dx10, ALU_OP2_MIN_DX10},
1221 {op2_sete, ALU_OP2_SETE},
1222 {op2_setgt, ALU_OP2_SETGT},
1223 {op2_setge, ALU_OP2_SETGE},
1224 {op2_setne, ALU_OP2_SETNE},
1225 {op2_sete_dx10, ALU_OP2_SETE_DX10},
1226 {op2_setgt_dx10, ALU_OP2_SETGT_DX10},
1227 {op2_setge_dx10, ALU_OP2_SETGE_DX10},
1228 {op2_setne_dx10, ALU_OP2_SETNE_DX10},
1229 {op1_fract, ALU_OP1_FRACT},
1230 {op1_trunc, ALU_OP1_TRUNC},
1231 {op1_ceil, ALU_OP1_CEIL},
1232 {op1_rndne, ALU_OP1_RNDNE},
1233 {op1_floor, ALU_OP1_FLOOR},
1234 {op2_ashr_int, ALU_OP2_ASHR_INT},
1235 {op2_lshr_int, ALU_OP2_LSHR_INT},
1236 {op2_lshl_int, ALU_OP2_LSHL_INT},
1237 {op1_mov, ALU_OP1_MOV},
1238 {op0_nop, ALU_OP0_NOP},
1239 {op2_mul_64, ALU_OP2_MUL_64},
1240 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32},
1241 {op1v_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64},
1242 {op2_prede_int, ALU_OP2_PRED_SETE_INT},
1243 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT},
1244 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT},
1245 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT},
1246 {op2_pred_setgt_uint, ALU_OP2_PRED_SETGT_UINT},
1247 {op2_pred_setge_uint, ALU_OP2_PRED_SETGE_UINT},
1248 {op2_pred_sete, ALU_OP2_PRED_SETE},
1249 {op2_pred_setgt, ALU_OP2_PRED_SETGT},
1250 {op2_pred_setge, ALU_OP2_PRED_SETGE},
1251 {op2_pred_setne, ALU_OP2_PRED_SETNE},
1252 {op0_pred_set_clr, ALU_OP0_PRED_SET_CLR},
1253 {op1_pred_set_restore, ALU_OP1_PRED_SET_RESTORE},
1254 {op2_pred_sete_push, ALU_OP2_PRED_SETE_PUSH},
1255 {op2_pred_setgt_push, ALU_OP2_PRED_SETGT_PUSH},
1256 {op2_pred_setge_push, ALU_OP2_PRED_SETGE_PUSH},
1257 {op2_pred_setne_push, ALU_OP2_PRED_SETNE_PUSH},
1258 {op2_kille, ALU_OP2_KILLE},
1259 {op2_killgt, ALU_OP2_KILLGT},
1260 {op2_killge, ALU_OP2_KILLGE},
1261 {op2_killne, ALU_OP2_KILLNE},
1262 {op2_and_int, ALU_OP2_AND_INT},
1263 {op2_or_int, ALU_OP2_OR_INT},
1264 {op2_xor_int, ALU_OP2_XOR_INT},
1265 {op1_not_int, ALU_OP1_NOT_INT},
1266 {op2_add_int, ALU_OP2_ADD_INT},
1267 {op2_sub_int, ALU_OP2_SUB_INT},
1268 {op2_max_int, ALU_OP2_MAX_INT},
1269 {op2_min_int, ALU_OP2_MIN_INT},
1270 {op2_max_uint, ALU_OP2_MAX_UINT},
1271 {op2_min_uint, ALU_OP2_MIN_UINT},
1272 {op2_sete_int, ALU_OP2_SETE_INT},
1273 {op2_setgt_int, ALU_OP2_SETGT_INT},
1274 {op2_setge_int, ALU_OP2_SETGE_INT},
1275 {op2_setne_int, ALU_OP2_SETNE_INT},
1276 {op2_setgt_uint, ALU_OP2_SETGT_UINT},
1277 {op2_setge_uint, ALU_OP2_SETGE_UINT},
1278 {op2_killgt_uint, ALU_OP2_KILLGT_UINT},
1279 {op2_killge_uint, ALU_OP2_KILLGE_UINT},
1280 {op2_pred_setgt_int, ALU_OP2_PRED_SETGT_INT},
1281 {op2_pred_setge_int, ALU_OP2_PRED_SETGE_INT},
1282 {op2_pred_setne_int, ALU_OP2_PRED_SETNE_INT},
1283 {op2_kille_int, ALU_OP2_KILLE_INT},
1284 {op2_killgt_int, ALU_OP2_KILLGT_INT},
1285 {op2_killge_int, ALU_OP2_KILLGE_INT},
1286 {op2_killne_int, ALU_OP2_KILLNE_INT},
1287 {op2_pred_sete_push_int, ALU_OP2_PRED_SETE_PUSH_INT},
1288 {op2_pred_setgt_push_int, ALU_OP2_PRED_SETGT_PUSH_INT},
1289 {op2_pred_setge_push_int, ALU_OP2_PRED_SETGE_PUSH_INT},
1290 {op2_pred_setne_push_int, ALU_OP2_PRED_SETNE_PUSH_INT},
1291 {op2_pred_setlt_push_int, ALU_OP2_PRED_SETLT_PUSH_INT},
1292 {op2_pred_setle_push_int, ALU_OP2_PRED_SETLE_PUSH_INT},
1293 {op1_flt_to_int, ALU_OP1_FLT_TO_INT},
1294 {op1_bfrev_int, ALU_OP1_BFREV_INT},
1295 {op2_addc_uint, ALU_OP2_ADDC_UINT},
1296 {op2_subb_uint, ALU_OP2_SUBB_UINT},
1297 {op0_group_barrier, ALU_OP0_GROUP_BARRIER},
1298 {op0_group_seq_begin, ALU_OP0_GROUP_SEQ_BEGIN},
1299 {op0_group_seq_end, ALU_OP0_GROUP_SEQ_END},
1300 {op2_set_mode, ALU_OP2_SET_MODE},
1301 {op1_set_cf_idx0, ALU_OP0_SET_CF_IDX0},
1302 {op1_set_cf_idx1, ALU_OP0_SET_CF_IDX1},
1303 {op2_set_lds_size, ALU_OP2_SET_LDS_SIZE},
1304 {op1_exp_ieee, ALU_OP1_EXP_IEEE},
1305 {op1_log_clamped, ALU_OP1_LOG_CLAMPED},
1306 {op1_log_ieee, ALU_OP1_LOG_IEEE},
1307 {op1_recip_clamped, ALU_OP1_RECIP_CLAMPED},
1308 {op1_recip_ff, ALU_OP1_RECIP_FF},
1309 {op1_recip_ieee, ALU_OP1_RECIP_IEEE},
1310 {op1_recipsqrt_clamped, ALU_OP1_RECIPSQRT_CLAMPED},
1311 {op1_recipsqrt_ff, ALU_OP1_RECIPSQRT_FF},
1312 {op1_recipsqrt_ieee1, ALU_OP1_RECIPSQRT_IEEE},
1313 {op1_sqrt_ieee, ALU_OP1_SQRT_IEEE},
1314 {op1_sin, ALU_OP1_SIN},
1315 {op1_cos, ALU_OP1_COS},
1316 {op2_mullo_int, ALU_OP2_MULLO_INT},
1317 {op2_mulhi_int, ALU_OP2_MULHI_INT},
1318 {op2_mullo_uint, ALU_OP2_MULLO_UINT},
1319 {op2_mulhi_uint, ALU_OP2_MULHI_UINT},
1320 {op1_recip_int, ALU_OP1_RECIP_INT},
1321 {op1_recip_uint, ALU_OP1_RECIP_UINT},
1322 {op1_recip_64, ALU_OP2_RECIP_64},
1323 {op1_recip_clamped_64, ALU_OP2_RECIP_CLAMPED_64},
1324 {op1_recipsqrt_64, ALU_OP2_RECIPSQRT_64},
1325 {op1_recipsqrt_clamped_64, ALU_OP2_RECIPSQRT_CLAMPED_64},
1326 {op1_sqrt_64, ALU_OP2_SQRT_64},
1327 {op1_flt_to_uint, ALU_OP1_FLT_TO_UINT},
1328 {op1_int_to_flt, ALU_OP1_INT_TO_FLT},
1329 {op1_uint_to_flt, ALU_OP1_UINT_TO_FLT},
1330 {op2_bfm_int, ALU_OP2_BFM_INT},
1331 {op1_flt32_to_flt16, ALU_OP1_FLT32_TO_FLT16},
1332 {op1_flt16_to_flt32, ALU_OP1_FLT16_TO_FLT32},
1333 {op1_ubyte0_flt, ALU_OP1_UBYTE0_FLT},
1334 {op1_ubyte1_flt, ALU_OP1_UBYTE1_FLT},
1335 {op1_ubyte2_flt, ALU_OP1_UBYTE2_FLT},
1336 {op1_ubyte3_flt, ALU_OP1_UBYTE3_FLT},
1337 {op1_bcnt_int, ALU_OP1_BCNT_INT},
1338 {op1_ffbh_uint, ALU_OP1_FFBH_UINT},
1339 {op1_ffbl_int, ALU_OP1_FFBL_INT},
1340 {op1_ffbh_int, ALU_OP1_FFBH_INT},
1341 {op1_flt_to_uint4, ALU_OP1_FLT_TO_UINT4},
1342 {op2_dot_ieee, ALU_OP2_DOT_IEEE},
1343 {op1_flt_to_int_rpi, ALU_OP1_FLT_TO_INT_RPI},
1344 {op1_flt_to_int_floor, ALU_OP1_FLT_TO_INT_FLOOR},
1345 {op2_mulhi_uint24, ALU_OP2_MULHI_UINT24},
1346 {op1_mbcnt_32hi_int, ALU_OP1_MBCNT_32HI_INT},
1347 {op1_offset_to_flt, ALU_OP1_OFFSET_TO_FLT},
1348 {op2_mul_uint24, ALU_OP2_MUL_UINT24},
1349 {op1_bcnt_accum_prev_int, ALU_OP1_BCNT_ACCUM_PREV_INT},
1350 {op1_mbcnt_32lo_accum_prev_int, ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT},
1351 {op2_sete_64, ALU_OP2_SETE_64},
1352 {op2_setne_64, ALU_OP2_SETNE_64},
1353 {op2_setgt_64, ALU_OP2_SETGT_64},
1354 {op2_setge_64, ALU_OP2_SETGE_64},
1355 {op2_min_64, ALU_OP2_MIN_64},
1356 {op2_max_64, ALU_OP2_MAX_64},
1357 {op2_dot4, ALU_OP2_DOT4},
1358 {op2_dot4_ieee, ALU_OP2_DOT4_IEEE},
1359 {op2_cube, ALU_OP2_CUBE},
1360 {op1_max4, ALU_OP1_MAX4},
1361 {op1_frexp_64, ALU_OP1_FREXP_64},
1362 {op1_ldexp_64, ALU_OP2_LDEXP_64},
1363 {op1_fract_64, ALU_OP1_FRACT_64},
1364 {op2_pred_setgt_64, ALU_OP2_PRED_SETGT_64},
1365 {op2_pred_sete_64, ALU_OP2_PRED_SETE_64},
1366 {op2_pred_setge_64, ALU_OP2_PRED_SETGE_64},
1367 {op2_add_64, ALU_OP2_ADD_64},
1368 {op1_mova_int, ALU_OP1_MOVA_INT},
1369 {op1v_flt64_to_flt32, ALU_OP1_FLT64_TO_FLT32},
1370 {op1_flt32_to_flt64, ALU_OP1_FLT32_TO_FLT64},
1371 {op2_sad_accum_prev_uint, ALU_OP2_SAD_ACCUM_PREV_UINT},
1372 {op2_dot, ALU_OP2_DOT},
1373 {op1_mul_prev, ALU_OP1_MUL_PREV},
1374 {op1_mul_ieee_prev, ALU_OP1_MUL_IEEE_PREV},
1375 {op1_add_prev, ALU_OP1_ADD_PREV},
1376 {op2_muladd_prev, ALU_OP2_MULADD_PREV},
1377 {op2_muladd_ieee_prev, ALU_OP2_MULADD_IEEE_PREV},
1378 {op2_interp_xy, ALU_OP2_INTERP_XY},
1379 {op2_interp_zw, ALU_OP2_INTERP_ZW},
1380 {op2_interp_x, ALU_OP2_INTERP_X},
1381 {op2_interp_z, ALU_OP2_INTERP_Z},
1382 {op0_store_flags, ALU_OP1_STORE_FLAGS},
1383 {op1_load_store_flags, ALU_OP1_LOAD_STORE_FLAGS},
1384 {op0_lds_1a, ALU_OP2_LDS_1A},
1385 {op0_lds_1a1d, ALU_OP2_LDS_1A1D},
1386 {op0_lds_2a, ALU_OP2_LDS_2A},
1387 {op1_interp_load_p0, ALU_OP1_INTERP_LOAD_P0},
1388 {op1_interp_load_p10, ALU_OP1_INTERP_LOAD_P10},
1389 {op1_interp_load_p20, ALU_OP1_INTERP_LOAD_P20},
1390 {op3_bfe_uint, ALU_OP3_BFE_UINT},
1391 {op3_bfe_int, ALU_OP3_BFE_INT},
1392 {op3_bfi_int, ALU_OP3_BFI_INT},
1393 {op3_fma, ALU_OP3_FMA},
1394 {op3_cndne_64, ALU_OP3_CNDNE_64},
1395 {op3_fma_64, ALU_OP3_FMA_64},
1396 {op3_lerp_uint, ALU_OP3_LERP_UINT},
1397 {op3_bit_align_int, ALU_OP3_BIT_ALIGN_INT},
1398 {op3_byte_align_int, ALU_OP3_BYTE_ALIGN_INT},
1399 {op3_sad_accum_uint, ALU_OP3_SAD_ACCUM_UINT},
1400 {op3_sad_accum_hi_uint, ALU_OP3_SAD_ACCUM_HI_UINT},
1401 {op3_muladd_uint24, ALU_OP3_MULADD_UINT24},
1402 {op3_lds_idx_op, ALU_OP3_LDS_IDX_OP},
1403 {op3_muladd, ALU_OP3_MULADD},
1404 {op3_muladd_m2, ALU_OP3_MULADD_M2},
1405 {op3_muladd_m4, ALU_OP3_MULADD_M4},
1406 {op3_muladd_d2, ALU_OP3_MULADD_D2},
1407 {op3_muladd_ieee, ALU_OP3_MULADD_IEEE},
1408 {op3_cnde, ALU_OP3_CNDE},
1409 {op3_cndgt, ALU_OP3_CNDGT},
1410 {op3_cndge, ALU_OP3_CNDGE},
1411 {op3_cnde_int, ALU_OP3_CNDE_INT},
1412 {op3_cndgt_int, ALU_OP3_CNDGT_INT},
1413 {op3_cndge_int, ALU_OP3_CNDGE_INT},
1414 {op3_mul_lit, ALU_OP3_MUL_LIT},
1415 };
1416
1417 const std::map<ESDOp, int> ds_opcode_map = {
1418 {DS_OP_ADD, FETCH_OP_GDS_ADD},
1419 {DS_OP_SUB, FETCH_OP_GDS_SUB},
1420 {DS_OP_RSUB, FETCH_OP_GDS_RSUB},
1421 {DS_OP_INC, FETCH_OP_GDS_INC},
1422 {DS_OP_DEC, FETCH_OP_GDS_DEC},
1423 {DS_OP_MIN_INT, FETCH_OP_GDS_MIN_INT},
1424 {DS_OP_MAX_INT, FETCH_OP_GDS_MAX_INT},
1425 {DS_OP_MIN_UINT, FETCH_OP_GDS_MIN_UINT},
1426 {DS_OP_MAX_UINT, FETCH_OP_GDS_MAX_UINT},
1427 {DS_OP_AND, FETCH_OP_GDS_AND},
1428 {DS_OP_OR, FETCH_OP_GDS_OR},
1429 {DS_OP_XOR, FETCH_OP_GDS_XOR},
1430 {DS_OP_MSKOR, FETCH_OP_GDS_MSKOR},
1431 {DS_OP_WRITE, FETCH_OP_GDS_WRITE},
1432 {DS_OP_WRITE_REL, FETCH_OP_GDS_WRITE_REL},
1433 {DS_OP_WRITE2, FETCH_OP_GDS_WRITE2},
1434 {DS_OP_CMP_STORE, FETCH_OP_GDS_CMP_STORE},
1435 {DS_OP_CMP_STORE_SPF, FETCH_OP_GDS_CMP_STORE_SPF},
1436 {DS_OP_BYTE_WRITE, FETCH_OP_GDS_BYTE_WRITE},
1437 {DS_OP_SHORT_WRITE, FETCH_OP_GDS_SHORT_WRITE},
1438 {DS_OP_ADD_RET, FETCH_OP_GDS_ADD_RET},
1439 {DS_OP_SUB_RET, FETCH_OP_GDS_SUB_RET},
1440 {DS_OP_RSUB_RET, FETCH_OP_GDS_RSUB_RET},
1441 {DS_OP_INC_RET, FETCH_OP_GDS_INC_RET},
1442 {DS_OP_DEC_RET, FETCH_OP_GDS_DEC_RET},
1443 {DS_OP_MIN_INT_RET, FETCH_OP_GDS_MIN_INT_RET},
1444 {DS_OP_MAX_INT_RET, FETCH_OP_GDS_MAX_INT_RET},
1445 {DS_OP_MIN_UINT_RET, FETCH_OP_GDS_MIN_UINT_RET},
1446 {DS_OP_MAX_UINT_RET, FETCH_OP_GDS_MAX_UINT_RET},
1447 {DS_OP_AND_RET, FETCH_OP_GDS_AND_RET},
1448 {DS_OP_OR_RET, FETCH_OP_GDS_OR_RET},
1449 {DS_OP_XOR_RET, FETCH_OP_GDS_XOR_RET},
1450 {DS_OP_MSKOR_RET, FETCH_OP_GDS_MSKOR_RET},
1451 {DS_OP_XCHG_RET, FETCH_OP_GDS_XCHG_RET},
1452 {DS_OP_XCHG_REL_RET, FETCH_OP_GDS_XCHG_REL_RET},
1453 {DS_OP_XCHG2_RET, FETCH_OP_GDS_XCHG2_RET},
1454 {DS_OP_CMP_XCHG_RET, FETCH_OP_GDS_CMP_XCHG_RET},
1455 {DS_OP_CMP_XCHG_SPF_RET, FETCH_OP_GDS_CMP_XCHG_SPF_RET},
1456 {DS_OP_READ_RET, FETCH_OP_GDS_READ_RET},
1457 {DS_OP_READ_REL_RET, FETCH_OP_GDS_READ_REL_RET},
1458 {DS_OP_READ2_RET, FETCH_OP_GDS_READ2_RET},
1459 {DS_OP_READWRITE_RET, FETCH_OP_GDS_READWRITE_RET},
1460 {DS_OP_BYTE_READ_RET, FETCH_OP_GDS_BYTE_READ_RET},
1461 {DS_OP_UBYTE_READ_RET, FETCH_OP_GDS_UBYTE_READ_RET},
1462 {DS_OP_SHORT_READ_RET, FETCH_OP_GDS_SHORT_READ_RET},
1463 {DS_OP_USHORT_READ_RET, FETCH_OP_GDS_USHORT_READ_RET},
1464 {DS_OP_ATOMIC_ORDERED_ALLOC_RET, FETCH_OP_GDS_ATOMIC_ORDERED_ALLOC},
1465 {DS_OP_INVALID, 0},
1466 };
1467
1468 }
1469