• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include "common/sid.h"
29 
30 #include "util/memstream.h"
31 
32 #include <algorithm>
33 #include <map>
34 #include <vector>
35 
36 namespace aco {
37 
38 struct constaddr_info {
39    unsigned getpc_end;
40    unsigned add_literal;
41 };
42 
43 struct asm_context {
44    Program* program;
45    enum chip_class chip_class;
46    std::vector<std::pair<int, SOPP_instruction*>> branches;
47    std::map<unsigned, constaddr_info> constaddrs;
48    const int16_t* opcode;
49    // TODO: keep track of branch instructions referring blocks
50    // and, when emitting the block, correct the offset in instr
asm_contextaco::asm_context51    asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
52    {
53       if (chip_class <= GFX7)
54          opcode = &instr_info.opcode_gfx7[0];
55       else if (chip_class <= GFX9)
56          opcode = &instr_info.opcode_gfx9[0];
57       else if (chip_class >= GFX10)
58          opcode = &instr_info.opcode_gfx10[0];
59    }
60 
61    int subvector_begin_pos = -1;
62 };
63 
64 unsigned
get_mimg_nsa_dwords(const Instruction * instr)65 get_mimg_nsa_dwords(const Instruction* instr)
66 {
67    unsigned addr_dwords = instr->operands.size() - 3;
68    for (unsigned i = 1; i < addr_dwords; i++) {
69       if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
70          return DIV_ROUND_UP(addr_dwords - 1, 4);
71    }
72    return 0;
73 }
74 
75 void
emit_instruction(asm_context & ctx,std::vector<uint32_t> & out,Instruction * instr)76 emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
77 {
78    /* lower remaining pseudo-instructions */
79    if (instr->opcode == aco_opcode::p_constaddr_getpc) {
80       ctx.constaddrs[instr->operands[0].constantValue()].getpc_end = out.size() + 1;
81 
82       instr->opcode = aco_opcode::s_getpc_b64;
83       instr->operands.pop_back();
84    } else if (instr->opcode == aco_opcode::p_constaddr_addlo) {
85       ctx.constaddrs[instr->operands[1].constantValue()].add_literal = out.size() + 1;
86 
87       instr->opcode = aco_opcode::s_add_u32;
88       instr->operands[1] = Operand::zero();
89       instr->operands[1].setFixed(PhysReg(255));
90    }
91 
92    uint32_t opcode = ctx.opcode[(int)instr->opcode];
93    if (opcode == (uint32_t)-1) {
94       char* outmem;
95       size_t outsize;
96       struct u_memstream mem;
97       u_memstream_open(&mem, &outmem, &outsize);
98       FILE* const memf = u_memstream_get(&mem);
99 
100       fprintf(memf, "Unsupported opcode: ");
101       aco_print_instr(instr, memf);
102       u_memstream_close(&mem);
103 
104       aco_err(ctx.program, outmem);
105       free(outmem);
106 
107       abort();
108    }
109 
110    switch (instr->format) {
111    case Format::SOP2: {
112       uint32_t encoding = (0b10 << 30);
113       encoding |= opcode << 23;
114       encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
115       encoding |= instr->operands.size() >= 2 ? instr->operands[1].physReg() << 8 : 0;
116       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
117       out.push_back(encoding);
118       break;
119    }
120    case Format::SOPK: {
121       SOPK_instruction& sopk = instr->sopk();
122 
123       if (instr->opcode == aco_opcode::s_subvector_loop_begin) {
124          assert(ctx.chip_class >= GFX10);
125          assert(ctx.subvector_begin_pos == -1);
126          ctx.subvector_begin_pos = out.size();
127       } else if (instr->opcode == aco_opcode::s_subvector_loop_end) {
128          assert(ctx.chip_class >= GFX10);
129          assert(ctx.subvector_begin_pos != -1);
130          /* Adjust s_subvector_loop_begin instruction to the address after the end  */
131          out[ctx.subvector_begin_pos] |= (out.size() - ctx.subvector_begin_pos);
132          /* Adjust s_subvector_loop_end instruction to the address after the beginning  */
133          sopk.imm = (uint16_t)(ctx.subvector_begin_pos - (int)out.size());
134          ctx.subvector_begin_pos = -1;
135       }
136 
137       uint32_t encoding = (0b1011 << 28);
138       encoding |= opcode << 23;
139       encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
140                      ? instr->definitions[0].physReg() << 16
141                   : !instr->operands.empty() && instr->operands[0].physReg() <= 127
142                      ? instr->operands[0].physReg() << 16
143                      : 0;
144       encoding |= sopk.imm;
145       out.push_back(encoding);
146       break;
147    }
148    case Format::SOP1: {
149       uint32_t encoding = (0b101111101 << 23);
150       if (opcode >= 55 && ctx.chip_class <= GFX9) {
151          assert(ctx.chip_class == GFX9 && opcode < 60);
152          opcode = opcode - 4;
153       }
154       encoding |= !instr->definitions.empty() ? instr->definitions[0].physReg() << 16 : 0;
155       encoding |= opcode << 8;
156       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
157       out.push_back(encoding);
158       break;
159    }
160    case Format::SOPC: {
161       uint32_t encoding = (0b101111110 << 23);
162       encoding |= opcode << 16;
163       encoding |= instr->operands.size() == 2 ? instr->operands[1].physReg() << 8 : 0;
164       encoding |= !instr->operands.empty() ? instr->operands[0].physReg() : 0;
165       out.push_back(encoding);
166       break;
167    }
168    case Format::SOPP: {
169       SOPP_instruction& sopp = instr->sopp();
170       uint32_t encoding = (0b101111111 << 23);
171       encoding |= opcode << 16;
172       encoding |= (uint16_t)sopp.imm;
173       if (sopp.block != -1) {
174          sopp.pass_flags = 0;
175          ctx.branches.emplace_back(out.size(), &sopp);
176       }
177       out.push_back(encoding);
178       break;
179    }
180    case Format::SMEM: {
181       SMEM_instruction& smem = instr->smem();
182       bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
183       bool is_load = !instr->definitions.empty();
184       uint32_t encoding = 0;
185 
186       if (ctx.chip_class <= GFX7) {
187          encoding = (0b11000 << 27);
188          encoding |= opcode << 22;
189          encoding |= instr->definitions.size() ? instr->definitions[0].physReg() << 15 : 0;
190          encoding |= instr->operands.size() ? (instr->operands[0].physReg() >> 1) << 9 : 0;
191          if (instr->operands.size() >= 2) {
192             if (!instr->operands[1].isConstant()) {
193                encoding |= instr->operands[1].physReg().reg();
194             } else if (instr->operands[1].constantValue() >= 1024) {
195                encoding |= 255; /* SQ_SRC_LITERAL */
196             } else {
197                encoding |= instr->operands[1].constantValue() >> 2;
198                encoding |= 1 << 8;
199             }
200          }
201          out.push_back(encoding);
202          /* SMRD instructions can take a literal on GFX7 */
203          if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
204              instr->operands[1].constantValue() >= 1024)
205             out.push_back(instr->operands[1].constantValue() >> 2);
206          return;
207       }
208 
209       if (ctx.chip_class <= GFX9) {
210          encoding = (0b110000 << 26);
211          assert(!smem.dlc); /* Device-level coherent is not supported on GFX9 and lower */
212          encoding |= smem.nv ? 1 << 15 : 0;
213       } else {
214          encoding = (0b111101 << 26);
215          assert(!smem.nv); /* Non-volatile is not supported on GFX10 */
216          encoding |= smem.dlc ? 1 << 14 : 0;
217       }
218 
219       encoding |= opcode << 18;
220       encoding |= smem.glc ? 1 << 16 : 0;
221 
222       if (ctx.chip_class <= GFX9) {
223          if (instr->operands.size() >= 2)
224             encoding |= instr->operands[1].isConstant() ? 1 << 17 : 0; /* IMM - immediate enable */
225       }
226       if (ctx.chip_class == GFX9) {
227          encoding |= soe ? 1 << 14 : 0;
228       }
229 
230       if (is_load || instr->operands.size() >= 3) { /* SDATA */
231          encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
232                      << 6;
233       }
234       if (instr->operands.size() >= 1) { /* SBASE */
235          encoding |= instr->operands[0].physReg() >> 1;
236       }
237 
238       out.push_back(encoding);
239       encoding = 0;
240 
241       int32_t offset = 0;
242       uint32_t soffset = ctx.chip_class >= GFX10
243                             ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
244                             : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
245                                     GFX8 and below) */
246       if (instr->operands.size() >= 2) {
247          const Operand& op_off1 = instr->operands[1];
248          if (ctx.chip_class <= GFX9) {
249             offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
250          } else {
251             /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
252              * SGPR */
253             if (op_off1.isConstant()) {
254                offset = op_off1.constantValue();
255             } else {
256                soffset = op_off1.physReg();
257                assert(!soe); /* There is no place to put the other SGPR offset, if any */
258             }
259          }
260 
261          if (soe) {
262             const Operand& op_off2 = instr->operands.back();
263             assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
264                                                and an SGPR at the same time */
265             assert(!op_off2.isConstant());
266             soffset = op_off2.physReg();
267          }
268       }
269       encoding |= offset;
270       encoding |= soffset << 25;
271 
272       out.push_back(encoding);
273       return;
274    }
275    case Format::VOP2: {
276       uint32_t encoding = 0;
277       encoding |= opcode << 25;
278       encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
279       encoding |= (0xFF & instr->operands[1].physReg()) << 9;
280       encoding |= instr->operands[0].physReg();
281       out.push_back(encoding);
282       break;
283    }
284    case Format::VOP1: {
285       uint32_t encoding = (0b0111111 << 25);
286       if (!instr->definitions.empty())
287          encoding |= (0xFF & instr->definitions[0].physReg()) << 17;
288       encoding |= opcode << 9;
289       if (!instr->operands.empty())
290          encoding |= instr->operands[0].physReg();
291       out.push_back(encoding);
292       break;
293    }
294    case Format::VOPC: {
295       uint32_t encoding = (0b0111110 << 25);
296       encoding |= opcode << 17;
297       encoding |= (0xFF & instr->operands[1].physReg()) << 9;
298       encoding |= instr->operands[0].physReg();
299       out.push_back(encoding);
300       break;
301    }
302    case Format::VINTRP: {
303       Interp_instruction& interp = instr->vintrp();
304       uint32_t encoding = 0;
305 
306       if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
307           instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
308           instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
309           instr->opcode == aco_opcode::v_interp_p2_f16) {
310          if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
311             encoding = (0b110100 << 26);
312          } else if (ctx.chip_class >= GFX10) {
313             encoding = (0b110101 << 26);
314          } else {
315             unreachable("Unknown chip_class.");
316          }
317 
318          encoding |= opcode << 16;
319          encoding |= (0xFF & instr->definitions[0].physReg());
320          out.push_back(encoding);
321 
322          encoding = 0;
323          encoding |= interp.attribute;
324          encoding |= interp.component << 6;
325          encoding |= instr->operands[0].physReg() << 9;
326          if (instr->opcode == aco_opcode::v_interp_p2_f16 ||
327              instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
328              instr->opcode == aco_opcode::v_interp_p1lv_f16) {
329             encoding |= instr->operands[2].physReg() << 18;
330          }
331          out.push_back(encoding);
332       } else {
333          if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
334             encoding = (0b110101 << 26); /* Vega ISA doc says 110010 but it's wrong */
335          } else {
336             encoding = (0b110010 << 26);
337          }
338 
339          assert(encoding);
340          encoding |= (0xFF & instr->definitions[0].physReg()) << 18;
341          encoding |= opcode << 16;
342          encoding |= interp.attribute << 10;
343          encoding |= interp.component << 8;
344          if (instr->opcode == aco_opcode::v_interp_mov_f32)
345             encoding |= (0x3 & instr->operands[0].constantValue());
346          else
347             encoding |= (0xFF & instr->operands[0].physReg());
348          out.push_back(encoding);
349       }
350       break;
351    }
352    case Format::DS: {
353       DS_instruction& ds = instr->ds();
354       uint32_t encoding = (0b110110 << 26);
355       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
356          encoding |= opcode << 17;
357          encoding |= (ds.gds ? 1 : 0) << 16;
358       } else {
359          encoding |= opcode << 18;
360          encoding |= (ds.gds ? 1 : 0) << 17;
361       }
362       encoding |= ((0xFF & ds.offset1) << 8);
363       encoding |= (0xFFFF & ds.offset0);
364       out.push_back(encoding);
365       encoding = 0;
366       unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
367       encoding |= (0xFF & reg) << 24;
368       reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
369                ? instr->operands[2].physReg()
370                : 0;
371       encoding |= (0xFF & reg) << 16;
372       reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
373                ? instr->operands[1].physReg()
374                : 0;
375       encoding |= (0xFF & reg) << 8;
376       encoding |= (0xFF & instr->operands[0].physReg());
377       out.push_back(encoding);
378       break;
379    }
380    case Format::MUBUF: {
381       MUBUF_instruction& mubuf = instr->mubuf();
382       uint32_t encoding = (0b111000 << 26);
383       encoding |= opcode << 18;
384       encoding |= (mubuf.lds ? 1 : 0) << 16;
385       encoding |= (mubuf.glc ? 1 : 0) << 14;
386       encoding |= (mubuf.idxen ? 1 : 0) << 13;
387       assert(!mubuf.addr64 || ctx.chip_class <= GFX7);
388       if (ctx.chip_class == GFX6 || ctx.chip_class == GFX7)
389          encoding |= (mubuf.addr64 ? 1 : 0) << 15;
390       encoding |= (mubuf.offen ? 1 : 0) << 12;
391       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
392          assert(!mubuf.dlc); /* Device-level coherent is not supported on GFX9 and lower */
393          encoding |= (mubuf.slc ? 1 : 0) << 17;
394       } else if (ctx.chip_class >= GFX10) {
395          encoding |= (mubuf.dlc ? 1 : 0) << 15;
396       }
397       encoding |= 0x0FFF & mubuf.offset;
398       out.push_back(encoding);
399       encoding = 0;
400       if (ctx.chip_class <= GFX7 || ctx.chip_class >= GFX10) {
401          encoding |= (mubuf.slc ? 1 : 0) << 22;
402       }
403       encoding |= instr->operands[2].physReg() << 24;
404       encoding |= (mubuf.tfe ? 1 : 0) << 23;
405       encoding |= (instr->operands[0].physReg() >> 2) << 16;
406       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
407                                                 : instr->definitions[0].physReg();
408       encoding |= (0xFF & reg) << 8;
409       encoding |= (0xFF & instr->operands[1].physReg());
410       out.push_back(encoding);
411       break;
412    }
413    case Format::MTBUF: {
414       MTBUF_instruction& mtbuf = instr->mtbuf();
415 
416       uint32_t img_format = ac_get_tbuffer_format(ctx.chip_class, mtbuf.dfmt, mtbuf.nfmt);
417       uint32_t encoding = (0b111010 << 26);
418       assert(img_format <= 0x7F);
419       assert(!mtbuf.dlc || ctx.chip_class >= GFX10);
420       encoding |= (mtbuf.dlc ? 1 : 0) << 15; /* DLC bit replaces one bit of the OPCODE on GFX10 */
421       encoding |= (mtbuf.glc ? 1 : 0) << 14;
422       encoding |= (mtbuf.idxen ? 1 : 0) << 13;
423       encoding |= (mtbuf.offen ? 1 : 0) << 12;
424       encoding |= 0x0FFF & mtbuf.offset;
425       encoding |= (img_format << 19); /* Handles both the GFX10 FORMAT and the old NFMT+DFMT */
426 
427       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
428          encoding |= opcode << 15;
429       } else {
430          encoding |= (opcode & 0x07) << 16; /* 3 LSBs of 4-bit OPCODE */
431       }
432 
433       out.push_back(encoding);
434       encoding = 0;
435 
436       encoding |= instr->operands[2].physReg() << 24;
437       encoding |= (mtbuf.tfe ? 1 : 0) << 23;
438       encoding |= (mtbuf.slc ? 1 : 0) << 22;
439       encoding |= (instr->operands[0].physReg() >> 2) << 16;
440       unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
441                                                 : instr->definitions[0].physReg();
442       encoding |= (0xFF & reg) << 8;
443       encoding |= (0xFF & instr->operands[1].physReg());
444 
445       if (ctx.chip_class >= GFX10) {
446          encoding |= (((opcode & 0x08) >> 3) << 21); /* MSB of 4-bit OPCODE */
447       }
448 
449       out.push_back(encoding);
450       break;
451    }
452    case Format::MIMG: {
453       unsigned nsa_dwords = get_mimg_nsa_dwords(instr);
454       assert(!nsa_dwords || ctx.chip_class >= GFX10);
455 
456       MIMG_instruction& mimg = instr->mimg();
457       uint32_t encoding = (0b111100 << 26);
458       encoding |= mimg.slc ? 1 << 25 : 0;
459       encoding |= (opcode & 0x7f) << 18;
460       encoding |= (opcode >> 7) & 1;
461       encoding |= mimg.lwe ? 1 << 17 : 0;
462       encoding |= mimg.tfe ? 1 << 16 : 0;
463       encoding |= mimg.glc ? 1 << 13 : 0;
464       encoding |= mimg.unrm ? 1 << 12 : 0;
465       if (ctx.chip_class <= GFX9) {
466          assert(!mimg.dlc); /* Device-level coherent is not supported on GFX9 and lower */
467          assert(!mimg.r128);
468          encoding |= mimg.a16 ? 1 << 15 : 0;
469          encoding |= mimg.da ? 1 << 14 : 0;
470       } else {
471          encoding |= mimg.r128 ? 1 << 15
472                                : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
473          encoding |= nsa_dwords << 1;
474          encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
475          encoding |= mimg.dlc ? 1 << 7 : 0;
476       }
477       encoding |= (0xF & mimg.dmask) << 8;
478       out.push_back(encoding);
479       encoding = (0xFF & instr->operands[3].physReg()); /* VADDR */
480       if (!instr->definitions.empty()) {
481          encoding |= (0xFF & instr->definitions[0].physReg()) << 8; /* VDATA */
482       } else if (!instr->operands[2].isUndefined()) {
483          encoding |= (0xFF & instr->operands[2].physReg()) << 8; /* VDATA */
484       }
485       encoding |= (0x1F & (instr->operands[0].physReg() >> 2)) << 16; /* T# (resource) */
486       if (!instr->operands[1].isUndefined())
487          encoding |= (0x1F & (instr->operands[1].physReg() >> 2)) << 21; /* sampler */
488 
489       assert(!mimg.d16 || ctx.chip_class >= GFX9);
490       encoding |= mimg.d16 ? 1 << 31 : 0;
491       if (ctx.chip_class >= GFX10) {
492          /* GFX10: A16 still exists, but is in a different place */
493          encoding |= mimg.a16 ? 1 << 30 : 0;
494       }
495 
496       out.push_back(encoding);
497 
498       if (nsa_dwords) {
499          out.resize(out.size() + nsa_dwords);
500          std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
501          for (unsigned i = 0; i < instr->operands.size() - 4u; i++)
502             nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
503       }
504       break;
505    }
506    case Format::FLAT:
507    case Format::SCRATCH:
508    case Format::GLOBAL: {
509       FLAT_instruction& flat = instr->flatlike();
510       uint32_t encoding = (0b110111 << 26);
511       encoding |= opcode << 18;
512       if (ctx.chip_class <= GFX9) {
513          assert(flat.offset <= 0x1fff);
514          encoding |= flat.offset & 0x1fff;
515       } else if (instr->isFlat()) {
516          /* GFX10 has a 12-bit immediate OFFSET field,
517           * but it has a hw bug: it ignores the offset, called FlatSegmentOffsetBug
518           */
519          assert(flat.offset == 0);
520       } else {
521          assert(flat.offset <= 0xfff);
522          encoding |= flat.offset & 0xfff;
523       }
524       if (instr->isScratch())
525          encoding |= 1 << 14;
526       else if (instr->isGlobal())
527          encoding |= 2 << 14;
528       encoding |= flat.lds ? 1 << 13 : 0;
529       encoding |= flat.glc ? 1 << 16 : 0;
530       encoding |= flat.slc ? 1 << 17 : 0;
531       if (ctx.chip_class >= GFX10) {
532          assert(!flat.nv);
533          encoding |= flat.dlc ? 1 << 12 : 0;
534       } else {
535          assert(!flat.dlc);
536       }
537       out.push_back(encoding);
538       encoding = (0xFF & instr->operands[0].physReg());
539       if (!instr->definitions.empty())
540          encoding |= (0xFF & instr->definitions[0].physReg()) << 24;
541       if (instr->operands.size() >= 3)
542          encoding |= (0xFF & instr->operands[2].physReg()) << 8;
543       if (!instr->operands[1].isUndefined()) {
544          assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
545          assert(instr->format != Format::FLAT);
546          encoding |= instr->operands[1].physReg() << 16;
547       } else if (instr->format != Format::FLAT ||
548                  ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
549          if (ctx.chip_class <= GFX9)
550             encoding |= 0x7F << 16;
551          else
552             encoding |= sgpr_null << 16;
553       }
554       encoding |= flat.nv ? 1 << 23 : 0;
555       out.push_back(encoding);
556       break;
557    }
558    case Format::EXP: {
559       Export_instruction& exp = instr->exp();
560       uint32_t encoding;
561       if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9) {
562          encoding = (0b110001 << 26);
563       } else {
564          encoding = (0b111110 << 26);
565       }
566 
567       encoding |= exp.valid_mask ? 0b1 << 12 : 0;
568       encoding |= exp.done ? 0b1 << 11 : 0;
569       encoding |= exp.compressed ? 0b1 << 10 : 0;
570       encoding |= exp.dest << 4;
571       encoding |= exp.enabled_mask;
572       out.push_back(encoding);
573       encoding = 0xFF & exp.operands[0].physReg();
574       encoding |= (0xFF & exp.operands[1].physReg()) << 8;
575       encoding |= (0xFF & exp.operands[2].physReg()) << 16;
576       encoding |= (0xFF & exp.operands[3].physReg()) << 24;
577       out.push_back(encoding);
578       break;
579    }
580    case Format::PSEUDO:
581    case Format::PSEUDO_BARRIER:
582       if (instr->opcode != aco_opcode::p_unit_test)
583          unreachable("Pseudo instructions should be lowered before assembly.");
584       break;
585    default:
586       if (instr->isVOP3()) {
587          VOP3_instruction& vop3 = instr->vop3();
588 
589          if (instr->isVOP2()) {
590             opcode = opcode + 0x100;
591          } else if (instr->isVOP1()) {
592             if (ctx.chip_class == GFX8 || ctx.chip_class == GFX9)
593                opcode = opcode + 0x140;
594             else
595                opcode = opcode + 0x180;
596          } else if (instr->isVOPC()) {
597             opcode = opcode + 0x0;
598          } else if (instr->isVINTRP()) {
599             opcode = opcode + 0x270;
600          }
601 
602          uint32_t encoding;
603          if (ctx.chip_class <= GFX9) {
604             encoding = (0b110100 << 26);
605          } else if (ctx.chip_class >= GFX10) {
606             encoding = (0b110101 << 26);
607          } else {
608             unreachable("Unknown chip_class.");
609          }
610 
611          if (ctx.chip_class <= GFX7) {
612             encoding |= opcode << 17;
613             encoding |= (vop3.clamp ? 1 : 0) << 11;
614          } else {
615             encoding |= opcode << 16;
616             encoding |= (vop3.clamp ? 1 : 0) << 15;
617          }
618          encoding |= vop3.opsel << 11;
619          for (unsigned i = 0; i < 3; i++)
620             encoding |= vop3.abs[i] << (8 + i);
621          if (instr->definitions.size() == 2)
622             encoding |= instr->definitions[1].physReg() << 8;
623          encoding |= (0xFF & instr->definitions[0].physReg());
624          out.push_back(encoding);
625          encoding = 0;
626          if (instr->opcode == aco_opcode::v_interp_mov_f32) {
627             encoding = 0x3 & instr->operands[0].constantValue();
628          } else {
629             for (unsigned i = 0; i < instr->operands.size(); i++)
630                encoding |= instr->operands[i].physReg() << (i * 9);
631          }
632          encoding |= vop3.omod << 27;
633          for (unsigned i = 0; i < 3; i++)
634             encoding |= vop3.neg[i] << (29 + i);
635          out.push_back(encoding);
636 
637       } else if (instr->isVOP3P()) {
638          VOP3P_instruction& vop3 = instr->vop3p();
639 
640          uint32_t encoding;
641          if (ctx.chip_class == GFX9) {
642             encoding = (0b110100111 << 23);
643          } else if (ctx.chip_class >= GFX10) {
644             encoding = (0b110011 << 26);
645          } else {
646             unreachable("Unknown chip_class.");
647          }
648 
649          encoding |= opcode << 16;
650          encoding |= (vop3.clamp ? 1 : 0) << 15;
651          encoding |= vop3.opsel_lo << 11;
652          encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
653          for (unsigned i = 0; i < 3; i++)
654             encoding |= vop3.neg_hi[i] << (8 + i);
655          encoding |= (0xFF & instr->definitions[0].physReg());
656          out.push_back(encoding);
657          encoding = 0;
658          for (unsigned i = 0; i < instr->operands.size(); i++)
659             encoding |= instr->operands[i].physReg() << (i * 9);
660          encoding |= (vop3.opsel_hi & 0x3) << 27;
661          for (unsigned i = 0; i < 3; i++)
662             encoding |= vop3.neg_lo[i] << (29 + i);
663          out.push_back(encoding);
664 
665       } else if (instr->isDPP()) {
666          assert(ctx.chip_class >= GFX8);
667          DPP_instruction& dpp = instr->dpp();
668 
669          /* first emit the instruction without the DPP operand */
670          Operand dpp_op = instr->operands[0];
671          instr->operands[0] = Operand(PhysReg{250}, v1);
672          instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
673          emit_instruction(ctx, out, instr);
674          uint32_t encoding = (0xF & dpp.row_mask) << 28;
675          encoding |= (0xF & dpp.bank_mask) << 24;
676          encoding |= dpp.abs[1] << 23;
677          encoding |= dpp.neg[1] << 22;
678          encoding |= dpp.abs[0] << 21;
679          encoding |= dpp.neg[0] << 20;
680          if (ctx.chip_class >= GFX10)
681             encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour */
682          encoding |= dpp.bound_ctrl << 19;
683          encoding |= dpp.dpp_ctrl << 8;
684          encoding |= (0xFF) & dpp_op.physReg();
685          out.push_back(encoding);
686          return;
687       } else if (instr->isSDWA()) {
688          SDWA_instruction& sdwa = instr->sdwa();
689 
690          /* first emit the instruction without the SDWA operand */
691          Operand sdwa_op = instr->operands[0];
692          instr->operands[0] = Operand(PhysReg{249}, v1);
693          instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
694          emit_instruction(ctx, out, instr);
695 
696          uint32_t encoding = 0;
697 
698          if (instr->isVOPC()) {
699             if (instr->definitions[0].physReg() != vcc) {
700                encoding |= instr->definitions[0].physReg() << 8;
701                encoding |= 1 << 15;
702             }
703             encoding |= (sdwa.clamp ? 1 : 0) << 13;
704          } else {
705             encoding |= sdwa.dst_sel.to_sdwa_sel(instr->definitions[0].physReg().byte()) << 8;
706             uint32_t dst_u = sdwa.dst_sel.sign_extend() ? 1 : 0;
707             if (instr->definitions[0].bytes() < 4) /* dst_preserve */
708                dst_u = 2;
709             encoding |= dst_u << 11;
710             encoding |= (sdwa.clamp ? 1 : 0) << 13;
711             encoding |= sdwa.omod << 14;
712          }
713 
714          encoding |= sdwa.sel[0].to_sdwa_sel(sdwa_op.physReg().byte()) << 16;
715          encoding |= sdwa.sel[0].sign_extend() ? 1 << 19 : 0;
716          encoding |= sdwa.abs[0] << 21;
717          encoding |= sdwa.neg[0] << 20;
718 
719          if (instr->operands.size() >= 2) {
720             encoding |= sdwa.sel[1].to_sdwa_sel(instr->operands[1].physReg().byte()) << 24;
721             encoding |= sdwa.sel[1].sign_extend() ? 1 << 27 : 0;
722             encoding |= sdwa.abs[1] << 29;
723             encoding |= sdwa.neg[1] << 28;
724          }
725 
726          encoding |= 0xFF & sdwa_op.physReg();
727          encoding |= (sdwa_op.physReg() < 256) << 23;
728          if (instr->operands.size() >= 2)
729             encoding |= (instr->operands[1].physReg() < 256) << 31;
730          out.push_back(encoding);
731       } else {
732          unreachable("unimplemented instruction format");
733       }
734       break;
735    }
736 
737    /* append literal dword */
738    for (const Operand& op : instr->operands) {
739       if (op.isLiteral()) {
740          out.push_back(op.constantValue());
741          break;
742       }
743    }
744 }
745 
746 void
emit_block(asm_context & ctx,std::vector<uint32_t> & out,Block & block)747 emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
748 {
749    for (aco_ptr<Instruction>& instr : block.instructions) {
750 #if 0
751       int start_idx = out.size();
752       std::cerr << "Encoding:\t" << std::endl;
753       aco_print_instr(&*instr, stderr);
754       std::cerr << std::endl;
755 #endif
756       emit_instruction(ctx, out, instr.get());
757 #if 0
758       for (int i = start_idx; i < out.size(); i++)
759          std::cerr << "encoding: " << "0x" << std::setfill('0') << std::setw(8) << std::hex << out[i] << std::endl;
760 #endif
761    }
762 }
763 
764 void
fix_exports(asm_context & ctx,std::vector<uint32_t> & out,Program * program)765 fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
766 {
767    bool exported = false;
768    for (Block& block : program->blocks) {
769       if (!(block.kind & block_kind_export_end))
770          continue;
771       std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
772       while (it != block.instructions.rend()) {
773          if ((*it)->isEXP()) {
774             Export_instruction& exp = (*it)->exp();
775             if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
776                if (exp.dest >= V_008DFC_SQ_EXP_POS && exp.dest <= (V_008DFC_SQ_EXP_POS + 3)) {
777                   exp.done = true;
778                   exported = true;
779                   break;
780                }
781             } else {
782                exp.done = true;
783                exp.valid_mask = true;
784                exported = true;
785                break;
786             }
787          } else if ((*it)->definitions.size() && (*it)->definitions[0].physReg() == exec)
788             break;
789          ++it;
790       }
791    }
792 
793    if (!exported) {
794       /* Abort in order to avoid a GPU hang. */
795       bool is_vertex_or_ngg =
796          (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
797       aco_err(program,
798               "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
799       aco_print_program(program, stderr);
800       abort();
801    }
802 }
803 
804 static void
insert_code(asm_context & ctx,std::vector<uint32_t> & out,unsigned insert_before,unsigned insert_count,const uint32_t * insert_data)805 insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
806             unsigned insert_count, const uint32_t* insert_data)
807 {
808    out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
809 
810    /* Update the offset of each affected block */
811    for (Block& block : ctx.program->blocks) {
812       if (block.offset >= insert_before)
813          block.offset += insert_count;
814    }
815 
816    /* Find first branch after the inserted code */
817    auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
818                                  [insert_before](const auto& branch) -> bool
819                                  { return (unsigned)branch.first >= insert_before; });
820 
821    /* Update the locations of branches */
822    for (; branch_it != ctx.branches.end(); ++branch_it)
823       branch_it->first += insert_count;
824 
825    /* Update the locations of p_constaddr instructions */
826    for (auto& constaddr : ctx.constaddrs) {
827       constaddr_info& info = constaddr.second;
828       if (info.getpc_end >= insert_before)
829          info.getpc_end += insert_count;
830       if (info.add_literal >= insert_before)
831          info.add_literal += insert_count;
832    }
833 }
834 
835 static void
fix_branches_gfx10(asm_context & ctx,std::vector<uint32_t> & out)836 fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
837 {
838    /* Branches with an offset of 0x3f are buggy on GFX10,
839     * we workaround by inserting NOPs if needed.
840     */
841    bool gfx10_3f_bug = false;
842 
843    do {
844       auto buggy_branch_it = std::find_if(
845          ctx.branches.begin(), ctx.branches.end(),
846          [&ctx](const auto& branch) -> bool {
847             return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
848                    0x3f;
849          });
850 
851       gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
852 
853       if (gfx10_3f_bug) {
854          /* Insert an s_nop after the branch */
855          constexpr uint32_t s_nop_0 = 0xbf800000u;
856          insert_code(ctx, out, buggy_branch_it->first + 1, 1, &s_nop_0);
857       }
858    } while (gfx10_3f_bug);
859 }
860 
861 void
emit_long_jump(asm_context & ctx,SOPP_instruction * branch,bool backwards,std::vector<uint32_t> & out)862 emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
863                std::vector<uint32_t>& out)
864 {
865    Builder bld(ctx.program);
866 
867    Definition def_tmp_lo(branch->definitions[0].physReg(), s1);
868    Operand op_tmp_lo(branch->definitions[0].physReg(), s1);
869    Definition def_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
870    Operand op_tmp_hi(branch->definitions[0].physReg().advance(4), s1);
871 
872    aco_ptr<Instruction> instr;
873 
874    if (branch->opcode != aco_opcode::s_branch) {
875       /* for conditional branches, skip the long jump if the condition is false */
876       aco_opcode inv;
877       switch (branch->opcode) {
878       case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
879       case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
880       case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
881       case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
882       case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
883       case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
884       default: unreachable("Unhandled long jump.");
885       }
886       instr.reset(bld.sopp(inv, -1, 7));
887       emit_instruction(ctx, out, instr.get());
888    }
889 
890    /* create the new PC and stash SCC in the LSB */
891    instr.reset(bld.sop1(aco_opcode::s_getpc_b64, branch->definitions[0]).instr);
892    emit_instruction(ctx, out, instr.get());
893 
894    instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);
895    instr->operands[1].setFixed(PhysReg{255}); /* this operand has to be a literal */
896    emit_instruction(ctx, out, instr.get());
897    branch->pass_flags = out.size();
898 
899    instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi,
900                         Operand::c32(backwards ? UINT32_MAX : 0u))
901                   .instr);
902    emit_instruction(ctx, out, instr.get());
903 
904    /* restore SCC and clear the LSB of the new PC */
905    instr.reset(bld.sopc(aco_opcode::s_bitcmp1_b32, def_tmp_lo, op_tmp_lo, Operand::zero()).instr);
906    emit_instruction(ctx, out, instr.get());
907    instr.reset(bld.sop1(aco_opcode::s_bitset0_b32, def_tmp_lo, Operand::zero()).instr);
908    emit_instruction(ctx, out, instr.get());
909 
910    /* create the s_setpc_b64 to jump */
911    instr.reset(
912       bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
913    emit_instruction(ctx, out, instr.get());
914 }
915 
916 void
fix_branches(asm_context & ctx,std::vector<uint32_t> & out)917 fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
918 {
919    bool repeat = false;
920    do {
921       repeat = false;
922 
923       if (ctx.chip_class == GFX10)
924          fix_branches_gfx10(ctx, out);
925 
926       for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
927          int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
928          if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
929             std::vector<uint32_t> long_jump;
930             bool backwards =
931                ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
932             emit_long_jump(ctx, branch.second, backwards, long_jump);
933 
934             out[branch.first] = long_jump[0];
935             insert_code(ctx, out, branch.first + 1, long_jump.size() - 1, long_jump.data() + 1);
936 
937             repeat = true;
938             break;
939          }
940 
941          if (branch.second->pass_flags) {
942             int after_getpc = branch.first + branch.second->pass_flags - 2;
943             offset = (int)ctx.program->blocks[branch.second->block].offset - after_getpc;
944             out[branch.first + branch.second->pass_flags - 1] = offset * 4;
945          } else {
946             out[branch.first] &= 0xffff0000u;
947             out[branch.first] |= (uint16_t)offset;
948          }
949       }
950    } while (repeat);
951 }
952 
953 void
fix_constaddrs(asm_context & ctx,std::vector<uint32_t> & out)954 fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
955 {
956    for (auto& constaddr : ctx.constaddrs) {
957       constaddr_info& info = constaddr.second;
958       out[info.add_literal] += (out.size() - info.getpc_end) * 4u;
959    }
960 }
961 
962 unsigned
emit_program(Program * program,std::vector<uint32_t> & code)963 emit_program(Program* program, std::vector<uint32_t>& code)
964 {
965    asm_context ctx(program);
966 
967    if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
968        program->stage.hw == HWStage::NGG)
969       fix_exports(ctx, code, program);
970 
971    for (Block& block : program->blocks) {
972       block.offset = code.size();
973       emit_block(ctx, code, block);
974    }
975 
976    fix_branches(ctx, code);
977 
978    unsigned exec_size = code.size() * sizeof(uint32_t);
979 
980    if (program->chip_class >= GFX10) {
981       /* Pad output with s_code_end so instruction prefetching doesn't cause
982        * page faults */
983       unsigned final_size = align(code.size() + 3 * 16, 16);
984       while (code.size() < final_size)
985          code.push_back(0xbf9f0000u);
986    }
987 
988    fix_constaddrs(ctx, code);
989 
990    while (program->constant_data.size() % 4u)
991       program->constant_data.push_back(0);
992    /* Copy constant data */
993    code.insert(code.end(), (uint32_t*)program->constant_data.data(),
994                (uint32_t*)(program->constant_data.data() + program->constant_data.size()));
995 
996    return exec_size;
997 }
998 
999 } // namespace aco
1000