• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "util/memstream.h"
28 #include "util/ralloc.h"
29 
30 #include <array>
31 #include <map>
32 #include <set>
33 #include <vector>
34 
35 namespace aco {
36 
37 static void
aco_log(Program * program,enum aco_compiler_debug_level level,const char * prefix,const char * file,unsigned line,const char * fmt,va_list args)38 aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, const char* file,
39         unsigned line, const char* fmt, va_list args)
40 {
41    char* msg;
42 
43    if (program->debug.shorten_messages) {
44       msg = ralloc_vasprintf(NULL, fmt, args);
45    } else {
46       msg = ralloc_strdup(NULL, prefix);
47       ralloc_asprintf_append(&msg, "    In file %s:%u\n", file, line);
48       ralloc_asprintf_append(&msg, "    ");
49       ralloc_vasprintf_append(&msg, fmt, args);
50    }
51 
52    if (program->debug.func)
53       program->debug.func(program->debug.private_data, level, msg);
54 
55    fprintf(program->debug.output, "%s\n", msg);
56 
57    ralloc_free(msg);
58 }
59 
60 void
_aco_perfwarn(Program * program,const char * file,unsigned line,const char * fmt,...)61 _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
62 {
63    va_list args;
64 
65    va_start(args, fmt);
66    aco_log(program, ACO_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
67    va_end(args);
68 }
69 
70 void
_aco_err(Program * program,const char * file,unsigned line,const char * fmt,...)71 _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
72 {
73    va_list args;
74 
75    va_start(args, fmt);
76    aco_log(program, ACO_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
77    va_end(args);
78 }
79 
80 bool
validate_ir(Program * program)81 validate_ir(Program* program)
82 {
83    bool is_valid = true;
84    auto check = [&program, &is_valid](bool success, const char* msg,
85                                       aco::Instruction* instr) -> void
86    {
87       if (!success) {
88          char* out;
89          size_t outsize;
90          struct u_memstream mem;
91          u_memstream_open(&mem, &out, &outsize);
92          FILE* const memf = u_memstream_get(&mem);
93 
94          fprintf(memf, "%s: ", msg);
95          aco_print_instr(program->gfx_level, instr, memf);
96          u_memstream_close(&mem);
97 
98          aco_err(program, "%s", out);
99          free(out);
100 
101          is_valid = false;
102       }
103    };
104 
105    for (Block& block : program->blocks) {
106       for (aco_ptr<Instruction>& instr : block.instructions) {
107 
108          unsigned pck_defs = instr_info.definitions[(int)instr->opcode];
109          unsigned pck_ops = instr_info.operands[(int)instr->opcode];
110 
111          if (pck_defs != 0) {
112             /* Before GFX10 v_cmpx also writes VCC. */
113             if (instr->isVOPC() && program->gfx_level < GFX10 && pck_defs == exec_hi)
114                pck_defs = vcc | (exec_hi << 8);
115 
116             for (unsigned i = 0; i < 4; i++) {
117                uint32_t def = (pck_defs >> (i * 8)) & 0xff;
118                if (def == 0) {
119                   check(i == instr->definitions.size(), "Too many definitions", instr.get());
120                   break;
121                } else {
122                   check(i < instr->definitions.size(), "Too few definitions", instr.get());
123                   if (i >= instr->definitions.size())
124                      break;
125                }
126 
127                if (def == m0) {
128                   check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == m0,
129                         "Definition needs m0", instr.get());
130                } else if (def == scc) {
131                   check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == scc,
132                         "Definition needs scc", instr.get());
133                } else if (def == exec_hi) {
134                   RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
135                   check(instr->definitions[i].isFixed() &&
136                            instr->definitions[i].physReg() == exec &&
137                            instr->definitions[i].regClass() == rc,
138                         "Definition needs exec", instr.get());
139                } else if (def == exec_lo) {
140                   check(instr->definitions[i].isFixed() &&
141                            instr->definitions[i].physReg() == exec_lo &&
142                            instr->definitions[i].regClass() == s1,
143                         "Definition needs exec_lo", instr.get());
144                } else if (def == vcc) {
145                   check(instr->definitions[i].regClass() == program->lane_mask,
146                         "Definition has to be lane mask", instr.get());
147                   check(!instr->definitions[i].isFixed() ||
148                            instr->definitions[i].physReg() == vcc || instr->isVOP3() ||
149                            instr->isSDWA(),
150                         "Definition has to be vcc", instr.get());
151                } else {
152                   check(instr->definitions[i].size() == def, "Definition has wrong size",
153                         instr.get());
154                }
155             }
156          }
157 
158          if (pck_ops != 0) {
159             for (unsigned i = 0; i < 4; i++) {
160                uint32_t op = (pck_ops >> (i * 8)) & 0xff;
161                if (op == 0) {
162                   check(i == instr->operands.size(), "Too many operands", instr.get());
163                   break;
164                } else {
165                   check(i < instr->operands.size(), "Too few operands", instr.get());
166                   if (i >= instr->operands.size())
167                      break;
168                }
169 
170                if (op == m0) {
171                   check(instr->operands[i].isFixed() && instr->operands[i].physReg() == m0,
172                         "Operand needs m0", instr.get());
173                } else if (op == scc) {
174                   check(instr->operands[i].isFixed() && instr->operands[i].physReg() == scc,
175                         "Operand needs scc", instr.get());
176                } else if (op == exec_hi) {
177                   RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
178                   check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec &&
179                            instr->operands[i].hasRegClass() && instr->operands[i].regClass() == rc,
180                         "Operand needs exec", instr.get());
181                } else if (op == exec_lo) {
182                   check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec_lo &&
183                            instr->operands[i].hasRegClass() && instr->operands[i].regClass() == s1,
184                         "Operand needs exec_lo", instr.get());
185                } else if (op == vcc) {
186                   check(instr->operands[i].hasRegClass() &&
187                            instr->operands[i].regClass() == program->lane_mask,
188                         "Operand has to be lane mask", instr.get());
189                   check(!instr->operands[i].isFixed() || instr->operands[i].physReg() == vcc ||
190                            instr->isVOP3(),
191                         "Operand has to be vcc", instr.get());
192                } else {
193                   check(instr->operands[i].size() == op ||
194                            (instr->operands[i].isFixed() && instr->operands[i].physReg() >= 128 &&
195                             instr->operands[i].physReg() < 256),
196                         "Operand has wrong size", instr.get());
197                }
198             }
199          }
200 
201          /* check base format */
202          Format base_format = instr->format;
203          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA);
204          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP16);
205          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP8);
206          if ((uint32_t)base_format & (uint32_t)Format::VOP1)
207             base_format = Format::VOP1;
208          else if ((uint32_t)base_format & (uint32_t)Format::VOP2)
209             base_format = Format::VOP2;
210          else if ((uint32_t)base_format & (uint32_t)Format::VOPC)
211             base_format = Format::VOPC;
212          else if (base_format == Format::VINTRP) {
213             if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
214                 instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
215                 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
216                 instr->opcode == aco_opcode::v_interp_p2_f16) {
217                /* v_interp_*_fp16 are considered VINTRP by the compiler but
218                 * they are emitted as VOP3.
219                 */
220                base_format = Format::VOP3;
221             } else {
222                base_format = Format::VINTRP;
223             }
224          }
225          check(base_format == instr_info.format[(int)instr->opcode],
226                "Wrong base format for instruction", instr.get());
227 
228          /* check VOP3 modifiers */
229          if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) {
230             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
231                      base_format == Format::VOPC || base_format == Format::VINTRP,
232                   "Format cannot have VOP3/VOP3B applied", instr.get());
233          }
234 
235          if (instr->isDPP()) {
236             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
237                      base_format == Format::VOPC || base_format == Format::VOP3 ||
238                      base_format == Format::VOP3P,
239                   "Format cannot have DPP applied", instr.get());
240             check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
241                   "VOP3+DPP is GFX11+ only", instr.get());
242 
243             bool fi =
244                instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
245             check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
246                   instr.get());
247          }
248 
249          /* check SDWA */
250          if (instr->isSDWA()) {
251             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
252                      base_format == Format::VOPC,
253                   "Format cannot have SDWA applied", instr.get());
254 
255             check(program->gfx_level >= GFX8, "SDWA is GFX8 to GFX10.3 only", instr.get());
256             check(program->gfx_level < GFX11, "SDWA is GFX8 to GFX10.3 only", instr.get());
257 
258             SDWA_instruction& sdwa = instr->sdwa();
259             check(sdwa.omod == 0 || program->gfx_level >= GFX9, "SDWA omod only supported on GFX9+",
260                   instr.get());
261             if (base_format == Format::VOPC) {
262                check(sdwa.clamp == false || program->gfx_level == GFX8,
263                      "SDWA VOPC clamp only supported on GFX8", instr.get());
264                check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
265                         program->gfx_level >= GFX9,
266                      "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
267             } else {
268                const Definition& def = instr->definitions[0];
269                check(def.bytes() <= 4, "SDWA definitions must not be larger than 4 bytes",
270                      instr.get());
271                check(def.bytes() >= sdwa.dst_sel.size() + sdwa.dst_sel.offset(),
272                      "SDWA definition selection size must be at most definition size", instr.get());
273                check(
274                   sdwa.dst_sel.size() == 1 || sdwa.dst_sel.size() == 2 || sdwa.dst_sel.size() == 4,
275                   "SDWA definition selection size must be 1, 2 or 4 bytes", instr.get());
276                check(sdwa.dst_sel.offset() % sdwa.dst_sel.size() == 0, "Invalid selection offset",
277                      instr.get());
278                check(def.bytes() == 4 || def.bytes() == sdwa.dst_sel.size(),
279                      "SDWA dst_sel size must be definition size for subdword definitions",
280                      instr.get());
281                check(def.bytes() == 4 || sdwa.dst_sel.offset() == 0,
282                      "SDWA dst_sel offset must be 0 for subdword definitions", instr.get());
283             }
284 
285             for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
286                const Operand& op = instr->operands[i];
287                check(op.bytes() <= 4, "SDWA operands must not be larger than 4 bytes", instr.get());
288                check(op.bytes() >= sdwa.sel[i].size() + sdwa.sel[i].offset(),
289                      "SDWA operand selection size must be at most operand size", instr.get());
290                check(sdwa.sel[i].size() == 1 || sdwa.sel[i].size() == 2 || sdwa.sel[i].size() == 4,
291                      "SDWA operand selection size must be 1, 2 or 4 bytes", instr.get());
292                check(sdwa.sel[i].offset() % sdwa.sel[i].size() == 0, "Invalid selection offset",
293                      instr.get());
294             }
295             if (instr->operands.size() >= 3) {
296                check(instr->operands[2].isFixed() && instr->operands[2].physReg() == vcc,
297                      "3rd operand must be fixed to vcc with SDWA", instr.get());
298             }
299             if (instr->definitions.size() >= 2) {
300                check(instr->definitions[1].isFixed() && instr->definitions[1].physReg() == vcc,
301                      "2nd definition must be fixed to vcc with SDWA", instr.get());
302             }
303 
304             const bool sdwa_opcodes =
305                instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
306                instr->opcode != aco_opcode::v_fmamk_f32 &&
307                instr->opcode != aco_opcode::v_fmaak_f32 &&
308                instr->opcode != aco_opcode::v_fmamk_f16 &&
309                instr->opcode != aco_opcode::v_fmaak_f16 &&
310                instr->opcode != aco_opcode::v_madmk_f32 &&
311                instr->opcode != aco_opcode::v_madak_f32 &&
312                instr->opcode != aco_opcode::v_madmk_f16 &&
313                instr->opcode != aco_opcode::v_madak_f16 &&
314                instr->opcode != aco_opcode::v_readfirstlane_b32 &&
315                instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
316 
317             const bool feature_mac =
318                program->gfx_level == GFX8 &&
319                (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
320 
321             check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
322          }
323 
324          /* check opsel */
325          if (instr->opcode == aco_opcode::v_permlane16_b32 ||
326              instr->opcode == aco_opcode::v_permlanex16_b32) {
327             check(instr->valu().opsel <= 0x3, "Unexpected opsel for permlane", instr.get());
328          } else if (instr->isVOP3() || instr->isVOP1() || instr->isVOP2() || instr->isVOPC()) {
329             VALU_instruction& valu = instr->valu();
330             check(valu.opsel == 0 || program->gfx_level >= GFX9, "Opsel is only supported on GFX9+",
331                   instr.get());
332             check(valu.opsel == 0 || instr->format == Format::VOP3 || program->gfx_level >= GFX11,
333                   "Opsel is only supported for VOP3 before GFX11", instr.get());
334 
335             for (unsigned i = 0; i < 3; i++) {
336                if (i >= instr->operands.size() ||
337                    (!instr->isVOP3() && !instr->operands[i].isOfType(RegType::vgpr)) ||
338                    (instr->operands[i].hasRegClass() &&
339                     instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
340                   check(!valu.opsel[i], "Unexpected opsel for operand", instr.get());
341             }
342             if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
343                check(!valu.opsel[3], "Unexpected opsel for sub-dword definition", instr.get());
344          } else if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
345                     instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
346                     instr->opcode == aco_opcode::v_fma_mix_f32) {
347             check(instr->definitions[0].regClass() ==
348                      (instr->opcode == aco_opcode::v_fma_mix_f32 ? v1 : v2b),
349                   "v_fma_mix_f32/v_fma_mix_f16 must have v1/v2b definition", instr.get());
350          } else if (instr->isVOP3P()) {
351             VALU_instruction& vop3p = instr->valu();
352             for (unsigned i = 0; i < instr->operands.size(); i++) {
353                if (instr->operands[i].hasRegClass() &&
354                    instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())
355                   check(!vop3p.opsel_lo[i] && !vop3p.opsel_hi[i],
356                         "Unexpected opsel for subdword operand", instr.get());
357             }
358             check(instr->definitions[0].regClass() == v1 ||
359                      instr_info.classes[(int)instr->opcode] == instr_class::wmma,
360                   "VOP3P must have v1 definition", instr.get());
361          }
362 
363          /* check for undefs */
364          for (unsigned i = 0; i < instr->operands.size(); i++) {
365             if (instr->operands[i].isUndefined()) {
366                bool flat = instr->isFlatLike();
367                bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
368                                    instr->opcode == aco_opcode::p_create_vector ||
369                                    instr->opcode == aco_opcode::p_jump_to_epilog ||
370                                    instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
371                                    instr->opcode == aco_opcode::p_end_with_regs ||
372                                    (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
373                                    (instr->opcode == aco_opcode::p_bpermute_permlane && i == 0) ||
374                                    (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
375                                    ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
376                                    (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
377                                    (instr->opcode == aco_opcode::p_init_scratch && i == 0);
378                check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
379             } else {
380                check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
381                         instr->operands[i].isConstant(),
382                      "Uninitialized Operand", instr.get());
383             }
384          }
385 
386          /* check subdword definitions */
387          for (unsigned i = 0; i < instr->definitions.size(); i++) {
388             if (instr->definitions[i].regClass().is_subdword())
389                check(instr->definitions[i].bytes() <= 4 || instr->isPseudo() || instr->isVMEM(),
390                      "Only Pseudo and VMEM instructions can write subdword registers > 4 bytes",
391                      instr.get());
392          }
393 
394          if ((instr->isSALU() && instr->opcode != aco_opcode::p_constaddr_addlo &&
395               instr->opcode != aco_opcode::p_resumeaddr_addlo) ||
396              instr->isVALU()) {
397             /* check literals */
398             Operand literal(s1);
399             for (unsigned i = 0; i < instr->operands.size(); i++) {
400                Operand op = instr->operands[i];
401                if (!op.isLiteral())
402                   continue;
403 
404                check(!instr->isDPP() && !instr->isSDWA() &&
405                         (!instr->isVOP3() || program->gfx_level >= GFX10) &&
406                         (!instr->isVOP3P() || program->gfx_level >= GFX10),
407                      "Literal applied on wrong instruction format", instr.get());
408 
409                check(literal.isUndefined() || (literal.size() == op.size() &&
410                                                literal.constantValue() == op.constantValue()),
411                      "Only 1 Literal allowed", instr.get());
412                literal = op;
413                check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
414                      "Wrong source position for Literal argument", instr.get());
415             }
416 
417             /* check num sgprs for VALU */
418             if (instr->isVALU()) {
419                bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
420                                  instr->opcode == aco_opcode::v_lshrrev_b64 ||
421                                  instr->opcode == aco_opcode::v_ashrrev_i64;
422                unsigned const_bus_limit = 1;
423                if (program->gfx_level >= GFX10 && !is_shift64)
424                   const_bus_limit = 2;
425 
426                uint32_t scalar_mask =
427                   instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG() ? 0x7 : 0x5;
428                if (instr->isSDWA())
429                   scalar_mask = program->gfx_level >= GFX9 ? 0x7 : 0x4;
430                else if (instr->isDPP())
431                   scalar_mask = 0x4;
432 
433                if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
434                    instr->opcode == aco_opcode::v_readlane_b32 ||
435                    instr->opcode == aco_opcode::v_readlane_b32_e64) {
436                   check(instr->definitions[0].regClass().type() == RegType::sgpr,
437                         "Wrong Definition type for VALU instruction", instr.get());
438                } else {
439                   check(instr->definitions[0].regClass().type() == RegType::vgpr,
440                         "Wrong Definition type for VALU instruction", instr.get());
441                }
442 
443                unsigned num_sgprs = 0;
444                unsigned sgpr[] = {0, 0};
445                for (unsigned i = 0; i < instr->operands.size(); i++) {
446                   Operand op = instr->operands[i];
447                   if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
448                       instr->opcode == aco_opcode::v_readlane_b32 ||
449                       instr->opcode == aco_opcode::v_readlane_b32_e64) {
450                      check(i != 1 || op.isOfType(RegType::sgpr) || op.isConstant(),
451                            "Must be a SGPR or a constant", instr.get());
452                      check(i == 1 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
453                            "Wrong Operand type for VALU instruction", instr.get());
454                      continue;
455                   }
456                   if (instr->opcode == aco_opcode::v_permlane16_b32 ||
457                       instr->opcode == aco_opcode::v_permlanex16_b32 ||
458                       instr->opcode == aco_opcode::v_permlane64_b32) {
459                      check(i != 0 || op.isOfType(RegType::vgpr),
460                            "Operand 0 of v_permlane must be VGPR", instr.get());
461                      check(i == 0 || op.isOfType(RegType::sgpr) || op.isConstant(),
462                            "Lane select operands of v_permlane must be SGPR or constant",
463                            instr.get());
464                   }
465 
466                   if (instr->opcode == aco_opcode::v_writelane_b32 ||
467                       instr->opcode == aco_opcode::v_writelane_b32_e64) {
468                      check(i != 2 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
469                            "Wrong Operand type for VALU instruction", instr.get());
470                      check(i == 2 || op.isOfType(RegType::sgpr) || op.isConstant(),
471                            "Must be a SGPR or a constant", instr.get());
472                      continue;
473                   }
474                   if (op.isOfType(RegType::sgpr)) {
475                      check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
476                            instr.get());
477 
478                      if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
479                         if (num_sgprs < 2)
480                            sgpr[num_sgprs++] = op.tempId();
481                      }
482                   }
483 
484                   if (op.isConstant() && !op.isLiteral())
485                      check(scalar_mask & (1 << i), "Wrong source position for constant argument",
486                            instr.get());
487                }
488                check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
489                      "Too many SGPRs/literals", instr.get());
490 
491                /* Validate modifiers. */
492                check(!instr->valu().opsel || instr->isVOP3() || instr->isVOP1() ||
493                         instr->isVOP2() || instr->isVOPC() || instr->isVINTERP_INREG(),
494                      "OPSEL set for unsupported instruction format", instr.get());
495                check(!instr->valu().opsel_lo || instr->isVOP3P(),
496                      "OPSEL_LO set for unsupported instruction format", instr.get());
497                check(!instr->valu().opsel_hi || instr->isVOP3P(),
498                      "OPSEL_HI set for unsupported instruction format", instr.get());
499                check(!instr->valu().omod || instr->isVOP3() || instr->isSDWA(),
500                      "OMOD set for unsupported instruction format", instr.get());
501                check(!instr->valu().clamp || instr->isVOP3() || instr->isVOP3P() ||
502                         instr->isSDWA() || instr->isVINTERP_INREG(),
503                      "CLAMP set for unsupported instruction format", instr.get());
504 
505                for (bool abs : instr->valu().abs) {
506                   check(!abs || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
507                            instr->isDPP16(),
508                         "ABS/NEG_HI set for unsupported instruction format", instr.get());
509                }
510                for (bool neg : instr->valu().neg) {
511                   check(!neg || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
512                            instr->isDPP16() || instr->isVINTERP_INREG(),
513                         "NEG/NEG_LO set for unsupported instruction format", instr.get());
514                }
515             }
516 
517             if (instr->isSOP1() || instr->isSOP2()) {
518                if (!instr->definitions.empty())
519                   check(instr->definitions[0].regClass().type() == RegType::sgpr,
520                         "Wrong Definition type for SALU instruction", instr.get());
521                for (const Operand& op : instr->operands) {
522                   check(op.isConstant() || op.isOfType(RegType::sgpr),
523                         "Wrong Operand type for SALU instruction", instr.get());
524                }
525             }
526          }
527 
528          switch (instr->format) {
529          case Format::PSEUDO: {
530             if (instr->opcode == aco_opcode::p_create_vector) {
531                unsigned size = 0;
532                for (const Operand& op : instr->operands) {
533                   check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
534                   size += op.bytes();
535                }
536                check(size == instr->definitions[0].bytes(),
537                      "Definition size does not match operand sizes", instr.get());
538                if (instr->definitions[0].regClass().type() == RegType::sgpr) {
539                   for (const Operand& op : instr->operands) {
540                      check(op.isConstant() || op.regClass().type() == RegType::sgpr,
541                            "Wrong Operand type for scalar vector", instr.get());
542                   }
543                }
544             } else if (instr->opcode == aco_opcode::p_extract_vector) {
545                check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
546                      "Wrong Operand types", instr.get());
547                check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
548                         instr->operands[0].bytes(),
549                      "Index out of range", instr.get());
550                check(instr->definitions[0].regClass().type() == RegType::vgpr ||
551                         instr->operands[0].regClass().type() == RegType::sgpr,
552                      "Cannot extract SGPR value from VGPR vector", instr.get());
553                check(program->gfx_level >= GFX9 ||
554                         !instr->definitions[0].regClass().is_subdword() ||
555                         instr->operands[0].regClass().type() == RegType::vgpr,
556                      "Cannot extract subdword from SGPR before GFX9+", instr.get());
557             } else if (instr->opcode == aco_opcode::p_split_vector) {
558                check(!instr->operands[0].isConstant(), "Operand must not be constant", instr.get());
559                unsigned size = 0;
560                for (const Definition& def : instr->definitions) {
561                   size += def.bytes();
562                }
563                check(size == instr->operands[0].bytes(),
564                      "Operand size does not match definition sizes", instr.get());
565                if (instr->operands[0].isOfType(RegType::vgpr)) {
566                   for (const Definition& def : instr->definitions)
567                      check(def.regClass().type() == RegType::vgpr,
568                            "Wrong Definition type for VGPR split_vector", instr.get());
569                } else {
570                   for (const Definition& def : instr->definitions)
571                      check(program->gfx_level >= GFX9 || !def.regClass().is_subdword(),
572                            "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
573                }
574             } else if (instr->opcode == aco_opcode::p_parallelcopy) {
575                check(instr->definitions.size() == instr->operands.size(),
576                      "Number of Operands does not match number of Definitions", instr.get());
577                for (unsigned i = 0; i < instr->operands.size(); i++) {
578                   check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
579                         "Operand and Definition size must match", instr.get());
580                   if (instr->operands[i].hasRegClass()) {
581                      check((instr->definitions[i].regClass().type() ==
582                             instr->operands[i].regClass().type()) ||
583                               (instr->definitions[i].regClass().type() == RegType::vgpr &&
584                                instr->operands[i].regClass().type() == RegType::sgpr),
585                            "Operand and Definition types do not match", instr.get());
586                      check(instr->definitions[i].regClass().is_linear_vgpr() ==
587                               instr->operands[i].regClass().is_linear_vgpr(),
588                            "Operand and Definition types do not match", instr.get());
589                   } else {
590                      check(!instr->definitions[i].regClass().is_linear_vgpr(),
591                            "Can only copy linear VGPRs into linear VGPRs, not constant/undef",
592                            instr.get());
593                   }
594                }
595             } else if (instr->opcode == aco_opcode::p_phi) {
596                check(instr->operands.size() == block.logical_preds.size(),
597                      "Number of Operands does not match number of predecessors", instr.get());
598                check(instr->definitions[0].regClass().type() == RegType::vgpr,
599                      "Logical Phi Definition must be vgpr", instr.get());
600                for (const Operand& op : instr->operands)
601                   check(instr->definitions[0].size() == op.size(),
602                         "Operand sizes must match Definition size", instr.get());
603             } else if (instr->opcode == aco_opcode::p_linear_phi) {
604                for (const Operand& op : instr->operands) {
605                   check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
606                         instr.get());
607                   check(instr->definitions[0].size() == op.size(),
608                         "Operand sizes must match Definition size", instr.get());
609                }
610                check(instr->operands.size() == block.linear_preds.size(),
611                      "Number of Operands does not match number of predecessors", instr.get());
612             } else if (instr->opcode == aco_opcode::p_extract ||
613                        instr->opcode == aco_opcode::p_insert) {
614                check(!instr->operands[0].isConstant(), "Data operand must not be constant",
615                      instr.get());
616                check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
617                if (instr->opcode == aco_opcode::p_extract)
618                   check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
619                         instr.get());
620 
621                check(instr->definitions[0].regClass().type() != RegType::sgpr ||
622                         instr->operands[0].regClass().type() == RegType::sgpr,
623                      "Can't extract/insert VGPR to SGPR", instr.get());
624 
625                if (instr->opcode == aco_opcode::p_insert)
626                   check(instr->operands[0].bytes() == instr->definitions[0].bytes(),
627                         "Sizes of p_insert data operand and definition must match", instr.get());
628 
629                if (instr->definitions[0].regClass().type() == RegType::sgpr)
630                   check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
631                            instr->definitions[1].physReg() == scc,
632                         "SGPR extract/insert needs an SCC definition", instr.get());
633 
634                unsigned data_bits = instr->operands[0].bytes() * 8u;
635                unsigned op_bits = instr->operands[2].constantValue();
636 
637                if (instr->opcode == aco_opcode::p_insert) {
638                   check(op_bits == 8 || op_bits == 16, "Size must be 8 or 16", instr.get());
639                   check(op_bits < data_bits, "Size must be smaller than source", instr.get());
640                } else if (instr->opcode == aco_opcode::p_extract) {
641                   check(op_bits == 8 || op_bits == 16 || op_bits == 32,
642                         "Size must be 8 or 16 or 32", instr.get());
643                   check(data_bits >= op_bits, "Can't extract more bits than what the data has.",
644                         instr.get());
645                }
646 
647                unsigned comp = data_bits / MAX2(op_bits, 1);
648                check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
649                      instr.get());
650             } else if (instr->opcode == aco_opcode::p_jump_to_epilog) {
651                check(instr->definitions.size() == 0, "p_jump_to_epilog must have 0 definitions",
652                      instr.get());
653                check(instr->operands.size() > 0 && instr->operands[0].isOfType(RegType::sgpr) &&
654                         instr->operands[0].size() == 2,
655                      "First operand of p_jump_to_epilog must be a SGPR", instr.get());
656                for (unsigned i = 1; i < instr->operands.size(); i++) {
657                   check(instr->operands[i].isOfType(RegType::vgpr) ||
658                            instr->operands[i].isOfType(RegType::sgpr) ||
659                            instr->operands[i].isUndefined(),
660                         "Other operands of p_jump_to_epilog must be VGPRs, SGPRs or undef",
661                         instr.get());
662                }
663             } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
664                check(instr->definitions.size() == 6,
665                      "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
666                check(instr->definitions[2].regClass() == program->lane_mask,
667                      "Third definition of p_dual_src_export_gfx11 must be a lane mask",
668                      instr.get());
669                check(instr->definitions[3].regClass() == program->lane_mask,
670                      "Fourth definition of p_dual_src_export_gfx11 must be a lane mask",
671                      instr.get());
672                check(instr->definitions[4].physReg() == vcc,
673                      "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
674                check(instr->definitions[5].physReg() == scc,
675                      "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
676                check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
677                      instr.get());
678                for (unsigned i = 0; i < instr->operands.size(); i++) {
679                   check(
680                      instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
681                      "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
682                }
683             } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
684                check(instr->definitions.size() == 1, "Must have one definition", instr.get());
685                check(instr->operands.size() <= 1, "Must have one or zero operands", instr.get());
686                if (!instr->definitions.empty())
687                   check(instr->definitions[0].regClass().is_linear_vgpr(),
688                         "Definition must be linear VGPR", instr.get());
689                if (!instr->definitions.empty() && !instr->operands.empty())
690                   check(instr->definitions[0].bytes() == instr->operands[0].bytes(),
691                         "Operand size must match definition", instr.get());
692             }
693             break;
694          }
695          case Format::PSEUDO_REDUCTION: {
696             for (const Operand& op : instr->operands)
697                check(op.regClass().type() == RegType::vgpr,
698                      "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
699                      instr.get());
700 
701             if (instr->opcode == aco_opcode::p_reduce &&
702                 instr->reduction().cluster_size == program->wave_size)
703                check(instr->definitions[0].regClass().type() == RegType::sgpr ||
704                         program->wave_size == 32,
705                      "The result of unclustered reductions must go into an SGPR.", instr.get());
706             else
707                check(instr->definitions[0].regClass().type() == RegType::vgpr,
708                      "The result of scans and clustered reductions must go into a VGPR.",
709                      instr.get());
710 
711             break;
712          }
713          case Format::SMEM: {
714             if (instr->operands.size() >= 1)
715                check(instr->operands[0].isOfType(RegType::sgpr), "SMEM operands must be sgpr",
716                      instr.get());
717             if (instr->operands.size() >= 2)
718                check(instr->operands[1].isConstant() || instr->operands[1].isOfType(RegType::sgpr),
719                      "SMEM offset must be constant or sgpr", instr.get());
720             if (!instr->definitions.empty())
721                check(instr->definitions[0].regClass().type() == RegType::sgpr,
722                      "SMEM result must be sgpr", instr.get());
723             break;
724          }
725          case Format::MTBUF:
726          case Format::MUBUF: {
727             check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
728                   instr.get());
729             check(instr->operands[1].isOfType(RegType::vgpr),
730                   "VADDR must be in vgpr for VMEM instructions", instr.get());
731             check(instr->operands[0].isOfType(RegType::sgpr), "VMEM resource constant must be sgpr",
732                   instr.get());
733             check(instr->operands.size() < 4 || instr->operands[3].isOfType(RegType::vgpr),
734                   "VMEM write data must be vgpr", instr.get());
735 
736             const bool d16 =
737                instr->opcode ==
738                   aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables
739                instr->opcode == aco_opcode::buffer_load_ubyte ||
740                instr->opcode == aco_opcode::buffer_load_sbyte ||
741                instr->opcode == aco_opcode::buffer_load_ushort ||
742                instr->opcode == aco_opcode::buffer_load_sshort ||
743                instr->opcode == aco_opcode::buffer_load_ubyte_d16 ||
744                instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi ||
745                instr->opcode == aco_opcode::buffer_load_sbyte_d16 ||
746                instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi ||
747                instr->opcode == aco_opcode::buffer_load_short_d16 ||
748                instr->opcode == aco_opcode::buffer_load_short_d16_hi ||
749                instr->opcode == aco_opcode::buffer_load_format_d16_x ||
750                instr->opcode == aco_opcode::buffer_load_format_d16_hi_x ||
751                instr->opcode == aco_opcode::buffer_load_format_d16_xy ||
752                instr->opcode == aco_opcode::buffer_load_format_d16_xyz ||
753                instr->opcode == aco_opcode::buffer_load_format_d16_xyzw ||
754                instr->opcode == aco_opcode::tbuffer_load_format_d16_x ||
755                instr->opcode == aco_opcode::tbuffer_load_format_d16_xy ||
756                instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz ||
757                instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw;
758             if (instr->definitions.size()) {
759                check(instr->definitions[0].regClass().type() == RegType::vgpr,
760                      "VMEM definitions[0] (VDATA) must be VGPR", instr.get());
761                check(d16 || !instr->definitions[0].regClass().is_subdword(),
762                      "Only D16 opcodes can load subdword values.", instr.get());
763                check(instr->definitions[0].bytes() <= 8 || !d16,
764                      "D16 opcodes can only load up to 8 bytes.", instr.get());
765             }
766             break;
767          }
768          case Format::MIMG: {
769             check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
770                   instr.get());
771             check(instr->operands[0].hasRegClass() &&
772                      (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
773                   "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
774             if (instr->operands[1].hasRegClass())
775                check(instr->operands[1].regClass() == s4,
776                      "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
777             if (!instr->operands[2].isUndefined()) {
778                bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
779                                  instr->opcode == aco_opcode::image_atomic_fcmpswap;
780                check(instr->definitions.empty() ||
781                         (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
782                          is_cmpswap),
783                      "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
784                      "TFE/LWE loads",
785                      instr.get());
786             }
787 
788             if (instr->mimg().strict_wqm) {
789                check(instr->operands[3].hasRegClass() &&
790                         instr->operands[3].regClass().is_linear_vgpr(),
791                      "MIMG operands[3] must be temp linear VGPR.", instr.get());
792 
793                unsigned total_size = 0;
794                for (unsigned i = 4; i < instr->operands.size(); i++) {
795                   check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
796                         "MIMG operands[4+] (VADDR) must be v1", instr.get());
797                   total_size += instr->operands[i].bytes();
798                }
799                check(total_size <= instr->operands[3].bytes(),
800                      "MIMG operands[4+] must fit within operands[3].", instr.get());
801             } else {
802                check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
803                      "NSA is only supported on GFX10+", instr.get());
804                for (unsigned i = 3; i < instr->operands.size(); i++) {
805                   check(instr->operands[i].hasRegClass() &&
806                            instr->operands[i].regClass().type() == RegType::vgpr,
807                         "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
808                   if (instr->operands.size() > 4) {
809                      if (program->gfx_level < GFX11) {
810                         check(instr->operands[i].regClass() == v1,
811                               "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
812                      } else {
813                         if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
814                             instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) {
815                            check(instr->operands[i].regClass() == v1,
816                                  "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
817                         }
818                      }
819                   }
820                }
821             }
822 
823             if (instr->definitions.size()) {
824                check(instr->definitions[0].regClass().type() == RegType::vgpr,
825                      "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
826                check(instr->mimg().d16 || !instr->definitions[0].regClass().is_subdword(),
827                      "Only D16 MIMG instructions can load subdword values.", instr.get());
828                check(instr->definitions[0].bytes() <= 8 || !instr->mimg().d16,
829                      "D16 MIMG instructions can only load up to 8 bytes.", instr.get());
830             }
831             break;
832          }
833          case Format::DS: {
834             for (const Operand& op : instr->operands) {
835                check(op.isOfType(RegType::vgpr) || op.physReg() == m0 || op.isUndefined(),
836                      "Only VGPRs are valid DS instruction operands", instr.get());
837             }
838             if (!instr->definitions.empty())
839                check(instr->definitions[0].regClass().type() == RegType::vgpr,
840                      "DS instruction must return VGPR", instr.get());
841             break;
842          }
843          case Format::EXP: {
844             for (unsigned i = 0; i < 4; i++)
845                check(instr->operands[i].isOfType(RegType::vgpr),
846                      "Only VGPRs are valid Export arguments", instr.get());
847             break;
848          }
849          case Format::FLAT:
850             check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
851                   instr.get());
852             FALLTHROUGH;
853          case Format::GLOBAL:
854             check(instr->operands[0].isOfType(RegType::vgpr), "FLAT/GLOBAL address must be vgpr",
855                   instr.get());
856             FALLTHROUGH;
857          case Format::SCRATCH: {
858             check(instr->operands[0].isOfType(RegType::vgpr),
859                   "FLAT/GLOBAL/SCRATCH address must be undefined or vgpr", instr.get());
860             check(instr->operands[1].isOfType(RegType::sgpr),
861                   "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
862             if (instr->format == Format::SCRATCH && program->gfx_level < GFX10_3)
863                check(!instr->operands[0].isUndefined() || !instr->operands[1].isUndefined(),
864                      "SCRATCH must have either SADDR or ADDR operand", instr.get());
865             if (!instr->definitions.empty())
866                check(instr->definitions[0].regClass().type() == RegType::vgpr,
867                      "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
868             else
869                check(instr->operands[2].isOfType(RegType::vgpr),
870                      "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
871             break;
872          }
873          case Format::LDSDIR: {
874             check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1,
875                   "LDSDIR must have an v1 definition", instr.get());
876             check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get());
877             if (!instr->operands.empty()) {
878                check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand",
879                      instr.get());
880                check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0,
881                      "LDSDIR must have an operand fixed to m0", instr.get());
882             }
883             break;
884          }
885          default: break;
886          }
887       }
888    }
889 
890    return is_valid;
891 }
892 
893 bool
validate_cfg(Program * program)894 validate_cfg(Program* program)
895 {
896    if (!(debug_flags & DEBUG_VALIDATE_IR))
897       return true;
898 
899    bool is_valid = true;
900    auto check_block = [&program, &is_valid](bool success, const char* msg,
901                                             aco::Block* block) -> void
902    {
903       if (!success) {
904          aco_err(program, "%s: BB%u", msg, block->index);
905          is_valid = false;
906       }
907    };
908 
909    /* validate CFG */
910    for (unsigned i = 0; i < program->blocks.size(); i++) {
911       Block& block = program->blocks[i];
912       check_block(block.index == i, "block.index must match actual index", &block);
913 
914       /* predecessors/successors should be sorted */
915       for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
916          check_block(block.linear_preds[j] < block.linear_preds[j + 1],
917                      "linear predecessors must be sorted", &block);
918       for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
919          check_block(block.logical_preds[j] < block.logical_preds[j + 1],
920                      "logical predecessors must be sorted", &block);
921       for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
922          check_block(block.linear_succs[j] < block.linear_succs[j + 1],
923                      "linear successors must be sorted", &block);
924       for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
925          check_block(block.logical_succs[j] < block.logical_succs[j + 1],
926                      "logical successors must be sorted", &block);
927 
928       /* critical edges are not allowed */
929       if (block.linear_preds.size() > 1) {
930          for (unsigned pred : block.linear_preds)
931             check_block(program->blocks[pred].linear_succs.size() == 1,
932                         "linear critical edges are not allowed", &program->blocks[pred]);
933          for (unsigned pred : block.logical_preds)
934             check_block(program->blocks[pred].logical_succs.size() == 1,
935                         "logical critical edges are not allowed", &program->blocks[pred]);
936       }
937    }
938 
939    return is_valid;
940 }
941 
942 /* RA validation */
943 namespace {
944 
945 struct Location {
Locationaco::__anon191fc7c00311::Location946    Location() : block(NULL), instr(NULL) {}
947 
948    Block* block;
949    Instruction* instr; // NULL if it's the block's live-in
950 };
951 
952 struct Assignment {
953    Location defloc;
954    Location firstloc;
955    PhysReg reg;
956    bool valid;
957 };
958 
959 bool
ra_fail(Program * program,Location loc,Location loc2,const char * fmt,...)960 ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
961 {
962    va_list args;
963    va_start(args, fmt);
964    char msg[1024];
965    vsprintf(msg, fmt, args);
966    va_end(args);
967 
968    char* out;
969    size_t outsize;
970    struct u_memstream mem;
971    u_memstream_open(&mem, &out, &outsize);
972    FILE* const memf = u_memstream_get(&mem);
973 
974    fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
975    if (loc.instr) {
976       aco_print_instr(program->gfx_level, loc.instr, memf);
977       fprintf(memf, "\n%s", msg);
978    } else {
979       fprintf(memf, "%s", msg);
980    }
981    if (loc2.block) {
982       fprintf(memf, " in BB%d:\n", loc2.block->index);
983       aco_print_instr(program->gfx_level, loc2.instr, memf);
984    }
985    fprintf(memf, "\n\n");
986    u_memstream_close(&mem);
987 
988    aco_err(program, "%s", out);
989    free(out);
990 
991    return true;
992 }
993 
994 bool
validate_subdword_operand(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned index)995 validate_subdword_operand(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
996                           unsigned index)
997 {
998    Operand op = instr->operands[index];
999    unsigned byte = op.physReg().byte();
1000 
1001    if (instr->opcode == aco_opcode::p_as_uniform)
1002       return byte == 0;
1003    if (instr->isPseudo() && gfx_level >= GFX8)
1004       return true;
1005    if (instr->isSDWA())
1006       return byte + instr->sdwa().sel[index].offset() + instr->sdwa().sel[index].size() <= 4 &&
1007              byte % instr->sdwa().sel[index].size() == 0;
1008    if (instr->isVOP3P()) {
1009       bool fma_mix = instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
1010                      instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
1011                      instr->opcode == aco_opcode::v_fma_mix_f32;
1012       return instr->valu().opsel_lo[index] == (byte >> 1) &&
1013              instr->valu().opsel_hi[index] == (fma_mix || (byte >> 1));
1014    }
1015    if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, index))
1016       return true;
1017 
1018    switch (instr->opcode) {
1019    case aco_opcode::v_cvt_f32_ubyte1:
1020       if (byte == 1)
1021          return true;
1022       break;
1023    case aco_opcode::v_cvt_f32_ubyte2:
1024       if (byte == 2)
1025          return true;
1026       break;
1027    case aco_opcode::v_cvt_f32_ubyte3:
1028       if (byte == 3)
1029          return true;
1030       break;
1031    case aco_opcode::ds_write_b8_d16_hi:
1032    case aco_opcode::ds_write_b16_d16_hi:
1033       if (byte == 2 && index == 1)
1034          return true;
1035       break;
1036    case aco_opcode::buffer_store_byte_d16_hi:
1037    case aco_opcode::buffer_store_short_d16_hi:
1038    case aco_opcode::buffer_store_format_d16_hi_x:
1039       if (byte == 2 && index == 3)
1040          return true;
1041       break;
1042    case aco_opcode::flat_store_byte_d16_hi:
1043    case aco_opcode::flat_store_short_d16_hi:
1044    case aco_opcode::scratch_store_byte_d16_hi:
1045    case aco_opcode::scratch_store_short_d16_hi:
1046    case aco_opcode::global_store_byte_d16_hi:
1047    case aco_opcode::global_store_short_d16_hi:
1048       if (byte == 2 && index == 2)
1049          return true;
1050       break;
1051    default: break;
1052    }
1053 
1054    return byte == 0;
1055 }
1056 
1057 bool
validate_subdword_definition(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr)1058 validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr)
1059 {
1060    Definition def = instr->definitions[0];
1061    unsigned byte = def.physReg().byte();
1062 
1063    if (instr->isPseudo() && gfx_level >= GFX8)
1064       return true;
1065    if (instr->isSDWA())
1066       return byte + instr->sdwa().dst_sel.offset() + instr->sdwa().dst_sel.size() <= 4 &&
1067              byte % instr->sdwa().dst_sel.size() == 0;
1068    if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, -1))
1069       return true;
1070 
1071    switch (instr->opcode) {
1072    case aco_opcode::v_fma_mixhi_f16:
1073    case aco_opcode::buffer_load_ubyte_d16_hi:
1074    case aco_opcode::buffer_load_sbyte_d16_hi:
1075    case aco_opcode::buffer_load_short_d16_hi:
1076    case aco_opcode::buffer_load_format_d16_hi_x:
1077    case aco_opcode::flat_load_ubyte_d16_hi:
1078    case aco_opcode::flat_load_short_d16_hi:
1079    case aco_opcode::scratch_load_ubyte_d16_hi:
1080    case aco_opcode::scratch_load_short_d16_hi:
1081    case aco_opcode::global_load_ubyte_d16_hi:
1082    case aco_opcode::global_load_short_d16_hi:
1083    case aco_opcode::ds_read_u8_d16_hi:
1084    case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
1085    default: break;
1086    }
1087 
1088    return byte == 0;
1089 }
1090 
1091 unsigned
get_subdword_bytes_written(Program * program,const aco_ptr<Instruction> & instr,unsigned index)1092 get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
1093 {
1094    amd_gfx_level gfx_level = program->gfx_level;
1095    Definition def = instr->definitions[index];
1096 
1097    if (instr->isPseudo())
1098       return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
1099    if (instr->isVALU()) {
1100       assert(def.bytes() <= 2);
1101       if (instr->isSDWA())
1102          return instr->sdwa().dst_sel.size();
1103 
1104       if (instr_is_16bit(gfx_level, instr->opcode))
1105          return 2;
1106 
1107       return 4;
1108    }
1109 
1110    if (instr->isMIMG()) {
1111       assert(instr->mimg().d16);
1112       return program->dev.sram_ecc_enabled ? def.size() * 4u : def.bytes();
1113    }
1114 
1115    switch (instr->opcode) {
1116    case aco_opcode::buffer_load_ubyte_d16:
1117    case aco_opcode::buffer_load_sbyte_d16:
1118    case aco_opcode::buffer_load_short_d16:
1119    case aco_opcode::buffer_load_format_d16_x:
1120    case aco_opcode::tbuffer_load_format_d16_x:
1121    case aco_opcode::flat_load_ubyte_d16:
1122    case aco_opcode::flat_load_short_d16:
1123    case aco_opcode::scratch_load_ubyte_d16:
1124    case aco_opcode::scratch_load_short_d16:
1125    case aco_opcode::global_load_ubyte_d16:
1126    case aco_opcode::global_load_short_d16:
1127    case aco_opcode::ds_read_u8_d16:
1128    case aco_opcode::ds_read_u16_d16:
1129    case aco_opcode::buffer_load_ubyte_d16_hi:
1130    case aco_opcode::buffer_load_sbyte_d16_hi:
1131    case aco_opcode::buffer_load_short_d16_hi:
1132    case aco_opcode::buffer_load_format_d16_hi_x:
1133    case aco_opcode::flat_load_ubyte_d16_hi:
1134    case aco_opcode::flat_load_short_d16_hi:
1135    case aco_opcode::scratch_load_ubyte_d16_hi:
1136    case aco_opcode::scratch_load_short_d16_hi:
1137    case aco_opcode::global_load_ubyte_d16_hi:
1138    case aco_opcode::global_load_short_d16_hi:
1139    case aco_opcode::ds_read_u8_d16_hi:
1140    case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
1141    case aco_opcode::buffer_load_format_d16_xyz:
1142    case aco_opcode::tbuffer_load_format_d16_xyz: return program->dev.sram_ecc_enabled ? 8 : 6;
1143    default: return def.size() * 4;
1144    }
1145 }
1146 
1147 bool
validate_instr_defs(Program * program,std::array<unsigned,2048> & regs,const std::vector<Assignment> & assignments,const Location & loc,aco_ptr<Instruction> & instr)1148 validate_instr_defs(Program* program, std::array<unsigned, 2048>& regs,
1149                     const std::vector<Assignment>& assignments, const Location& loc,
1150                     aco_ptr<Instruction>& instr)
1151 {
1152    bool err = false;
1153 
1154    for (unsigned i = 0; i < instr->definitions.size(); i++) {
1155       Definition& def = instr->definitions[i];
1156       if (!def.isTemp())
1157          continue;
1158       Temp tmp = def.getTemp();
1159       PhysReg reg = assignments[tmp.id()].reg;
1160       for (unsigned j = 0; j < tmp.bytes(); j++) {
1161          if (regs[reg.reg_b + j])
1162             err |=
1163                ra_fail(program, loc, assignments[regs[reg.reg_b + j]].defloc,
1164                        "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
1165                        tmp.id(), regs[reg.reg_b + j]);
1166          regs[reg.reg_b + j] = tmp.id();
1167       }
1168       if (def.regClass().is_subdword() && def.bytes() < 4) {
1169          unsigned written = get_subdword_bytes_written(program, instr, i);
1170          /* If written=4, the instruction still might write the upper half. In that case, it's
1171           * the lower half that isn't preserved */
1172          for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
1173             unsigned written_reg = reg.reg() * 4u + j;
1174             if (regs[written_reg] && regs[written_reg] != def.tempId())
1175                err |= ra_fail(program, loc, assignments[regs[written_reg]].defloc,
1176                               "Assignment of element %d of %%%d overwrites the full register "
1177                               "taken by %%%d from instruction",
1178                               i, tmp.id(), regs[written_reg]);
1179          }
1180       }
1181    }
1182 
1183    for (const Definition& def : instr->definitions) {
1184       if (!def.isTemp())
1185          continue;
1186       if (def.isKill()) {
1187          for (unsigned j = 0; j < def.getTemp().bytes(); j++)
1188             regs[def.physReg().reg_b + j] = 0;
1189       }
1190    }
1191 
1192    return err;
1193 }
1194 
1195 } /* end namespace */
1196 
1197 bool
validate_ra(Program * program)1198 validate_ra(Program* program)
1199 {
1200    if (!(debug_flags & DEBUG_VALIDATE_RA))
1201       return false;
1202 
1203    bool err = false;
1204    aco::live live_vars = aco::live_var_analysis(program);
1205    std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
1206    uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
1207 
1208    std::vector<Assignment> assignments(program->peekAllocationId());
1209    for (Block& block : program->blocks) {
1210       Location loc;
1211       loc.block = &block;
1212       for (aco_ptr<Instruction>& instr : block.instructions) {
1213          if (instr->opcode == aco_opcode::p_phi) {
1214             for (unsigned i = 0; i < instr->operands.size(); i++) {
1215                if (instr->operands[i].isTemp() &&
1216                    instr->operands[i].getTemp().type() == RegType::sgpr &&
1217                    instr->operands[i].isFirstKill())
1218                   phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp());
1219             }
1220          }
1221 
1222          loc.instr = instr.get();
1223          for (unsigned i = 0; i < instr->operands.size(); i++) {
1224             Operand& op = instr->operands[i];
1225             if (!op.isTemp())
1226                continue;
1227             if (!op.isFixed())
1228                err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
1229             if (assignments[op.tempId()].valid && assignments[op.tempId()].reg != op.physReg())
1230                err |=
1231                   ra_fail(program, loc, assignments[op.tempId()].firstloc,
1232                           "Operand %d has an inconsistent register assignment with instruction", i);
1233             if ((op.getTemp().type() == RegType::vgpr &&
1234                  op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
1235                 (op.getTemp().type() == RegType::sgpr &&
1236                  op.physReg() + op.size() > program->config->num_sgprs &&
1237                  op.physReg() < sgpr_limit))
1238                err |= ra_fail(program, loc, assignments[op.tempId()].firstloc,
1239                               "Operand %d has an out-of-bounds register assignment", i);
1240             if (op.physReg() == vcc && !program->needs_vcc)
1241                err |= ra_fail(program, loc, Location(),
1242                               "Operand %d fixed to vcc but needs_vcc=false", i);
1243             if (op.regClass().is_subdword() &&
1244                 !validate_subdword_operand(program->gfx_level, instr, i))
1245                err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
1246             if (!assignments[op.tempId()].firstloc.block)
1247                assignments[op.tempId()].firstloc = loc;
1248             if (!assignments[op.tempId()].defloc.block) {
1249                assignments[op.tempId()].reg = op.physReg();
1250                assignments[op.tempId()].valid = true;
1251             }
1252          }
1253 
1254          for (unsigned i = 0; i < instr->definitions.size(); i++) {
1255             Definition& def = instr->definitions[i];
1256             if (!def.isTemp())
1257                continue;
1258             if (!def.isFixed())
1259                err |=
1260                   ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
1261             if (assignments[def.tempId()].defloc.block)
1262                err |= ra_fail(program, loc, assignments[def.tempId()].defloc,
1263                               "Temporary %%%d also defined by instruction", def.tempId());
1264             if ((def.getTemp().type() == RegType::vgpr &&
1265                  def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
1266                 (def.getTemp().type() == RegType::sgpr &&
1267                  def.physReg() + def.size() > program->config->num_sgprs &&
1268                  def.physReg() < sgpr_limit))
1269                err |= ra_fail(program, loc, assignments[def.tempId()].firstloc,
1270                               "Definition %d has an out-of-bounds register assignment", i);
1271             if (def.physReg() == vcc && !program->needs_vcc)
1272                err |= ra_fail(program, loc, Location(),
1273                               "Definition %d fixed to vcc but needs_vcc=false", i);
1274             if (def.regClass().is_subdword() &&
1275                 !validate_subdword_definition(program->gfx_level, instr))
1276                err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
1277             if (!assignments[def.tempId()].firstloc.block)
1278                assignments[def.tempId()].firstloc = loc;
1279             assignments[def.tempId()].defloc = loc;
1280             assignments[def.tempId()].reg = def.physReg();
1281             assignments[def.tempId()].valid = true;
1282          }
1283       }
1284    }
1285 
1286    for (Block& block : program->blocks) {
1287       Location loc;
1288       loc.block = &block;
1289 
1290       std::array<unsigned, 2048> regs; /* register file in bytes */
1291       regs.fill(0);
1292 
1293       IDSet live = live_vars.live_out[block.index];
1294       /* remove killed p_phi sgpr operands */
1295       for (Temp tmp : phi_sgpr_ops[block.index])
1296          live.erase(tmp.id());
1297 
1298       /* check live out */
1299       for (unsigned id : live) {
1300          Temp tmp(id, program->temp_rc[id]);
1301          PhysReg reg = assignments[id].reg;
1302          for (unsigned i = 0; i < tmp.bytes(); i++) {
1303             if (regs[reg.reg_b + i]) {
1304                err |= ra_fail(program, loc, Location(),
1305                               "Assignment of element %d of %%%d already taken by %%%d in live-out",
1306                               i, id, regs[reg.reg_b + i]);
1307             }
1308             regs[reg.reg_b + i] = id;
1309          }
1310       }
1311       regs.fill(0);
1312 
1313       for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); ++it) {
1314          aco_ptr<Instruction>& instr = *it;
1315 
1316          /* check killed p_phi sgpr operands */
1317          if (instr->opcode == aco_opcode::p_logical_end) {
1318             for (Temp tmp : phi_sgpr_ops[block.index]) {
1319                PhysReg reg = assignments[tmp.id()].reg;
1320                for (unsigned i = 0; i < tmp.bytes(); i++) {
1321                   if (regs[reg.reg_b + i])
1322                      err |= ra_fail(
1323                         program, loc, Location(),
1324                         "Assignment of element %d of %%%d already taken by %%%d in live-out", i,
1325                         tmp.id(), regs[reg.reg_b + i]);
1326                }
1327                live.insert(tmp.id());
1328             }
1329          }
1330 
1331          for (const Definition& def : instr->definitions) {
1332             if (!def.isTemp())
1333                continue;
1334             live.erase(def.tempId());
1335          }
1336 
1337          /* don't count phi operands as live-in, since they are actually
1338           * killed when they are copied at the predecessor */
1339          if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1340             for (const Operand& op : instr->operands) {
1341                if (!op.isTemp())
1342                   continue;
1343                live.insert(op.tempId());
1344             }
1345          }
1346       }
1347 
1348       for (unsigned id : live) {
1349          Temp tmp(id, program->temp_rc[id]);
1350          PhysReg reg = assignments[id].reg;
1351          for (unsigned i = 0; i < tmp.bytes(); i++)
1352             regs[reg.reg_b + i] = id;
1353       }
1354 
1355       for (aco_ptr<Instruction>& instr : block.instructions) {
1356          loc.instr = instr.get();
1357 
1358          /* remove killed p_phi operands from regs */
1359          if (instr->opcode == aco_opcode::p_logical_end) {
1360             for (Temp tmp : phi_sgpr_ops[block.index]) {
1361                PhysReg reg = assignments[tmp.id()].reg;
1362                for (unsigned i = 0; i < tmp.bytes(); i++)
1363                   regs[reg.reg_b + i] = 0;
1364             }
1365          }
1366 
1367          if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1368             for (const Operand& op : instr->operands) {
1369                if (!op.isTemp())
1370                   continue;
1371                if (op.isFirstKillBeforeDef()) {
1372                   for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1373                      regs[op.physReg().reg_b + j] = 0;
1374                }
1375             }
1376          }
1377 
1378          if (!instr->isBranch() || block.linear_succs.size() != 1)
1379             err |= validate_instr_defs(program, regs, assignments, loc, instr);
1380 
1381          if (!is_phi(instr)) {
1382             for (const Operand& op : instr->operands) {
1383                if (!op.isTemp())
1384                   continue;
1385                if (op.isLateKill() && op.isFirstKill()) {
1386                   for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1387                      regs[op.physReg().reg_b + j] = 0;
1388                }
1389             }
1390          } else if (block.linear_preds.size() != 1 ||
1391                     program->blocks[block.linear_preds[0]].linear_succs.size() == 1) {
1392             for (unsigned pred : block.linear_preds) {
1393                aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
1394                assert(br->isBranch());
1395                err |= validate_instr_defs(program, regs, assignments, loc, br);
1396             }
1397          }
1398       }
1399    }
1400 
1401    return err;
1402 }
1403 } // namespace aco
1404