1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_ir.h"
8
9 #include "util/memstream.h"
10 #include "util/ralloc.h"
11
12 #include <array>
13 #include <map>
14 #include <set>
15 #include <vector>
16
17 namespace aco {
18
19 static void
aco_log(Program * program,enum aco_compiler_debug_level level,const char * prefix,const char * file,unsigned line,const char * fmt,va_list args)20 aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, const char* file,
21 unsigned line, const char* fmt, va_list args)
22 {
23 char* msg;
24
25 if (program->debug.shorten_messages) {
26 msg = ralloc_vasprintf(NULL, fmt, args);
27 } else {
28 msg = ralloc_strdup(NULL, prefix);
29 ralloc_asprintf_append(&msg, " In file %s:%u\n", file, line);
30 ralloc_asprintf_append(&msg, " ");
31 ralloc_vasprintf_append(&msg, fmt, args);
32 }
33
34 if (program->debug.func)
35 program->debug.func(program->debug.private_data, level, msg);
36
37 fprintf(program->debug.output, "%s\n", msg);
38
39 ralloc_free(msg);
40 }
41
42 void
_aco_err(Program * program,const char * file,unsigned line,const char * fmt,...)43 _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
44 {
45 va_list args;
46
47 va_start(args, fmt);
48 aco_log(program, ACO_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
49 va_end(args);
50 }
51
52 bool
validate_ir(Program * program)53 validate_ir(Program* program)
54 {
55 bool is_valid = true;
56 auto check = [&program, &is_valid](bool success, const char* msg,
57 aco::Instruction* instr) -> void
58 {
59 if (!success) {
60 char* out;
61 size_t outsize;
62 struct u_memstream mem;
63 u_memstream_open(&mem, &out, &outsize);
64 FILE* const memf = u_memstream_get(&mem);
65
66 fprintf(memf, "%s: ", msg);
67 aco_print_instr(program->gfx_level, instr, memf);
68 u_memstream_close(&mem);
69
70 aco_err(program, "%s", out);
71 free(out);
72
73 is_valid = false;
74 }
75 };
76
77 /* check reachability */
78 if (program->progress < CompilationProgress::after_lower_to_hw) {
79 std::map<uint32_t, std::pair<uint32_t, bool>> def_blocks;
80 for (Block& block : program->blocks) {
81 for (aco_ptr<Instruction>& instr : block.instructions) {
82 for (Definition def : instr->definitions) {
83 if (!def.isTemp())
84 continue;
85 check(!def_blocks.count(def.tempId()), "Temporary defined twice", instr.get());
86 def_blocks[def.tempId()] = std::make_pair(block.index, false);
87 }
88 }
89 }
90
91 for (Block& block : program->blocks) {
92 for (aco_ptr<Instruction>& instr : block.instructions) {
93 for (unsigned i = 0; i < instr->operands.size(); i++) {
94 Operand op = instr->operands[i];
95 if (!op.isTemp())
96 continue;
97
98 uint32_t use_block_idx = block.index;
99 if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_boolean_phi)
100 use_block_idx = block.logical_preds[i];
101 else if (instr->opcode == aco_opcode::p_linear_phi)
102 use_block_idx = block.linear_preds[i];
103
104 auto it = def_blocks.find(op.tempId());
105 if (it != def_blocks.end()) {
106 Block& def_block = program->blocks[it->second.first];
107 Block& use_block = program->blocks[use_block_idx];
108 bool dominates =
109 def_block.index == use_block_idx
110 ? (use_block_idx == block.index ? it->second.second : true)
111 : (op.regClass().is_linear() ? dominates_linear(def_block, use_block)
112 : dominates_logical(def_block, use_block));
113 if (!dominates) {
114 char msg[256];
115 snprintf(msg, sizeof(msg), "Definition of %%%u does not dominate use",
116 op.tempId());
117 check(false, msg, instr.get());
118 }
119 } else {
120 char msg[256];
121 snprintf(msg, sizeof(msg), "%%%u never defined", op.tempId());
122 check(false, msg, instr.get());
123 }
124 }
125
126 for (Definition def : instr->definitions) {
127 if (def.isTemp())
128 def_blocks[def.tempId()].second = true;
129 }
130 }
131 }
132 }
133
134 for (Block& block : program->blocks) {
135 for (aco_ptr<Instruction>& instr : block.instructions) {
136
137 if (program->progress < CompilationProgress::after_lower_to_hw) {
138 for (const Operand& op : instr->operands)
139 check(!op.isTemp() || op.regClass() == program->temp_rc[op.tempId()],
140 "Operand RC not consistent.", instr.get());
141
142 for (const Definition& def : instr->definitions)
143 check(!def.isTemp() || def.regClass() == program->temp_rc[def.tempId()],
144 "Definition RC not consistent.", instr.get());
145 }
146
147 unsigned pck_defs = instr_info.definitions[(int)instr->opcode];
148 unsigned pck_ops = instr_info.operands[(int)instr->opcode];
149
150 if (pck_defs != 0) {
151 /* Before GFX10 v_cmpx also writes VCC. */
152 if (instr->isVOPC() && program->gfx_level < GFX10 && pck_defs == exec_hi)
153 pck_defs = vcc | (exec_hi << 8);
154
155 for (unsigned i = 0; i < 4; i++) {
156 uint32_t def = (pck_defs >> (i * 8)) & 0xff;
157 if (def == 0) {
158 check(i == instr->definitions.size(), "Too many definitions", instr.get());
159 break;
160 } else {
161 check(i < instr->definitions.size(), "Too few definitions", instr.get());
162 if (i >= instr->definitions.size())
163 break;
164 }
165
166 if (def == m0) {
167 check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == m0,
168 "Definition needs m0", instr.get());
169 } else if (def == scc) {
170 check(instr->definitions[i].isFixed() && instr->definitions[i].physReg() == scc,
171 "Definition needs scc", instr.get());
172 } else if (def == exec_hi) {
173 RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
174 check(instr->definitions[i].isFixed() &&
175 instr->definitions[i].physReg() == exec &&
176 instr->definitions[i].regClass() == rc,
177 "Definition needs exec", instr.get());
178 } else if (def == exec_lo) {
179 check(instr->definitions[i].isFixed() &&
180 instr->definitions[i].physReg() == exec_lo &&
181 instr->definitions[i].regClass() == s1,
182 "Definition needs exec_lo", instr.get());
183 } else if (def == vcc) {
184 check(instr->definitions[i].regClass() == program->lane_mask,
185 "Definition has to be lane mask", instr.get());
186 check(!instr->definitions[i].isFixed() ||
187 instr->definitions[i].physReg() == vcc || instr->isVOP3() ||
188 instr->isSDWA(),
189 "Definition has to be vcc", instr.get());
190 } else {
191 check(instr->definitions[i].size() == def, "Definition has wrong size",
192 instr.get());
193 }
194 }
195 }
196
197 if (pck_ops != 0) {
198 for (unsigned i = 0; i < 4; i++) {
199 uint32_t op = (pck_ops >> (i * 8)) & 0xff;
200 if (op == 0) {
201 check(i == instr->operands.size(), "Too many operands", instr.get());
202 break;
203 } else {
204 check(i < instr->operands.size(), "Too few operands", instr.get());
205 if (i >= instr->operands.size())
206 break;
207 }
208
209 if (op == m0) {
210 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == m0,
211 "Operand needs m0", instr.get());
212 } else if (op == scc) {
213 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == scc,
214 "Operand needs scc", instr.get());
215 } else if (op == exec_hi) {
216 RegClass rc = instr->isSALU() ? s2 : program->lane_mask;
217 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec &&
218 instr->operands[i].hasRegClass() && instr->operands[i].regClass() == rc,
219 "Operand needs exec", instr.get());
220 } else if (op == exec_lo) {
221 check(instr->operands[i].isFixed() && instr->operands[i].physReg() == exec_lo &&
222 instr->operands[i].hasRegClass() && instr->operands[i].regClass() == s1,
223 "Operand needs exec_lo", instr.get());
224 } else if (op == vcc) {
225 check(instr->operands[i].hasRegClass() &&
226 instr->operands[i].regClass() == program->lane_mask,
227 "Operand has to be lane mask", instr.get());
228 check(!instr->operands[i].isFixed() || instr->operands[i].physReg() == vcc ||
229 instr->isVOP3(),
230 "Operand has to be vcc", instr.get());
231 } else {
232 check(instr->operands[i].size() == op ||
233 (instr->operands[i].isFixed() && instr->operands[i].physReg() >= 128 &&
234 instr->operands[i].physReg() < 256),
235 "Operand has wrong size", instr.get());
236 }
237 }
238 }
239
240 /* check base format */
241 Format base_format = instr->format;
242 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA);
243 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP16);
244 base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP8);
245 if ((uint32_t)base_format & (uint32_t)Format::VOP1)
246 base_format = Format::VOP1;
247 else if ((uint32_t)base_format & (uint32_t)Format::VOP2)
248 base_format = Format::VOP2;
249 else if ((uint32_t)base_format & (uint32_t)Format::VOPC)
250 base_format = Format::VOPC;
251 else if (base_format == Format::VINTRP) {
252 if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
253 instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
254 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
255 instr->opcode == aco_opcode::v_interp_p2_f16 ||
256 instr->opcode == aco_opcode::v_interp_p2_hi_f16) {
257 /* v_interp_*_fp16 are considered VINTRP by the compiler but
258 * they are emitted as VOP3.
259 */
260 base_format = Format::VOP3;
261 } else {
262 base_format = Format::VINTRP;
263 }
264 }
265 check(base_format == instr_info.format[(int)instr->opcode],
266 "Wrong base format for instruction", instr.get());
267
268 /* check VOP3 modifiers */
269 if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) {
270 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
271 base_format == Format::VOPC || base_format == Format::VINTRP,
272 "Format cannot have VOP3/VOP3B applied", instr.get());
273 }
274
275 if (instr->isDPP()) {
276 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
277 base_format == Format::VOPC || base_format == Format::VOP3 ||
278 base_format == Format::VOP3P,
279 "Format cannot have DPP applied", instr.get());
280 check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
281 "VOP3+DPP is GFX11+ only", instr.get());
282
283 bool fi =
284 instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
285 check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
286 instr.get());
287 }
288
289 /* check SDWA */
290 if (instr->isSDWA()) {
291 check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
292 base_format == Format::VOPC,
293 "Format cannot have SDWA applied", instr.get());
294
295 check(program->gfx_level >= GFX8, "SDWA is GFX8 to GFX10.3 only", instr.get());
296 check(program->gfx_level < GFX11, "SDWA is GFX8 to GFX10.3 only", instr.get());
297
298 SDWA_instruction& sdwa = instr->sdwa();
299 check(sdwa.omod == 0 || program->gfx_level >= GFX9, "SDWA omod only supported on GFX9+",
300 instr.get());
301 if (base_format == Format::VOPC) {
302 check(sdwa.clamp == false || program->gfx_level == GFX8,
303 "SDWA VOPC clamp only supported on GFX8", instr.get());
304 check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
305 program->gfx_level >= GFX9,
306 "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
307 } else {
308 const Definition& def = instr->definitions[0];
309 check(def.bytes() <= 4, "SDWA definitions must not be larger than 4 bytes",
310 instr.get());
311 check(def.bytes() >= sdwa.dst_sel.size() + sdwa.dst_sel.offset(),
312 "SDWA definition selection size must be at most definition size", instr.get());
313 check(
314 sdwa.dst_sel.size() == 1 || sdwa.dst_sel.size() == 2 || sdwa.dst_sel.size() == 4,
315 "SDWA definition selection size must be 1, 2 or 4 bytes", instr.get());
316 check(sdwa.dst_sel.offset() % sdwa.dst_sel.size() == 0, "Invalid selection offset",
317 instr.get());
318 check(def.bytes() == 4 || def.bytes() == sdwa.dst_sel.size(),
319 "SDWA dst_sel size must be definition size for subdword definitions",
320 instr.get());
321 check(def.bytes() == 4 || sdwa.dst_sel.offset() == 0,
322 "SDWA dst_sel offset must be 0 for subdword definitions", instr.get());
323 }
324
325 for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
326 const Operand& op = instr->operands[i];
327 check(op.bytes() <= 4, "SDWA operands must not be larger than 4 bytes", instr.get());
328 check(op.bytes() >= sdwa.sel[i].size() + sdwa.sel[i].offset(),
329 "SDWA operand selection size must be at most operand size", instr.get());
330 check(sdwa.sel[i].size() == 1 || sdwa.sel[i].size() == 2 || sdwa.sel[i].size() == 4,
331 "SDWA operand selection size must be 1, 2 or 4 bytes", instr.get());
332 check(sdwa.sel[i].offset() % sdwa.sel[i].size() == 0, "Invalid selection offset",
333 instr.get());
334 }
335 if (instr->operands.size() >= 3) {
336 check(instr->operands[2].isFixed() && instr->operands[2].physReg() == vcc,
337 "3rd operand must be fixed to vcc with SDWA", instr.get());
338 }
339 if (instr->definitions.size() >= 2) {
340 check(instr->definitions[1].isFixed() && instr->definitions[1].physReg() == vcc,
341 "2nd definition must be fixed to vcc with SDWA", instr.get());
342 }
343
344 const bool sdwa_opcodes =
345 instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
346 instr->opcode != aco_opcode::v_fmamk_f32 &&
347 instr->opcode != aco_opcode::v_fmaak_f32 &&
348 instr->opcode != aco_opcode::v_fmamk_f16 &&
349 instr->opcode != aco_opcode::v_fmaak_f16 &&
350 instr->opcode != aco_opcode::v_madmk_f32 &&
351 instr->opcode != aco_opcode::v_madak_f32 &&
352 instr->opcode != aco_opcode::v_madmk_f16 &&
353 instr->opcode != aco_opcode::v_madak_f16 &&
354 instr->opcode != aco_opcode::v_readfirstlane_b32 &&
355 instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
356
357 const bool feature_mac =
358 program->gfx_level == GFX8 &&
359 (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
360
361 check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
362 }
363
364 /* check opsel */
365 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
366 instr->opcode == aco_opcode::v_permlanex16_b32) {
367 check(instr->valu().opsel <= 0x3, "Unexpected opsel for permlane", instr.get());
368 } else if (instr->isVOP3() || instr->isVOP1() || instr->isVOP2() || instr->isVOPC()) {
369 VALU_instruction& valu = instr->valu();
370 check(valu.opsel == 0 || program->gfx_level >= GFX9, "Opsel is only supported on GFX9+",
371 instr.get());
372 check(valu.opsel == 0 || instr->format == Format::VOP3 || program->gfx_level >= GFX11,
373 "Opsel is only supported for VOP3 before GFX11", instr.get());
374
375 for (unsigned i = 0; i < 3; i++) {
376 if (i >= instr->operands.size() ||
377 (!instr->isVOP3() && !instr->operands[i].isOfType(RegType::vgpr)) ||
378 (instr->operands[i].hasRegClass() &&
379 instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
380 check(!valu.opsel[i], "Unexpected opsel for operand", instr.get());
381 }
382 if (!instr->definitions.empty() && instr->definitions[0].regClass().is_subdword() &&
383 !instr->definitions[0].isFixed())
384 check(!valu.opsel[3], "Unexpected opsel for sub-dword definition", instr.get());
385 } else if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
386 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
387 instr->opcode == aco_opcode::v_fma_mix_f32) {
388 check(instr->definitions[0].regClass() ==
389 (instr->opcode == aco_opcode::v_fma_mix_f32 ? v1 : v2b),
390 "v_fma_mix_f32/v_fma_mix_f16 must have v1/v2b definition", instr.get());
391 } else if (instr->isVOP3P()) {
392 VALU_instruction& vop3p = instr->valu();
393 for (unsigned i = 0; i < instr->operands.size(); i++) {
394 if (instr->operands[i].hasRegClass() &&
395 instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())
396 check(!vop3p.opsel_lo[i] && !vop3p.opsel_hi[i],
397 "Unexpected opsel for subdword operand", instr.get());
398 }
399 check(instr->definitions[0].regClass() == v1 ||
400 instr_info.classes[(int)instr->opcode] == instr_class::wmma,
401 "VOP3P must have v1 definition", instr.get());
402 }
403
404 /* check for undefs */
405 for (unsigned i = 0; i < instr->operands.size(); i++) {
406 if (instr->operands[i].isUndefined()) {
407 bool flat = instr->isFlatLike();
408 bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
409 instr->opcode == aco_opcode::p_create_vector ||
410 instr->opcode == aco_opcode::p_start_linear_vgpr ||
411 instr->opcode == aco_opcode::p_jump_to_epilog ||
412 instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
413 instr->opcode == aco_opcode::p_end_with_regs ||
414 (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
415 (instr->opcode == aco_opcode::p_bpermute_permlane && i == 0) ||
416 (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
417 ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
418 (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
419 (instr->opcode == aco_opcode::p_init_scratch && i == 0);
420 check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
421 } else {
422 check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
423 instr->operands[i].isConstant(),
424 "Uninitialized Operand", instr.get());
425 }
426 }
427
428 for (Operand& op : instr->operands) {
429 if (op.isFixed() || !op.hasRegClass() || !op.regClass().is_linear_vgpr() ||
430 op.isUndefined())
431 continue;
432
433 /* Only kill linear VGPRs in top-level blocks. Otherwise, we might have to move linear
434 * VGPRs to make space for normal ones and that isn't possible inside control flow. */
435 if (op.isKill()) {
436 check(block.kind & block_kind_top_level,
437 "Linear VGPR operands must only be killed at top-level blocks", instr.get());
438 }
439 }
440
441 /* check subdword definitions */
442 for (unsigned i = 0; i < instr->definitions.size(); i++) {
443 if (instr->definitions[i].regClass().is_subdword())
444 check(instr->definitions[i].bytes() <= 4 || instr->isPseudo() || instr->isVMEM(),
445 "Only Pseudo and VMEM instructions can write subdword registers > 4 bytes",
446 instr.get());
447 }
448
449 if ((instr->isSALU() && instr->opcode != aco_opcode::p_constaddr_addlo &&
450 instr->opcode != aco_opcode::p_resumeaddr_addlo) ||
451 instr->isVALU()) {
452 /* check literals */
453 Operand literal(s1);
454 for (unsigned i = 0; i < instr->operands.size(); i++) {
455 Operand op = instr->operands[i];
456 if (!op.isLiteral())
457 continue;
458
459 check(!instr->isDPP() && !instr->isSDWA() &&
460 (!instr->isVOP3() || program->gfx_level >= GFX10) &&
461 (!instr->isVOP3P() || program->gfx_level >= GFX10),
462 "Literal applied on wrong instruction format", instr.get());
463
464 check(literal.isUndefined() || (literal.size() == op.size() &&
465 literal.constantValue() == op.constantValue()),
466 "Only 1 Literal allowed", instr.get());
467 literal = op;
468 check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
469 "Wrong source position for Literal argument", instr.get());
470 }
471
472 /* check num sgprs for VALU */
473 if (instr->isVALU()) {
474 bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64_e64 ||
475 instr->opcode == aco_opcode::v_lshlrev_b64 ||
476 instr->opcode == aco_opcode::v_lshrrev_b64 ||
477 instr->opcode == aco_opcode::v_ashrrev_i64;
478 unsigned const_bus_limit = 1;
479 if (program->gfx_level >= GFX10 && !is_shift64)
480 const_bus_limit = 2;
481
482 uint32_t scalar_mask;
483 if (instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG())
484 scalar_mask = 0x7;
485 else if (instr->isSDWA())
486 scalar_mask = program->gfx_level >= GFX9 ? 0x7 : 0x4;
487 else if (instr->isDPP())
488 scalar_mask = 0x4;
489 else if (instr->opcode == aco_opcode::v_movrels_b32 ||
490 instr->opcode == aco_opcode::v_movrelsd_b32 ||
491 instr->opcode == aco_opcode::v_movrelsd_2_b32)
492 scalar_mask = 0x2;
493 else
494 scalar_mask = 0x5;
495
496 if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
497 instr->opcode == aco_opcode::v_readlane_b32 ||
498 instr->opcode == aco_opcode::v_readlane_b32_e64 ||
499 instr_info.classes[(int)instr->opcode] ==
500 instr_class::valu_pseudo_scalar_trans) {
501 check(instr->definitions[0].regClass().type() == RegType::sgpr,
502 "Wrong Definition type for VALU instruction", instr.get());
503 } else {
504 if (!instr->definitions.empty())
505 check(instr->definitions[0].regClass().type() == RegType::vgpr,
506 "Wrong Definition type for VALU instruction", instr.get());
507 }
508
509 unsigned num_sgprs = 0;
510 unsigned sgpr[] = {0, 0};
511 for (unsigned i = 0; i < instr->operands.size(); i++) {
512 Operand op = instr->operands[i];
513 if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
514 instr->opcode == aco_opcode::v_readlane_b32 ||
515 instr->opcode == aco_opcode::v_readlane_b32_e64) {
516 check(i != 1 || op.isOfType(RegType::sgpr) || op.isConstant(),
517 "Must be a SGPR or a constant", instr.get());
518 check(i == 1 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
519 "Wrong Operand type for VALU instruction", instr.get());
520 continue;
521 }
522 if (instr->opcode == aco_opcode::v_permlane16_b32 ||
523 instr->opcode == aco_opcode::v_permlanex16_b32 ||
524 instr->opcode == aco_opcode::v_permlane64_b32) {
525 check(i != 0 || op.isOfType(RegType::vgpr),
526 "Operand 0 of v_permlane must be VGPR", instr.get());
527 check(i == 0 || op.isOfType(RegType::sgpr) || op.isConstant(),
528 "Lane select operands of v_permlane must be SGPR or constant",
529 instr.get());
530 }
531
532 if (instr->opcode == aco_opcode::v_writelane_b32 ||
533 instr->opcode == aco_opcode::v_writelane_b32_e64) {
534 check(i != 2 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
535 "Wrong Operand type for VALU instruction", instr.get());
536 check(i == 2 || op.isOfType(RegType::sgpr) || op.isConstant(),
537 "Must be a SGPR or a constant", instr.get());
538 continue;
539 }
540 if (op.isOfType(RegType::sgpr)) {
541 check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
542 instr.get());
543
544 if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
545 if (num_sgprs < 2)
546 sgpr[num_sgprs++] = op.tempId();
547 }
548 }
549
550 if (op.isConstant() && !op.isLiteral())
551 check(scalar_mask & (1 << i), "Wrong source position for constant argument",
552 instr.get());
553 }
554 check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
555 "Too many SGPRs/literals", instr.get());
556
557 /* Validate modifiers. */
558 check(!instr->valu().opsel || instr->isVOP3() || instr->isVOP1() ||
559 instr->isVOP2() || instr->isVOPC() || instr->isVINTERP_INREG(),
560 "OPSEL set for unsupported instruction format", instr.get());
561 check(!instr->valu().opsel_lo || instr->isVOP3P(),
562 "OPSEL_LO set for unsupported instruction format", instr.get());
563 check(!instr->valu().opsel_hi || instr->isVOP3P(),
564 "OPSEL_HI set for unsupported instruction format", instr.get());
565 check(!instr->valu().omod || instr->isVOP3() || instr->isSDWA(),
566 "OMOD set for unsupported instruction format", instr.get());
567 check(!instr->valu().clamp || instr->isVOP3() || instr->isVOP3P() ||
568 instr->isSDWA() || instr->isVINTERP_INREG(),
569 "CLAMP set for unsupported instruction format", instr.get());
570
571 for (bool abs : instr->valu().abs) {
572 check(!abs || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
573 instr->isDPP16(),
574 "ABS/NEG_HI set for unsupported instruction format", instr.get());
575 }
576 for (bool neg : instr->valu().neg) {
577 check(!neg || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
578 instr->isDPP16() || instr->isVINTERP_INREG(),
579 "NEG/NEG_LO set for unsupported instruction format", instr.get());
580 }
581 }
582
583 if (instr->isSOP1() || instr->isSOP2()) {
584 if (!instr->definitions.empty())
585 check(instr->definitions[0].regClass().type() == RegType::sgpr,
586 "Wrong Definition type for SALU instruction", instr.get());
587 for (const Operand& op : instr->operands) {
588 check(op.isConstant() || op.isOfType(RegType::sgpr),
589 "Wrong Operand type for SALU instruction", instr.get());
590 }
591 }
592 }
593
594 switch (instr->format) {
595 case Format::PSEUDO: {
596 if (instr->opcode == aco_opcode::p_create_vector ||
597 instr->opcode == aco_opcode::p_start_linear_vgpr) {
598 unsigned size = 0;
599 for (const Operand& op : instr->operands) {
600 check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
601 size += op.bytes();
602 }
603 if (!instr->operands.empty() || instr->opcode == aco_opcode::p_create_vector) {
604 check(size == instr->definitions[0].bytes(),
605 "Definition size does not match operand sizes", instr.get());
606 }
607 if (instr->definitions[0].regClass().type() == RegType::sgpr) {
608 for (const Operand& op : instr->operands) {
609 check(op.isConstant() || op.regClass().type() == RegType::sgpr,
610 "Wrong Operand type for scalar vector", instr.get());
611 }
612 }
613 if (instr->opcode == aco_opcode::p_start_linear_vgpr)
614 check(instr->definitions[0].regClass().is_linear_vgpr(),
615 "Definition must be linear VGPR", instr.get());
616 } else if (instr->opcode == aco_opcode::p_extract_vector) {
617 check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
618 "Wrong Operand types", instr.get());
619 check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
620 instr->operands[0].bytes(),
621 "Index out of range", instr.get());
622 check(instr->definitions[0].regClass().type() == RegType::vgpr ||
623 instr->operands[0].regClass().type() == RegType::sgpr,
624 "Cannot extract SGPR value from VGPR vector", instr.get());
625 check(program->gfx_level >= GFX9 ||
626 !instr->definitions[0].regClass().is_subdword() ||
627 instr->operands[0].regClass().type() == RegType::vgpr,
628 "Cannot extract subdword from SGPR before GFX9+", instr.get());
629 } else if (instr->opcode == aco_opcode::p_split_vector) {
630 check(!instr->operands[0].isConstant(), "Operand must not be constant", instr.get());
631 unsigned size = 0;
632 for (const Definition& def : instr->definitions) {
633 size += def.bytes();
634 }
635 check(size == instr->operands[0].bytes(),
636 "Operand size does not match definition sizes", instr.get());
637 if (instr->operands[0].isOfType(RegType::vgpr)) {
638 for (const Definition& def : instr->definitions)
639 check(def.regClass().type() == RegType::vgpr,
640 "Wrong Definition type for VGPR split_vector", instr.get());
641 } else {
642 for (const Definition& def : instr->definitions)
643 check(program->gfx_level >= GFX9 || !def.regClass().is_subdword(),
644 "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
645 }
646 } else if (instr->opcode == aco_opcode::p_parallelcopy) {
647 check(instr->definitions.size() == instr->operands.size(),
648 "Number of Operands does not match number of Definitions", instr.get());
649 for (unsigned i = 0; i < instr->operands.size(); i++) {
650 check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
651 "Operand and Definition size must match", instr.get());
652 if (instr->operands[i].hasRegClass()) {
653 check((instr->definitions[i].regClass().type() ==
654 instr->operands[i].regClass().type()) ||
655 (instr->definitions[i].regClass().type() == RegType::vgpr &&
656 instr->operands[i].regClass().type() == RegType::sgpr),
657 "Operand and Definition types do not match", instr.get());
658 check(instr->definitions[i].regClass().is_linear_vgpr() ==
659 instr->operands[i].regClass().is_linear_vgpr(),
660 "Operand and Definition types do not match", instr.get());
661 } else {
662 check(!instr->definitions[i].regClass().is_linear_vgpr(),
663 "Can only copy linear VGPRs into linear VGPRs, not constant/undef",
664 instr.get());
665 }
666 }
667 } else if (instr->opcode == aco_opcode::p_phi) {
668 check(instr->operands.size() == block.logical_preds.size(),
669 "Number of Operands does not match number of predecessors", instr.get());
670 check(instr->definitions[0].regClass().type() == RegType::vgpr,
671 "Logical Phi Definition must be vgpr", instr.get());
672 for (const Operand& op : instr->operands)
673 check(instr->definitions[0].size() == op.size(),
674 "Operand sizes must match Definition size", instr.get());
675 } else if (instr->opcode == aco_opcode::p_linear_phi) {
676 for (const Operand& op : instr->operands) {
677 check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
678 instr.get());
679 check(instr->definitions[0].size() == op.size(),
680 "Operand sizes must match Definition size", instr.get());
681 }
682 check(instr->operands.size() == block.linear_preds.size(),
683 "Number of Operands does not match number of predecessors", instr.get());
684 } else if (instr->opcode == aco_opcode::p_extract ||
685 instr->opcode == aco_opcode::p_insert) {
686 check(!instr->operands[0].isConstant(), "Data operand must not be constant",
687 instr.get());
688 check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
689 if (instr->opcode == aco_opcode::p_extract)
690 check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
691 instr.get());
692
693 check(instr->definitions[0].regClass().type() != RegType::sgpr ||
694 instr->operands[0].regClass().type() == RegType::sgpr,
695 "Can't extract/insert VGPR to SGPR", instr.get());
696
697 if (instr->opcode == aco_opcode::p_insert)
698 check(instr->operands[0].bytes() == instr->definitions[0].bytes(),
699 "Sizes of p_insert data operand and definition must match", instr.get());
700
701 if (instr->definitions[0].regClass().type() == RegType::sgpr)
702 check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
703 instr->definitions[1].physReg() == scc,
704 "SGPR extract/insert needs an SCC definition", instr.get());
705
706 unsigned data_bits = instr->operands[0].bytes() * 8u;
707 unsigned op_bits = instr->operands[2].constantValue();
708
709 check(op_bits == 8 || op_bits == 16, "Size must be 8 or 16", instr.get());
710 if (instr->opcode == aco_opcode::p_insert) {
711 check(op_bits < data_bits, "Size must be smaller than source", instr.get());
712 } else if (instr->opcode == aco_opcode::p_extract) {
713 check(data_bits >= op_bits, "Can't extract more bits than what the data has.",
714 instr.get());
715 }
716
717 unsigned comp = data_bits / MAX2(op_bits, 1);
718 check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
719 instr.get());
720
721 check(program->gfx_level >= GFX9 ||
722 !instr->definitions[0].regClass().is_subdword() ||
723 instr->operands[0].regClass().type() == RegType::vgpr,
724 "Cannot extract/insert to subdword definition from SGPR before GFX9+",
725 instr.get());
726 } else if (instr->opcode == aco_opcode::p_jump_to_epilog) {
727 check(instr->definitions.size() == 0, "p_jump_to_epilog must have 0 definitions",
728 instr.get());
729 check(instr->operands.size() > 0 && instr->operands[0].isOfType(RegType::sgpr) &&
730 instr->operands[0].size() == 2,
731 "First operand of p_jump_to_epilog must be a SGPR", instr.get());
732 for (unsigned i = 1; i < instr->operands.size(); i++) {
733 check(instr->operands[i].isOfType(RegType::vgpr) ||
734 instr->operands[i].isOfType(RegType::sgpr) ||
735 instr->operands[i].isUndefined(),
736 "Other operands of p_jump_to_epilog must be VGPRs, SGPRs or undef",
737 instr.get());
738 }
739 } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
740 check(instr->definitions.size() == 6,
741 "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
742 check(instr->definitions[2].regClass() == program->lane_mask,
743 "Third definition of p_dual_src_export_gfx11 must be a lane mask",
744 instr.get());
745 check(instr->definitions[3].regClass() == program->lane_mask,
746 "Fourth definition of p_dual_src_export_gfx11 must be a lane mask",
747 instr.get());
748 check(instr->definitions[4].physReg() == vcc,
749 "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
750 check(instr->definitions[5].physReg() == scc,
751 "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
752 check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
753 instr.get());
754 for (unsigned i = 0; i < instr->operands.size(); i++) {
755 check(
756 instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
757 "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
758 }
759 }
760 break;
761 }
762 case Format::PSEUDO_REDUCTION: {
763 for (const Operand& op : instr->operands)
764 check(op.regClass().type() == RegType::vgpr,
765 "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
766 instr.get());
767
768 if (instr->opcode == aco_opcode::p_reduce &&
769 instr->reduction().cluster_size == program->wave_size)
770 check(instr->definitions[0].regClass().type() == RegType::sgpr ||
771 program->wave_size == 32,
772 "The result of unclustered reductions must go into an SGPR.", instr.get());
773 else
774 check(instr->definitions[0].regClass().type() == RegType::vgpr,
775 "The result of scans and clustered reductions must go into a VGPR.",
776 instr.get());
777
778 break;
779 }
780 case Format::SMEM: {
781 if (instr->operands.size() >= 1)
782 check(instr->operands[0].isOfType(RegType::sgpr), "SMEM operands must be sgpr",
783 instr.get());
784 if (instr->operands.size() >= 2)
785 check(instr->operands[1].isConstant() || instr->operands[1].isOfType(RegType::sgpr),
786 "SMEM offset must be constant or sgpr", instr.get());
787 if (!instr->definitions.empty())
788 check(instr->definitions[0].regClass().type() == RegType::sgpr,
789 "SMEM result must be sgpr", instr.get());
790 break;
791 }
792 case Format::MTBUF:
793 case Format::MUBUF: {
794 check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
795 instr.get());
796 check(instr->operands[1].isOfType(RegType::vgpr),
797 "VADDR must be in vgpr for VMEM instructions", instr.get());
798 check(instr->operands[0].isOfType(RegType::sgpr), "VMEM resource constant must be sgpr",
799 instr.get());
800 check(instr->operands.size() < 4 || instr->operands[3].isOfType(RegType::vgpr),
801 "VMEM write data must be vgpr", instr.get());
802 if (instr->operands.size() >= 3 && instr->operands[2].isConstant())
803 check(program->gfx_level < GFX12 || instr->operands[2].constantValue() == 0,
804 "VMEM SOFFSET must not be non-zero constant on GFX12+", instr.get());
805
806 const bool d16 =
807 instr->opcode ==
808 aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables
809 instr->opcode == aco_opcode::buffer_load_ubyte ||
810 instr->opcode == aco_opcode::buffer_load_sbyte ||
811 instr->opcode == aco_opcode::buffer_load_ushort ||
812 instr->opcode == aco_opcode::buffer_load_sshort ||
813 instr->opcode == aco_opcode::buffer_load_ubyte_d16 ||
814 instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi ||
815 instr->opcode == aco_opcode::buffer_load_sbyte_d16 ||
816 instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi ||
817 instr->opcode == aco_opcode::buffer_load_short_d16 ||
818 instr->opcode == aco_opcode::buffer_load_short_d16_hi ||
819 instr->opcode == aco_opcode::buffer_load_format_d16_x ||
820 instr->opcode == aco_opcode::buffer_load_format_d16_hi_x ||
821 instr->opcode == aco_opcode::buffer_load_format_d16_xy ||
822 instr->opcode == aco_opcode::buffer_load_format_d16_xyz ||
823 instr->opcode == aco_opcode::buffer_load_format_d16_xyzw ||
824 instr->opcode == aco_opcode::tbuffer_load_format_d16_x ||
825 instr->opcode == aco_opcode::tbuffer_load_format_d16_xy ||
826 instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz ||
827 instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw;
828 if (instr->definitions.size()) {
829 check(instr->definitions[0].regClass().type() == RegType::vgpr,
830 "VMEM definitions[0] (VDATA) must be VGPR", instr.get());
831 check(d16 || !instr->definitions[0].regClass().is_subdword(),
832 "Only D16 opcodes can load subdword values.", instr.get());
833 check(instr->definitions[0].bytes() <= 8 || !d16,
834 "D16 opcodes can only load up to 8 bytes.", instr.get());
835 }
836 break;
837 }
838 case Format::MIMG: {
839 check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
840 instr.get());
841 check(instr->operands[0].hasRegClass() &&
842 (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
843 "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
844 if (instr->operands[1].hasRegClass())
845 check(instr->operands[1].regClass() == s4,
846 "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
847 if (!instr->operands[2].isUndefined()) {
848 bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
849 instr->opcode == aco_opcode::image_atomic_fcmpswap;
850 check(instr->definitions.empty() ||
851 (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
852 is_cmpswap),
853 "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
854 "TFE/LWE loads",
855 instr.get());
856 }
857
858 if (instr->mimg().strict_wqm) {
859 check(instr->operands[3].hasRegClass() &&
860 instr->operands[3].regClass().is_linear_vgpr(),
861 "MIMG operands[3] must be temp linear VGPR.", instr.get());
862
863 unsigned total_size = 0;
864 for (unsigned i = 4; i < instr->operands.size(); i++) {
865 check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
866 "MIMG operands[4+] (VADDR) must be v1", instr.get());
867 total_size += instr->operands[i].bytes();
868 }
869 check(total_size <= instr->operands[3].bytes(),
870 "MIMG operands[4+] must fit within operands[3].", instr.get());
871 } else {
872 check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
873 "NSA is only supported on GFX10+", instr.get());
874 for (unsigned i = 3; i < instr->operands.size(); i++) {
875 check(instr->operands[i].hasRegClass() &&
876 instr->operands[i].regClass().type() == RegType::vgpr,
877 "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
878 if (instr->operands.size() > 4) {
879 if (program->gfx_level < GFX11) {
880 check(instr->operands[i].regClass() == v1,
881 "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
882 } else {
883 unsigned num_scalar =
884 program->gfx_level >= GFX12 ? (instr->operands.size() - 4) : 4;
885 if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
886 instr->opcode != aco_opcode::image_bvh64_intersect_ray &&
887 i < 3 + num_scalar) {
888 check(instr->operands[i].regClass() == v1,
889 "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
890 }
891 }
892 }
893 }
894 }
895
896 if (instr->definitions.size()) {
897 check(instr->definitions[0].regClass().type() == RegType::vgpr,
898 "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
899 check(instr->mimg().d16 || !instr->definitions[0].regClass().is_subdword(),
900 "Only D16 MIMG instructions can load subdword values.", instr.get());
901 check(instr->definitions[0].bytes() <= 8 || !instr->mimg().d16,
902 "D16 MIMG instructions can only load up to 8 bytes.", instr.get());
903 }
904 break;
905 }
906 case Format::DS: {
907 for (const Operand& op : instr->operands) {
908 check(op.isOfType(RegType::vgpr) || op.physReg() == m0 || op.isUndefined(),
909 "Only VGPRs are valid DS instruction operands", instr.get());
910 }
911 if (!instr->definitions.empty())
912 check(instr->definitions[0].regClass().type() == RegType::vgpr,
913 "DS instruction must return VGPR", instr.get());
914 break;
915 }
916 case Format::EXP: {
917 for (unsigned i = 0; i < 4; i++)
918 check(instr->operands[i].isOfType(RegType::vgpr),
919 "Only VGPRs are valid Export arguments", instr.get());
920 break;
921 }
922 case Format::FLAT:
923 check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
924 instr.get());
925 FALLTHROUGH;
926 case Format::GLOBAL:
927 check(instr->operands[0].isOfType(RegType::vgpr), "FLAT/GLOBAL address must be vgpr",
928 instr.get());
929 FALLTHROUGH;
930 case Format::SCRATCH: {
931 check(instr->operands[0].isOfType(RegType::vgpr),
932 "FLAT/GLOBAL/SCRATCH address must be undefined or vgpr", instr.get());
933 check(instr->operands[1].isOfType(RegType::sgpr),
934 "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
935 if (instr->format == Format::SCRATCH && program->gfx_level < GFX10_3)
936 check(!instr->operands[0].isUndefined() || !instr->operands[1].isUndefined(),
937 "SCRATCH must have either SADDR or ADDR operand", instr.get());
938 if (!instr->definitions.empty())
939 check(instr->definitions[0].regClass().type() == RegType::vgpr,
940 "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
941 else
942 check(instr->operands[2].isOfType(RegType::vgpr),
943 "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
944 break;
945 }
946 case Format::LDSDIR: {
947 check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1,
948 "LDSDIR must have an v1 definition", instr.get());
949 check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get());
950 if (!instr->operands.empty()) {
951 check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand",
952 instr.get());
953 check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0,
954 "LDSDIR must have an operand fixed to m0", instr.get());
955 }
956 break;
957 }
958 default: break;
959 }
960 }
961 }
962
963 return is_valid;
964 }
965
966 bool
validate_cfg(Program * program)967 validate_cfg(Program* program)
968 {
969 if (!(debug_flags & DEBUG_VALIDATE_IR))
970 return true;
971
972 bool is_valid = true;
973 auto check_block = [&program, &is_valid](bool success, const char* msg,
974 aco::Block* block) -> void
975 {
976 if (!success) {
977 aco_err(program, "%s: BB%u", msg, block->index);
978 is_valid = false;
979 }
980 };
981
982 /* validate CFG */
983 for (unsigned i = 0; i < program->blocks.size(); i++) {
984 Block& block = program->blocks[i];
985 check_block(block.index == i, "block.index must match actual index", &block);
986
987 /* predecessors/successors should be sorted */
988 for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
989 check_block(block.linear_preds[j] < block.linear_preds[j + 1],
990 "linear predecessors must be sorted", &block);
991 for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
992 check_block(block.logical_preds[j] < block.logical_preds[j + 1],
993 "logical predecessors must be sorted", &block);
994 for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
995 check_block(block.linear_succs[j] < block.linear_succs[j + 1],
996 "linear successors must be sorted", &block);
997 for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
998 check_block(block.logical_succs[j] < block.logical_succs[j + 1],
999 "logical successors must be sorted", &block);
1000
1001 /* critical edges are not allowed */
1002 if (block.linear_preds.size() > 1) {
1003 for (unsigned pred : block.linear_preds)
1004 check_block(program->blocks[pred].linear_succs.size() == 1,
1005 "linear critical edges are not allowed", &program->blocks[pred]);
1006 for (unsigned pred : block.logical_preds)
1007 check_block(program->blocks[pred].logical_succs.size() == 1,
1008 "logical critical edges are not allowed", &program->blocks[pred]);
1009 }
1010 }
1011
1012 return is_valid;
1013 }
1014
1015 bool
validate_live_vars(Program * program)1016 validate_live_vars(Program* program)
1017 {
1018 if (!(debug_flags & DEBUG_VALIDATE_LIVE_VARS))
1019 return true;
1020
1021 bool is_valid = true;
1022 const int prev_num_waves = program->num_waves;
1023 const monotonic_buffer_resource old_memory = std::move(program->live.memory);
1024 const std::vector<IDSet> prev_live_in = std::move(program->live.live_in);
1025 const RegisterDemand prev_max_demand = program->max_reg_demand;
1026 std::vector<RegisterDemand> block_demands(program->blocks.size());
1027 std::vector<RegisterDemand> live_in_demands(program->blocks.size());
1028 std::vector<std::vector<RegisterDemand>> register_demands(program->blocks.size());
1029
1030 for (unsigned i = 0; i < program->blocks.size(); i++) {
1031 Block& b = program->blocks[i];
1032 block_demands[i] = b.register_demand;
1033 live_in_demands[i] = b.live_in_demand;
1034 register_demands[i].reserve(b.instructions.size());
1035 for (unsigned j = 0; j < b.instructions.size(); j++)
1036 register_demands[i].emplace_back(b.instructions[j]->register_demand);
1037 }
1038
1039 aco::live_var_analysis(program);
1040
1041 /* Validate RegisterDemand calculation */
1042 for (unsigned i = 0; i < program->blocks.size(); i++) {
1043 Block& b = program->blocks[i];
1044
1045 if (!(b.register_demand == block_demands[i])) {
1046 is_valid = false;
1047 aco_err(program,
1048 "Register Demand not updated correctly for BB%d: got (%3u vgpr, %3u sgpr), but "
1049 "should be (%3u vgpr, %3u sgpr)",
1050 i, block_demands[i].vgpr, block_demands[i].sgpr, b.register_demand.vgpr,
1051 b.register_demand.sgpr);
1052 }
1053 if (!(b.live_in_demand == live_in_demands[i])) {
1054 is_valid = false;
1055 aco_err(program,
1056 "Live-in Demand not updated correctly for BB%d: got (%3u vgpr, %3u sgpr), but "
1057 "should be (%3u vgpr, %3u sgpr)",
1058 i, live_in_demands[i].vgpr, live_in_demands[i].sgpr, b.live_in_demand.vgpr,
1059 b.live_in_demand.sgpr);
1060 }
1061
1062 for (unsigned j = 0; j < b.instructions.size(); j++) {
1063 if (b.instructions[j]->register_demand == register_demands[i][j])
1064 continue;
1065
1066 char* out;
1067 size_t outsize;
1068 struct u_memstream mem;
1069 u_memstream_open(&mem, &out, &outsize);
1070 FILE* const memf = u_memstream_get(&mem);
1071
1072 fprintf(memf,
1073 "Register Demand not updated correctly: got (%3u vgpr, %3u sgpr), but should be "
1074 "(%3u vgpr, %3u sgpr): \n\t",
1075 register_demands[i][j].vgpr, register_demands[i][j].sgpr,
1076 b.instructions[j]->register_demand.vgpr, b.instructions[j]->register_demand.sgpr);
1077 aco_print_instr(program->gfx_level, b.instructions[j].get(), memf, print_kill);
1078 u_memstream_close(&mem);
1079
1080 aco_err(program, "%s", out);
1081 free(out);
1082
1083 is_valid = false;
1084 }
1085 }
1086 if (!(program->max_reg_demand == prev_max_demand) || program->num_waves != prev_num_waves) {
1087 is_valid = false;
1088 aco_err(program,
1089 "Max Register Demand and Num Waves not updated correctly: got (%3u vgpr, %3u sgpr) "
1090 "and %2u waves, but should be (%3u vgpr, %3u sgpr) and %2u waves",
1091 prev_max_demand.vgpr, prev_max_demand.sgpr, prev_num_waves,
1092 program->max_reg_demand.vgpr, program->max_reg_demand.sgpr, program->num_waves);
1093 }
1094
1095 /* Validate Live-in sets */
1096 for (unsigned i = 0; i < program->blocks.size(); i++) {
1097 if (prev_live_in[i] != program->live.live_in[i]) {
1098 char* out;
1099 size_t outsize;
1100 struct u_memstream mem;
1101 u_memstream_open(&mem, &out, &outsize);
1102 FILE* const memf = u_memstream_get(&mem);
1103
1104 fprintf(memf, "Live-in set not updated correctly for BB%d:", i);
1105 fprintf(memf, "\nMissing values: ");
1106 for (unsigned t : program->live.live_in[i]) {
1107 if (prev_live_in[i].count(t) == 0)
1108 fprintf(memf, "%%%d, ", t);
1109 }
1110 fprintf(memf, "\nAdditional values: ");
1111 for (unsigned t : prev_live_in[i]) {
1112 if (program->live.live_in[i].count(t) == 0)
1113 fprintf(memf, "%%%d, ", t);
1114 }
1115 u_memstream_close(&mem);
1116 aco_err(program, "%s", out);
1117 free(out);
1118 is_valid = false;
1119 }
1120 }
1121
1122 return is_valid;
1123 }
1124
1125 /* RA validation */
1126 namespace {
1127
1128 struct Location {
Locationaco::__anonb699e9220311::Location1129 Location() : block(NULL), instr(NULL) {}
1130
1131 Block* block;
1132 Instruction* instr; // NULL if it's the block's live-in
1133 };
1134
1135 struct Assignment {
1136 Location defloc;
1137 Location firstloc;
1138 PhysReg reg;
1139 bool valid;
1140 };
1141
1142 bool
ra_fail(Program * program,Location loc,Location loc2,const char * fmt,...)1143 ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
1144 {
1145 va_list args;
1146 va_start(args, fmt);
1147 char msg[1024];
1148 vsprintf(msg, fmt, args);
1149 va_end(args);
1150
1151 char* out;
1152 size_t outsize;
1153 struct u_memstream mem;
1154 u_memstream_open(&mem, &out, &outsize);
1155 FILE* const memf = u_memstream_get(&mem);
1156
1157 fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
1158 if (loc.instr) {
1159 aco_print_instr(program->gfx_level, loc.instr, memf);
1160 fprintf(memf, "\n%s", msg);
1161 } else {
1162 fprintf(memf, "%s", msg);
1163 }
1164 if (loc2.block) {
1165 fprintf(memf, " in BB%d:\n", loc2.block->index);
1166 aco_print_instr(program->gfx_level, loc2.instr, memf);
1167 }
1168 fprintf(memf, "\n\n");
1169 u_memstream_close(&mem);
1170
1171 aco_err(program, "%s", out);
1172 free(out);
1173
1174 return true;
1175 }
1176
1177 bool
validate_subdword_operand(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr,unsigned index)1178 validate_subdword_operand(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
1179 unsigned index)
1180 {
1181 Operand op = instr->operands[index];
1182 unsigned byte = op.physReg().byte();
1183
1184 if (instr->opcode == aco_opcode::p_as_uniform)
1185 return byte == 0;
1186 if (instr->isPseudo() && gfx_level >= GFX8)
1187 return true;
1188 if (instr->isSDWA())
1189 return byte + instr->sdwa().sel[index].offset() + instr->sdwa().sel[index].size() <= 4 &&
1190 byte % instr->sdwa().sel[index].size() == 0;
1191 if (instr->isVOP3P()) {
1192 bool fma_mix = instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
1193 instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
1194 instr->opcode == aco_opcode::v_fma_mix_f32;
1195 return instr->valu().opsel_lo[index] == (byte >> 1) &&
1196 instr->valu().opsel_hi[index] == (fma_mix || (byte >> 1));
1197 }
1198 if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, index))
1199 return true;
1200
1201 switch (instr->opcode) {
1202 case aco_opcode::v_cvt_f32_ubyte1:
1203 if (byte == 1)
1204 return true;
1205 break;
1206 case aco_opcode::v_cvt_f32_ubyte2:
1207 if (byte == 2)
1208 return true;
1209 break;
1210 case aco_opcode::v_cvt_f32_ubyte3:
1211 if (byte == 3)
1212 return true;
1213 break;
1214 case aco_opcode::ds_write_b8_d16_hi:
1215 case aco_opcode::ds_write_b16_d16_hi:
1216 if (byte == 2 && index == 1)
1217 return true;
1218 break;
1219 case aco_opcode::buffer_store_byte_d16_hi:
1220 case aco_opcode::buffer_store_short_d16_hi:
1221 case aco_opcode::buffer_store_format_d16_hi_x:
1222 if (byte == 2 && index == 3)
1223 return true;
1224 break;
1225 case aco_opcode::flat_store_byte_d16_hi:
1226 case aco_opcode::flat_store_short_d16_hi:
1227 case aco_opcode::scratch_store_byte_d16_hi:
1228 case aco_opcode::scratch_store_short_d16_hi:
1229 case aco_opcode::global_store_byte_d16_hi:
1230 case aco_opcode::global_store_short_d16_hi:
1231 if (byte == 2 && index == 2)
1232 return true;
1233 break;
1234 default: break;
1235 }
1236
1237 return byte == 0;
1238 }
1239
1240 bool
validate_subdword_definition(amd_gfx_level gfx_level,const aco_ptr<Instruction> & instr)1241 validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr)
1242 {
1243 Definition def = instr->definitions[0];
1244 unsigned byte = def.physReg().byte();
1245
1246 if (instr->isPseudo() && gfx_level >= GFX8)
1247 return true;
1248 if (instr->isSDWA())
1249 return byte + instr->sdwa().dst_sel.offset() + instr->sdwa().dst_sel.size() <= 4 &&
1250 byte % instr->sdwa().dst_sel.size() == 0;
1251 if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, -1))
1252 return true;
1253
1254 switch (instr->opcode) {
1255 case aco_opcode::v_interp_p2_hi_f16:
1256 case aco_opcode::v_fma_mixhi_f16:
1257 case aco_opcode::buffer_load_ubyte_d16_hi:
1258 case aco_opcode::buffer_load_sbyte_d16_hi:
1259 case aco_opcode::buffer_load_short_d16_hi:
1260 case aco_opcode::buffer_load_format_d16_hi_x:
1261 case aco_opcode::flat_load_ubyte_d16_hi:
1262 case aco_opcode::flat_load_short_d16_hi:
1263 case aco_opcode::scratch_load_ubyte_d16_hi:
1264 case aco_opcode::scratch_load_short_d16_hi:
1265 case aco_opcode::global_load_ubyte_d16_hi:
1266 case aco_opcode::global_load_short_d16_hi:
1267 case aco_opcode::ds_read_u8_d16_hi:
1268 case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
1269 case aco_opcode::p_v_cvt_pk_u8_f32: return true;
1270 default: break;
1271 }
1272
1273 return byte == 0;
1274 }
1275
1276 unsigned
get_subdword_bytes_written(Program * program,const aco_ptr<Instruction> & instr,unsigned index)1277 get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
1278 {
1279 amd_gfx_level gfx_level = program->gfx_level;
1280 Definition def = instr->definitions[index];
1281
1282 if (instr->isPseudo())
1283 return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
1284 if (instr->isVALU() || instr->isVINTRP()) {
1285 assert(def.bytes() <= 2);
1286 if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32)
1287 return 1;
1288
1289 if (instr->isSDWA())
1290 return instr->sdwa().dst_sel.size();
1291
1292 if (instr_is_16bit(gfx_level, instr->opcode))
1293 return 2;
1294
1295 return 4;
1296 }
1297
1298 if (instr->isMIMG()) {
1299 assert(instr->mimg().d16);
1300 return program->dev.sram_ecc_enabled ? def.size() * 4u : def.bytes();
1301 }
1302
1303 switch (instr->opcode) {
1304 case aco_opcode::buffer_load_ubyte_d16:
1305 case aco_opcode::buffer_load_sbyte_d16:
1306 case aco_opcode::buffer_load_short_d16:
1307 case aco_opcode::buffer_load_format_d16_x:
1308 case aco_opcode::tbuffer_load_format_d16_x:
1309 case aco_opcode::flat_load_ubyte_d16:
1310 case aco_opcode::flat_load_short_d16:
1311 case aco_opcode::scratch_load_ubyte_d16:
1312 case aco_opcode::scratch_load_short_d16:
1313 case aco_opcode::global_load_ubyte_d16:
1314 case aco_opcode::global_load_short_d16:
1315 case aco_opcode::ds_read_u8_d16:
1316 case aco_opcode::ds_read_u16_d16:
1317 case aco_opcode::buffer_load_ubyte_d16_hi:
1318 case aco_opcode::buffer_load_sbyte_d16_hi:
1319 case aco_opcode::buffer_load_short_d16_hi:
1320 case aco_opcode::buffer_load_format_d16_hi_x:
1321 case aco_opcode::flat_load_ubyte_d16_hi:
1322 case aco_opcode::flat_load_short_d16_hi:
1323 case aco_opcode::scratch_load_ubyte_d16_hi:
1324 case aco_opcode::scratch_load_short_d16_hi:
1325 case aco_opcode::global_load_ubyte_d16_hi:
1326 case aco_opcode::global_load_short_d16_hi:
1327 case aco_opcode::ds_read_u8_d16_hi:
1328 case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
1329 case aco_opcode::buffer_load_format_d16_xyz:
1330 case aco_opcode::tbuffer_load_format_d16_xyz: return program->dev.sram_ecc_enabled ? 8 : 6;
1331 default: return def.size() * 4;
1332 }
1333 }
1334
1335 bool
validate_instr_defs(Program * program,std::array<unsigned,2048> & regs,const std::vector<Assignment> & assignments,const Location & loc,aco_ptr<Instruction> & instr)1336 validate_instr_defs(Program* program, std::array<unsigned, 2048>& regs,
1337 const std::vector<Assignment>& assignments, const Location& loc,
1338 aco_ptr<Instruction>& instr)
1339 {
1340 bool err = false;
1341
1342 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1343 Definition& def = instr->definitions[i];
1344 if (!def.isTemp())
1345 continue;
1346 Temp tmp = def.getTemp();
1347 PhysReg reg = assignments[tmp.id()].reg;
1348 for (unsigned j = 0; j < tmp.bytes(); j++) {
1349 if (regs[reg.reg_b + j])
1350 err |=
1351 ra_fail(program, loc, assignments[regs[reg.reg_b + j]].defloc,
1352 "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
1353 tmp.id(), regs[reg.reg_b + j]);
1354 regs[reg.reg_b + j] = tmp.id();
1355 }
1356 if (def.regClass().is_subdword() && def.bytes() < 4) {
1357 unsigned written = get_subdword_bytes_written(program, instr, i);
1358 /* If written=4, the instruction still might write the upper half. In that case, it's
1359 * the lower half that isn't preserved */
1360 for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
1361 unsigned written_reg = reg.reg() * 4u + j;
1362 if (regs[written_reg] && regs[written_reg] != def.tempId())
1363 err |= ra_fail(program, loc, assignments[regs[written_reg]].defloc,
1364 "Assignment of element %d of %%%d overwrites the full register "
1365 "taken by %%%d from instruction",
1366 i, tmp.id(), regs[written_reg]);
1367 }
1368 }
1369 }
1370
1371 for (const Definition& def : instr->definitions) {
1372 if (!def.isTemp())
1373 continue;
1374 if (def.isKill()) {
1375 for (unsigned j = 0; j < def.getTemp().bytes(); j++)
1376 regs[def.physReg().reg_b + j] = 0;
1377 }
1378 }
1379
1380 return err;
1381 }
1382
1383 } /* end namespace */
1384
1385 bool
validate_ra(Program * program)1386 validate_ra(Program* program)
1387 {
1388 if (!(debug_flags & DEBUG_VALIDATE_RA))
1389 return false;
1390
1391 bool err = false;
1392 aco::live_var_analysis(program);
1393 std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
1394 uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
1395
1396 std::vector<Assignment> assignments(program->peekAllocationId());
1397 for (Block& block : program->blocks) {
1398 Location loc;
1399 loc.block = █
1400 for (aco_ptr<Instruction>& instr : block.instructions) {
1401 if (instr->opcode == aco_opcode::p_phi) {
1402 for (unsigned i = 0; i < instr->operands.size(); i++) {
1403 if (instr->operands[i].isTemp() &&
1404 instr->operands[i].getTemp().type() == RegType::sgpr &&
1405 instr->operands[i].isFirstKill())
1406 phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp());
1407 }
1408 }
1409
1410 loc.instr = instr.get();
1411 for (unsigned i = 0; i < instr->operands.size(); i++) {
1412 Operand& op = instr->operands[i];
1413 if (!op.isTemp())
1414 continue;
1415 if (!op.isFixed())
1416 err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
1417 if (assignments[op.tempId()].valid && assignments[op.tempId()].reg != op.physReg())
1418 err |=
1419 ra_fail(program, loc, assignments[op.tempId()].firstloc,
1420 "Operand %d has an inconsistent register assignment with instruction", i);
1421 if ((op.getTemp().type() == RegType::vgpr &&
1422 op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
1423 (op.getTemp().type() == RegType::sgpr &&
1424 op.physReg() + op.size() > program->config->num_sgprs &&
1425 op.physReg() < sgpr_limit))
1426 err |= ra_fail(program, loc, assignments[op.tempId()].firstloc,
1427 "Operand %d has an out-of-bounds register assignment", i);
1428 if (op.physReg() == vcc && !program->needs_vcc)
1429 err |= ra_fail(program, loc, Location(),
1430 "Operand %d fixed to vcc but needs_vcc=false", i);
1431 if (op.regClass().is_subdword() &&
1432 !validate_subdword_operand(program->gfx_level, instr, i))
1433 err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
1434 if (!assignments[op.tempId()].firstloc.block)
1435 assignments[op.tempId()].firstloc = loc;
1436 if (!assignments[op.tempId()].defloc.block) {
1437 assignments[op.tempId()].reg = op.physReg();
1438 assignments[op.tempId()].valid = true;
1439 }
1440 }
1441
1442 for (unsigned i = 0; i < instr->definitions.size(); i++) {
1443 Definition& def = instr->definitions[i];
1444 if (!def.isTemp())
1445 continue;
1446 if (!def.isFixed())
1447 err |=
1448 ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
1449 if (assignments[def.tempId()].defloc.block)
1450 err |= ra_fail(program, loc, assignments[def.tempId()].defloc,
1451 "Temporary %%%d also defined by instruction", def.tempId());
1452 if ((def.getTemp().type() == RegType::vgpr &&
1453 def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
1454 (def.getTemp().type() == RegType::sgpr &&
1455 def.physReg() + def.size() > program->config->num_sgprs &&
1456 def.physReg() < sgpr_limit))
1457 err |= ra_fail(program, loc, assignments[def.tempId()].firstloc,
1458 "Definition %d has an out-of-bounds register assignment", i);
1459 if (def.physReg() == vcc && !program->needs_vcc)
1460 err |= ra_fail(program, loc, Location(),
1461 "Definition %d fixed to vcc but needs_vcc=false", i);
1462 if (def.regClass().is_subdword() &&
1463 !validate_subdword_definition(program->gfx_level, instr))
1464 err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
1465 if (!assignments[def.tempId()].firstloc.block)
1466 assignments[def.tempId()].firstloc = loc;
1467 assignments[def.tempId()].defloc = loc;
1468 assignments[def.tempId()].reg = def.physReg();
1469 assignments[def.tempId()].valid = true;
1470 }
1471 }
1472 }
1473
1474 for (Block& block : program->blocks) {
1475 Location loc;
1476 loc.block = █
1477
1478 std::array<unsigned, 2048> regs; /* register file in bytes */
1479 regs.fill(0);
1480
1481 /* check live in */
1482 for (unsigned id : program->live.live_in[block.index]) {
1483 Temp tmp(id, program->temp_rc[id]);
1484 PhysReg reg = assignments[id].reg;
1485 for (unsigned i = 0; i < tmp.bytes(); i++) {
1486 if (regs[reg.reg_b + i]) {
1487 err |= ra_fail(program, loc, Location(),
1488 "Assignment of element %d of %%%d already taken by %%%d in live-in",
1489 i, id, regs[reg.reg_b + i]);
1490 }
1491 regs[reg.reg_b + i] = id;
1492 }
1493 }
1494
1495 for (aco_ptr<Instruction>& instr : block.instructions) {
1496 loc.instr = instr.get();
1497
1498 /* remove killed p_phi operands from regs */
1499 if (instr->opcode == aco_opcode::p_logical_end) {
1500 for (Temp tmp : phi_sgpr_ops[block.index]) {
1501 PhysReg reg = assignments[tmp.id()].reg;
1502 for (unsigned i = 0; i < tmp.bytes(); i++)
1503 regs[reg.reg_b + i] = 0;
1504 }
1505 }
1506
1507 if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1508 for (const Operand& op : instr->operands) {
1509 if (!op.isTemp())
1510 continue;
1511 if (op.isFirstKillBeforeDef()) {
1512 for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1513 regs[op.physReg().reg_b + j] = 0;
1514 }
1515 }
1516 }
1517
1518 err |= validate_instr_defs(program, regs, assignments, loc, instr);
1519
1520 if (!is_phi(instr)) {
1521 for (const Operand& op : instr->operands) {
1522 if (!op.isTemp())
1523 continue;
1524 if (op.isLateKill() && op.isFirstKill()) {
1525 for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1526 regs[op.physReg().reg_b + j] = 0;
1527 }
1528 }
1529 }
1530 }
1531 }
1532
1533 return err;
1534 }
1535 } // namespace aco
1536