1 /* 2 * Copyright © 2020 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 #include "helpers.h" 25 26 using namespace aco; 27 28 BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands) 29 /* Registers of operands should be "recycled" for the output. But if the 30 * input is smaller than the output, that's not generally possible. The 31 * first v_cvt_f32_f16 instruction below uses the upper 16 bits of v0 32 * while the lower 16 bits are still live, so the output must be stored in 33 * a register other than v0. For the second v_cvt_f32_f16, the original 34 * value stored in v0 is no longer used and hence it's safe to store the 35 * result in v0. 36 */ 37 38 /* TODO: is this possible to do on GFX11? */ 39 for (amd_gfx_level cc = GFX8; cc <= GFX10_3; cc = (amd_gfx_level)((unsigned)cc + 1)) { 40 for (bool pessimistic : {false, true}) { 41 const char* subvariant = pessimistic ? "/pessimistic" : "/optimistic"; 42 43 //>> v1: %_:v[#a] = p_startpgm 44 if (!setup_cs("v1", (amd_gfx_level)cc, CHIP_UNKNOWN, subvariant)) 45 return; 46 47 //! v2b: %_:v[#a][0:16], v2b: %res1:v[#a][16:32] = p_split_vector %_:v[#a] 48 Builder::Result tmp = 49 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]); 50 51 //! v1: %_:v[#b] = v_cvt_f32_f16 %_:v[#a][16:32] dst_sel:dword src0_sel:uword1 52 //! v1: %_:v[#a] = v_cvt_f32_f16 %_:v[#a][0:16] 53 //; success = (b != a) 54 auto result1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), tmp.def(1).getTemp()); 55 auto result2 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), tmp.def(0).getTemp()); 56 writeout(0, result1); 57 writeout(1, result2); 58 59 finish_ra_test(ra_test_policy{pessimistic}); 60 } 61 } 62 END_TEST 63 64 BEGIN_TEST(regalloc._32bit_partial_write) 65 //>> v1: %_:v[0] = p_startpgm 66 if (!setup_cs("v1", GFX10)) 67 return; 68 69 /* ensure high 16 bits are occupied */ 70 //! v2b: %_:v[0][0:16], v2b: %_:v[0][16:32] = p_split_vector %_:v[0] 71 Temp hi = 72 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]).def(1).getTemp(); 73 74 /* This test checks if this instruction uses SDWA. */ 75 //! v2b: %_:v[0][0:16] = v_not_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword 76 Temp lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v2b), Operand::zero()); 77 78 //! v1: %_:v[0] = p_create_vector %_:v[0][0:16], %_:v[0][16:32] 79 bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi); 80 81 finish_ra_test(ra_test_policy()); 82 END_TEST 83 84 BEGIN_TEST(regalloc.precolor.swap) 85 //>> s2: %op0:s[0-1] = p_startpgm 86 if (!setup_cs("s2", GFX10)) 87 return; 88 89 program->dev.sgpr_limit = 4; 90 91 //! s2: %op1:s[2-3] = p_unit_test 92 Temp op1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2)); 93 94 //! s2: %op0_2:s[2-3], s2: %op1_2:s[0-1] = p_parallelcopy %op0:s[0-1], %op1:s[2-3] 95 //! p_unit_test %op0_2:s[2-3], %op1_2:s[0-1] 96 Operand op(inputs[0]); 97 op.setFixed(PhysReg(2)); 98 bld.pseudo(aco_opcode::p_unit_test, op, op1); 99 100 finish_ra_test(ra_test_policy()); 101 END_TEST 102 103 BEGIN_TEST(regalloc.precolor.blocking_vector) 104 //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2] = p_startpgm 105 if (!setup_cs("s2 s1", GFX10)) 106 return; 107 108 //! s1: %tmp1_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp0:s[0-1] 109 //! p_unit_test %tmp1_2:s[1] 110 Operand op(inputs[1]); 111 op.setFixed(PhysReg(1)); 112 bld.pseudo(aco_opcode::p_unit_test, op); 113 114 //! p_unit_test %tmp0_2:s[2-3] 115 bld.pseudo(aco_opcode::p_unit_test, inputs[0]); 116 117 finish_ra_test(ra_test_policy()); 118 END_TEST 119 120 BEGIN_TEST(regalloc.precolor.vector.test) 121 //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2], s1: %tmp2:s[3] = p_startpgm 122 if (!setup_cs("s2 s1 s1", GFX10)) 123 return; 124 125 //! s2: %tmp0_2:s[2-3], s1: %tmp2_2:s[0] = p_parallelcopy %tmp0:s[0-1], %tmp2:s[3] 126 //! p_unit_test %tmp0_2:s[2-3] 127 Operand op(inputs[0]); 128 op.setFixed(PhysReg(2)); 129 bld.pseudo(aco_opcode::p_unit_test, op); 130 131 //! p_unit_test %tmp2_2:s[0] 132 bld.pseudo(aco_opcode::p_unit_test, inputs[2]); 133 134 finish_ra_test(ra_test_policy()); 135 END_TEST 136 137 BEGIN_TEST(regalloc.precolor.vector.collect) 138 //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2], s1: %tmp2:s[3] = p_startpgm 139 if (!setup_cs("s2 s1 s1", GFX10)) 140 return; 141 142 //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[0], s1: %tmp2_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2], %tmp2:s[3] 143 //! p_unit_test %tmp0_2:s[2-3] 144 Operand op(inputs[0]); 145 op.setFixed(PhysReg(2)); 146 bld.pseudo(aco_opcode::p_unit_test, op); 147 148 //! p_unit_test %tmp1_2:s[0], %tmp2_2:s[1] 149 bld.pseudo(aco_opcode::p_unit_test, inputs[1], inputs[2]); 150 151 finish_ra_test(ra_test_policy()); 152 END_TEST 153 154 BEGIN_TEST(regalloc.precolor.vgpr_move) 155 //>> v1: %tmp0:v[0], v1: %tmp1:v[1] = p_startpgm 156 if (!setup_cs("v1 v1", GFX10)) 157 return; 158 159 //! v1: %tmp1_2:v[0], v1: %tmp0_2:v[1] = p_parallelcopy %tmp1:v[1], %tmp0:v[0] 160 //! p_unit_test %tmp0_2:v[1], %tmp1_2:v[0] 161 bld.pseudo(aco_opcode::p_unit_test, inputs[0], Operand(inputs[1], PhysReg(256))); 162 163 finish_ra_test(ra_test_policy()); 164 END_TEST 165 166 BEGIN_TEST(regalloc.precolor.multiple_operands) 167 //>> v1: %tmp0:v[0], v1: %tmp1:v[1], v1: %tmp2:v[2], v1: %tmp3:v[3] = p_startpgm 168 if (!setup_cs("v1 v1 v1 v1", GFX10)) 169 return; 170 171 //! v1: %tmp3_2:v[0], v1: %tmp0_2:v[1], v1: %tmp1_2:v[2], v1: %tmp2_2:v[3] = p_parallelcopy %tmp3:v[3], %tmp0:v[0], %tmp1:v[1], %tmp2:v[2] 172 //! p_unit_test %tmp3_2:v[0], %tmp0_2:v[1], %tmp1_2:v[2], %tmp2_2:v[3] 173 bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256 + 0)), 174 Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[1], PhysReg(256 + 2)), 175 Operand(inputs[2], PhysReg(256 + 3))); 176 177 finish_ra_test(ra_test_policy()); 178 END_TEST 179 180 BEGIN_TEST(regalloc.precolor.different_regs) 181 //>> v1: %tmp0:v[0] = p_startpgm 182 if (!setup_cs("v1", GFX10)) 183 return; 184 185 //! v1: %tmp1:v[1], v1: %tmp2:v[2] = p_parallelcopy %tmp0:v[0], %tmp0:v[0] 186 //! p_unit_test %tmp0:v[0], %tmp1:v[1], %tmp2:v[2] 187 bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256 + 0)), 188 Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[0], PhysReg(256 + 2))); 189 190 finish_ra_test(ra_test_policy()); 191 END_TEST 192 193 BEGIN_TEST(regalloc.scratch_sgpr.create_vector) 194 if (!setup_cs("v1 s1", GFX7)) 195 return; 196 197 Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::zero()); 198 199 //>> v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24] 200 //! s1: %0:s[1] = s_mov_b32 0x1000001 201 //! v1: %0:v[0] = v_mul_lo_u32 %0:s[1], %_:v[0][0:8] 202 bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand(v3b), Operand(tmp)); 203 204 //! p_unit_test %_:s[0] 205 //! s_endpgm 206 bld.pseudo(aco_opcode::p_unit_test, inputs[1]); 207 208 finish_ra_test(ra_test_policy(), true); 209 END_TEST 210 211 BEGIN_TEST(regalloc.scratch_sgpr.create_vector_sgpr_operand) 212 if (!setup_cs("v2 s1", GFX7)) 213 return; 214 215 Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::c32(4u)); 216 217 //>> v1: %0:v[0] = v_mov_b32 %_:s[0] 218 //! v3b: %0:v[1][0:24] = v_and_b32 0xffffff, %0:v[1][0:24] 219 //! s1: %0:s[1] = s_mov_b32 0x1000001 220 //! v1: %0:v[1] = v_mul_lo_u32 %0:s[1], %_:v[1][0:8] 221 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inputs[1], Operand(v3b), Operand(tmp)); 222 223 //! p_unit_test %_:s[0] 224 //! s_endpgm 225 bld.pseudo(aco_opcode::p_unit_test, inputs[1]); 226 227 finish_ra_test(ra_test_policy(), true); 228 END_TEST 229 230 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.fixed_def) 231 //>> p_startpgm 232 if (!setup_cs("", GFX10)) 233 return; 234 235 PhysReg reg_v0{256}; 236 237 //! lv1: %tmp1:v[0] = p_unit_test 238 Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0)); 239 240 //! lv1: %tmp2:v[1] = p_parallelcopy %tmp1:v[0] 241 //! v1: %_:v[0] = p_unit_test 242 bld.pseudo(aco_opcode::p_unit_test, Definition(reg_v0, v1)); 243 244 //! p_unit_test %tmp2:v[1] 245 bld.pseudo(aco_opcode::p_unit_test, tmp); 246 247 finish_ra_test(ra_test_policy()); 248 END_TEST 249 250 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl) 251 //>> p_startpgm 252 if (!setup_cs("", GFX10)) 253 return; 254 255 program->dev.vgpr_limit = 3; 256 257 PhysReg reg_v1{257}; 258 259 //! s1: %scc_tmp:scc, s1: %1:s[0] = p_unit_test 260 Temp s0_tmp = bld.tmp(s1); 261 Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc), 262 Definition(s0_tmp.id(), PhysReg{0}, s1)); 263 264 //! lv1: %tmp1:v[1] = p_unit_test 265 Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v1)); 266 267 //! lv1: %tmp2:v[2] = p_parallelcopy %tmp1:v[1] 268 //! v2: %_:v[0-1] = p_unit_test 269 bld.pseudo(aco_opcode::p_unit_test, bld.def(v2)); 270 271 //! p_unit_test %tmp2:v[2], %scc_tmp:scc, %1:s[0] 272 bld.pseudo(aco_opcode::p_unit_test, tmp, scc_tmp, s0_tmp); 273 274 finish_ra_test(ra_test_policy()); 275 276 //>> lv1: %5:v[2] = p_parallelcopy %3:v[1] scc:1 scratch:s1 277 Pseudo_instruction& parallelcopy = program->blocks[0].instructions[3]->pseudo(); 278 aco_print_instr(program->gfx_level, ¶llelcopy, output); 279 fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc, 280 parallelcopy.scratch_sgpr.reg()); 281 END_TEST 282 283 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_regs_for_copies) 284 //>> p_startpgm 285 if (!setup_cs("", GFX10)) 286 return; 287 288 program->dev.vgpr_limit = 6; 289 290 PhysReg reg_v2{258}; 291 PhysReg reg_v4{260}; 292 293 //! lv1: %lin_tmp1:v[4] = p_unit_test 294 Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v4)); 295 //! v2: %log_tmp1:v[2-3] = p_unit_test 296 Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v2, reg_v2)); 297 298 //! lv1: %lin_tmp2:v[0], v2: %log_tmp2:v[4-5] = p_parallelcopy %lin_tmp1:v[4], %log_tmp1:v[2-3] 299 //! v3: %_:v[1-3] = p_unit_test 300 bld.pseudo(aco_opcode::p_unit_test, bld.def(v3)); 301 302 //! p_unit_test %log_tmp2:v[4-5], %lin_tmp2:v[0] 303 bld.pseudo(aco_opcode::p_unit_test, log_tmp, lin_tmp); 304 305 finish_ra_test(ra_test_policy()); 306 END_TEST 307 308 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_create_vector) 309 //>> p_startpgm 310 if (!setup_cs("", GFX10)) 311 return; 312 313 program->dev.vgpr_limit = 4; 314 315 PhysReg reg_v0{256}; 316 PhysReg reg_v1{257}; 317 318 //! lv1: %lin_tmp1:v[0] = p_unit_test 319 Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0)); 320 //! v1: %log_tmp:v[1] = p_unit_test 321 Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1, reg_v1)); 322 323 //! lv1: %lin_tmp2:v[2] = p_parallelcopy %lin_tmp1:v[0] 324 //! v2: %_:v[0-1] = p_create_vector v1: undef, %log_tmp:v[1] 325 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(v1), log_tmp); 326 327 //! p_unit_test %lin_tmp2:v[2] 328 bld.pseudo(aco_opcode::p_unit_test, lin_tmp); 329 330 finish_ra_test(ra_test_policy()); 331 END_TEST 332 333 BEGIN_TEST(regalloc.branch_def_phis_at_merge_block) 334 //>> p_startpgm 335 if (!setup_cs("", GFX10)) 336 return; 337 338 //! s2: %_:s[2-3] = p_branch 339 bld.branch(aco_opcode::p_branch, bld.def(s2)); 340 341 //! BB1 342 //! /* logical preds: / linear preds: BB0, / kind: uniform, */ 343 bld.reset(program->create_and_insert_block()); 344 program->blocks[1].linear_preds.push_back(0); 345 346 //! s2: %tmp:s[0-1] = p_linear_phi 0 347 Temp tmp = bld.pseudo(aco_opcode::p_linear_phi, bld.def(s2), Operand::c64(0u)); 348 349 //! p_unit_test %tmp:s[0-1] 350 bld.pseudo(aco_opcode::p_unit_test, tmp); 351 352 finish_ra_test(ra_test_policy()); 353 END_TEST 354 355 BEGIN_TEST(regalloc.branch_def_phis_at_branch_block) 356 //>> p_startpgm 357 if (!setup_cs("", GFX10)) 358 return; 359 360 //! s2: %tmp:s[0-1] = p_unit_test 361 Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2)); 362 363 //! s2: %_:s[2-3] = p_cbranch_z %0:scc 364 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(scc, s1)); 365 366 //! BB1 367 //! /* logical preds: / linear preds: BB0, / kind: */ 368 bld.reset(program->create_and_insert_block()); 369 program->blocks[1].linear_preds.push_back(0); 370 371 //! p_unit_test %tmp:s[0-1] 372 bld.pseudo(aco_opcode::p_unit_test, tmp); 373 bld.branch(aco_opcode::p_branch, bld.def(s2)); 374 375 bld.reset(program->create_and_insert_block()); 376 program->blocks[2].linear_preds.push_back(0); 377 378 bld.branch(aco_opcode::p_branch, bld.def(s2)); 379 380 bld.reset(program->create_and_insert_block()); 381 program->blocks[3].linear_preds.push_back(1); 382 program->blocks[3].linear_preds.push_back(2); 383 384 finish_ra_test(ra_test_policy()); 385 END_TEST 386 387 BEGIN_TEST(regalloc.vinterp_fp16) 388 //>> v1: %in0:v[0], v1: %in1:v[1], v1: %in2:v[2] = p_startpgm 389 if (!setup_cs("v1 v1 v1", GFX11)) 390 return; 391 392 //! v2b: %lo:v[3][0:16], v2b: %hi:v[3][16:32] = p_split_vector %in0:v[0] 393 Temp lo = bld.tmp(v2b); 394 Temp hi = bld.tmp(v2b); 395 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), inputs[0]); 396 397 //! v1: %tmp0:v[1] = v_interp_p10_f16_f32_inreg %lo:v[3][0:16], %in1:v[1], hi(%hi:v[3][16:32]) 398 //! p_unit_test %tmp0:v[1] 399 Temp tmp0 = 400 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), lo, inputs[1], hi); 401 bld.pseudo(aco_opcode::p_unit_test, tmp0); 402 403 //! v2b: %tmp1:v[0][16:32] = v_interp_p2_f16_f32_inreg %in0:v[0], %in2:v[2], %tmp0:v[1] opsel_hi 404 //! v1: %tmp2:v[0] = p_create_vector 0, %tmp1:v[0][16:32] 405 //! p_unit_test %tmp2:v[0] 406 Temp tmp1 = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0], 407 inputs[2], tmp0); 408 Temp tmp2 = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand::zero(2), tmp1); 409 bld.pseudo(aco_opcode::p_unit_test, tmp2); 410 411 finish_ra_test(ra_test_policy()); 412 END_TEST 413 414 BEGIN_TEST(regalloc.writelane) 415 //>> v1: %in0:v[0], s1: %in1:s[0], s1: %in2:s[1], s1: %in3:s[2] = p_startpgm 416 if (!setup_cs("v1 s1 s1 s1", GFX8)) 417 return; 418 419 //! s1: %tmp:m0 = p_parallelcopy %int3:s[2] 420 Temp tmp = bld.copy(bld.def(s1, m0), inputs[3]); 421 422 //! s1: %in1_2:m0, s1: %tmp_2:s[0] = p_parallelcopy %in1:s[0], %tmp:m0 423 //! v1: %tmp2:v[0] = v_writelane_b32_e64 %in1_2:m0, %in2:s[1], %in0:v[0] 424 Temp tmp2 = bld.writelane(bld.def(v1), inputs[1], inputs[2], inputs[0]); 425 426 //! p_unit_test %tmp_2:s[0], %tmp2:v[0] 427 bld.pseudo(aco_opcode::p_unit_test, tmp, tmp2); 428 429 finish_ra_test(ra_test_policy()); 430 END_TEST 431