1 /* 2 * Copyright © 2021 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include "helpers.h" 26 27 using namespace aco; 28 29 BEGIN_TEST(optimizer_postRA.vcmp) 30 PhysReg reg_v0(256); 31 PhysReg reg_s0(0); 32 PhysReg reg_s2(2); 33 PhysReg reg_s4(4); 34 35 //>> v1: %a:v[0] = p_startpgm 36 ASSERTED bool setup_ok = setup_cs("v1", GFX8); 37 assert(setup_ok); 38 39 auto& startpgm = bld.instructions->at(0); 40 assert(startpgm->opcode == aco_opcode::p_startpgm); 41 startpgm->definitions[0].setFixed(reg_v0); 42 43 Temp v_in = inputs[0]; 44 45 { 46 /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */ 47 48 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 49 //! s2: %e:s[2-3] = p_cbranch_z %b:vcc 50 //! p_unit_test 0, %e:s[2-3] 51 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 52 Operand(v_in, reg_v0)); 53 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 54 Operand(exec, bld.lm)); 55 auto br = 56 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 57 writeout(0, Operand(br, reg_s2)); 58 } 59 60 //; del b, e 61 62 { 63 /* When VCC is overwritten inbetween, don't optimize. */ 64 65 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 66 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 67 //! s2: %f:vcc = s_mov_b64 0 68 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 69 //! p_unit_test 1, %e:s[2-3], %f:vcc 70 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 71 Operand(v_in, reg_v0)); 72 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 73 Operand(exec, bld.lm)); 74 auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero()); 75 auto br = 76 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 77 writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); 78 } 79 80 //; del b, c, d, e, f 81 82 { 83 /* When part of VCC is overwritten inbetween, don't optimize. */ 84 85 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 86 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 87 //! s1: %f:vcc_hi = s_mov_b32 0 88 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 89 //! p_unit_test 1, %e:s[2-3], %f:vcc_hi 90 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 91 Operand(v_in, reg_v0)); 92 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 93 Operand(exec, bld.lm)); 94 auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero()); 95 auto br = 96 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 97 writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc_hi)); 98 } 99 100 //; del b, c, d, e, f 101 102 { 103 /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */ 104 105 //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0] 106 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec 107 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 108 //! p_unit_test 2, %e:s[2-3] 109 auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(), 110 Operand(v_in, reg_v0)); 111 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), 112 Operand(vcmp, reg_s4), Operand(exec, bld.lm)); 113 auto br = 114 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 115 writeout(2, Operand(br, reg_s2)); 116 } 117 118 //; del b, c, d, e 119 120 { 121 /* When the VCC isn't written by VOPC, don't optimize */ 122 123 //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5] 124 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 125 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 126 //! p_unit_test 2, %e:s[2-3] 127 auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand::c32(1u), 128 Operand(reg_s4, bld.lm)); 129 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), 130 Operand(salu, vcc), Operand(exec, bld.lm)); 131 auto br = 132 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 133 writeout(2, Operand(br, reg_s2)); 134 } 135 136 //; del b, c, d, e, f, x 137 138 { 139 /* When EXEC is overwritten inbetween, don't optimize. */ 140 141 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 142 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 143 //! s2: %f:exec = s_mov_b64 42 144 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 145 //! p_unit_test 4, %e:s[2-3], %f:exec 146 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 147 Operand(v_in, reg_v0)); 148 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 149 Operand(exec, bld.lm)); 150 auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u)); 151 auto br = 152 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 153 writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec)); 154 } 155 156 //; del b, c, d, e, f, x 157 158 finish_optimizer_postRA_test(); 159 END_TEST 160 161 BEGIN_TEST(optimizer_postRA.scc_nocmp_opt) 162 //>> s1: %a, s2: %y, s1: %z = p_startpgm 163 ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6); 164 assert(setup_ok); 165 166 PhysReg reg_s0{0}; 167 PhysReg reg_s2{2}; 168 PhysReg reg_s3{3}; 169 PhysReg reg_s4{4}; 170 PhysReg reg_s6{6}; 171 PhysReg reg_s8{8}; 172 173 Temp in_0 = inputs[0]; 174 Temp in_1 = inputs[1]; 175 Temp in_2 = inputs[2]; 176 Operand op_in_0(in_0); 177 op_in_0.setFixed(reg_s0); 178 Operand op_in_1(in_1); 179 op_in_1.setFixed(reg_s4); 180 Operand op_in_2(in_2); 181 op_in_2.setFixed(reg_s6); 182 183 { 184 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 185 //! s2: %f:vcc = p_cbranch_nz %e:scc 186 //! p_unit_test 0, %f:vcc 187 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 188 Operand::c32(0x40018u)); 189 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 190 Operand::zero()); 191 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 192 writeout(0, Operand(br, vcc)); 193 } 194 195 //; del d, e, f 196 197 { 198 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 199 //! s2: %f:vcc = p_cbranch_z %e:scc 200 //! p_unit_test 1, %f:vcc 201 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 202 Operand::c32(0x40018u)); 203 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 204 Operand::zero()); 205 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 206 writeout(1, Operand(br, vcc)); 207 } 208 209 //; del d, e, f 210 211 { 212 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 213 //! s2: %f:vcc = p_cbranch_z %e:scc 214 //! p_unit_test 2, %f:vcc 215 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 216 Operand::c32(0x40018u)); 217 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 218 Operand::zero()); 219 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 220 writeout(2, Operand(br, vcc)); 221 } 222 223 //; del d, e, f 224 225 { 226 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 227 //! s2: %f:vcc = p_cbranch_nz %e:scc 228 //! p_unit_test 3, %f:vcc 229 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 230 Operand::c32(0x40018u)); 231 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 232 Operand::zero()); 233 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 234 writeout(3, Operand(br, vcc)); 235 } 236 237 //; del d, e, f 238 239 { 240 //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345 241 //! s2: %f:vcc = p_cbranch_z %e:scc 242 //! p_unit_test 4, %f:vcc 243 auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1, 244 Operand::c32(0x12345u)); 245 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2), 246 Operand::zero(8)); 247 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 248 writeout(4, Operand(br, vcc)); 249 } 250 251 //; del d, e, f 252 253 { 254 /* SCC is overwritten in between, don't optimize */ 255 256 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 257 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 258 //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 259 //! s2: %f:vcc = p_cbranch_z %g:scc 260 //! p_unit_test 5, %f:vcc, %h:s[3] 261 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 262 Operand::c32(0x40018u)); 263 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 264 Operand::c32(1u)); 265 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 266 Operand::zero()); 267 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 268 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 269 } 270 271 //; del d, e, f, g, h, x 272 273 { 274 /* SCC is overwritten in between, optimize by pulling down */ 275 276 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 277 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 278 //! s2: %f:vcc = p_cbranch_z %g:scc 279 //! p_unit_test 5, %f:vcc, %h:s[3] 280 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 281 Operand::c32(0x40018u)); 282 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 283 Operand::c32(1u)); 284 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 285 Operand::zero()); 286 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 287 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 288 } 289 290 //; del d, e, f, g, h, x 291 292 { 293 /* SCC is overwritten in between, optimize by pulling down */ 294 295 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 296 //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018 297 //! s2: %f:vcc = p_cbranch_z %g:scc 298 //! p_unit_test 5, %f:vcc, %h:s[3] 299 auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1, 300 Operand::c32(0x40018u)); 301 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 302 Operand::c32(1u)); 303 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s8), 304 Operand::zero()); 305 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 306 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 307 } 308 309 //; del d, e, f, g, h, x 310 311 { 312 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 313 //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc 314 //! p_unit_test 6, %f:s[4] 315 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 316 Operand::c32(0x40018u)); 317 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 318 Operand::zero()); 319 auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), 320 Operand(op_in_2), bld.scc(scmp)); 321 writeout(6, Operand(br, reg_s4)); 322 } 323 324 //; del d, e, f 325 326 { 327 /* SCC is overwritten in between, don't optimize */ 328 329 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 330 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 331 //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 332 //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc 333 //! p_unit_test 7, %f:s[4], %h:s[3] 334 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 335 Operand::c32(0x40018u)); 336 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 337 Operand::c32(1u)); 338 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 339 Operand::zero()); 340 auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), 341 Operand(op_in_2), bld.scc(scmp)); 342 writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3)); 343 } 344 345 //; del d, e, f, g, h, x 346 347 finish_optimizer_postRA_test(); 348 END_TEST 349 350 BEGIN_TEST(optimizer_postRA.dpp) 351 //>> v1: %a:v[0], v1: %b:v[1], s2: %c:vcc, s2: %d:s[0-1] = p_startpgm 352 if (!setup_cs("v1 v1 s2 s2", GFX10_3)) 353 return; 354 355 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 356 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 357 bld.instructions->at(0)->definitions[2].setFixed(vcc); 358 bld.instructions->at(0)->definitions[3].setFixed(PhysReg(0)); 359 360 PhysReg reg_v0(256); 361 PhysReg reg_v2(258); 362 Operand a(inputs[0], PhysReg(256)); 363 Operand b(inputs[1], PhysReg(257)); 364 Operand c(inputs[2], vcc); 365 Operand d(inputs[3], PhysReg(0)); 366 367 /* basic optimization */ 368 //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 369 //! p_unit_test 0, %res0:v[2] 370 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 371 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); 372 writeout(0, Operand(res0, reg_v2)); 373 374 /* operand swapping */ 375 //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 376 //! p_unit_test 1, %res1:v[2] 377 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 378 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2)); 379 writeout(1, Operand(res1, reg_v2)); 380 381 //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 382 //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi 383 //! p_unit_test 2, %res2:v[2] 384 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 385 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), 386 dpp_row_half_mirror); 387 writeout(2, Operand(res2, reg_v2)); 388 389 /* modifiers */ 390 //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 391 //! p_unit_test 3, %res3:v[2] 392 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 393 tmp3->dpp16().neg[0] = true; 394 Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b); 395 writeout(3, Operand(res3, reg_v2)); 396 397 //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 398 //! p_unit_test 4, %res4:v[2] 399 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 400 auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b); 401 res4->valu().neg[0] = true; 402 writeout(4, Operand(res4, reg_v2)); 403 404 //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 405 //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp 406 //! p_unit_test 5, %res5:v[2] 407 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 408 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp5, reg_v2), b); 409 res5->valu().clamp = true; 410 writeout(5, Operand(res5, reg_v2)); 411 412 //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi 413 //! p_unit_test 6, %res6:v[2] 414 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 415 tmp6->dpp16().neg[0] = true; 416 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b); 417 res6->valu().abs[0] = true; 418 writeout(6, Operand(res6, reg_v2)); 419 420 //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi 421 //! p_unit_test 7, %res7:v[2] 422 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 423 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2)); 424 res7->valu().abs[0] = true; 425 writeout(7, Operand(res7, reg_v2)); 426 427 //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi 428 //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1] 429 //! p_unit_test 12, %res12:v[2] 430 auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 431 tmp12->dpp16().neg[0] = true; 432 Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b); 433 writeout(12, Operand(res12, reg_v2)); 434 435 //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi 436 //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1] 437 //! p_unit_test 13, %res13:v[2] 438 auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 439 tmp13->dpp16().neg[0] = true; 440 Temp res13 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1, reg_v2), Operand(tmp13, reg_v2), b); 441 writeout(13, Operand(res13, reg_v2)); 442 443 /* vcc */ 444 //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi 445 //! p_unit_test 8, %res8:v[2] 446 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 447 Temp res8 = 448 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c); 449 writeout(8, Operand(res8, reg_v2)); 450 451 //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 452 //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1] 453 //! p_unit_test 9, %res9:v[2] 454 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 455 Temp res9 = 456 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d); 457 writeout(9, Operand(res9, reg_v2)); 458 459 /* control flow */ 460 //! BB1 461 //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ 462 //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 463 //! p_unit_test 10, %res10:v[2] 464 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 465 466 bld.reset(program->create_and_insert_block()); 467 program->blocks[0].linear_succs.push_back(1); 468 program->blocks[0].logical_succs.push_back(1); 469 program->blocks[1].linear_preds.push_back(0); 470 program->blocks[1].logical_preds.push_back(0); 471 472 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp10, reg_v2), b); 473 writeout(10, Operand(res10, reg_v2)); 474 475 /* can't combine if the v_mov_b32's operand is modified */ 476 //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 477 //! v1: %tmp11_2:v[0] = v_mov_b32 0 478 //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1] 479 //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0] 480 Temp tmp11_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 481 Temp tmp11_2 = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1, reg_v0), Operand::c32(0)); 482 Temp res11 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp11_1, reg_v2), b); 483 writeout(11, Operand(res11, reg_v2), Operand(tmp11_2, reg_v0)); 484 485 finish_optimizer_postRA_test(); 486 END_TEST 487 488 BEGIN_TEST(optimizer_postRA.dpp_across_exec) 489 for (amd_gfx_level gfx : {GFX9, GFX10}) { 490 //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm 491 if (!setup_cs("v1 v1", gfx)) 492 continue; 493 494 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 495 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 496 497 PhysReg reg_v2(258); 498 Operand a(inputs[0], PhysReg(256)); 499 Operand b(inputs[1], PhysReg(257)); 500 501 //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 502 //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec 503 //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1] 504 //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 505 //! p_unit_test 0, %res0:v[2] 506 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 507 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), 508 Operand(exec, bld.lm)); 509 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); 510 writeout(0, Operand(res0, reg_v2)); 511 512 finish_optimizer_postRA_test(); 513 } 514 END_TEST 515 516 BEGIN_TEST(optimizer_postRA.dpp_vcmpx) 517 //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm 518 if (!setup_cs("v1 v1", GFX11)) 519 return; 520 521 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 522 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 523 524 PhysReg reg_v2(258); 525 Operand a(inputs[0], PhysReg(256)); 526 Operand b(inputs[1], PhysReg(257)); 527 528 //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 529 //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1] 530 //! p_unit_test 0, %res0:exec 531 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 532 Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b); 533 writeout(0, Operand(res0, exec)); 534 535 finish_optimizer_postRA_test(); 536 END_TEST 537 538 BEGIN_TEST(optimizer_postRA.dpp_across_cf) 539 //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = p_startpgm 540 if (!setup_cs("v1 v1 v1 v1 s2", GFX10_3)) 541 return; 542 543 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 544 startpgm->definitions[0].setFixed(PhysReg(256)); 545 startpgm->definitions[1].setFixed(PhysReg(257)); 546 startpgm->definitions[2].setFixed(PhysReg(258)); 547 startpgm->definitions[3].setFixed(PhysReg(259)); 548 startpgm->definitions[4].setFixed(PhysReg(0)); 549 550 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 551 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 552 Operand c(inputs[2], PhysReg(258)); /* buffer store address */ 553 Operand d(inputs[3], PhysReg(259)); /* buffer store value */ 554 Operand e(inputs[4], PhysReg(0)); /* condition */ 555 PhysReg reg_v12(268); /* temporary register */ 556 557 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 558 559 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 560 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 561 562 emit_divergent_if_else( 563 program.get(), bld, e, 564 [&]() -> void __anondfca8ca00102() 565 { 566 /* --- logical then --- */ 567 //! BB1 568 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 569 //! p_logical_start 570 571 //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen 572 bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); 573 574 //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 575 //! p_unit_test 10, %res10:v[12] 576 Temp result = 577 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 578 writeout(10, Operand(result, reg_v12)); 579 580 //! p_logical_end 581 //! s2: %0:vcc = p_branch BB3 582 583 /* --- linear then --- */ 584 //! BB2 585 //! /* logical preds: / linear preds: BB0, / kind: */ 586 //! s2: %0:vcc = p_branch BB3 587 588 /* --- invert --- */ 589 //! BB3 590 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 591 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 592 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 593 }, 594 [&]() -> void __anondfca8ca00202() 595 { 596 /* --- logical else --- */ 597 //! BB4 598 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 599 //! p_logical_start 600 //! p_logical_end 601 //! s2: %0:vcc = p_branch BB6 602 603 /* --- linear else --- */ 604 //! BB5 605 //! /* logical preds: / linear preds: BB3, / kind: */ 606 //! s2: %0:vcc = p_branch BB6 607 }); 608 609 /* --- merge block --- */ 610 //! BB6 611 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 612 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 613 614 finish_optimizer_postRA_test(); 615 END_TEST 616 617 BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) 618 //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[2] = p_startpgm 619 if (!setup_cs("v1 v1 v1 v1 s2 s1", GFX10_3)) 620 return; 621 622 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 623 startpgm->definitions[0].setFixed(PhysReg(256)); 624 startpgm->definitions[1].setFixed(PhysReg(257)); 625 startpgm->definitions[2].setFixed(PhysReg(258)); 626 startpgm->definitions[3].setFixed(PhysReg(259)); 627 startpgm->definitions[4].setFixed(PhysReg(0)); 628 startpgm->definitions[5].setFixed(PhysReg(2)); 629 630 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 631 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 632 Operand c(inputs[2], PhysReg(258)); /* buffer store address */ 633 Operand d(inputs[3], PhysReg(259)); /* buffer store value */ 634 Operand e(inputs[4], PhysReg(0)); /* condition */ 635 Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */ 636 PhysReg reg_v12(268); /* temporary register */ 637 638 //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 639 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 640 641 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 642 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 643 644 emit_divergent_if_else( 645 program.get(), bld, e, 646 [&]() -> void __anondfca8ca00302() 647 { 648 /* --- logical then --- */ 649 //! BB1 650 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 651 //! p_logical_start 652 653 //! v1: %addr:v[0] = p_parallelcopy %f:s[2] 654 Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f); 655 656 //! buffer_store_dword %addr:v[0], 0, %d:v[3], 0 offen 657 bld.mubuf(aco_opcode::buffer_store_dword, Operand(addr, a.physReg()), Operand::zero(), d, 658 Operand::zero(), 0, true); 659 660 //! p_logical_end 661 //! s2: %0:vcc = p_branch BB3 662 663 /* --- linear then --- */ 664 //! BB2 665 //! /* logical preds: / linear preds: BB0, / kind: */ 666 //! s2: %0:vcc = p_branch BB3 667 668 /* --- invert --- */ 669 //! BB3 670 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 671 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 672 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 673 }, 674 [&]() -> void __anondfca8ca00402() 675 { 676 /* --- logical else --- */ 677 //! BB4 678 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 679 //! p_logical_start 680 //! p_logical_end 681 //! s2: %0:vcc = p_branch BB6 682 683 /* --- linear else --- */ 684 //! BB5 685 //! /* logical preds: / linear preds: BB3, / kind: */ 686 //! s2: %0:vcc = p_branch BB6 687 }); 688 689 /* --- merge block --- */ 690 //! BB6 691 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 692 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 693 694 //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1] 695 Temp result = 696 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 697 //! p_unit_test 10, %result:v[12] 698 writeout(10, Operand(result, reg_v12)); 699 700 finish_optimizer_postRA_test(); 701 END_TEST 702 703 BEGIN_TEST(optimizer_postRA.dpp_across_cf_linear_clobber) 704 //>> v1: %a:v[0], v1: %b:v[1], s2: %c:s[0-1] = p_startpgm 705 if (!setup_cs("v1 v1 s2", GFX10_3)) 706 return; 707 708 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 709 startpgm->definitions[0].setFixed(PhysReg(256)); 710 startpgm->definitions[1].setFixed(PhysReg(257)); 711 startpgm->definitions[2].setFixed(PhysReg(0)); 712 713 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 714 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 715 Operand c(inputs[2], PhysReg(0)); /* condition */ 716 PhysReg reg_v12(268); /* temporary register */ 717 718 //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 719 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 720 721 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %c:s[0-1], %0:exec 722 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 723 724 emit_divergent_if_else( 725 program.get(), bld, c, 726 [&]() -> void __anondfca8ca00502() 727 { 728 /* --- logical then --- */ 729 //! BB1 730 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 731 //! p_logical_start 732 733 //! v1: %clobber:v[0] = p_parallelcopy 0 734 Temp clobber = 735 bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), Operand::c32(0)); 736 737 //! p_unit_test 0, %clobber:v[0] 738 writeout(0, Operand(clobber, a.physReg())); 739 740 //! p_logical_end 741 //! s2: %0:vcc = p_branch BB3 742 743 /* --- linear then --- */ 744 //! BB2 745 //! /* logical preds: / linear preds: BB0, / kind: */ 746 //! s2: %0:vcc = p_branch BB3 747 748 /* --- invert --- */ 749 //! BB3 750 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 751 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 752 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 753 }, 754 [&]() -> void __anondfca8ca00602() 755 { 756 /* --- logical else --- */ 757 //! BB4 758 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 759 //! p_logical_start 760 761 //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1] 762 Temp result = 763 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 764 //! p_unit_test 1, %result:v[12] 765 writeout(1, Operand(result, reg_v12)); 766 767 //! p_logical_end 768 //! s2: %0:vcc = p_branch BB6 769 770 /* --- linear else --- */ 771 //! BB5 772 //! /* logical preds: / linear preds: BB3, / kind: */ 773 //! s2: %0:vcc = p_branch BB6 774 }); 775 776 /* --- merge block --- */ 777 //! BB6 778 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 779 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 780 781 finish_optimizer_postRA_test(); 782 END_TEST 783 784 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf) 785 //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = p_startpgm 786 if (!setup_cs("s2 v1 v1 s2", GFX10_3)) 787 return; 788 789 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 790 startpgm->definitions[0].setFixed(PhysReg(2)); 791 startpgm->definitions[1].setFixed(PhysReg(258)); 792 startpgm->definitions[2].setFixed(PhysReg(259)); 793 startpgm->definitions[3].setFixed(PhysReg(0)); 794 795 Operand a(inputs[0], PhysReg(2)); /* source for s_and */ 796 Operand c(inputs[1], PhysReg(258)); /* buffer store address */ 797 Operand d(inputs[2], PhysReg(259)); /* buffer store value */ 798 Operand e(inputs[3], PhysReg(0)); /* condition */ 799 PhysReg reg_s8(8); /* temporary register */ 800 801 auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, 802 Operand::c32(0x40018u)); 803 804 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 805 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 806 807 emit_divergent_if_else( 808 program.get(), bld, e, 809 [&]() -> void __anondfca8ca00702() 810 { 811 /* --- logical then --- */ 812 //! BB1 813 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 814 //! p_logical_start 815 816 //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen 817 bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true); 818 819 //! p_logical_end 820 //! s2: %0:vcc = p_branch BB3 821 822 /* --- linear then --- */ 823 //! BB2 824 //! /* logical preds: / linear preds: BB0, / kind: */ 825 //! s2: %0:vcc = p_branch BB3 826 827 /* --- invert --- */ 828 //! BB3 829 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 830 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 831 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 832 }, 833 [&]() -> void __anondfca8ca00802() 834 { 835 /* --- logical else --- */ 836 //! BB4 837 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 838 //! p_logical_start 839 //! p_logical_end 840 //! s2: %0:vcc = p_branch BB6 841 842 /* --- linear else --- */ 843 //! BB5 844 //! /* logical preds: / linear preds: BB3, / kind: */ 845 //! s2: %0:vcc = p_branch BB6 846 }); 847 848 /* --- merge block --- */ 849 //! BB6 850 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 851 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 852 853 //! s2: %tmp_salu:s[8-9], s1: %br_scc:scc = s_and_b64 %a:s[2-3], 0x40018 854 //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc 855 //! p_unit_test 5, %br_vcc:vcc 856 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(tmp_salu, reg_s8), 857 Operand::zero()); 858 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 859 writeout(5, Operand(br, vcc)); 860 861 finish_optimizer_postRA_test(); 862 END_TEST 863 864 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten) 865 //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[4] = p_startpgm 866 if (!setup_cs("s2 v1 v1 s2 s1", GFX10_3)) 867 return; 868 869 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 870 startpgm->definitions[0].setFixed(PhysReg(2)); 871 startpgm->definitions[1].setFixed(PhysReg(258)); 872 startpgm->definitions[2].setFixed(PhysReg(259)); 873 startpgm->definitions[3].setFixed(PhysReg(0)); 874 startpgm->definitions[4].setFixed(PhysReg(4)); 875 876 Operand a(inputs[0], PhysReg(2)); /* source for s_and */ 877 Operand c(inputs[1], PhysReg(258)); /* buffer store address */ 878 Operand d(inputs[2], PhysReg(259)); /* buffer store value */ 879 Operand e(inputs[3], PhysReg(0)); /* condition */ 880 Operand f(inputs[4], PhysReg(4)); /* overwrite value */ 881 PhysReg reg_s3(3); /* temporary register */ 882 PhysReg reg_s8(8); /* temporary register */ 883 884 //! s2: %tmp_salu:s[8-9], s1: %tmp_salu_scc:scc = s_and_b64 %a:s[2-3], 0x40018 885 auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, 886 Operand::c32(0x40018u)); 887 888 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 889 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 890 891 emit_divergent_if_else( 892 program.get(), bld, e, 893 [&]() -> void __anondfca8ca00902() 894 { 895 /* --- logical then --- */ 896 //! BB1 897 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 898 //! p_logical_start 899 900 //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4] 901 Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f); 902 903 //! buffer_store_dword %c:v[2], %ovrwr:s[3], %d:v[3], 0 offen 904 bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(s_addr, reg_s3), d, Operand::zero(), 905 0, true); 906 907 //! p_logical_end 908 //! s2: %0:vcc = p_branch BB3 909 910 /* --- linear then --- */ 911 //! BB2 912 //! /* logical preds: / linear preds: BB0, / kind: */ 913 //! s2: %0:vcc = p_branch BB3 914 915 /* --- invert --- */ 916 //! BB3 917 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 918 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 919 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 920 }, 921 [&]() -> void __anondfca8ca00a02() 922 { 923 /* --- logical else --- */ 924 //! BB4 925 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 926 //! p_logical_start 927 //! p_logical_end 928 //! s2: %0:vcc = p_branch BB6 929 930 /* --- linear else --- */ 931 //! BB5 932 //! /* logical preds: / linear preds: BB3, / kind: */ 933 //! s2: %0:vcc = p_branch BB6 934 }); 935 936 /* --- merge block --- */ 937 //! BB6 938 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 939 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 940 941 //! s1: %br_scc:scc = s_cmp_lg_u32 %tmp_salu:s[8-9], 0 942 //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc 943 //! p_unit_test 5, %br_vcc:vcc 944 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(tmp_salu, reg_s8), 945 Operand::zero()); 946 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 947 writeout(5, Operand(br, vcc)); 948 949 finish_optimizer_postRA_test(); 950 END_TEST 951