• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "helpers.h"
26 
27 using namespace aco;
28 
29 BEGIN_TEST(optimizer_postRA.vcmp)
30    PhysReg reg_v0(256);
31    PhysReg reg_s0(0);
32    PhysReg reg_s2(2);
33    PhysReg reg_s4(4);
34 
35    //>> v1: %a:v[0] = p_startpgm
36    ASSERTED bool setup_ok = setup_cs("v1", GFX8);
37    assert(setup_ok);
38 
39    auto& startpgm = bld.instructions->at(0);
40    assert(startpgm->opcode == aco_opcode::p_startpgm);
41    startpgm->definitions[0].setFixed(reg_v0);
42 
43    Temp v_in = inputs[0];
44 
45    {
46       /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
47 
48       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
49       //! s2: %e:s[2-3] = p_cbranch_z %b:vcc
50       //! p_unit_test 0, %e:s[2-3]
51       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
52                            Operand(v_in, reg_v0));
53       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
54                            Operand(exec, bld.lm));
55       auto br =
56          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
57       writeout(0, Operand(br, reg_s2));
58    }
59 
60    //; del b, e
61 
62    {
63       /* When VCC is overwritten inbetween, don't optimize. */
64 
65       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
66       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
67       //! s2: %f:vcc = s_mov_b64 0
68       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
69       //! p_unit_test 1, %e:s[2-3], %f:vcc
70       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
71                            Operand(v_in, reg_v0));
72       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
73                            Operand(exec, bld.lm));
74       auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero());
75       auto br =
76          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
77       writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
78    }
79 
80    //; del b, c, d, e, f
81 
82    {
83       /* When part of VCC is overwritten inbetween, don't optimize. */
84 
85       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
86       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
87       //! s1: %f:vcc_hi = s_mov_b32 0
88       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
89       //! p_unit_test 1, %e:s[2-3], %f:vcc_hi
90       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
91                            Operand(v_in, reg_v0));
92       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
93                            Operand(exec, bld.lm));
94       auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero());
95       auto br =
96          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
97       writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc_hi));
98    }
99 
100    //; del b, c, d, e, f
101 
102    {
103       /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
104 
105       //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
106       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
107       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
108       //! p_unit_test 2, %e:s[2-3]
109       auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(),
110                                Operand(v_in, reg_v0));
111       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc),
112                            Operand(vcmp, reg_s4), Operand(exec, bld.lm));
113       auto br =
114          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
115       writeout(2, Operand(br, reg_s2));
116    }
117 
118    //; del b, c, d, e
119 
120    {
121       /* When the VCC isn't written by VOPC, don't optimize */
122 
123       //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
124       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
125       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
126       //! p_unit_test 2, %e:s[2-3]
127       auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand::c32(1u),
128                            Operand(reg_s4, bld.lm));
129       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc),
130                            Operand(salu, vcc), Operand(exec, bld.lm));
131       auto br =
132          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
133       writeout(2, Operand(br, reg_s2));
134    }
135 
136    //; del b, c, d, e, f, x
137 
138    {
139       /* When EXEC is overwritten inbetween, don't optimize. */
140 
141       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
142       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
143       //! s2: %f:exec = s_mov_b64 42
144       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
145       //! p_unit_test 4, %e:s[2-3], %f:exec
146       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
147                            Operand(v_in, reg_v0));
148       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
149                            Operand(exec, bld.lm));
150       auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u));
151       auto br =
152          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
153       writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
154    }
155 
156    //; del b, c, d, e, f, x
157 
158    finish_optimizer_postRA_test();
159 END_TEST
160 
161 BEGIN_TEST(optimizer_postRA.scc_nocmp_opt)
162    //>> s1: %a, s2: %y, s1: %z = p_startpgm
163    ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6);
164    assert(setup_ok);
165 
166    PhysReg reg_s0{0};
167    PhysReg reg_s2{2};
168    PhysReg reg_s3{3};
169    PhysReg reg_s4{4};
170    PhysReg reg_s6{6};
171    PhysReg reg_s8{8};
172 
173    Temp in_0 = inputs[0];
174    Temp in_1 = inputs[1];
175    Temp in_2 = inputs[2];
176    Operand op_in_0(in_0);
177    op_in_0.setFixed(reg_s0);
178    Operand op_in_1(in_1);
179    op_in_1.setFixed(reg_s4);
180    Operand op_in_2(in_2);
181    op_in_2.setFixed(reg_s6);
182 
183    {
184       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
185       //! s2: %f:vcc = p_cbranch_nz %e:scc
186       //! p_unit_test 0, %f:vcc
187       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
188                            Operand::c32(0x40018u));
189       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
190                            Operand::zero());
191       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
192       writeout(0, Operand(br, vcc));
193    }
194 
195    //; del d, e, f
196 
197    {
198       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
199       //! s2: %f:vcc = p_cbranch_z %e:scc
200       //! p_unit_test 1, %f:vcc
201       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
202                            Operand::c32(0x40018u));
203       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
204                            Operand::zero());
205       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
206       writeout(1, Operand(br, vcc));
207    }
208 
209    //; del d, e, f
210 
211    {
212       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
213       //! s2: %f:vcc = p_cbranch_z %e:scc
214       //! p_unit_test 2, %f:vcc
215       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
216                            Operand::c32(0x40018u));
217       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
218                            Operand::zero());
219       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
220       writeout(2, Operand(br, vcc));
221    }
222 
223    //; del d, e, f
224 
225    {
226       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
227       //! s2: %f:vcc = p_cbranch_nz %e:scc
228       //! p_unit_test 3, %f:vcc
229       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
230                            Operand::c32(0x40018u));
231       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
232                            Operand::zero());
233       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
234       writeout(3, Operand(br, vcc));
235    }
236 
237    //; del d, e, f
238 
239    {
240       //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345
241       //! s2: %f:vcc = p_cbranch_z %e:scc
242       //! p_unit_test 4, %f:vcc
243       auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1,
244                            Operand::c32(0x12345u));
245       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2),
246                            Operand::zero(8));
247       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
248       writeout(4, Operand(br, vcc));
249    }
250 
251    //; del d, e, f
252 
253    {
254       /* SCC is overwritten in between, don't optimize */
255 
256       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
257       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
258       //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0
259       //! s2: %f:vcc = p_cbranch_z %g:scc
260       //! p_unit_test 5, %f:vcc, %h:s[3]
261       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
262                            Operand::c32(0x40018u));
263       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
264                            Operand::c32(1u));
265       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
266                            Operand::zero());
267       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
268       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
269    }
270 
271    //; del d, e, f, g, h, x
272 
273    {
274       /* SCC is overwritten in between, optimize by pulling down */
275 
276       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
277       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
278       //! s2: %f:vcc = p_cbranch_z %g:scc
279       //! p_unit_test 5, %f:vcc, %h:s[3]
280       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
281                            Operand::c32(0x40018u));
282       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
283                            Operand::c32(1u));
284       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
285                            Operand::zero());
286       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
287       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
288    }
289 
290    //; del d, e, f, g, h, x
291 
292    {
293       /* SCC is overwritten in between, optimize by pulling down */
294 
295       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
296       //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018
297       //! s2: %f:vcc = p_cbranch_z %g:scc
298       //! p_unit_test 5, %f:vcc, %h:s[3]
299       auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1,
300                            Operand::c32(0x40018u));
301       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
302                            Operand::c32(1u));
303       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s8),
304                            Operand::zero());
305       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
306       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
307    }
308 
309    //; del d, e, f, g, h, x
310 
311    {
312       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
313       //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc
314       //! p_unit_test 6, %f:s[4]
315       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
316                            Operand::c32(0x40018u));
317       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
318                            Operand::zero());
319       auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0),
320                          Operand(op_in_2), bld.scc(scmp));
321       writeout(6, Operand(br, reg_s4));
322    }
323 
324    //; del d, e, f
325 
326    {
327       /* SCC is overwritten in between, don't optimize */
328 
329       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
330       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
331       //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0
332       //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc
333       //! p_unit_test 7, %f:s[4], %h:s[3]
334       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
335                            Operand::c32(0x40018u));
336       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
337                            Operand::c32(1u));
338       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
339                            Operand::zero());
340       auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0),
341                          Operand(op_in_2), bld.scc(scmp));
342       writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3));
343    }
344 
345    //; del d, e, f, g, h, x
346 
347    finish_optimizer_postRA_test();
348 END_TEST
349 
350 BEGIN_TEST(optimizer_postRA.dpp)
351    //>> v1: %a:v[0], v1: %b:v[1], s2: %c:vcc, s2: %d:s[0-1] = p_startpgm
352    if (!setup_cs("v1 v1 s2 s2", GFX10_3))
353       return;
354 
355    bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
356    bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
357    bld.instructions->at(0)->definitions[2].setFixed(vcc);
358    bld.instructions->at(0)->definitions[3].setFixed(PhysReg(0));
359 
360    PhysReg reg_v0(256);
361    PhysReg reg_v2(258);
362    Operand a(inputs[0], PhysReg(256));
363    Operand b(inputs[1], PhysReg(257));
364    Operand c(inputs[2], vcc);
365    Operand d(inputs[3], PhysReg(0));
366 
367    /* basic optimization */
368    //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
369    //! p_unit_test 0, %res0:v[2]
370    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
371    Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
372    writeout(0, Operand(res0, reg_v2));
373 
374    /* operand swapping */
375    //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
376    //! p_unit_test 1, %res1:v[2]
377    Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
378    Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2));
379    writeout(1, Operand(res1, reg_v2));
380 
381    //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
382    //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi
383    //! p_unit_test 2, %res2:v[2]
384    Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
385    Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2),
386                             dpp_row_half_mirror);
387    writeout(2, Operand(res2, reg_v2));
388 
389    /* modifiers */
390    //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
391    //! p_unit_test 3, %res3:v[2]
392    auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
393    tmp3->dpp16().neg[0] = true;
394    Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b);
395    writeout(3, Operand(res3, reg_v2));
396 
397    //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
398    //! p_unit_test 4, %res4:v[2]
399    Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
400    auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b);
401    res4->valu().neg[0] = true;
402    writeout(4, Operand(res4, reg_v2));
403 
404    //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
405    //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp
406    //! p_unit_test 5, %res5:v[2]
407    Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
408    auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp5, reg_v2), b);
409    res5->valu().clamp = true;
410    writeout(5, Operand(res5, reg_v2));
411 
412    //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi
413    //! p_unit_test 6, %res6:v[2]
414    auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
415    tmp6->dpp16().neg[0] = true;
416    auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b);
417    res6->valu().abs[0] = true;
418    writeout(6, Operand(res6, reg_v2));
419 
420    //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi
421    //! p_unit_test 7, %res7:v[2]
422    Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
423    auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2));
424    res7->valu().abs[0] = true;
425    writeout(7, Operand(res7, reg_v2));
426 
427    //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
428    //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1]
429    //! p_unit_test 12, %res12:v[2]
430    auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
431    tmp12->dpp16().neg[0] = true;
432    Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b);
433    writeout(12, Operand(res12, reg_v2));
434 
435    //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
436    //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1]
437    //! p_unit_test 13, %res13:v[2]
438    auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
439    tmp13->dpp16().neg[0] = true;
440    Temp res13 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1, reg_v2), Operand(tmp13, reg_v2), b);
441    writeout(13, Operand(res13, reg_v2));
442 
443    /* vcc */
444    //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi
445    //! p_unit_test 8, %res8:v[2]
446    Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
447    Temp res8 =
448       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c);
449    writeout(8, Operand(res8, reg_v2));
450 
451    //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
452    //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1]
453    //! p_unit_test 9, %res9:v[2]
454    Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
455    Temp res9 =
456       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d);
457    writeout(9, Operand(res9, reg_v2));
458 
459    /* control flow */
460    //! BB1
461    //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
462    //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
463    //! p_unit_test 10, %res10:v[2]
464    Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
465 
466    bld.reset(program->create_and_insert_block());
467    program->blocks[0].linear_succs.push_back(1);
468    program->blocks[0].logical_succs.push_back(1);
469    program->blocks[1].linear_preds.push_back(0);
470    program->blocks[1].logical_preds.push_back(0);
471 
472    Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp10, reg_v2), b);
473    writeout(10, Operand(res10, reg_v2));
474 
475    /* can't combine if the v_mov_b32's operand is modified */
476    //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
477    //! v1: %tmp11_2:v[0] = v_mov_b32 0
478    //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1]
479    //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0]
480    Temp tmp11_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
481    Temp tmp11_2 = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1, reg_v0), Operand::c32(0));
482    Temp res11 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp11_1, reg_v2), b);
483    writeout(11, Operand(res11, reg_v2), Operand(tmp11_2, reg_v0));
484 
485    finish_optimizer_postRA_test();
486 END_TEST
487 
488 BEGIN_TEST(optimizer_postRA.dpp_across_exec)
489    for (amd_gfx_level gfx : {GFX9, GFX10}) {
490       //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
491       if (!setup_cs("v1 v1", gfx))
492          continue;
493 
494       bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
495       bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
496 
497       PhysReg reg_v2(258);
498       Operand a(inputs[0], PhysReg(256));
499       Operand b(inputs[1], PhysReg(257));
500 
501       //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
502       //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
503       //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1]
504       //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
505       //! p_unit_test 0, %res0:v[2]
506       Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
507       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
508                Operand(exec, bld.lm));
509       Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
510       writeout(0, Operand(res0, reg_v2));
511 
512       finish_optimizer_postRA_test();
513    }
514 END_TEST
515 
516 BEGIN_TEST(optimizer_postRA.dpp_vcmpx)
517    //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
518    if (!setup_cs("v1 v1", GFX11))
519       return;
520 
521    bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
522    bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
523 
524    PhysReg reg_v2(258);
525    Operand a(inputs[0], PhysReg(256));
526    Operand b(inputs[1], PhysReg(257));
527 
528    //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
529    //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1]
530    //! p_unit_test 0, %res0:exec
531    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
532    Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b);
533    writeout(0, Operand(res0, exec));
534 
535    finish_optimizer_postRA_test();
536 END_TEST
537 
538 BEGIN_TEST(optimizer_postRA.dpp_across_cf)
539    //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = p_startpgm
540    if (!setup_cs("v1 v1 v1 v1 s2", GFX10_3))
541       return;
542 
543    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
544    startpgm->definitions[0].setFixed(PhysReg(256));
545    startpgm->definitions[1].setFixed(PhysReg(257));
546    startpgm->definitions[2].setFixed(PhysReg(258));
547    startpgm->definitions[3].setFixed(PhysReg(259));
548    startpgm->definitions[4].setFixed(PhysReg(0));
549 
550    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
551    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
552    Operand c(inputs[2], PhysReg(258)); /* buffer store address */
553    Operand d(inputs[3], PhysReg(259)); /* buffer store value */
554    Operand e(inputs[4], PhysReg(0));   /* condition */
555    PhysReg reg_v12(268);               /* temporary register */
556 
557    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
558 
559    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
560    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
561 
562    emit_divergent_if_else(
563       program.get(), bld, e,
564       [&]() -> void
__anondfca8ca00102() 565       {
566          /* --- logical then --- */
567          //! BB1
568          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
569          //! p_logical_start
570 
571          //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen
572          bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true);
573 
574          //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
575          //! p_unit_test 10, %res10:v[12]
576          Temp result =
577             bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
578          writeout(10, Operand(result, reg_v12));
579 
580          //! p_logical_end
581          //! s2: %0:vcc = p_branch BB3
582 
583          /* --- linear then --- */
584          //! BB2
585          //! /* logical preds: / linear preds: BB0, / kind: */
586          //! s2: %0:vcc = p_branch BB3
587 
588          /* --- invert --- */
589          //! BB3
590          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
591          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
592          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
593       },
594       [&]() -> void
__anondfca8ca00202() 595       {
596          /* --- logical else --- */
597          //! BB4
598          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
599          //! p_logical_start
600          //! p_logical_end
601          //! s2: %0:vcc = p_branch BB6
602 
603          /* --- linear else --- */
604          //! BB5
605          //! /* logical preds: / linear preds: BB3, / kind: */
606          //! s2: %0:vcc = p_branch BB6
607       });
608 
609    /* --- merge block --- */
610    //! BB6
611    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
612    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
613 
614    finish_optimizer_postRA_test();
615 END_TEST
616 
617 BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten)
618    //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[2] = p_startpgm
619    if (!setup_cs("v1 v1 v1 v1 s2 s1", GFX10_3))
620       return;
621 
622    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
623    startpgm->definitions[0].setFixed(PhysReg(256));
624    startpgm->definitions[1].setFixed(PhysReg(257));
625    startpgm->definitions[2].setFixed(PhysReg(258));
626    startpgm->definitions[3].setFixed(PhysReg(259));
627    startpgm->definitions[4].setFixed(PhysReg(0));
628    startpgm->definitions[5].setFixed(PhysReg(2));
629 
630    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
631    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
632    Operand c(inputs[2], PhysReg(258)); /* buffer store address */
633    Operand d(inputs[3], PhysReg(259)); /* buffer store value */
634    Operand e(inputs[4], PhysReg(0));   /* condition */
635    Operand f(inputs[5], PhysReg(2));   /* buffer store address (scalar) */
636    PhysReg reg_v12(268);               /* temporary register */
637 
638    //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
639    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
640 
641    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
642    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
643 
644    emit_divergent_if_else(
645       program.get(), bld, e,
646       [&]() -> void
__anondfca8ca00302() 647       {
648          /* --- logical then --- */
649          //! BB1
650          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
651          //! p_logical_start
652 
653          //! v1: %addr:v[0] = p_parallelcopy %f:s[2]
654          Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f);
655 
656          //! buffer_store_dword %addr:v[0], 0, %d:v[3], 0 offen
657          bld.mubuf(aco_opcode::buffer_store_dword, Operand(addr, a.physReg()), Operand::zero(), d,
658                    Operand::zero(), 0, true);
659 
660          //! p_logical_end
661          //! s2: %0:vcc = p_branch BB3
662 
663          /* --- linear then --- */
664          //! BB2
665          //! /* logical preds: / linear preds: BB0, / kind: */
666          //! s2: %0:vcc = p_branch BB3
667 
668          /* --- invert --- */
669          //! BB3
670          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
671          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
672          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
673       },
674       [&]() -> void
__anondfca8ca00402() 675       {
676          /* --- logical else --- */
677          //! BB4
678          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
679          //! p_logical_start
680          //! p_logical_end
681          //! s2: %0:vcc = p_branch BB6
682 
683          /* --- linear else --- */
684          //! BB5
685          //! /* logical preds: / linear preds: BB3, / kind: */
686          //! s2: %0:vcc = p_branch BB6
687       });
688 
689    /* --- merge block --- */
690    //! BB6
691    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
692    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
693 
694    //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1]
695    Temp result =
696       bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
697    //! p_unit_test 10, %result:v[12]
698    writeout(10, Operand(result, reg_v12));
699 
700    finish_optimizer_postRA_test();
701 END_TEST
702 
703 BEGIN_TEST(optimizer_postRA.dpp_across_cf_linear_clobber)
704    //>> v1: %a:v[0], v1: %b:v[1], s2: %c:s[0-1] = p_startpgm
705    if (!setup_cs("v1 v1 s2", GFX10_3))
706       return;
707 
708    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
709    startpgm->definitions[0].setFixed(PhysReg(256));
710    startpgm->definitions[1].setFixed(PhysReg(257));
711    startpgm->definitions[2].setFixed(PhysReg(0));
712 
713    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
714    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
715    Operand c(inputs[2], PhysReg(0));   /* condition */
716    PhysReg reg_v12(268);               /* temporary register */
717 
718    //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
719    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
720 
721    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %c:s[0-1], %0:exec
722    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
723 
724    emit_divergent_if_else(
725       program.get(), bld, c,
726       [&]() -> void
__anondfca8ca00502() 727       {
728          /* --- logical then --- */
729          //! BB1
730          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
731          //! p_logical_start
732 
733          //! v1: %clobber:v[0] = p_parallelcopy 0
734          Temp clobber =
735             bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), Operand::c32(0));
736 
737          //! p_unit_test 0, %clobber:v[0]
738          writeout(0, Operand(clobber, a.physReg()));
739 
740          //! p_logical_end
741          //! s2: %0:vcc = p_branch BB3
742 
743          /* --- linear then --- */
744          //! BB2
745          //! /* logical preds: / linear preds: BB0, / kind: */
746          //! s2: %0:vcc = p_branch BB3
747 
748          /* --- invert --- */
749          //! BB3
750          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
751          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
752          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
753       },
754       [&]() -> void
__anondfca8ca00602() 755       {
756          /* --- logical else --- */
757          //! BB4
758          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
759          //! p_logical_start
760 
761          //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1]
762          Temp result =
763             bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
764          //! p_unit_test 1, %result:v[12]
765          writeout(1, Operand(result, reg_v12));
766 
767          //! p_logical_end
768          //! s2: %0:vcc = p_branch BB6
769 
770          /* --- linear else --- */
771          //! BB5
772          //! /* logical preds: / linear preds: BB3, / kind: */
773          //! s2: %0:vcc = p_branch BB6
774       });
775 
776    /* --- merge block --- */
777    //! BB6
778    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
779    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
780 
781    finish_optimizer_postRA_test();
782 END_TEST
783 
784 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf)
785    //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = p_startpgm
786    if (!setup_cs("s2 v1 v1 s2", GFX10_3))
787       return;
788 
789    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
790    startpgm->definitions[0].setFixed(PhysReg(2));
791    startpgm->definitions[1].setFixed(PhysReg(258));
792    startpgm->definitions[2].setFixed(PhysReg(259));
793    startpgm->definitions[3].setFixed(PhysReg(0));
794 
795    Operand a(inputs[0], PhysReg(2));   /* source for s_and */
796    Operand c(inputs[1], PhysReg(258)); /* buffer store address */
797    Operand d(inputs[2], PhysReg(259)); /* buffer store value */
798    Operand e(inputs[3], PhysReg(0));   /* condition */
799    PhysReg reg_s8(8);                  /* temporary register */
800 
801    auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a,
802                             Operand::c32(0x40018u));
803 
804    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
805    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
806 
807    emit_divergent_if_else(
808       program.get(), bld, e,
809       [&]() -> void
__anondfca8ca00702() 810       {
811          /* --- logical then --- */
812          //! BB1
813          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
814          //! p_logical_start
815 
816          //! buffer_store_dword %c:v[2], 0, %d:v[3], 0 offen
817          bld.mubuf(aco_opcode::buffer_store_dword, c, Operand::zero(), d, Operand::zero(), 0, true);
818 
819          //! p_logical_end
820          //! s2: %0:vcc = p_branch BB3
821 
822          /* --- linear then --- */
823          //! BB2
824          //! /* logical preds: / linear preds: BB0, / kind: */
825          //! s2: %0:vcc = p_branch BB3
826 
827          /* --- invert --- */
828          //! BB3
829          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
830          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
831          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
832       },
833       [&]() -> void
__anondfca8ca00802() 834       {
835          /* --- logical else --- */
836          //! BB4
837          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
838          //! p_logical_start
839          //! p_logical_end
840          //! s2: %0:vcc = p_branch BB6
841 
842          /* --- linear else --- */
843          //! BB5
844          //! /* logical preds: / linear preds: BB3, / kind: */
845          //! s2: %0:vcc = p_branch BB6
846       });
847 
848    /* --- merge block --- */
849    //! BB6
850    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
851    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
852 
853    //! s2: %tmp_salu:s[8-9], s1: %br_scc:scc = s_and_b64 %a:s[2-3], 0x40018
854    //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc
855    //! p_unit_test 5, %br_vcc:vcc
856    auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(tmp_salu, reg_s8),
857                         Operand::zero());
858    auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
859    writeout(5, Operand(br, vcc));
860 
861    finish_optimizer_postRA_test();
862 END_TEST
863 
864 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten)
865    //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[4] = p_startpgm
866    if (!setup_cs("s2 v1 v1 s2 s1", GFX10_3))
867       return;
868 
869    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
870    startpgm->definitions[0].setFixed(PhysReg(2));
871    startpgm->definitions[1].setFixed(PhysReg(258));
872    startpgm->definitions[2].setFixed(PhysReg(259));
873    startpgm->definitions[3].setFixed(PhysReg(0));
874    startpgm->definitions[4].setFixed(PhysReg(4));
875 
876    Operand a(inputs[0], PhysReg(2));   /* source for s_and */
877    Operand c(inputs[1], PhysReg(258)); /* buffer store address */
878    Operand d(inputs[2], PhysReg(259)); /* buffer store value */
879    Operand e(inputs[3], PhysReg(0));   /* condition */
880    Operand f(inputs[4], PhysReg(4));   /* overwrite value */
881    PhysReg reg_s3(3);                  /* temporary register */
882    PhysReg reg_s8(8);                  /* temporary register */
883 
884    //! s2: %tmp_salu:s[8-9], s1: %tmp_salu_scc:scc = s_and_b64 %a:s[2-3], 0x40018
885    auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a,
886                             Operand::c32(0x40018u));
887 
888    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
889    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
890 
891    emit_divergent_if_else(
892       program.get(), bld, e,
893       [&]() -> void
__anondfca8ca00902() 894       {
895          /* --- logical then --- */
896          //! BB1
897          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
898          //! p_logical_start
899 
900          //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4]
901          Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f);
902 
903          //! buffer_store_dword %c:v[2], %ovrwr:s[3], %d:v[3], 0 offen
904          bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(s_addr, reg_s3), d, Operand::zero(),
905                    0, true);
906 
907          //! p_logical_end
908          //! s2: %0:vcc = p_branch BB3
909 
910          /* --- linear then --- */
911          //! BB2
912          //! /* logical preds: / linear preds: BB0, / kind: */
913          //! s2: %0:vcc = p_branch BB3
914 
915          /* --- invert --- */
916          //! BB3
917          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
918          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
919          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
920       },
921       [&]() -> void
__anondfca8ca00a02() 922       {
923          /* --- logical else --- */
924          //! BB4
925          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
926          //! p_logical_start
927          //! p_logical_end
928          //! s2: %0:vcc = p_branch BB6
929 
930          /* --- linear else --- */
931          //! BB5
932          //! /* logical preds: / linear preds: BB3, / kind: */
933          //! s2: %0:vcc = p_branch BB6
934       });
935 
936    /* --- merge block --- */
937    //! BB6
938    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
939    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
940 
941    //! s1: %br_scc:scc = s_cmp_lg_u32 %tmp_salu:s[8-9], 0
942    //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc
943    //! p_unit_test 5, %br_vcc:vcc
944    auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(tmp_salu, reg_s8),
945                         Operand::zero());
946    auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
947    writeout(5, Operand(br, vcc));
948 
949    finish_optimizer_postRA_test();
950 END_TEST
951