Lines Matching full:v1
30 //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31 if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i))
34 //! v1: %res0 = v_mul_f32 %a, -%b
37 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
39 //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40 //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41 //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
44 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
46 //! v1: %res2 = v_mul_f32 %a, %b
49 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
51 //! v1: %res3 = v_mul_f32 |%a|, %b
54 writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
56 //! v1: %res4 = v_mul_f32 -|%a|, %b
60 writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
62 //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
64 … writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
66 //! v1: %res6 = v_subrev_f32 %a, %b
68 writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
70 //! v1: %res7 = v_sub_f32 %b, %a
72 writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
74 //! v1: %res8 = v_mul_f32 %a, -%c
76 Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
77 writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
79 // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
82 writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
89 //>> v1: %a, v1: %b = p_startpgm
90 if (!setup_cs("v1 v1", GFX9))
97 //! v1: %res0 = v_add_f32 %a, %b *0.5
99 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
100 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
102 //! v1: %res1 = v_add_f32 %a, %b *2
104 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
107 //! v1: %res2 = v_add_f32 %a, %b *4
109 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
110 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
112 //! v1: %res3 = v_add_f32 %a, %b clamp
114 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
115 writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
118 //! v1: %res4 = v_add_f32 %a, %b *2 clamp
120 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
121 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
122 writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
157 //! v1: %res10_tmp = v_add_f32 %a, %b clamp
158 //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
160 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
161 tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
163 writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
167 //! v1: %res11_tmp = v_xor_b32 %a, %b
168 //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
170 tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
171 writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
175 //! v1: %res12_tmp = v_add_f32 %a, %b
177 //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
179 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
181 writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
183 //! v1: %res13 = v_add_f32 %a, %b
185 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
186 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
197 //! v1: %res14_tmp = v_add_f32 %a, %b
198 //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
200 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
201 writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
203 //! v1: %res15 = v_add_f32 %a, %b clamp
205 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
206 writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
237 //! v1: %res18_tmp = v_add_f32 %a, %b
238 //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
240 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
241 writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
242 //! v1: %res19 = v_add_f32 %a, %b clamp
244 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
245 writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
269 return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2); in create_subbrev_co()
274 //>> v1: %a, s1: %b, s2: %c = p_startpgm
275 if (!setup_cs("v1 s1 s2", (amd_gfx_level)i))
280 //! v1: %res0 = v_cndmask_b32 0, %a, %c
283 writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
285 //! v1: %res1 = v_cndmask_b32 0, 42, %c
288 writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
290 //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
291 //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
292 //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
295 writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
297 //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
298 //! v1: %xor = v_xor_b32 %a, %subbrev1
299 //! v1: %res3 = v_cndmask_b32 0, %xor, %c
302 Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
303 writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
305 //! v1: %res4 = v_cndmask_b32 0, %a, %c
307 Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
309 Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
310 writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
318 //>> s1: %a, v1: %b = p_startpgm
319 if (!setup_cs("s1 v1", (amd_gfx_level)i))
335 //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
336 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
338 //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
339 //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
345 Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
346 writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
349 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
350 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
354 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
357 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
358 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
363 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
366 //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
367 //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
370 Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
374 //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
375 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
378 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
380 //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
381 //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
384 writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
386 //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
387 //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
392 writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
400 //>> v1: %a, s1: %b = p_startpgm
401 if (!setup_cs("v1 s1", (amd_gfx_level)i))
406 //! v1: %res0 = v_bcnt_u32_b32 %a, %a
408 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
409 writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
411 //! v1: %res1 = v_bcnt_u32_b32 %a, %b
413 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
414 writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
416 //! v1: %res2 = v_bcnt_u32_b32 %a, 42
418 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
419 writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
421 //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
422 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
423 //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
425 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
426 writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
428 //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
429 //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
430 //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
432 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
433 Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
475 if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
487 //>> v1: %a, v1: %b, v1: %c = p_startpgm
489 //! v1: %res0 = @med3 @ub, @lb, %a
491 writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
492 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
494 //! v1: %res1 = @med3 @lb, @ub, %a
496 writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
497 bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
500 //! v1: %res2_tmp = @min @lb, %a
501 //! v1: %res2 = @max @ub, %res2_tmp
503 writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
504 bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
506 //! v1: %res3_tmp = @max @ub, %a
507 //! v1: %res3 = @min @lb, %res3_tmp
509 writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
510 bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
514 //! v1: %res4_tmp = @max @lb, %a
515 //! v1: %res4 = @min %b, %res4_tmp
517 writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
518 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
520 //! v1: %res5_tmp = @max %b, %a
521 //! v1: %res5 = @min @ub, %res5_tmp
523 writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
524 bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
526 //! v1: %res6_tmp = @max %c, %a
527 //! v1: %res6 = @min %b, %res6_tmp
529 writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
530 bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
534 //~f(16|32)! v1: %res7 = @med3 @ub, @lb, %a
536 Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
538 Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
542 //~f(16|32)! v1: (precise)%res8_tmp = @min @ub, %a
543 //~f(16|32)! v1: %res8 = @max @lb, %res8_tmp
545 min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
547 writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
555 //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
556 if (!setup_cs("v1 v1 v2 v1", GFX9))
579 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
601 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
688 //>> v1: %a, v1: %b, v1: %c = p_startpgm
689 if (!setup_cs("v1 v1 v1", GFX9))
692 //! v1: %res0 = v_add3_u32 %a, %b, %c
694 Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
695 writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
697 //! v1: %tmp1 = v_add_u32 %b, %c clamp
698 //! v1: %res1 = v_add_u32 %a, %tmp1
700 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
702 writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
704 //! v1: %tmp2 = v_add_u32 %b, %c
705 //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
707 tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
708 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
717 //>> v1: %a = p_startpgm
718 if (!setup_cs("v1", (amd_gfx_level)i))
721 //! v1: %res0 = v_max3_f32 0, -0, %a
724 Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
726 writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
728 //! v1: %res1 = v_max3_f32 0, -0, -%a
730 min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
732 writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
740 //>> v1: %a, v1: %b, v1: %c = p_startpgm
741 if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
744 //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
746 Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
747 writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
749 //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
750 //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
752 mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
753 writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
761 //>> v1: %a, v1: %b, s1: %c = p_startpgm
762 if (!setup_cs("v1 v1 s1", (amd_gfx_level)i))
767 //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
768 //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
769 //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
771 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
772 writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
774 //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
775 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
776 //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
780 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
781 writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
783 //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
784 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
785 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
789 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
790 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
792 //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
793 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
795 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
796 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
798 //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
799 //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
803 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
804 writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
806 //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
807 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
811 lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
812 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
857 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]); in emit_denorm_srcdest()
859 return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val); in emit_denorm_srcdest()
861 return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val); in emit_denorm_srcdest()
890 if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
902 //>> v1: %a, s2: %b = p_startpgm
904 //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
905 //; 'min': 'v1: %{} = v_min_f32 0, {}',
906 //; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
907 //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
908 //; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
909 //; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
910 //; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
928 //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
936 val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
950 0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
958 //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
959 if (!setup_cs("v1 v1 s2 s1", GFX10_3))
968 //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
970 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
971 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
975 //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
977 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
978 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
981 //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
982 //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
984 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
985 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
989 //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
991 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
993 Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
996 //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
998 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
999 auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
1003 //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
1004 //! v1: %res5 = v_add_f32 %tmp5, %b clamp
1006 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1007 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
1011 //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
1013 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1015 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
1019 //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
1021 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1022 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
1027 //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
1029 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1030 Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
1034 //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
1035 //! v1: %res9 = v_add_f32 %tmp9, %d
1037 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1038 Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
1041 //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
1042 //! v1: %res10 = v_add_f32 %d, %tmp10
1044 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1045 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
1052 //>> v1: %a, s1: %b = p_startpgm
1053 if (!setup_cs("v1 s1", GFX10))
1056 //! v1: %one = p_parallelcopy 1
1057 //! v1: %res0 = v_mul_f32 1, %a
1059 Temp one = bld.copy(bld.def(v1), Operand::c32(1));
1060 writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
1062 //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
1064 writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
1066 //! v1: %res2 = v_mul_f32 0x12345678, %a
1068 Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1069 … writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1071 //! v1: %literal2 = p_parallelcopy 0x12345679
1072 //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
1074 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1075 … writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1077 //! v1: %b_v = p_parallelcopy %b
1078 //! v1: %res4 = v_mul_f32 %b, %a
1080 Temp b_v = bld.copy(bld.def(v1), inputs[1]);
1081 writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
1083 //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
1085 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
1087 //! v1: %res6 = v_rcp_f32 %b
1089 writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
1095 //>> v1: %a, v2b: %a16 = p_startpgm
1096 if (!setup_cs("v1 v2b", GFX10_3))
1104 //! v1: %res0_tmp = v_mul_f32 -1.0, %a
1110 //! v1: %res1 = v_mul_f32 %res1_tmp, %a
1114 //! v1: %res2_tmp = v_mul_f32 -1.0, %a16
1117 …writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u),…
1119 //! v1: %res3_tmp = v_mul_f32 %a, %a
1125 //! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp
1129 //! v1: %res5_tmp = v_mul_f32 %a, %a
1135 //! v1: %res6 = v_mul_f32 2.0, %res6_tmp
1137 …writeout(6, fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1139 //! v1: %res7_tmp = v_mul_f32 %a, %a
1145 //! v1: %res8 = v_add_f32 %res8_tmp, %a
1149 //! v1: %res9_tmp = v_mul_f32 %a, %a
1155 //! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
1157 …writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_un…
1164 //>> v1: %a, v2b: %a16 = p_startpgm
1165 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1173 //! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), -0
1177 //! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16)
1181 //! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
1185 //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
1189 //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
1199 //>> v1: %a, v2b: %a16 = p_startpgm
1200 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1209 //~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
1210 //~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
1211 //~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
1252 //>> v1: %a, v2b: %a16 = p_startpgm
1253 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1262 //! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), -0
1266 //! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), -0
1271 //! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), -0
1275 //! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), -0
1280 //! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1284 //! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1289 //! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1293 //! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1297 //! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1301 //! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1305 //! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1309 //! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1313 //! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1318 //! v1: %res13 = v_fma_mix_f32 lo(%a), %a, -0
1322 //! v1: %res14 = v_fma_mix_f32 hi(%a), %a, -0
1326 //! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
1327 //! v1: %res15 = v_mul_f32 %res15_cvt, %a
1331 //! v1: %res16_cvt = v_cvt_f32_f16 %a
1332 //! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
1336 //! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
1337 //! v1: %res17 = v_mul_f32 %res17_cvt, %a
1347 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1348 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1398 //! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
1403 //! v1: %res1 = v_cvt_f32_f16 %res1_tmp
1413 //>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm
1414 if (!setup_cs("v1 v1 v2b v2b", (amd_gfx_level)i))
1425 //! v1: %res0_add = v_add_f32 %1, %2
1430 //! v1: %res1_add = v_add_f32 %1, %2
1436 //! v1: %res2 = v_cvt_f32_f16 |%res2_add|
1441 //! v1: %res3 = v_cvt_f32_f16 -%res3_add
1451 //! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
1462 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
1463 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1474 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1478 //! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
1483 //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0
1484 //! v1: %res2 = v_add_f32 %res2_mul, %c *2
1486 …writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32…
1489 //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
1493 //! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
1497 //! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16)
1501 //! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16)
1516 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1517 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1529 //~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1530 //~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1531 //~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
1535 //~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
1536 //~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1537 //~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
1553 //! v1: %res4 = v_cvt_f32_f16 %res4_tmp
1558 //! v1: %res5 = v_cvt_f32_f16 %res5_tmp
1568 //! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
1578 //>> v1: %a, v2b: %a16 = p_startpgm
1579 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1587 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, -0 clamp
1605 //>> v1: %a, v2b: %a16 = p_startpgm
1606 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1614 /* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
1618 //! v1: %res0_cvt = v_cvt_f32_f16 %a16
1624 //! v1: %res1 = v_mul_f32 %res1_cvt, %a
1633 //! v1: %res3_mul = v_mul_f32 %a, %a
1634 //! v1: %res3 = v_cvt_f32_f16 %res3_mul
1638 //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
1644 //! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
1648 //! v1: %res6_mul = v_mul_f32 %a, %a
1649 //! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
1654 //! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16)
1659 //! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, -0
1675 writeout((*idx)++, bld.vop3p(op, bld.def(v1), bld.copy(bld.def(v1), Operand::c32(val)), in vop3p_constant()
1690 //>> v1: %a = p_startpgm
1691 if (!setup_cs("v1", GFX10_3, CHIP_UNKNOWN, variant))
1696 //; insert_pattern('v1: %%res%u = %s $got%u %%a' % (i, opcode, i))