1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s 4 5define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { 6; GFX9-LABEL: v_add_v2i16: 7; GFX9: ; %bb.0: 8; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 10; GFX9-NEXT: s_setpc_b64 s[30:31] 11; 12; GFX8-LABEL: v_add_v2i16: 13; GFX8: ; %bb.0: 14; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 16; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 17; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 18; GFX8-NEXT: s_setpc_b64 s[30:31] 19 %add = add <2 x i16> %a, %b 20 ret <2 x i16> %add 21} 22 23define <2 x i16> @v_add_v2i16_fneg_lhs(<2 x half> %a, <2 x i16> %b) { 24; GFX9-LABEL: v_add_v2i16_fneg_lhs: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,0] neg_hi:[1,0] 28; GFX9-NEXT: s_setpc_b64 s[30:31] 29; 30; GFX8-LABEL: v_add_v2i16_fneg_lhs: 31; GFX8: ; %bb.0: 32; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 34; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 35; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 36; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 37; GFX8-NEXT: s_setpc_b64 s[30:31] 38 %neg.a = fneg <2 x half> %a 39 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 40 %add = add <2 x i16> %cast.neg.a, %b 41 ret <2 x i16> %add 42} 43 44define <2 x i16> @v_add_v2i16_fneg_rhs(<2 x i16> %a, <2 x half> %b) { 45; GFX9-LABEL: v_add_v2i16_fneg_rhs: 46; GFX9: ; %bb.0: 47; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] 49; GFX9-NEXT: s_setpc_b64 s[30:31] 50; 51; GFX8-LABEL: v_add_v2i16_fneg_rhs: 52; GFX8: ; %bb.0: 53; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 54; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 55; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 56; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 57; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 58; GFX8-NEXT: s_setpc_b64 s[30:31] 59 %neg.b = fneg <2 x half> %b 60 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 61 %add = add <2 x i16> %a, %cast.neg.b 62 ret <2 x i16> %add 63} 64 65define <2 x i16> @v_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) { 66; GFX9-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: 67; GFX9: ; %bb.0: 68; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] 70; GFX9-NEXT: s_setpc_b64 s[30:31] 71; 72; GFX8-LABEL: v_add_v2i16_fneg_lhs_fneg_rhs: 73; GFX8: ; %bb.0: 74; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX8-NEXT: s_mov_b32 s4, 0x80008000 76; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 77; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 78; GFX8-NEXT: v_add_u16_e32 v2, v0, v1 79; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 80; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 81; GFX8-NEXT: s_setpc_b64 s[30:31] 82 %neg.a = fneg <2 x half> %a 83 %neg.b = fneg <2 x half> %b 84 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 85 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 86 %add = add <2 x i16> %cast.neg.a, %cast.neg.b 87 ret <2 x i16> %add 88} 89 90define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { 91; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: 92; GFX9: ; %bb.0: 93; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 94; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 95; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 96; GFX9-NEXT: s_setpc_b64 s[30:31] 97; 98; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: 99; GFX8: ; %bb.0: 100; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 101; GFX8-NEXT: s_movk_i32 s4, 0xffc0 102; GFX8-NEXT: v_mov_b32_e32 v2, s4 103; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 104; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 105; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 106; GFX8-NEXT: s_setpc_b64 s[30:31] 107 %add = add <2 x i16> %a, <i16 -64, i16 -64> 108 ret <2 x i16> %add 109} 110 111define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { 112; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: 113; GFX9: ; %bb.0: 114; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 115; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0 116; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 117; GFX9-NEXT: s_setpc_b64 s[30:31] 118; 119; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo: 120; GFX8: ; %bb.0: 121; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX8-NEXT: v_mov_b32_e32 v2, 4 123; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 124; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 125; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 126; GFX8-NEXT: s_setpc_b64 s[30:31] 127 %add = add <2 x i16> %a, <i16 -64, i16 4> 128 ret <2 x i16> %add 129} 130 131define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { 132; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: 133; GFX9: ; %bb.0: 134; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004 136; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 137; GFX9-NEXT: s_setpc_b64 s[30:31] 138; 139; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: 140; GFX8: ; %bb.0: 141; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 142; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffc0 143; GFX8-NEXT: v_add_u16_e32 v2, 4, v0 144; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 145; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 146; GFX8-NEXT: s_setpc_b64 s[30:31] 147 %add = add <2 x i16> %a, <i16 4, i16 -64> 148 ret <2 x i16> %add 149} 150 151define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { 152; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_lshr_b32 s1, s0, 16 155; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 156; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 157; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 158; GFX9-NEXT: ; return to shader part epilog 159; 160; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: 161; GFX8: ; %bb.0: 162; GFX8-NEXT: s_mov_b32 s3, 0xffff 163; GFX8-NEXT: s_lshr_b32 s2, s0, 16 164; GFX8-NEXT: s_mov_b32 s1, 0xffc0 165; GFX8-NEXT: s_and_b32 s0, s0, s3 166; GFX8-NEXT: s_add_i32 s0, s0, s1 167; GFX8-NEXT: s_add_i32 s2, s2, s1 168; GFX8-NEXT: s_lshl_b32 s1, s2, 16 169; GFX8-NEXT: s_and_b32 s0, s0, s3 170; GFX8-NEXT: s_or_b32 s0, s1, s0 171; GFX8-NEXT: ; return to shader part epilog 172 %add = add <2 x i16> %a, <i16 -64, i16 -64> 173 %cast = bitcast <2 x i16> %add to i32 174 ret i32 %cast 175} 176 177define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { 178; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo: 179; GFX9: ; %bb.0: 180; GFX9-NEXT: s_lshr_b32 s1, s0, 16 181; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0 182; GFX9-NEXT: s_add_i32 s1, s1, 4 183; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 184; GFX9-NEXT: ; return to shader part epilog 185; 186; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: 187; GFX8: ; %bb.0: 188; GFX8-NEXT: s_mov_b32 s2, 0xffff 189; GFX8-NEXT: s_lshr_b32 s1, s0, 16 190; GFX8-NEXT: s_and_b32 s0, s0, s2 191; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 192; GFX8-NEXT: s_add_i32 s1, s1, 4 193; GFX8-NEXT: s_lshl_b32 s1, s1, 16 194; GFX8-NEXT: s_and_b32 s0, s0, s2 195; GFX8-NEXT: s_or_b32 s0, s1, s0 196; GFX8-NEXT: ; return to shader part epilog 197 %add = add <2 x i16> %a, <i16 -64, i16 4> 198 %cast = bitcast <2 x i16> %add to i32 199 ret i32 %cast 200} 201 202define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { 203; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi: 204; GFX9: ; %bb.0: 205; GFX9-NEXT: s_lshr_b32 s1, s0, 16 206; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004 207; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 208; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 209; GFX9-NEXT: ; return to shader part epilog 210; 211; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: 212; GFX8: ; %bb.0: 213; GFX8-NEXT: s_mov_b32 s2, 0xffff 214; GFX8-NEXT: s_lshr_b32 s1, s0, 16 215; GFX8-NEXT: s_and_b32 s0, s0, s2 216; GFX8-NEXT: s_add_i32 s0, s0, 4 217; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 218; GFX8-NEXT: s_lshl_b32 s1, s1, 16 219; GFX8-NEXT: s_and_b32 s0, s0, s2 220; GFX8-NEXT: s_or_b32 s0, s1, s0 221; GFX8-NEXT: ; return to shader part epilog 222 %add = add <2 x i16> %a, <i16 4, i16 -64> 223 %cast = bitcast <2 x i16> %add to i32 224 ret i32 %cast 225} 226 227define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { 228; GFX9-LABEL: s_add_v2i16: 229; GFX9: ; %bb.0: 230; GFX9-NEXT: s_lshr_b32 s2, s0, 16 231; GFX9-NEXT: s_lshr_b32 s3, s1, 16 232; GFX9-NEXT: s_add_i32 s0, s0, s1 233; GFX9-NEXT: s_add_i32 s2, s2, s3 234; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 235; GFX9-NEXT: ; return to shader part epilog 236; 237; GFX8-LABEL: s_add_v2i16: 238; GFX8: ; %bb.0: 239; GFX8-NEXT: s_mov_b32 s3, 0xffff 240; GFX8-NEXT: s_lshr_b32 s2, s0, 16 241; GFX8-NEXT: s_lshr_b32 s4, s1, 16 242; GFX8-NEXT: s_and_b32 s0, s0, s3 243; GFX8-NEXT: s_and_b32 s1, s1, s3 244; GFX8-NEXT: s_add_i32 s0, s0, s1 245; GFX8-NEXT: s_add_i32 s2, s2, s4 246; GFX8-NEXT: s_lshl_b32 s1, s2, 16 247; GFX8-NEXT: s_and_b32 s0, s0, s3 248; GFX8-NEXT: s_or_b32 s0, s1, s0 249; GFX8-NEXT: ; return to shader part epilog 250 %add = add <2 x i16> %a, %b 251 %cast = bitcast <2 x i16> %add to i32 252 ret i32 %cast 253} 254 255define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg %b) { 256; GFX9-LABEL: s_add_v2i16_fneg_lhs: 257; GFX9: ; %bb.0: 258; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000 259; GFX9-NEXT: s_lshr_b32 s2, s0, 16 260; GFX9-NEXT: s_lshr_b32 s3, s1, 16 261; GFX9-NEXT: s_add_i32 s0, s0, s1 262; GFX9-NEXT: s_add_i32 s2, s2, s3 263; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 264; GFX9-NEXT: ; return to shader part epilog 265; 266; GFX8-LABEL: s_add_v2i16_fneg_lhs: 267; GFX8: ; %bb.0: 268; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 269; GFX8-NEXT: s_mov_b32 s3, 0xffff 270; GFX8-NEXT: s_lshr_b32 s2, s0, 16 271; GFX8-NEXT: s_lshr_b32 s4, s1, 16 272; GFX8-NEXT: s_and_b32 s0, s0, s3 273; GFX8-NEXT: s_and_b32 s1, s1, s3 274; GFX8-NEXT: s_add_i32 s0, s0, s1 275; GFX8-NEXT: s_add_i32 s2, s2, s4 276; GFX8-NEXT: s_lshl_b32 s1, s2, 16 277; GFX8-NEXT: s_and_b32 s0, s0, s3 278; GFX8-NEXT: s_or_b32 s0, s1, s0 279; GFX8-NEXT: ; return to shader part epilog 280 %neg.a = fneg <2 x half> %a 281 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 282 %add = add <2 x i16> %cast.neg.a, %b 283 %cast = bitcast <2 x i16> %add to i32 284 ret i32 %cast 285} 286 287define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg %b) { 288; GFX9-LABEL: s_add_v2i16_fneg_rhs: 289; GFX9: ; %bb.0: 290; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000 291; GFX9-NEXT: s_lshr_b32 s2, s0, 16 292; GFX9-NEXT: s_lshr_b32 s3, s1, 16 293; GFX9-NEXT: s_add_i32 s0, s0, s1 294; GFX9-NEXT: s_add_i32 s2, s2, s3 295; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 296; GFX9-NEXT: ; return to shader part epilog 297; 298; GFX8-LABEL: s_add_v2i16_fneg_rhs: 299; GFX8: ; %bb.0: 300; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 301; GFX8-NEXT: s_mov_b32 s3, 0xffff 302; GFX8-NEXT: s_lshr_b32 s2, s0, 16 303; GFX8-NEXT: s_lshr_b32 s4, s1, 16 304; GFX8-NEXT: s_and_b32 s0, s0, s3 305; GFX8-NEXT: s_and_b32 s1, s1, s3 306; GFX8-NEXT: s_add_i32 s0, s0, s1 307; GFX8-NEXT: s_add_i32 s2, s2, s4 308; GFX8-NEXT: s_lshl_b32 s1, s2, 16 309; GFX8-NEXT: s_and_b32 s0, s0, s3 310; GFX8-NEXT: s_or_b32 s0, s1, s0 311; GFX8-NEXT: ; return to shader part epilog 312 %neg.b = fneg <2 x half> %b 313 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 314 %add = add <2 x i16> %a, %cast.neg.b 315 %cast = bitcast <2 x i16> %add to i32 316 ret i32 %cast 317} 318 319define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { 320; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: 321; GFX9: ; %bb.0: 322; GFX9-NEXT: s_mov_b32 s2, 0x80008000 323; GFX9-NEXT: s_xor_b32 s1, s1, s2 324; GFX9-NEXT: s_xor_b32 s0, s0, s2 325; GFX9-NEXT: s_lshr_b32 s2, s0, 16 326; GFX9-NEXT: s_lshr_b32 s3, s1, 16 327; GFX9-NEXT: s_add_i32 s0, s0, s1 328; GFX9-NEXT: s_add_i32 s2, s2, s3 329; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 330; GFX9-NEXT: ; return to shader part epilog 331; 332; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: 333; GFX8: ; %bb.0: 334; GFX8-NEXT: s_mov_b32 s2, 0x80008000 335; GFX8-NEXT: s_xor_b32 s1, s1, s2 336; GFX8-NEXT: s_xor_b32 s0, s0, s2 337; GFX8-NEXT: s_mov_b32 s3, 0xffff 338; GFX8-NEXT: s_lshr_b32 s2, s0, 16 339; GFX8-NEXT: s_lshr_b32 s4, s1, 16 340; GFX8-NEXT: s_and_b32 s0, s0, s3 341; GFX8-NEXT: s_and_b32 s1, s1, s3 342; GFX8-NEXT: s_add_i32 s0, s0, s1 343; GFX8-NEXT: s_add_i32 s2, s2, s4 344; GFX8-NEXT: s_lshl_b32 s1, s2, 16 345; GFX8-NEXT: s_and_b32 s0, s0, s3 346; GFX8-NEXT: s_or_b32 s0, s1, s0 347; GFX8-NEXT: ; return to shader part epilog 348 %neg.a = fneg <2 x half> %a 349 %neg.b = fneg <2 x half> %b 350 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 351 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 352 %add = add <2 x i16> %cast.neg.a, %cast.neg.b 353 %cast = bitcast <2 x i16> %add to i32 354 ret i32 %cast 355} 356