1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI 4 5; FIXME: Need to handle non-uniform case for function below (load without gep). 6define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 7; GFX9-LABEL: v_test_sub_v2i16: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 11; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 12; GFX9-NEXT: s_mov_b32 s3, 0xf000 13; GFX9-NEXT: s_mov_b32 s2, -1 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_mov_b32 s0, s4 16; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 17; GFX9-NEXT: global_load_dword v0, v0, s[8:9] 18; GFX9-NEXT: s_mov_b32 s1, s5 19; GFX9-NEXT: s_waitcnt vmcnt(0) 20; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 21; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 22; GFX9-NEXT: s_endpgm 23; 24; VI-LABEL: v_test_sub_v2i16: 25; VI: ; %bb.0: 26; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 27; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 28; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v1, s7 31; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 32; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 33; VI-NEXT: v_mov_b32_e32 v3, s1 34; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 35; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 36; VI-NEXT: flat_load_dword v0, v[0:1] 37; VI-NEXT: flat_load_dword v1, v[2:3] 38; VI-NEXT: s_mov_b32 s7, 0xf000 39; VI-NEXT: s_mov_b32 s6, -1 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: v_sub_u16_e32 v2, v0, v1 42; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 43; VI-NEXT: v_or_b32_e32 v0, v2, v0 44; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 45; VI-NEXT: s_endpgm 46 %tid = call i32 @llvm.amdgcn.workitem.id.x() 47 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 48 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 49 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 50 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 51 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 52 %add = sub <2 x i16> %a, %b 53 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 54 ret void 55} 56 57define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { 58; GFX9-LABEL: s_test_sub_v2i16: 59; GFX9: ; %bb.0: 60; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 61; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 62; GFX9-NEXT: s_mov_b32 s3, 0xf000 63; GFX9-NEXT: s_mov_b32 s2, -1 64; GFX9-NEXT: s_waitcnt lgkmcnt(0) 65; GFX9-NEXT: s_mov_b32 s0, s4 66; GFX9-NEXT: s_mov_b32 s1, s5 67; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 68; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 69; GFX9-NEXT: s_waitcnt lgkmcnt(0) 70; GFX9-NEXT: v_mov_b32_e32 v0, s5 71; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0 72; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 73; GFX9-NEXT: s_endpgm 74; 75; VI-LABEL: s_test_sub_v2i16: 76; VI: ; %bb.0: 77; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 78; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 79; VI-NEXT: s_mov_b32 s3, 0xf000 80; VI-NEXT: s_mov_b32 s2, -1 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_mov_b32 s0, s4 83; VI-NEXT: s_load_dword s4, s[6:7], 0x0 84; VI-NEXT: s_load_dword s6, s[8:9], 0x0 85; VI-NEXT: s_mov_b32 s1, s5 86; VI-NEXT: s_waitcnt lgkmcnt(0) 87; VI-NEXT: s_lshr_b32 s5, s4, 16 88; VI-NEXT: s_lshr_b32 s7, s6, 16 89; VI-NEXT: s_sub_i32 s4, s4, s6 90; VI-NEXT: s_sub_i32 s5, s5, s7 91; VI-NEXT: s_and_b32 s4, s4, 0xffff 92; VI-NEXT: s_lshl_b32 s5, s5, 16 93; VI-NEXT: s_or_b32 s4, s4, s5 94; VI-NEXT: v_mov_b32_e32 v0, s4 95; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 96; VI-NEXT: s_endpgm 97 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 98 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 99 %add = sub <2 x i16> %a, %b 100 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 101 ret void 102} 103 104define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { 105; GCN-LABEL: s_test_sub_self_v2i16: 106; GCN: ; %bb.0: 107; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 108; GCN-NEXT: s_mov_b32 s3, 0xf000 109; GCN-NEXT: s_mov_b32 s2, -1 110; GCN-NEXT: v_mov_b32_e32 v0, 0 111; GCN-NEXT: s_waitcnt lgkmcnt(0) 112; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 113; GCN-NEXT: s_endpgm 114 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 115 %add = sub <2 x i16> %a, %a 116 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 117 ret void 118} 119 120; FIXME: VI should not scalarize arg access. 121define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { 122; GFX9-LABEL: s_test_sub_v2i16_kernarg: 123; GFX9: ; %bb.0: 124; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 125; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 126; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 127; GFX9-NEXT: s_mov_b32 s7, 0xf000 128; GFX9-NEXT: s_mov_b32 s6, -1 129; GFX9-NEXT: s_waitcnt lgkmcnt(0) 130; GFX9-NEXT: v_mov_b32_e32 v0, s0 131; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 132; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 133; GFX9-NEXT: s_endpgm 134; 135; VI-LABEL: s_test_sub_v2i16_kernarg: 136; VI: ; %bb.0: 137; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 138; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 139; VI-NEXT: s_load_dword s0, s[0:1], 0x30 140; VI-NEXT: s_mov_b32 s7, 0xf000 141; VI-NEXT: s_mov_b32 s6, -1 142; VI-NEXT: s_waitcnt lgkmcnt(0) 143; VI-NEXT: s_lshr_b32 s1, s2, 16 144; VI-NEXT: s_lshr_b32 s3, s0, 16 145; VI-NEXT: s_sub_i32 s1, s1, s3 146; VI-NEXT: s_sub_i32 s0, s2, s0 147; VI-NEXT: s_lshl_b32 s1, s1, 16 148; VI-NEXT: s_and_b32 s0, s0, 0xffff 149; VI-NEXT: s_or_b32 s0, s0, s1 150; VI-NEXT: v_mov_b32_e32 v0, s0 151; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 152; VI-NEXT: s_endpgm 153 %add = sub <2 x i16> %a, %b 154 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 155 ret void 156} 157 158define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 159; GFX9-LABEL: v_test_sub_v2i16_constant: 160; GFX9: ; %bb.0: 161; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 162; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 163; GFX9-NEXT: s_mov_b32 s3, 0xf000 164; GFX9-NEXT: s_mov_b32 s2, -1 165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 166; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 167; GFX9-NEXT: s_mov_b32 s0, s4 168; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b 169; GFX9-NEXT: s_mov_b32 s1, s5 170; GFX9-NEXT: s_waitcnt vmcnt(0) 171; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 172; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 173; GFX9-NEXT: s_endpgm 174; 175; VI-LABEL: v_test_sub_v2i16_constant: 176; VI: ; %bb.0: 177; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 178; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 179; VI-NEXT: s_waitcnt lgkmcnt(0) 180; VI-NEXT: v_mov_b32_e32 v1, s3 181; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 182; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 183; VI-NEXT: flat_load_dword v0, v[0:1] 184; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 185; VI-NEXT: s_mov_b32 s3, 0xf000 186; VI-NEXT: s_mov_b32 s2, -1 187; VI-NEXT: s_waitcnt vmcnt(0) 188; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 189; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 190; VI-NEXT: v_or_b32_e32 v0, v2, v0 191; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 192; VI-NEXT: s_endpgm 193 %tid = call i32 @llvm.amdgcn.workitem.id.x() 194 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 195 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 196 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 197 %add = sub <2 x i16> %a, <i16 123, i16 456> 198 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 199 ret void 200} 201 202; FIXME: Need to handle non-uniform case for function below (load without gep). 203define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 204; GFX9-LABEL: v_test_sub_v2i16_neg_constant: 205; GFX9: ; %bb.0: 206; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 207; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 208; GFX9-NEXT: s_mov_b32 s3, 0xf000 209; GFX9-NEXT: s_mov_b32 s2, -1 210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 211; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 212; GFX9-NEXT: s_mov_b32 s0, s4 213; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 214; GFX9-NEXT: s_mov_b32 s1, s5 215; GFX9-NEXT: s_waitcnt vmcnt(0) 216; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 217; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 218; GFX9-NEXT: s_endpgm 219; 220; VI-LABEL: v_test_sub_v2i16_neg_constant: 221; VI: ; %bb.0: 222; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 223; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 224; VI-NEXT: s_waitcnt lgkmcnt(0) 225; VI-NEXT: v_mov_b32_e32 v1, s3 226; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 227; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 228; VI-NEXT: flat_load_dword v0, v[0:1] 229; VI-NEXT: v_mov_b32_e32 v1, 0x3df 230; VI-NEXT: s_mov_b32 s3, 0xf000 231; VI-NEXT: s_mov_b32 s2, -1 232; VI-NEXT: s_waitcnt vmcnt(0) 233; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 234; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 235; VI-NEXT: v_or_b32_e32 v0, v2, v0 236; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 237; VI-NEXT: s_endpgm 238 %tid = call i32 @llvm.amdgcn.workitem.id.x() 239 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 240 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 241 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 242 %add = sub <2 x i16> %a, <i16 -845, i16 -991> 243 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 244 ret void 245} 246 247define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 248; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: 249; GFX9: ; %bb.0: 250; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 251; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 252; GFX9-NEXT: s_mov_b32 s3, 0xf000 253; GFX9-NEXT: s_mov_b32 s2, -1 254; GFX9-NEXT: s_waitcnt lgkmcnt(0) 255; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 256; GFX9-NEXT: s_mov_b32 s0, s4 257; GFX9-NEXT: s_mov_b32 s1, s5 258; GFX9-NEXT: s_waitcnt vmcnt(0) 259; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 260; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 261; GFX9-NEXT: s_endpgm 262; 263; VI-LABEL: v_test_sub_v2i16_inline_neg1: 264; VI: ; %bb.0: 265; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 266; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 267; VI-NEXT: s_waitcnt lgkmcnt(0) 268; VI-NEXT: v_mov_b32_e32 v1, s3 269; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 270; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 271; VI-NEXT: flat_load_dword v0, v[0:1] 272; VI-NEXT: v_mov_b32_e32 v1, 1 273; VI-NEXT: s_mov_b32 s3, 0xf000 274; VI-NEXT: s_mov_b32 s2, -1 275; VI-NEXT: s_waitcnt vmcnt(0) 276; VI-NEXT: v_add_u16_e32 v2, 1, v0 277; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 278; VI-NEXT: v_or_b32_e32 v0, v2, v0 279; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 280; VI-NEXT: s_endpgm 281 %tid = call i32 @llvm.amdgcn.workitem.id.x() 282 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 283 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 284 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 285 %add = sub <2 x i16> %a, <i16 -1, i16 -1> 286 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 287 ret void 288} 289 290define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 291; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 292; GFX9: ; %bb.0: 293; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 294; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 295; GFX9-NEXT: s_mov_b32 s3, 0xf000 296; GFX9-NEXT: s_mov_b32 s2, -1 297; GFX9-NEXT: s_waitcnt lgkmcnt(0) 298; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 299; GFX9-NEXT: s_mov_b32 s0, s4 300; GFX9-NEXT: s_mov_b32 s1, s5 301; GFX9-NEXT: s_waitcnt vmcnt(0) 302; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 303; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 304; GFX9-NEXT: s_endpgm 305; 306; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 307; VI: ; %bb.0: 308; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 309; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; VI-NEXT: s_waitcnt lgkmcnt(0) 311; VI-NEXT: v_mov_b32_e32 v1, s3 312; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 313; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 314; VI-NEXT: flat_load_dword v0, v[0:1] 315; VI-NEXT: s_mov_b32 s3, 0xf000 316; VI-NEXT: s_mov_b32 s2, -1 317; VI-NEXT: s_waitcnt vmcnt(0) 318; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 319; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 320; VI-NEXT: v_or_b32_e32 v0, v0, v1 321; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 322; VI-NEXT: s_endpgm 323 %tid = call i32 @llvm.amdgcn.workitem.id.x() 324 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 325 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 326 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 327 %add = sub <2 x i16> %a, <i16 32, i16 0> 328 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 329 ret void 330} 331 332; The high element gives fp 333define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 334; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: 335; GFX9: ; %bb.0: 336; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 337; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 338; GFX9-NEXT: s_mov_b32 s3, 0xf000 339; GFX9-NEXT: s_mov_b32 s2, -1 340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 341; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 342; GFX9-NEXT: s_mov_b32 s0, s4 343; GFX9-NEXT: s_mov_b32 s4, 1.0 344; GFX9-NEXT: s_mov_b32 s1, s5 345; GFX9-NEXT: s_waitcnt vmcnt(0) 346; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 347; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 348; GFX9-NEXT: s_endpgm 349; 350; VI-LABEL: v_test_sub_v2i16_inline_fp_split: 351; VI: ; %bb.0: 352; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 353; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 354; VI-NEXT: s_mov_b32 s3, 0xf000 355; VI-NEXT: s_mov_b32 s2, -1 356; VI-NEXT: s_waitcnt lgkmcnt(0) 357; VI-NEXT: v_mov_b32_e32 v1, s7 358; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 359; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 360; VI-NEXT: flat_load_dword v0, v[0:1] 361; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 362; VI-NEXT: s_mov_b32 s0, s4 363; VI-NEXT: s_mov_b32 s1, s5 364; VI-NEXT: s_waitcnt vmcnt(0) 365; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 366; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 367; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 368; VI-NEXT: s_endpgm 369 %tid = call i32 @llvm.amdgcn.workitem.id.x() 370 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 371 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 372 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 373 %add = sub <2 x i16> %a, <i16 0, i16 16256> 374 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 375 ret void 376} 377 378; FIXME: Need to handle non-uniform case for function below (load without gep). 379define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 380; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: 381; GFX9: ; %bb.0: 382; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 383; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 384; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 386; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 387; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 388; GFX9-NEXT: s_mov_b32 s7, 0xf000 389; GFX9-NEXT: s_mov_b32 s6, -1 390; GFX9-NEXT: s_waitcnt vmcnt(0) 391; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 392; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 393; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 394; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 395; GFX9-NEXT: s_endpgm 396; 397; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: 398; VI: ; %bb.0: 399; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 400; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 401; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: v_mov_b32_e32 v1, s7 404; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 405; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 406; VI-NEXT: v_mov_b32_e32 v3, s1 407; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 408; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 409; VI-NEXT: flat_load_dword v1, v[0:1] 410; VI-NEXT: flat_load_dword v2, v[2:3] 411; VI-NEXT: s_mov_b32 s7, 0xf000 412; VI-NEXT: s_mov_b32 s6, -1 413; VI-NEXT: s_waitcnt vmcnt(0) 414; VI-NEXT: v_sub_u16_e32 v0, v1, v2 415; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 416; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 417; VI-NEXT: s_endpgm 418 %tid = call i32 @llvm.amdgcn.workitem.id.x() 419 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 420 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 421 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 422 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 423 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 424 %add = sub <2 x i16> %a, %b 425 %ext = zext <2 x i16> %add to <2 x i32> 426 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 427 ret void 428} 429 430; FIXME: Need to handle non-uniform case for function below (load without gep). 431define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 432; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: 433; GFX9: ; %bb.0: 434; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 435; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 436; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 437; GFX9-NEXT: v_mov_b32_e32 v1, 0 438; GFX9-NEXT: v_mov_b32_e32 v3, v1 439; GFX9-NEXT: s_waitcnt lgkmcnt(0) 440; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 441; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 442; GFX9-NEXT: s_mov_b32 s7, 0xf000 443; GFX9-NEXT: s_mov_b32 s6, -1 444; GFX9-NEXT: s_waitcnt vmcnt(0) 445; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 446; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 447; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 448; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 449; GFX9-NEXT: s_endpgm 450; 451; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: 452; VI: ; %bb.0: 453; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 454; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 455; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 456; VI-NEXT: s_waitcnt lgkmcnt(0) 457; VI-NEXT: v_mov_b32_e32 v1, s7 458; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 459; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 460; VI-NEXT: v_mov_b32_e32 v3, s1 461; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 462; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 463; VI-NEXT: flat_load_dword v4, v[0:1] 464; VI-NEXT: flat_load_dword v2, v[2:3] 465; VI-NEXT: v_mov_b32_e32 v1, 0 466; VI-NEXT: s_mov_b32 s7, 0xf000 467; VI-NEXT: s_mov_b32 s6, -1 468; VI-NEXT: v_mov_b32_e32 v3, v1 469; VI-NEXT: s_waitcnt vmcnt(0) 470; VI-NEXT: v_sub_u16_e32 v0, v4, v2 471; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 472; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 473; VI-NEXT: s_endpgm 474 %tid = call i32 @llvm.amdgcn.workitem.id.x() 475 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 476 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 477 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 478 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 479 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 480 %add = sub <2 x i16> %a, %b 481 %ext = zext <2 x i16> %add to <2 x i64> 482 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 483 ret void 484} 485 486; FIXME: Need to handle non-uniform case for function below (load without gep). 487define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 488; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: 489; GFX9: ; %bb.0: 490; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 491; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 492; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 493; GFX9-NEXT: s_waitcnt lgkmcnt(0) 494; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 495; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 496; GFX9-NEXT: s_mov_b32 s7, 0xf000 497; GFX9-NEXT: s_mov_b32 s6, -1 498; GFX9-NEXT: s_waitcnt vmcnt(0) 499; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 500; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 501; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 502; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 503; GFX9-NEXT: s_endpgm 504; 505; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: 506; VI: ; %bb.0: 507; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 508; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 509; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 510; VI-NEXT: s_waitcnt lgkmcnt(0) 511; VI-NEXT: v_mov_b32_e32 v1, s7 512; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 513; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 514; VI-NEXT: v_mov_b32_e32 v3, s1 515; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 516; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 517; VI-NEXT: flat_load_dword v0, v[0:1] 518; VI-NEXT: flat_load_dword v1, v[2:3] 519; VI-NEXT: s_mov_b32 s7, 0xf000 520; VI-NEXT: s_mov_b32 s6, -1 521; VI-NEXT: s_waitcnt vmcnt(0) 522; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 523; VI-NEXT: v_sub_u16_e32 v0, v0, v1 524; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 525; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 526; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 527; VI-NEXT: s_endpgm 528 %tid = call i32 @llvm.amdgcn.workitem.id.x() 529 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 530 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 531 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 532 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 533 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 534 %add = sub <2 x i16> %a, %b 535 %ext = sext <2 x i16> %add to <2 x i32> 536 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 537 ret void 538} 539 540; FIXME: Need to handle non-uniform case for function below (load without gep). 541define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 542; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: 543; GFX9: ; %bb.0: 544; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 545; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 546; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 548; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 549; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 550; GFX9-NEXT: s_mov_b32 s7, 0xf000 551; GFX9-NEXT: s_mov_b32 s6, -1 552; GFX9-NEXT: s_waitcnt vmcnt(0) 553; GFX9-NEXT: v_pk_sub_i16 v1, v1, v0 554; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 555; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 556; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 557; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 558; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 559; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 560; GFX9-NEXT: s_endpgm 561; 562; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: 563; VI: ; %bb.0: 564; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 565; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 566; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 567; VI-NEXT: s_waitcnt lgkmcnt(0) 568; VI-NEXT: v_mov_b32_e32 v1, s7 569; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 570; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 571; VI-NEXT: v_mov_b32_e32 v3, s1 572; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 573; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 574; VI-NEXT: flat_load_dword v0, v[0:1] 575; VI-NEXT: flat_load_dword v1, v[2:3] 576; VI-NEXT: s_mov_b32 s7, 0xf000 577; VI-NEXT: s_mov_b32 s6, -1 578; VI-NEXT: s_waitcnt vmcnt(0) 579; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 580; VI-NEXT: v_sub_u16_e32 v0, v0, v1 581; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 582; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 583; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 584; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 585; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 586; VI-NEXT: s_endpgm 587 %tid = call i32 @llvm.amdgcn.workitem.id.x() 588 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 589 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 590 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 591 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 592 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 593 %add = sub <2 x i16> %a, %b 594 %ext = sext <2 x i16> %add to <2 x i64> 595 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 596 ret void 597} 598 599declare i32 @llvm.amdgcn.workitem.id.x() #0 600 601attributes #0 = { nounwind readnone } 602attributes #1 = { nounwind } 603