1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s 4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 5 6define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { 7; GFX9-LABEL: s_shl_v2i16: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 10; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 11; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 12; GFX9-NEXT: s_mov_b32 s7, 0xf000 13; GFX9-NEXT: s_mov_b32 s6, -1 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: v_mov_b32_e32 v0, s2 16; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 17; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 18; GFX9-NEXT: s_endpgm 19; 20; VI-LABEL: s_shl_v2i16: 21; VI: ; %bb.0: 22; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 23; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 24; VI-NEXT: s_load_dword s0, s[0:1], 0x30 25; VI-NEXT: s_mov_b32 s3, 0xffff 26; VI-NEXT: s_mov_b32 s7, 0xf000 27; VI-NEXT: s_mov_b32 s6, -1 28; VI-NEXT: s_waitcnt lgkmcnt(0) 29; VI-NEXT: s_lshr_b32 s1, s2, 16 30; VI-NEXT: s_lshr_b32 s8, s0, 16 31; VI-NEXT: s_and_b32 s2, s2, s3 32; VI-NEXT: s_and_b32 s0, s0, s3 33; VI-NEXT: s_lshl_b32 s0, s2, s0 34; VI-NEXT: s_lshl_b32 s1, s1, s8 35; VI-NEXT: s_lshl_b32 s1, s1, 16 36; VI-NEXT: s_and_b32 s0, s0, s3 37; VI-NEXT: s_or_b32 s0, s0, s1 38; VI-NEXT: v_mov_b32_e32 v0, s0 39; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 40; VI-NEXT: s_endpgm 41; 42; CI-LABEL: s_shl_v2i16: 43; CI: ; %bb.0: 44; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 45; CI-NEXT: s_load_dword s2, s[0:1], 0xb 46; CI-NEXT: s_load_dword s0, s[0:1], 0xc 47; CI-NEXT: s_mov_b32 s3, 0xffff 48; CI-NEXT: s_mov_b32 s7, 0xf000 49; CI-NEXT: s_mov_b32 s6, -1 50; CI-NEXT: s_waitcnt lgkmcnt(0) 51; CI-NEXT: s_lshr_b32 s1, s2, 16 52; CI-NEXT: s_and_b32 s8, s0, s3 53; CI-NEXT: s_lshr_b32 s0, s0, 16 54; CI-NEXT: s_lshl_b32 s0, s1, s0 55; CI-NEXT: s_lshl_b32 s1, s2, s8 56; CI-NEXT: s_lshl_b32 s0, s0, 16 57; CI-NEXT: s_and_b32 s1, s1, s3 58; CI-NEXT: s_or_b32 s0, s1, s0 59; CI-NEXT: v_mov_b32_e32 v0, s0 60; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 61; CI-NEXT: s_endpgm 62 %result = shl <2 x i16> %lhs, %rhs 63 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 64 ret void 65} 66 67define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 68; GFX9-LABEL: v_shl_v2i16: 69; GFX9: ; %bb.0: 70; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 71; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 74; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 75; GFX9-NEXT: s_waitcnt vmcnt(0) 76; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 77; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 78; GFX9-NEXT: s_endpgm 79; 80; VI-LABEL: v_shl_v2i16: 81; VI: ; %bb.0: 82; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 83; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 84; VI-NEXT: s_waitcnt lgkmcnt(0) 85; VI-NEXT: v_mov_b32_e32 v1, s3 86; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 87; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 88; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 89; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 90; VI-NEXT: flat_load_dword v5, v[0:1] 91; VI-NEXT: flat_load_dword v2, v[2:3] 92; VI-NEXT: v_mov_b32_e32 v1, s1 93; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 94; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 95; VI-NEXT: s_waitcnt vmcnt(0) 96; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 97; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 98; VI-NEXT: v_or_b32_e32 v2, v3, v2 99; VI-NEXT: flat_store_dword v[0:1], v2 100; VI-NEXT: s_endpgm 101; 102; CI-LABEL: v_shl_v2i16: 103; CI: ; %bb.0: 104; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 105; CI-NEXT: s_mov_b32 s3, 0xf000 106; CI-NEXT: s_mov_b32 s2, 0 107; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 108; CI-NEXT: v_mov_b32_e32 v1, 0 109; CI-NEXT: s_waitcnt lgkmcnt(0) 110; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 111; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 112; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 113; CI-NEXT: s_mov_b32 s0, 0xffff 114; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 115; CI-NEXT: s_waitcnt vmcnt(1) 116; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 117; CI-NEXT: s_waitcnt vmcnt(0) 118; CI-NEXT: v_and_b32_e32 v5, s0, v3 119; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 120; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 121; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 122; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 123; CI-NEXT: v_and_b32_e32 v2, s0, v2 124; CI-NEXT: v_or_b32_e32 v2, v2, v3 125; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 126; CI-NEXT: s_endpgm 127 %tid = call i32 @llvm.amdgcn.workitem.id.x() 128 %tid.ext = sext i32 %tid to i64 129 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 130 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 131 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 132 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 133 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 134 %result = shl <2 x i16> %a, %b 135 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 136 ret void 137} 138 139define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 140; GFX9-LABEL: shl_v_s_v2i16: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 143; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 144; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 146; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 147; GFX9-NEXT: s_waitcnt vmcnt(0) 148; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 149; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 150; GFX9-NEXT: s_endpgm 151; 152; VI-LABEL: shl_v_s_v2i16: 153; VI: ; %bb.0: 154; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 155; VI-NEXT: s_load_dword s0, s[0:1], 0x34 156; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 157; VI-NEXT: s_waitcnt lgkmcnt(0) 158; VI-NEXT: v_mov_b32_e32 v1, s7 159; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 160; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 161; VI-NEXT: flat_load_dword v3, v[0:1] 162; VI-NEXT: s_lshr_b32 s1, s0, 16 163; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 164; VI-NEXT: v_mov_b32_e32 v2, s1 165; VI-NEXT: v_mov_b32_e32 v1, s5 166; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 167; VI-NEXT: s_waitcnt vmcnt(0) 168; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 169; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 170; VI-NEXT: v_or_b32_e32 v2, v4, v2 171; VI-NEXT: flat_store_dword v[0:1], v2 172; VI-NEXT: s_endpgm 173; 174; CI-LABEL: shl_v_s_v2i16: 175; CI: ; %bb.0: 176; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 177; CI-NEXT: s_load_dword s8, s[0:1], 0xd 178; CI-NEXT: s_mov_b32 s3, 0xf000 179; CI-NEXT: s_mov_b32 s2, 0 180; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 181; CI-NEXT: s_waitcnt lgkmcnt(0) 182; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 183; CI-NEXT: v_mov_b32_e32 v1, 0 184; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 185; CI-NEXT: s_mov_b32 s0, 0xffff 186; CI-NEXT: s_lshr_b32 s1, s8, 16 187; CI-NEXT: s_and_b32 s8, s8, s0 188; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 189; CI-NEXT: s_waitcnt vmcnt(0) 190; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 191; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 192; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 193; CI-NEXT: v_and_b32_e32 v2, s0, v2 194; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 195; CI-NEXT: v_or_b32_e32 v2, v2, v3 196; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 197; CI-NEXT: s_endpgm 198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 199 %tid.ext = sext i32 %tid to i64 200 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 201 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 202 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 203 %result = shl <2 x i16> %vgpr, %sgpr 204 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 205 ret void 206} 207 208define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 209; GFX9-LABEL: shl_s_v_v2i16: 210; GFX9: ; %bb.0: 211; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 212; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 213; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 216; GFX9-NEXT: s_waitcnt vmcnt(0) 217; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 218; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 219; GFX9-NEXT: s_endpgm 220; 221; VI-LABEL: shl_s_v_v2i16: 222; VI: ; %bb.0: 223; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 224; VI-NEXT: s_load_dword s0, s[0:1], 0x34 225; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 226; VI-NEXT: s_waitcnt lgkmcnt(0) 227; VI-NEXT: v_mov_b32_e32 v1, s7 228; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 229; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 230; VI-NEXT: flat_load_dword v3, v[0:1] 231; VI-NEXT: s_lshr_b32 s1, s0, 16 232; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 233; VI-NEXT: v_mov_b32_e32 v2, s1 234; VI-NEXT: v_mov_b32_e32 v1, s5 235; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 236; VI-NEXT: s_waitcnt vmcnt(0) 237; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 238; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 239; VI-NEXT: v_or_b32_e32 v2, v4, v2 240; VI-NEXT: flat_store_dword v[0:1], v2 241; VI-NEXT: s_endpgm 242; 243; CI-LABEL: shl_s_v_v2i16: 244; CI: ; %bb.0: 245; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 246; CI-NEXT: s_load_dword s8, s[0:1], 0xd 247; CI-NEXT: s_mov_b32 s3, 0xf000 248; CI-NEXT: s_mov_b32 s2, 0 249; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; CI-NEXT: s_waitcnt lgkmcnt(0) 251; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 252; CI-NEXT: v_mov_b32_e32 v1, 0 253; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 254; CI-NEXT: s_mov_b32 s0, 0xffff 255; CI-NEXT: s_lshr_b32 s1, s8, 16 256; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 257; CI-NEXT: s_waitcnt vmcnt(0) 258; CI-NEXT: v_and_b32_e32 v3, s0, v2 259; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 260; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 261; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 262; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 263; CI-NEXT: v_and_b32_e32 v3, s0, v3 264; CI-NEXT: v_or_b32_e32 v2, v3, v2 265; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 266; CI-NEXT: s_endpgm 267 %tid = call i32 @llvm.amdgcn.workitem.id.x() 268 %tid.ext = sext i32 %tid to i64 269 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 270 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 271 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 272 %result = shl <2 x i16> %sgpr, %vgpr 273 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 274 ret void 275} 276 277define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 278; GFX9-LABEL: shl_imm_v_v2i16: 279; GFX9: ; %bb.0: 280; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 281; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 283; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 284; GFX9-NEXT: s_waitcnt vmcnt(0) 285; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 286; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 287; GFX9-NEXT: s_endpgm 288; 289; VI-LABEL: shl_imm_v_v2i16: 290; VI: ; %bb.0: 291; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 292; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 293; VI-NEXT: v_mov_b32_e32 v4, 8 294; VI-NEXT: s_waitcnt lgkmcnt(0) 295; VI-NEXT: v_mov_b32_e32 v1, s3 296; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 297; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 298; VI-NEXT: flat_load_dword v3, v[0:1] 299; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 300; VI-NEXT: v_mov_b32_e32 v1, s1 301; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 302; VI-NEXT: s_waitcnt vmcnt(0) 303; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 304; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 305; VI-NEXT: v_or_b32_e32 v2, v2, v3 306; VI-NEXT: flat_store_dword v[0:1], v2 307; VI-NEXT: s_endpgm 308; 309; CI-LABEL: shl_imm_v_v2i16: 310; CI: ; %bb.0: 311; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 312; CI-NEXT: s_mov_b32 s3, 0xf000 313; CI-NEXT: s_mov_b32 s2, 0 314; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 315; CI-NEXT: v_mov_b32_e32 v1, 0 316; CI-NEXT: s_waitcnt lgkmcnt(0) 317; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 318; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 319; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 320; CI-NEXT: s_waitcnt vmcnt(0) 321; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 322; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 323; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 324; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 325; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 326; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 327; CI-NEXT: v_or_b32_e32 v2, v3, v2 328; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 329; CI-NEXT: s_endpgm 330 %tid = call i32 @llvm.amdgcn.workitem.id.x() 331 %tid.ext = sext i32 %tid to i64 332 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 333 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 334 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 335 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr 336 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 337 ret void 338} 339 340define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 341; GFX9-LABEL: shl_v_imm_v2i16: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 344; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 345; GFX9-NEXT: s_waitcnt lgkmcnt(0) 346; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 347; GFX9-NEXT: s_waitcnt vmcnt(0) 348; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 349; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 350; GFX9-NEXT: s_endpgm 351; 352; VI-LABEL: shl_v_imm_v2i16: 353; VI: ; %bb.0: 354; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 355; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 356; VI-NEXT: s_waitcnt lgkmcnt(0) 357; VI-NEXT: v_mov_b32_e32 v1, s3 358; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 359; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 360; VI-NEXT: flat_load_dword v3, v[0:1] 361; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 362; VI-NEXT: v_mov_b32_e32 v1, s1 363; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 364; VI-NEXT: s_waitcnt vmcnt(0) 365; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 366; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 367; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 368; VI-NEXT: v_or_b32_e32 v2, v3, v2 369; VI-NEXT: flat_store_dword v[0:1], v2 370; VI-NEXT: s_endpgm 371; 372; CI-LABEL: shl_v_imm_v2i16: 373; CI: ; %bb.0: 374; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 375; CI-NEXT: s_mov_b32 s3, 0xf000 376; CI-NEXT: s_mov_b32 s2, 0 377; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 378; CI-NEXT: v_mov_b32_e32 v1, 0 379; CI-NEXT: s_waitcnt lgkmcnt(0) 380; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 381; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 382; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 383; CI-NEXT: s_waitcnt vmcnt(0) 384; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 385; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 386; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 387; CI-NEXT: s_endpgm 388 %tid = call i32 @llvm.amdgcn.workitem.id.x() 389 %tid.ext = sext i32 %tid to i64 390 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 391 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 392 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 393 %result = shl <2 x i16> %vgpr, <i16 8, i16 8> 394 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 395 ret void 396} 397 398define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 399; GFX9-LABEL: v_shl_v4i16: 400; GFX9: ; %bb.0: 401; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 402; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 404; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 405; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 406; GFX9-NEXT: s_waitcnt vmcnt(0) 407; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 408; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 409; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 410; GFX9-NEXT: s_endpgm 411; 412; VI-LABEL: v_shl_v4i16: 413; VI: ; %bb.0: 414; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 415; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 416; VI-NEXT: s_waitcnt lgkmcnt(0) 417; VI-NEXT: v_mov_b32_e32 v1, s3 418; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 419; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 420; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 421; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 422; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 423; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 424; VI-NEXT: v_mov_b32_e32 v5, s1 425; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 426; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 427; VI-NEXT: s_waitcnt vmcnt(0) 428; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 429; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 430; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 431; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 432; VI-NEXT: v_or_b32_e32 v1, v6, v1 433; VI-NEXT: v_or_b32_e32 v0, v3, v0 434; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 435; VI-NEXT: s_endpgm 436; 437; CI-LABEL: v_shl_v4i16: 438; CI: ; %bb.0: 439; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 440; CI-NEXT: s_mov_b32 s3, 0xf000 441; CI-NEXT: s_mov_b32 s2, 0 442; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 443; CI-NEXT: v_mov_b32_e32 v1, 0 444; CI-NEXT: s_waitcnt lgkmcnt(0) 445; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 446; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 447; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 448; CI-NEXT: s_mov_b32 s0, 0xffff 449; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 450; CI-NEXT: s_waitcnt vmcnt(1) 451; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 452; CI-NEXT: s_waitcnt vmcnt(0) 453; CI-NEXT: v_and_b32_e32 v8, s0, v4 454; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 455; CI-NEXT: v_and_b32_e32 v9, s0, v5 456; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 457; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 458; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 459; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 460; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 461; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 462; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 463; CI-NEXT: v_and_b32_e32 v3, s0, v3 464; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 465; CI-NEXT: v_and_b32_e32 v2, s0, v2 466; CI-NEXT: v_or_b32_e32 v3, v3, v5 467; CI-NEXT: v_or_b32_e32 v2, v2, v4 468; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 469; CI-NEXT: s_endpgm 470 %tid = call i32 @llvm.amdgcn.workitem.id.x() 471 %tid.ext = sext i32 %tid to i64 472 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 473 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 474 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 475 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 476 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 477 %result = shl <4 x i16> %a, %b 478 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 479 ret void 480} 481 482define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 483; GFX9-LABEL: shl_v_imm_v4i16: 484; GFX9: ; %bb.0: 485; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 486; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 487; GFX9-NEXT: s_waitcnt lgkmcnt(0) 488; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 489; GFX9-NEXT: s_waitcnt vmcnt(0) 490; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 491; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 492; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 493; GFX9-NEXT: s_endpgm 494; 495; VI-LABEL: shl_v_imm_v4i16: 496; VI: ; %bb.0: 497; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 498; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 499; VI-NEXT: s_waitcnt lgkmcnt(0) 500; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 501; VI-NEXT: v_mov_b32_e32 v1, s3 502; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 503; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 504; VI-NEXT: s_mov_b32 s2, 0xff000000 505; VI-NEXT: v_mov_b32_e32 v3, s1 506; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 507; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 508; VI-NEXT: s_waitcnt vmcnt(0) 509; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 510; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 511; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 512; VI-NEXT: v_and_b32_e32 v0, s2, v0 513; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 514; VI-NEXT: v_and_b32_e32 v4, s2, v4 515; VI-NEXT: v_or_b32_e32 v1, v1, v4 516; VI-NEXT: v_or_b32_e32 v0, v5, v0 517; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 518; VI-NEXT: s_endpgm 519; 520; CI-LABEL: shl_v_imm_v4i16: 521; CI: ; %bb.0: 522; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 523; CI-NEXT: s_mov_b32 s3, 0xf000 524; CI-NEXT: s_mov_b32 s2, 0 525; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 526; CI-NEXT: v_mov_b32_e32 v1, 0 527; CI-NEXT: s_waitcnt lgkmcnt(0) 528; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 529; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 530; CI-NEXT: s_mov_b32 s0, 0xff00 531; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 532; CI-NEXT: s_waitcnt vmcnt(0) 533; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 534; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 535; CI-NEXT: v_and_b32_e32 v4, s0, v4 536; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 537; CI-NEXT: v_and_b32_e32 v3, s0, v3 538; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 539; CI-NEXT: v_or_b32_e32 v3, v3, v4 540; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 541; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 542; CI-NEXT: s_endpgm 543 %tid = call i32 @llvm.amdgcn.workitem.id.x() 544 %tid.ext = sext i32 %tid to i64 545 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 546 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 547 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 548 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> 549 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 550 ret void 551} 552 553declare i32 @llvm.amdgcn.workitem.id.x() #1 554 555attributes #0 = { nounwind } 556attributes #1 = { nounwind readnone } 557