1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s 4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s 5; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s 6; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 7 8define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { 9; GFX9-LABEL: s_lshr_v2i16: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 13; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 14; GFX9-NEXT: v_mov_b32_e32 v0, 0 15; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16; GFX9-NEXT: v_mov_b32_e32 v1, s4 17; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 18; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 19; GFX9-NEXT: s_endpgm 20; 21; VI-LABEL: s_lshr_v2i16: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 24; VI-NEXT: s_load_dword s5, s[0:1], 0x2c 25; VI-NEXT: s_load_dword s0, s[0:1], 0x30 26; VI-NEXT: s_mov_b32 s4, 0xffff 27; VI-NEXT: s_waitcnt lgkmcnt(0) 28; VI-NEXT: v_mov_b32_e32 v0, s2 29; VI-NEXT: s_lshr_b32 s1, s5, 16 30; VI-NEXT: s_lshr_b32 s6, s0, 16 31; VI-NEXT: s_lshr_b32 s1, s1, s6 32; VI-NEXT: s_and_b32 s5, s5, s4 33; VI-NEXT: s_and_b32 s0, s0, s4 34; VI-NEXT: s_lshr_b32 s0, s5, s0 35; VI-NEXT: s_lshl_b32 s1, s1, 16 36; VI-NEXT: s_or_b32 s0, s0, s1 37; VI-NEXT: v_mov_b32_e32 v1, s3 38; VI-NEXT: v_mov_b32_e32 v2, s0 39; VI-NEXT: flat_store_dword v[0:1], v2 40; VI-NEXT: s_endpgm 41; 42; CI-LABEL: s_lshr_v2i16: 43; CI: ; %bb.0: 44; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 45; CI-NEXT: s_load_dword s2, s[0:1], 0xb 46; CI-NEXT: s_load_dword s0, s[0:1], 0xc 47; CI-NEXT: s_mov_b32 s3, 0xffff 48; CI-NEXT: s_mov_b32 s7, 0xf000 49; CI-NEXT: s_mov_b32 s6, -1 50; CI-NEXT: s_waitcnt lgkmcnt(0) 51; CI-NEXT: s_lshr_b32 s1, s2, 16 52; CI-NEXT: s_lshr_b32 s8, s0, 16 53; CI-NEXT: s_lshr_b32 s1, s1, s8 54; CI-NEXT: s_and_b32 s2, s2, s3 55; CI-NEXT: s_and_b32 s0, s0, s3 56; CI-NEXT: s_lshr_b32 s0, s2, s0 57; CI-NEXT: s_lshl_b32 s1, s1, 16 58; CI-NEXT: s_or_b32 s0, s0, s1 59; CI-NEXT: v_mov_b32_e32 v0, s0 60; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 61; CI-NEXT: s_endpgm 62 %result = lshr <2 x i16> %lhs, %rhs 63 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 64 ret void 65} 66 67define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 68; GFX9-LABEL: v_lshr_v2i16: 69; GFX9: ; %bb.0: 70; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 71; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 74; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 75; GFX9-NEXT: s_waitcnt vmcnt(0) 76; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 77; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 78; GFX9-NEXT: s_endpgm 79; 80; VI-LABEL: v_lshr_v2i16: 81; VI: ; %bb.0: 82; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 83; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 84; VI-NEXT: s_waitcnt lgkmcnt(0) 85; VI-NEXT: v_mov_b32_e32 v1, s3 86; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 87; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 88; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 89; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 90; VI-NEXT: flat_load_dword v5, v[0:1] 91; VI-NEXT: flat_load_dword v2, v[2:3] 92; VI-NEXT: v_mov_b32_e32 v1, s1 93; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 94; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 95; VI-NEXT: s_waitcnt vmcnt(0) 96; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 97; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 98; VI-NEXT: v_or_b32_e32 v2, v3, v2 99; VI-NEXT: flat_store_dword v[0:1], v2 100; VI-NEXT: s_endpgm 101; 102; CI-LABEL: v_lshr_v2i16: 103; CI: ; %bb.0: 104; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 105; CI-NEXT: s_mov_b32 s3, 0xf000 106; CI-NEXT: s_mov_b32 s2, 0 107; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 108; CI-NEXT: v_mov_b32_e32 v1, 0 109; CI-NEXT: s_waitcnt lgkmcnt(0) 110; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 111; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 112; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 113; CI-NEXT: s_mov_b32 s0, 0xffff 114; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 115; CI-NEXT: s_waitcnt vmcnt(1) 116; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 117; CI-NEXT: s_waitcnt vmcnt(0) 118; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 119; CI-NEXT: v_and_b32_e32 v2, s0, v2 120; CI-NEXT: v_and_b32_e32 v3, s0, v3 121; CI-NEXT: v_lshr_b32_e32 v2, v2, v3 122; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 123; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 124; CI-NEXT: v_or_b32_e32 v2, v2, v3 125; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 126; CI-NEXT: s_endpgm 127 %tid = call i32 @llvm.amdgcn.workitem.id.x() 128 %tid.ext = sext i32 %tid to i64 129 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 130 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 131 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 132 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 133 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 134 %result = lshr <2 x i16> %a, %b 135 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 136 ret void 137} 138 139define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 140; GFX9-LABEL: lshr_v_s_v2i16: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 143; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 144; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 146; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 147; GFX9-NEXT: s_waitcnt vmcnt(0) 148; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 149; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 150; GFX9-NEXT: s_endpgm 151; 152; VI-LABEL: lshr_v_s_v2i16: 153; VI: ; %bb.0: 154; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 155; VI-NEXT: s_load_dword s0, s[0:1], 0x34 156; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 157; VI-NEXT: s_waitcnt lgkmcnt(0) 158; VI-NEXT: v_mov_b32_e32 v1, s7 159; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 160; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 161; VI-NEXT: flat_load_dword v3, v[0:1] 162; VI-NEXT: s_lshr_b32 s1, s0, 16 163; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 164; VI-NEXT: v_mov_b32_e32 v2, s1 165; VI-NEXT: v_mov_b32_e32 v1, s5 166; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 167; VI-NEXT: s_waitcnt vmcnt(0) 168; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3 169; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 170; VI-NEXT: v_or_b32_e32 v2, v4, v2 171; VI-NEXT: flat_store_dword v[0:1], v2 172; VI-NEXT: s_endpgm 173; 174; CI-LABEL: lshr_v_s_v2i16: 175; CI: ; %bb.0: 176; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 177; CI-NEXT: s_load_dword s8, s[0:1], 0xd 178; CI-NEXT: s_mov_b32 s3, 0xf000 179; CI-NEXT: s_mov_b32 s2, 0 180; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 181; CI-NEXT: s_waitcnt lgkmcnt(0) 182; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 183; CI-NEXT: v_mov_b32_e32 v1, 0 184; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 185; CI-NEXT: s_mov_b32 s0, 0xffff 186; CI-NEXT: s_lshr_b32 s1, s8, 16 187; CI-NEXT: s_and_b32 s8, s8, s0 188; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 189; CI-NEXT: s_waitcnt vmcnt(0) 190; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 191; CI-NEXT: v_and_b32_e32 v2, s0, v2 192; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3 193; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 194; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 195; CI-NEXT: v_or_b32_e32 v2, v2, v3 196; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 197; CI-NEXT: s_endpgm 198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 199 %tid.ext = sext i32 %tid to i64 200 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 201 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 202 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 203 %result = lshr <2 x i16> %vgpr, %sgpr 204 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 205 ret void 206} 207 208define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 209; GFX9-LABEL: lshr_s_v_v2i16: 210; GFX9: ; %bb.0: 211; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 212; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 213; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 216; GFX9-NEXT: s_waitcnt vmcnt(0) 217; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 218; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 219; GFX9-NEXT: s_endpgm 220; 221; VI-LABEL: lshr_s_v_v2i16: 222; VI: ; %bb.0: 223; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 224; VI-NEXT: s_load_dword s0, s[0:1], 0x34 225; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 226; VI-NEXT: s_waitcnt lgkmcnt(0) 227; VI-NEXT: v_mov_b32_e32 v1, s7 228; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 229; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 230; VI-NEXT: flat_load_dword v3, v[0:1] 231; VI-NEXT: s_lshr_b32 s1, s0, 16 232; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 233; VI-NEXT: v_mov_b32_e32 v2, s1 234; VI-NEXT: v_mov_b32_e32 v1, s5 235; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 236; VI-NEXT: s_waitcnt vmcnt(0) 237; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0 238; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 239; VI-NEXT: v_or_b32_e32 v2, v4, v2 240; VI-NEXT: flat_store_dword v[0:1], v2 241; VI-NEXT: s_endpgm 242; 243; CI-LABEL: lshr_s_v_v2i16: 244; CI: ; %bb.0: 245; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 246; CI-NEXT: s_load_dword s8, s[0:1], 0xd 247; CI-NEXT: s_mov_b32 s3, 0xf000 248; CI-NEXT: s_mov_b32 s2, 0 249; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 250; CI-NEXT: s_waitcnt lgkmcnt(0) 251; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 252; CI-NEXT: v_mov_b32_e32 v1, 0 253; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 254; CI-NEXT: s_mov_b32 s0, 0xffff 255; CI-NEXT: s_lshr_b32 s1, s8, 16 256; CI-NEXT: s_and_b32 s8, s8, s0 257; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 258; CI-NEXT: s_waitcnt vmcnt(0) 259; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 260; CI-NEXT: v_and_b32_e32 v2, s0, v2 261; CI-NEXT: v_lshr_b32_e32 v3, s1, v3 262; CI-NEXT: v_lshr_b32_e32 v2, s8, v2 263; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 264; CI-NEXT: v_or_b32_e32 v2, v2, v3 265; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 266; CI-NEXT: s_endpgm 267 %tid = call i32 @llvm.amdgcn.workitem.id.x() 268 %tid.ext = sext i32 %tid to i64 269 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 270 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 271 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 272 %result = lshr <2 x i16> %sgpr, %vgpr 273 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 274 ret void 275} 276 277define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 278; GFX9-LABEL: lshr_imm_v_v2i16: 279; GFX9: ; %bb.0: 280; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 281; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 283; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 284; GFX9-NEXT: s_waitcnt vmcnt(0) 285; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] 286; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 287; GFX9-NEXT: s_endpgm 288; 289; VI-LABEL: lshr_imm_v_v2i16: 290; VI: ; %bb.0: 291; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 292; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 293; VI-NEXT: v_mov_b32_e32 v4, 8 294; VI-NEXT: s_waitcnt lgkmcnt(0) 295; VI-NEXT: v_mov_b32_e32 v1, s3 296; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 297; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 298; VI-NEXT: flat_load_dword v3, v[0:1] 299; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 300; VI-NEXT: v_mov_b32_e32 v1, s1 301; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 302; VI-NEXT: s_waitcnt vmcnt(0) 303; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8 304; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 305; VI-NEXT: v_or_b32_e32 v2, v2, v3 306; VI-NEXT: flat_store_dword v[0:1], v2 307; VI-NEXT: s_endpgm 308; 309; CI-LABEL: lshr_imm_v_v2i16: 310; CI: ; %bb.0: 311; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 312; CI-NEXT: s_mov_b32 s3, 0xf000 313; CI-NEXT: s_mov_b32 s2, 0 314; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 315; CI-NEXT: v_mov_b32_e32 v1, 0 316; CI-NEXT: s_waitcnt lgkmcnt(0) 317; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 318; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 319; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 320; CI-NEXT: s_waitcnt vmcnt(0) 321; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 322; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 323; CI-NEXT: v_lshr_b32_e32 v3, 8, v3 324; CI-NEXT: v_lshr_b32_e32 v2, 8, v2 325; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 326; CI-NEXT: v_or_b32_e32 v2, v2, v3 327; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 328; CI-NEXT: s_endpgm 329 %tid = call i32 @llvm.amdgcn.workitem.id.x() 330 %tid.ext = sext i32 %tid to i64 331 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 332 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 333 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 334 %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr 335 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 336 ret void 337} 338 339define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 340; GFX9-LABEL: lshr_v_imm_v2i16: 341; GFX9: ; %bb.0: 342; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 343; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 344; GFX9-NEXT: s_waitcnt lgkmcnt(0) 345; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 346; GFX9-NEXT: s_waitcnt vmcnt(0) 347; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 348; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 349; GFX9-NEXT: s_endpgm 350; 351; VI-LABEL: lshr_v_imm_v2i16: 352; VI: ; %bb.0: 353; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 354; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 355; VI-NEXT: s_waitcnt lgkmcnt(0) 356; VI-NEXT: v_mov_b32_e32 v1, s3 357; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 358; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 359; VI-NEXT: flat_load_dword v0, v[0:1] 360; VI-NEXT: v_mov_b32_e32 v3, s1 361; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 362; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 363; VI-NEXT: s_waitcnt vmcnt(0) 364; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 365; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 366; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 367; VI-NEXT: flat_store_dword v[2:3], v0 368; VI-NEXT: s_endpgm 369; 370; CI-LABEL: lshr_v_imm_v2i16: 371; CI: ; %bb.0: 372; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 373; CI-NEXT: s_mov_b32 s3, 0xf000 374; CI-NEXT: s_mov_b32 s2, 0 375; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 376; CI-NEXT: v_mov_b32_e32 v1, 0 377; CI-NEXT: s_waitcnt lgkmcnt(0) 378; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 379; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 380; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 381; CI-NEXT: s_waitcnt vmcnt(0) 382; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 383; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 384; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 385; CI-NEXT: s_endpgm 386 %tid = call i32 @llvm.amdgcn.workitem.id.x() 387 %tid.ext = sext i32 %tid to i64 388 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 389 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 390 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 391 %result = lshr <2 x i16> %vgpr, <i16 8, i16 8> 392 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 393 ret void 394} 395 396define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 397; GFX9-LABEL: v_lshr_v4i16: 398; GFX9: ; %bb.0: 399; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 400; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 401; GFX9-NEXT: s_waitcnt lgkmcnt(0) 402; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 403; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 404; GFX9-NEXT: s_waitcnt vmcnt(0) 405; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 406; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 407; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 408; GFX9-NEXT: s_endpgm 409; 410; VI-LABEL: v_lshr_v4i16: 411; VI: ; %bb.0: 412; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 413; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 414; VI-NEXT: s_waitcnt lgkmcnt(0) 415; VI-NEXT: v_mov_b32_e32 v1, s3 416; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 417; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 418; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 419; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 420; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 421; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 422; VI-NEXT: v_mov_b32_e32 v5, s1 423; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 424; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 425; VI-NEXT: s_waitcnt vmcnt(0) 426; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 427; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 428; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 429; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 430; VI-NEXT: v_or_b32_e32 v1, v6, v1 431; VI-NEXT: v_or_b32_e32 v0, v3, v0 432; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 433; VI-NEXT: s_endpgm 434; 435; CI-LABEL: v_lshr_v4i16: 436; CI: ; %bb.0: 437; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 438; CI-NEXT: s_mov_b32 s3, 0xf000 439; CI-NEXT: s_mov_b32 s2, 0 440; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 441; CI-NEXT: v_mov_b32_e32 v1, 0 442; CI-NEXT: s_waitcnt lgkmcnt(0) 443; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 444; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 445; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 446; CI-NEXT: s_mov_b32 s0, 0xffff 447; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 448; CI-NEXT: s_waitcnt vmcnt(1) 449; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 450; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 451; CI-NEXT: s_waitcnt vmcnt(0) 452; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 453; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5 454; CI-NEXT: v_and_b32_e32 v2, s0, v2 455; CI-NEXT: v_and_b32_e32 v4, s0, v4 456; CI-NEXT: v_and_b32_e32 v3, s0, v3 457; CI-NEXT: v_and_b32_e32 v5, s0, v5 458; CI-NEXT: v_lshr_b32_e32 v3, v3, v5 459; CI-NEXT: v_lshr_b32_e32 v5, v7, v9 460; CI-NEXT: v_lshr_b32_e32 v2, v2, v4 461; CI-NEXT: v_lshr_b32_e32 v4, v6, v8 462; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 463; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 464; CI-NEXT: v_or_b32_e32 v3, v3, v5 465; CI-NEXT: v_or_b32_e32 v2, v2, v4 466; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 467; CI-NEXT: s_endpgm 468 %tid = call i32 @llvm.amdgcn.workitem.id.x() 469 %tid.ext = sext i32 %tid to i64 470 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 471 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 472 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 473 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 474 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 475 %result = lshr <4 x i16> %a, %b 476 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 477 ret void 478} 479 480define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 481; GFX9-LABEL: lshr_v_imm_v4i16: 482; GFX9: ; %bb.0: 483; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 484; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 487; GFX9-NEXT: s_waitcnt vmcnt(0) 488; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 489; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 490; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 491; GFX9-NEXT: s_endpgm 492; 493; VI-LABEL: lshr_v_imm_v4i16: 494; VI: ; %bb.0: 495; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 496; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 497; VI-NEXT: s_waitcnt lgkmcnt(0) 498; VI-NEXT: v_mov_b32_e32 v1, s3 499; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 500; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 501; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 502; VI-NEXT: v_mov_b32_e32 v3, s1 503; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 504; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 505; VI-NEXT: s_waitcnt vmcnt(0) 506; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1 507; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0 508; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 509; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 510; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 511; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 512; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 513; VI-NEXT: s_endpgm 514; 515; CI-LABEL: lshr_v_imm_v4i16: 516; CI: ; %bb.0: 517; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 518; CI-NEXT: s_mov_b32 s3, 0xf000 519; CI-NEXT: s_mov_b32 s2, 0 520; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 521; CI-NEXT: v_mov_b32_e32 v1, 0 522; CI-NEXT: s_waitcnt lgkmcnt(0) 523; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 524; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 525; CI-NEXT: s_mov_b32 s0, 0xff00ff 526; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 527; CI-NEXT: s_waitcnt vmcnt(0) 528; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 529; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 530; CI-NEXT: v_and_b32_e32 v3, s0, v3 531; CI-NEXT: v_and_b32_e32 v2, s0, v2 532; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 533; CI-NEXT: s_endpgm 534 %tid = call i32 @llvm.amdgcn.workitem.id.x() 535 %tid.ext = sext i32 %tid to i64 536 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 537 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 538 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 539 %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> 540 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 541 ret void 542} 543 544declare i32 @llvm.amdgcn.workitem.id.x() #1 545 546attributes #0 = { nounwind } 547attributes #1 = { nounwind readnone } 548