1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI 4 5define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 6; SI-LABEL: s_sext_i1_to_i32: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 10; SI-NEXT: s_mov_b32 s7, 0xf000 11; SI-NEXT: s_mov_b32 s6, -1 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: v_mov_b32_e32 v0, s1 14; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 15; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 16; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: s_sext_i1_to_i32: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 22; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 23; VI-NEXT: s_mov_b32 s7, 0xf000 24; VI-NEXT: s_mov_b32 s6, -1 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: v_mov_b32_e32 v0, s1 27; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 28; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 29; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 30; VI-NEXT: s_endpgm 31 %cmp = icmp eq i32 %a, %b 32 %sext = sext i1 %cmp to i32 33 store i32 %sext, i32 addrspace(1)* %out, align 4 34 ret void 35} 36 37define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 38; SI-LABEL: test_s_sext_i32_to_i64: 39; SI: ; %bb.0: ; %entry 40; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 41; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 42; SI-NEXT: s_load_dword s0, s[0:1], 0xd 43; SI-NEXT: s_mov_b32 s7, 0xf000 44; SI-NEXT: s_mov_b32 s6, -1 45; SI-NEXT: s_waitcnt lgkmcnt(0) 46; SI-NEXT: s_mul_i32 s1, s2, s3 47; SI-NEXT: s_add_i32 s1, s1, s0 48; SI-NEXT: s_ashr_i32 s0, s1, 31 49; SI-NEXT: v_mov_b32_e32 v0, s1 50; SI-NEXT: v_mov_b32_e32 v1, s0 51; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 52; SI-NEXT: s_endpgm 53; 54; VI-LABEL: test_s_sext_i32_to_i64: 55; VI: ; %bb.0: ; %entry 56; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 57; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 58; VI-NEXT: s_load_dword s0, s[0:1], 0x34 59; VI-NEXT: s_mov_b32 s7, 0xf000 60; VI-NEXT: s_mov_b32 s6, -1 61; VI-NEXT: s_waitcnt lgkmcnt(0) 62; VI-NEXT: s_mul_i32 s1, s2, s3 63; VI-NEXT: s_add_i32 s1, s1, s0 64; VI-NEXT: s_ashr_i32 s0, s1, 31 65; VI-NEXT: v_mov_b32_e32 v0, s1 66; VI-NEXT: v_mov_b32_e32 v1, s0 67; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 68; VI-NEXT: s_endpgm 69entry: 70 %mul = mul i32 %a, %b 71 %add = add i32 %mul, %c 72 %sext = sext i32 %add to i64 73 store i64 %sext, i64 addrspace(1)* %out, align 8 74 ret void 75} 76 77define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 78; SI-LABEL: s_sext_i1_to_i64: 79; SI: ; %bb.0: 80; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 81; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 82; SI-NEXT: s_mov_b32 s7, 0xf000 83; SI-NEXT: s_mov_b32 s6, -1 84; SI-NEXT: s_waitcnt lgkmcnt(0) 85; SI-NEXT: v_mov_b32_e32 v0, s1 86; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 87; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 88; SI-NEXT: v_mov_b32_e32 v1, v0 89; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 90; SI-NEXT: s_endpgm 91; 92; VI-LABEL: s_sext_i1_to_i64: 93; VI: ; %bb.0: 94; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 95; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 96; VI-NEXT: s_mov_b32 s7, 0xf000 97; VI-NEXT: s_mov_b32 s6, -1 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: v_mov_b32_e32 v0, s1 100; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 101; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 102; VI-NEXT: v_mov_b32_e32 v1, v0 103; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 104; VI-NEXT: s_endpgm 105 %cmp = icmp eq i32 %a, %b 106 %sext = sext i1 %cmp to i64 107 store i64 %sext, i64 addrspace(1)* %out, align 8 108 ret void 109} 110 111define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { 112; SI-LABEL: s_sext_i32_to_i64: 113; SI: ; %bb.0: 114; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 115; SI-NEXT: s_load_dword s0, s[0:1], 0xb 116; SI-NEXT: s_mov_b32 s7, 0xf000 117; SI-NEXT: s_mov_b32 s6, -1 118; SI-NEXT: s_waitcnt lgkmcnt(0) 119; SI-NEXT: s_ashr_i32 s1, s0, 31 120; SI-NEXT: v_mov_b32_e32 v0, s0 121; SI-NEXT: v_mov_b32_e32 v1, s1 122; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 123; SI-NEXT: s_endpgm 124; 125; VI-LABEL: s_sext_i32_to_i64: 126; VI: ; %bb.0: 127; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 128; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 129; VI-NEXT: s_mov_b32 s7, 0xf000 130; VI-NEXT: s_mov_b32 s6, -1 131; VI-NEXT: s_waitcnt lgkmcnt(0) 132; VI-NEXT: s_ashr_i32 s1, s0, 31 133; VI-NEXT: v_mov_b32_e32 v0, s0 134; VI-NEXT: v_mov_b32_e32 v1, s1 135; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 136; VI-NEXT: s_endpgm 137 %sext = sext i32 %a to i64 138 store i64 %sext, i64 addrspace(1)* %out, align 8 139 ret void 140} 141 142define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 143; SI-LABEL: v_sext_i32_to_i64: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 146; SI-NEXT: s_mov_b32 s3, 0xf000 147; SI-NEXT: s_mov_b32 s2, -1 148; SI-NEXT: s_waitcnt lgkmcnt(0) 149; SI-NEXT: s_mov_b32 s0, s4 150; SI-NEXT: s_mov_b32 s1, s5 151; SI-NEXT: s_mov_b32 s4, s6 152; SI-NEXT: s_mov_b32 s5, s7 153; SI-NEXT: s_mov_b32 s6, s2 154; SI-NEXT: s_mov_b32 s7, s3 155; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 158; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 159; SI-NEXT: s_endpgm 160; 161; VI-LABEL: v_sext_i32_to_i64: 162; VI: ; %bb.0: 163; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 164; VI-NEXT: s_mov_b32 s3, 0xf000 165; VI-NEXT: s_mov_b32 s2, -1 166; VI-NEXT: s_waitcnt lgkmcnt(0) 167; VI-NEXT: s_mov_b32 s0, s4 168; VI-NEXT: s_mov_b32 s1, s5 169; VI-NEXT: s_mov_b32 s4, s6 170; VI-NEXT: s_mov_b32 s5, s7 171; VI-NEXT: s_mov_b32 s6, s2 172; VI-NEXT: s_mov_b32 s7, s3 173; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 176; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 177; VI-NEXT: s_endpgm 178 %val = load i32, i32 addrspace(1)* %in, align 4 179 %sext = sext i32 %val to i64 180 store i64 %sext, i64 addrspace(1)* %out, align 8 181 ret void 182} 183 184define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { 185; SI-LABEL: s_sext_i16_to_i64: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 188; SI-NEXT: s_load_dword s0, s[0:1], 0xb 189; SI-NEXT: s_mov_b32 s7, 0xf000 190; SI-NEXT: s_mov_b32 s6, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 193; SI-NEXT: v_mov_b32_e32 v0, s0 194; SI-NEXT: v_mov_b32_e32 v1, s1 195; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 196; SI-NEXT: s_endpgm 197; 198; VI-LABEL: s_sext_i16_to_i64: 199; VI: ; %bb.0: 200; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 201; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 202; VI-NEXT: s_mov_b32 s7, 0xf000 203; VI-NEXT: s_mov_b32 s6, -1 204; VI-NEXT: s_waitcnt lgkmcnt(0) 205; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 206; VI-NEXT: v_mov_b32_e32 v0, s0 207; VI-NEXT: v_mov_b32_e32 v1, s1 208; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 209; VI-NEXT: s_endpgm 210 %sext = sext i16 %a to i64 211 store i64 %sext, i64 addrspace(1)* %out, align 8 212 ret void 213} 214 215define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 216; SI-LABEL: s_sext_i1_to_i16: 217; SI: ; %bb.0: 218; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 219; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 220; SI-NEXT: s_mov_b32 s7, 0xf000 221; SI-NEXT: s_mov_b32 s6, -1 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: v_mov_b32_e32 v0, s1 224; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 225; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 226; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 227; SI-NEXT: s_endpgm 228; 229; VI-LABEL: s_sext_i1_to_i16: 230; VI: ; %bb.0: 231; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 232; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 233; VI-NEXT: s_mov_b32 s7, 0xf000 234; VI-NEXT: s_mov_b32 s6, -1 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: v_mov_b32_e32 v0, s1 237; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 238; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 239; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 240; VI-NEXT: s_endpgm 241 %cmp = icmp eq i32 %a, %b 242 %sext = sext i1 %cmp to i16 243 store i16 %sext, i16 addrspace(1)* %out 244 ret void 245} 246 247; This purpose of this test is to make sure the i16 = sign_extend i1 node 248; makes it all the way throught the legalizer/optimizer to make sure 249; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node 250; is optimized to a select very early. 251define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { 252; SI-LABEL: s_sext_i1_to_i16_with_and: 253; SI: ; %bb.0: 254; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 255; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 256; SI-NEXT: s_mov_b32 s7, 0xf000 257; SI-NEXT: s_mov_b32 s6, -1 258; SI-NEXT: s_waitcnt lgkmcnt(0) 259; SI-NEXT: v_mov_b32_e32 v0, s1 260; SI-NEXT: v_mov_b32_e32 v1, s3 261; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 262; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 263; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 264; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 265; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 266; SI-NEXT: s_endpgm 267; 268; VI-LABEL: s_sext_i1_to_i16_with_and: 269; VI: ; %bb.0: 270; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 271; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 272; VI-NEXT: s_mov_b32 s7, 0xf000 273; VI-NEXT: s_mov_b32 s6, -1 274; VI-NEXT: s_waitcnt lgkmcnt(0) 275; VI-NEXT: v_mov_b32_e32 v0, s1 276; VI-NEXT: v_mov_b32_e32 v1, s3 277; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 278; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1 279; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 280; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 281; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 282; VI-NEXT: s_endpgm 283 %cmp0 = icmp eq i32 %a, %b 284 %cmp1 = icmp eq i32 %c, %d 285 %cmp = and i1 %cmp0, %cmp1 286 %sext = sext i1 %cmp to i16 287 store i16 %sext, i16 addrspace(1)* %out 288 ret void 289} 290 291define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 292; SI-LABEL: v_sext_i1_to_i16_with_and: 293; SI: ; %bb.0: 294; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 295; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 296; SI-NEXT: s_load_dword s0, s[0:1], 0xd 297; SI-NEXT: s_mov_b32 s7, 0xf000 298; SI-NEXT: s_mov_b32 s6, -1 299; SI-NEXT: s_waitcnt lgkmcnt(0) 300; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 301; SI-NEXT: v_mov_b32_e32 v0, s0 302; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 303; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 304; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 305; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 306; SI-NEXT: s_endpgm 307; 308; VI-LABEL: v_sext_i1_to_i16_with_and: 309; VI: ; %bb.0: 310; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 311; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 312; VI-NEXT: s_load_dword s0, s[0:1], 0x34 313; VI-NEXT: s_mov_b32 s7, 0xf000 314; VI-NEXT: s_mov_b32 s6, -1 315; VI-NEXT: s_waitcnt lgkmcnt(0) 316; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 317; VI-NEXT: v_mov_b32_e32 v0, s0 318; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 319; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 320; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 321; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 322; VI-NEXT: s_endpgm 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 324 %cmp0 = icmp eq i32 %a, %tid 325 %cmp1 = icmp eq i32 %b, %c 326 %cmp = and i1 %cmp0, %cmp1 327 %sext = sext i1 %cmp to i16 328 store i16 %sext, i16 addrspace(1)* %out 329 ret void 330} 331 332; FIXME: We end up with a v_bfe instruction, because the i16 srl 333; gets selected to a v_lshrrev_b16 instructions, so the input to 334; the bfe is a vector registers. To fix this we need to be able to 335; optimize: 336; t29: i16 = truncate t10 337; t55: i16 = srl t29, Constant:i32<8> 338; t63: i32 = any_extend t55 339; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 340define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { 341; SI-LABEL: s_sext_v4i8_to_v4i32: 342; SI: ; %bb.0: 343; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 344; SI-NEXT: s_load_dword s0, s[0:1], 0xb 345; SI-NEXT: s_mov_b32 s7, 0xf000 346; SI-NEXT: s_mov_b32 s6, -1 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_ashr_i32 s1, s0, 24 349; SI-NEXT: s_bfe_i32 s2, s0, 0x80010 350; SI-NEXT: s_bfe_i32 s3, s0, 0x80008 351; SI-NEXT: s_sext_i32_i8 s0, s0 352; SI-NEXT: v_mov_b32_e32 v0, s0 353; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 354; SI-NEXT: s_waitcnt expcnt(0) 355; SI-NEXT: v_mov_b32_e32 v0, s3 356; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 357; SI-NEXT: s_waitcnt expcnt(0) 358; SI-NEXT: v_mov_b32_e32 v0, s2 359; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 360; SI-NEXT: s_waitcnt expcnt(0) 361; SI-NEXT: v_mov_b32_e32 v0, s1 362; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; SI-NEXT: s_endpgm 364; 365; VI-LABEL: s_sext_v4i8_to_v4i32: 366; VI: ; %bb.0: 367; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 368; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 369; VI-NEXT: s_mov_b32 s7, 0xf000 370; VI-NEXT: s_mov_b32 s6, -1 371; VI-NEXT: s_waitcnt lgkmcnt(0) 372; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0 373; VI-NEXT: s_ashr_i32 s1, s0, 24 374; VI-NEXT: s_bfe_i32 s2, s0, 0x80010 375; VI-NEXT: s_sext_i32_i8 s0, s0 376; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 377; VI-NEXT: v_mov_b32_e32 v1, s0 378; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 379; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 380; VI-NEXT: v_mov_b32_e32 v0, s2 381; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 382; VI-NEXT: v_mov_b32_e32 v0, s1 383; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 384; VI-NEXT: s_endpgm 385 %cast = bitcast i32 %a to <4 x i8> 386 %ext = sext <4 x i8> %cast to <4 x i32> 387 %elt0 = extractelement <4 x i32> %ext, i32 0 388 %elt1 = extractelement <4 x i32> %ext, i32 1 389 %elt2 = extractelement <4 x i32> %ext, i32 2 390 %elt3 = extractelement <4 x i32> %ext, i32 3 391 store volatile i32 %elt0, i32 addrspace(1)* %out 392 store volatile i32 %elt1, i32 addrspace(1)* %out 393 store volatile i32 %elt2, i32 addrspace(1)* %out 394 store volatile i32 %elt3, i32 addrspace(1)* %out 395 ret void 396} 397 398; FIXME: need to optimize same sequence as above test to avoid 399; this shift. 400define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 401; SI-LABEL: v_sext_v4i8_to_v4i32: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 404; SI-NEXT: s_mov_b32 s3, 0xf000 405; SI-NEXT: s_mov_b32 s2, -1 406; SI-NEXT: s_mov_b32 s10, s2 407; SI-NEXT: s_mov_b32 s11, s3 408; SI-NEXT: s_waitcnt lgkmcnt(0) 409; SI-NEXT: s_mov_b32 s8, s6 410; SI-NEXT: s_mov_b32 s9, s7 411; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 412; SI-NEXT: s_mov_b32 s0, s4 413; SI-NEXT: s_mov_b32 s1, s5 414; SI-NEXT: s_waitcnt vmcnt(0) 415; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 416; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 417; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 418; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 419; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 420; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 421; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 422; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 423; SI-NEXT: s_endpgm 424; 425; VI-LABEL: v_sext_v4i8_to_v4i32: 426; VI: ; %bb.0: 427; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 428; VI-NEXT: s_mov_b32 s3, 0xf000 429; VI-NEXT: s_mov_b32 s2, -1 430; VI-NEXT: s_mov_b32 s10, s2 431; VI-NEXT: s_mov_b32 s11, s3 432; VI-NEXT: s_waitcnt lgkmcnt(0) 433; VI-NEXT: s_mov_b32 s8, s6 434; VI-NEXT: s_mov_b32 s9, s7 435; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 436; VI-NEXT: s_mov_b32 s0, s4 437; VI-NEXT: s_mov_b32 s1, s5 438; VI-NEXT: s_waitcnt vmcnt(0) 439; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 440; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 441; VI-NEXT: v_bfe_i32 v3, v0, 16, 8 442; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 443; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 444; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 445; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 446; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 447; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 448; VI-NEXT: s_endpgm 449 %a = load i32, i32 addrspace(1)* %in 450 %cast = bitcast i32 %a to <4 x i8> 451 %ext = sext <4 x i8> %cast to <4 x i32> 452 %elt0 = extractelement <4 x i32> %ext, i32 0 453 %elt1 = extractelement <4 x i32> %ext, i32 1 454 %elt2 = extractelement <4 x i32> %ext, i32 2 455 %elt3 = extractelement <4 x i32> %ext, i32 3 456 store volatile i32 %elt0, i32 addrspace(1)* %out 457 store volatile i32 %elt1, i32 addrspace(1)* %out 458 store volatile i32 %elt2, i32 addrspace(1)* %out 459 store volatile i32 %elt3, i32 addrspace(1)* %out 460 ret void 461} 462 463; FIXME: s_bfe_i64, same on SI and VI 464define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { 465; SI-LABEL: s_sext_v4i16_to_v4i32: 466; SI: ; %bb.0: 467; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 468; SI-NEXT: s_mov_b32 s3, 0xf000 469; SI-NEXT: s_mov_b32 s2, -1 470; SI-NEXT: s_waitcnt lgkmcnt(0) 471; SI-NEXT: s_mov_b32 s0, s4 472; SI-NEXT: s_mov_b32 s1, s5 473; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48 474; SI-NEXT: s_ashr_i32 s5, s6, 16 475; SI-NEXT: s_sext_i32_i16 s6, s6 476; SI-NEXT: v_mov_b32_e32 v0, s6 477; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 478; SI-NEXT: s_waitcnt expcnt(0) 479; SI-NEXT: v_mov_b32_e32 v0, s5 480; SI-NEXT: s_sext_i32_i16 s7, s7 481; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 482; SI-NEXT: s_waitcnt expcnt(0) 483; SI-NEXT: v_mov_b32_e32 v0, s7 484; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 485; SI-NEXT: s_waitcnt expcnt(0) 486; SI-NEXT: v_mov_b32_e32 v0, s4 487; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 488; SI-NEXT: s_endpgm 489; 490; VI-LABEL: s_sext_v4i16_to_v4i32: 491; VI: ; %bb.0: 492; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 493; VI-NEXT: s_mov_b32 s3, 0xf000 494; VI-NEXT: s_mov_b32 s2, -1 495; VI-NEXT: s_waitcnt lgkmcnt(0) 496; VI-NEXT: s_mov_b32 s1, s5 497; VI-NEXT: s_ashr_i32 s5, s6, 16 498; VI-NEXT: s_sext_i32_i16 s6, s6 499; VI-NEXT: s_mov_b32 s0, s4 500; VI-NEXT: v_mov_b32_e32 v0, s6 501; VI-NEXT: s_ashr_i32 s4, s7, 16 502; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 503; VI-NEXT: v_mov_b32_e32 v0, s5 504; VI-NEXT: s_sext_i32_i16 s7, s7 505; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 506; VI-NEXT: v_mov_b32_e32 v0, s7 507; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 508; VI-NEXT: v_mov_b32_e32 v0, s4 509; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 510; VI-NEXT: s_endpgm 511 %cast = bitcast i64 %a to <4 x i16> 512 %ext = sext <4 x i16> %cast to <4 x i32> 513 %elt0 = extractelement <4 x i32> %ext, i32 0 514 %elt1 = extractelement <4 x i32> %ext, i32 1 515 %elt2 = extractelement <4 x i32> %ext, i32 2 516 %elt3 = extractelement <4 x i32> %ext, i32 3 517 store volatile i32 %elt0, i32 addrspace(1)* %out 518 store volatile i32 %elt1, i32 addrspace(1)* %out 519 store volatile i32 %elt2, i32 addrspace(1)* %out 520 store volatile i32 %elt3, i32 addrspace(1)* %out 521 ret void 522} 523 524define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 525; SI-LABEL: v_sext_v4i16_to_v4i32: 526; SI: ; %bb.0: 527; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 528; SI-NEXT: s_mov_b32 s3, 0xf000 529; SI-NEXT: s_mov_b32 s2, -1 530; SI-NEXT: s_mov_b32 s10, s2 531; SI-NEXT: s_mov_b32 s11, s3 532; SI-NEXT: s_waitcnt lgkmcnt(0) 533; SI-NEXT: s_mov_b32 s8, s6 534; SI-NEXT: s_mov_b32 s9, s7 535; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 536; SI-NEXT: s_mov_b32 s0, s4 537; SI-NEXT: s_mov_b32 s1, s5 538; SI-NEXT: s_waitcnt vmcnt(0) 539; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 540; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 541; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 542; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 543; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 544; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 545; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 546; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 547; SI-NEXT: s_endpgm 548; 549; VI-LABEL: v_sext_v4i16_to_v4i32: 550; VI: ; %bb.0: 551; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 552; VI-NEXT: s_mov_b32 s3, 0xf000 553; VI-NEXT: s_mov_b32 s2, -1 554; VI-NEXT: s_mov_b32 s10, s2 555; VI-NEXT: s_mov_b32 s11, s3 556; VI-NEXT: s_waitcnt lgkmcnt(0) 557; VI-NEXT: s_mov_b32 s8, s6 558; VI-NEXT: s_mov_b32 s9, s7 559; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 560; VI-NEXT: s_mov_b32 s0, s4 561; VI-NEXT: s_mov_b32 s1, s5 562; VI-NEXT: s_waitcnt vmcnt(0) 563; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 564; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 565; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 566; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 567; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 568; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 569; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 570; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 571; VI-NEXT: s_endpgm 572 %a = load i64, i64 addrspace(1)* %in 573 %cast = bitcast i64 %a to <4 x i16> 574 %ext = sext <4 x i16> %cast to <4 x i32> 575 %elt0 = extractelement <4 x i32> %ext, i32 0 576 %elt1 = extractelement <4 x i32> %ext, i32 1 577 %elt2 = extractelement <4 x i32> %ext, i32 2 578 %elt3 = extractelement <4 x i32> %ext, i32 3 579 store volatile i32 %elt0, i32 addrspace(1)* %out 580 store volatile i32 %elt1, i32 addrspace(1)* %out 581 store volatile i32 %elt2, i32 addrspace(1)* %out 582 store volatile i32 %elt3, i32 addrspace(1)* %out 583 ret void 584} 585 586declare i32 @llvm.amdgcn.workitem.id.x() #1 587 588attributes #1 = { nounwind readnone } 589