1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) { 6; SI-LABEL: widen_i16_constant_load: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 9; SI-NEXT: s_mov_b32 s3, 0xf000 10; SI-NEXT: s_mov_b32 s2, -1 11; SI-NEXT: s_waitcnt lgkmcnt(0) 12; SI-NEXT: s_load_dword s1, s[0:1], 0x0 13; SI-NEXT: s_mov_b32 s0, 0 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_addk_i32 s1, 0x3e7 16; SI-NEXT: s_or_b32 s4, s1, 4 17; SI-NEXT: s_mov_b32 s1, s0 18; SI-NEXT: v_mov_b32_e32 v0, s4 19; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: widen_i16_constant_load: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 25; VI-NEXT: v_mov_b32_e32 v0, 0 26; VI-NEXT: v_mov_b32_e32 v1, 0 27; VI-NEXT: s_waitcnt lgkmcnt(0) 28; VI-NEXT: s_load_dword s0, s[0:1], 0x0 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: s_addk_i32 s0, 0x3e7 31; VI-NEXT: s_or_b32 s0, s0, 4 32; VI-NEXT: v_mov_b32_e32 v2, s0 33; VI-NEXT: flat_store_short v[0:1], v2 34; VI-NEXT: s_endpgm 35 %load = load i16, i16 addrspace(4)* %arg, align 4 36 %add = add i16 %load, 999 37 %or = or i16 %add, 4 38 store i16 %or, i16 addrspace(1)* null 39 ret void 40} 41 42define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) { 43; SI-LABEL: widen_i16_constant_load_zext_i32: 44; SI: ; %bb.0: 45; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 46; SI-NEXT: s_mov_b32 s3, 0xf000 47; SI-NEXT: s_mov_b32 s2, -1 48; SI-NEXT: s_waitcnt lgkmcnt(0) 49; SI-NEXT: s_load_dword s1, s[0:1], 0x0 50; SI-NEXT: s_mov_b32 s0, 0 51; SI-NEXT: s_waitcnt lgkmcnt(0) 52; SI-NEXT: s_and_b32 s1, s1, 0xffff 53; SI-NEXT: s_addk_i32 s1, 0x3e7 54; SI-NEXT: s_or_b32 s4, s1, 4 55; SI-NEXT: s_mov_b32 s1, s0 56; SI-NEXT: v_mov_b32_e32 v0, s4 57; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 58; SI-NEXT: s_endpgm 59; 60; VI-LABEL: widen_i16_constant_load_zext_i32: 61; VI: ; %bb.0: 62; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 63; VI-NEXT: v_mov_b32_e32 v0, 0 64; VI-NEXT: v_mov_b32_e32 v1, 0 65; VI-NEXT: s_waitcnt lgkmcnt(0) 66; VI-NEXT: s_load_dword s0, s[0:1], 0x0 67; VI-NEXT: s_waitcnt lgkmcnt(0) 68; VI-NEXT: s_and_b32 s0, s0, 0xffff 69; VI-NEXT: s_addk_i32 s0, 0x3e7 70; VI-NEXT: s_or_b32 s0, s0, 4 71; VI-NEXT: v_mov_b32_e32 v2, s0 72; VI-NEXT: flat_store_dword v[0:1], v2 73; VI-NEXT: s_endpgm 74 %load = load i16, i16 addrspace(4)* %arg, align 4 75 %ext = zext i16 %load to i32 76 %add = add i32 %ext, 999 77 %or = or i32 %add, 4 78 store i32 %or, i32 addrspace(1)* null 79 ret void 80} 81 82define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) { 83; SI-LABEL: widen_i16_constant_load_sext_i32: 84; SI: ; %bb.0: 85; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 86; SI-NEXT: s_mov_b32 s3, 0xf000 87; SI-NEXT: s_mov_b32 s2, -1 88; SI-NEXT: s_waitcnt lgkmcnt(0) 89; SI-NEXT: s_load_dword s1, s[0:1], 0x0 90; SI-NEXT: s_mov_b32 s0, 0 91; SI-NEXT: s_waitcnt lgkmcnt(0) 92; SI-NEXT: s_sext_i32_i16 s1, s1 93; SI-NEXT: s_addk_i32 s1, 0x3e7 94; SI-NEXT: s_or_b32 s4, s1, 4 95; SI-NEXT: s_mov_b32 s1, s0 96; SI-NEXT: v_mov_b32_e32 v0, s4 97; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 98; SI-NEXT: s_endpgm 99; 100; VI-LABEL: widen_i16_constant_load_sext_i32: 101; VI: ; %bb.0: 102; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 103; VI-NEXT: v_mov_b32_e32 v0, 0 104; VI-NEXT: v_mov_b32_e32 v1, 0 105; VI-NEXT: s_waitcnt lgkmcnt(0) 106; VI-NEXT: s_load_dword s0, s[0:1], 0x0 107; VI-NEXT: s_waitcnt lgkmcnt(0) 108; VI-NEXT: s_sext_i32_i16 s0, s0 109; VI-NEXT: s_addk_i32 s0, 0x3e7 110; VI-NEXT: s_or_b32 s0, s0, 4 111; VI-NEXT: v_mov_b32_e32 v2, s0 112; VI-NEXT: flat_store_dword v[0:1], v2 113; VI-NEXT: s_endpgm 114 %load = load i16, i16 addrspace(4)* %arg, align 4 115 %ext = sext i16 %load to i32 116 %add = add i32 %ext, 999 117 %or = or i32 %add, 4 118 store i32 %or, i32 addrspace(1)* null 119 ret void 120} 121 122define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { 123; SI-LABEL: widen_i17_constant_load: 124; SI: ; %bb.0: 125; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 126; SI-NEXT: s_mov_b32 s0, 0 127; SI-NEXT: s_mov_b32 s3, 0xf000 128; SI-NEXT: s_mov_b32 s2, -1 129; SI-NEXT: s_mov_b32 s1, s0 130; SI-NEXT: s_waitcnt lgkmcnt(0) 131; SI-NEXT: s_load_dword s7, s[6:7], 0x0 132; SI-NEXT: s_mov_b32 s4, 2 133; SI-NEXT: s_mov_b32 s5, s0 134; SI-NEXT: s_mov_b32 s6, s2 135; SI-NEXT: s_waitcnt lgkmcnt(0) 136; SI-NEXT: s_add_i32 s7, s7, 34 137; SI-NEXT: s_or_b32 s7, s7, 4 138; SI-NEXT: v_mov_b32_e32 v0, s7 139; SI-NEXT: s_bfe_u32 s8, s7, 0x10010 140; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 141; SI-NEXT: s_mov_b32 s7, s3 142; SI-NEXT: s_waitcnt expcnt(0) 143; SI-NEXT: v_mov_b32_e32 v0, s8 144; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 145; SI-NEXT: s_endpgm 146; 147; VI-LABEL: widen_i17_constant_load: 148; VI: ; %bb.0: 149; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; VI-NEXT: v_mov_b32_e32 v0, 0 151; VI-NEXT: v_mov_b32_e32 v2, 2 152; VI-NEXT: v_mov_b32_e32 v1, 0 153; VI-NEXT: v_mov_b32_e32 v3, 0 154; VI-NEXT: s_waitcnt lgkmcnt(0) 155; VI-NEXT: s_load_dword s0, s[0:1], 0x0 156; VI-NEXT: s_waitcnt lgkmcnt(0) 157; VI-NEXT: s_add_i32 s0, s0, 34 158; VI-NEXT: s_or_b32 s0, s0, 4 159; VI-NEXT: v_mov_b32_e32 v4, s0 160; VI-NEXT: s_bfe_u32 s0, s0, 0x10010 161; VI-NEXT: flat_store_short v[0:1], v4 162; VI-NEXT: v_mov_b32_e32 v0, s0 163; VI-NEXT: flat_store_byte v[2:3], v0 164; VI-NEXT: s_endpgm 165 %load = load i17, i17 addrspace(4)* %arg, align 4 166 %add = add i17 %load, 34 167 %or = or i17 %add, 4 168 store i17 %or, i17 addrspace(1)* null 169 ret void 170} 171 172define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) { 173; SI-LABEL: widen_f16_constant_load: 174; SI: ; %bb.0: 175; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 176; SI-NEXT: s_mov_b32 s3, 0xf000 177; SI-NEXT: s_mov_b32 s2, -1 178; SI-NEXT: s_waitcnt lgkmcnt(0) 179; SI-NEXT: s_load_dword s0, s[0:1], 0x0 180; SI-NEXT: s_waitcnt lgkmcnt(0) 181; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 182; SI-NEXT: s_mov_b32 s0, 0 183; SI-NEXT: s_mov_b32 s1, s0 184; SI-NEXT: v_add_f32_e32 v0, 4.0, v0 185; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 186; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 187; SI-NEXT: s_endpgm 188; 189; VI-LABEL: widen_f16_constant_load: 190; VI: ; %bb.0: 191; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 192; VI-NEXT: v_mov_b32_e32 v0, 0 193; VI-NEXT: v_mov_b32_e32 v1, 0 194; VI-NEXT: s_waitcnt lgkmcnt(0) 195; VI-NEXT: s_load_dword s0, s[0:1], 0x0 196; VI-NEXT: s_waitcnt lgkmcnt(0) 197; VI-NEXT: v_add_f16_e64 v2, s0, 4.0 198; VI-NEXT: flat_store_short v[0:1], v2 199; VI-NEXT: s_endpgm 200 %load = load half, half addrspace(4)* %arg, align 4 201 %add = fadd half %load, 4.0 202 store half %add, half addrspace(1)* null 203 ret void 204} 205 206; FIXME: valu usage on VI 207define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { 208; SI-LABEL: widen_v2i8_constant_load: 209; SI: ; %bb.0: 210; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 211; SI-NEXT: s_mov_b32 s0, 0 212; SI-NEXT: s_waitcnt lgkmcnt(0) 213; SI-NEXT: s_load_dword s1, s[2:3], 0x0 214; SI-NEXT: s_mov_b32 s3, 0xf000 215; SI-NEXT: s_mov_b32 s2, -1 216; SI-NEXT: s_waitcnt lgkmcnt(0) 217; SI-NEXT: s_and_b32 s4, s1, 0xff00 218; SI-NEXT: s_add_i32 s1, s1, 12 219; SI-NEXT: s_or_b32 s1, s1, 4 220; SI-NEXT: s_and_b32 s1, s1, 0xff 221; SI-NEXT: s_or_b32 s1, s4, s1 222; SI-NEXT: s_addk_i32 s1, 0x2c00 223; SI-NEXT: s_or_b32 s4, s1, 0x300 224; SI-NEXT: s_mov_b32 s1, s0 225; SI-NEXT: v_mov_b32_e32 v0, s4 226; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 227; SI-NEXT: s_endpgm 228; 229; VI-LABEL: widen_v2i8_constant_load: 230; VI: ; %bb.0: 231; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 232; VI-NEXT: v_mov_b32_e32 v0, 44 233; VI-NEXT: v_mov_b32_e32 v1, 3 234; VI-NEXT: s_waitcnt lgkmcnt(0) 235; VI-NEXT: s_load_dword s0, s[0:1], 0x0 236; VI-NEXT: s_waitcnt lgkmcnt(0) 237; VI-NEXT: s_and_b32 s1, s0, 0xffff 238; VI-NEXT: v_mov_b32_e32 v2, s0 239; VI-NEXT: s_add_i32 s1, s1, 12 240; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 241; VI-NEXT: s_or_b32 s0, s1, 4 242; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 243; VI-NEXT: s_and_b32 s0, s0, 0xff 244; VI-NEXT: v_or_b32_e32 v2, s0, v0 245; VI-NEXT: v_mov_b32_e32 v0, 0 246; VI-NEXT: v_mov_b32_e32 v1, 0 247; VI-NEXT: flat_store_short v[0:1], v2 248; VI-NEXT: s_endpgm 249 %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4 250 %add = add <2 x i8> %load, <i8 12, i8 44> 251 %or = or <2 x i8> %add, <i8 4, i8 3> 252 store <2 x i8> %or, <2 x i8> addrspace(1)* null 253 ret void 254} 255 256define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) { 257; SI-LABEL: no_widen_i16_constant_divergent_load: 258; SI: ; %bb.0: 259; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 260; SI-NEXT: s_mov_b32 s2, 0 261; SI-NEXT: s_mov_b32 s3, 0xf000 262; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 263; SI-NEXT: v_mov_b32_e32 v1, 0 264; SI-NEXT: s_waitcnt lgkmcnt(0) 265; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 266; SI-NEXT: s_mov_b32 s6, -1 267; SI-NEXT: s_mov_b32 s4, s2 268; SI-NEXT: s_mov_b32 s5, s2 269; SI-NEXT: s_mov_b32 s7, s3 270; SI-NEXT: s_waitcnt vmcnt(0) 271; SI-NEXT: v_add_i32_e32 v0, vcc, 0x3e7, v0 272; SI-NEXT: v_or_b32_e32 v0, 4, v0 273; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 274; SI-NEXT: s_endpgm 275; 276; VI-LABEL: no_widen_i16_constant_divergent_load: 277; VI: ; %bb.0: 278; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 279; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 280; VI-NEXT: s_waitcnt lgkmcnt(0) 281; VI-NEXT: v_mov_b32_e32 v1, s1 282; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 283; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 284; VI-NEXT: flat_load_ushort v0, v[0:1] 285; VI-NEXT: s_waitcnt vmcnt(0) 286; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 287; VI-NEXT: v_or_b32_e32 v2, 4, v0 288; VI-NEXT: v_mov_b32_e32 v0, 0 289; VI-NEXT: v_mov_b32_e32 v1, 0 290; VI-NEXT: flat_store_short v[0:1], v2 291; VI-NEXT: s_endpgm 292 %tid = call i32 @llvm.amdgcn.workitem.id.x() 293 %tid.ext = zext i32 %tid to i64 294 %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext 295 %load = load i16, i16 addrspace(4)* %gep.arg, align 4 296 %add = add i16 %load, 999 297 %or = or i16 %add, 4 298 store i16 %or, i16 addrspace(1)* null 299 ret void 300} 301 302define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) { 303; SI-LABEL: widen_i1_constant_load: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 306; SI-NEXT: s_mov_b32 s3, 0xf000 307; SI-NEXT: s_mov_b32 s2, -1 308; SI-NEXT: s_waitcnt lgkmcnt(0) 309; SI-NEXT: s_load_dword s1, s[0:1], 0x0 310; SI-NEXT: s_mov_b32 s0, 0 311; SI-NEXT: s_waitcnt lgkmcnt(0) 312; SI-NEXT: s_and_b32 s4, s1, 1 313; SI-NEXT: s_mov_b32 s1, s0 314; SI-NEXT: v_mov_b32_e32 v0, s4 315; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 316; SI-NEXT: s_endpgm 317; 318; VI-LABEL: widen_i1_constant_load: 319; VI: ; %bb.0: 320; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 321; VI-NEXT: v_mov_b32_e32 v0, 0 322; VI-NEXT: v_mov_b32_e32 v1, 0 323; VI-NEXT: s_waitcnt lgkmcnt(0) 324; VI-NEXT: s_load_dword s0, s[0:1], 0x0 325; VI-NEXT: s_waitcnt lgkmcnt(0) 326; VI-NEXT: s_and_b32 s0, s0, 1 327; VI-NEXT: v_mov_b32_e32 v2, s0 328; VI-NEXT: flat_store_byte v[0:1], v2 329; VI-NEXT: s_endpgm 330 %load = load i1, i1 addrspace(4)* %arg, align 4 331 %and = and i1 %load, true 332 store i1 %and, i1 addrspace(1)* null 333 ret void 334} 335 336define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) { 337; SI-LABEL: widen_i16_zextload_i64_constant_load: 338; SI: ; %bb.0: 339; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 340; SI-NEXT: s_mov_b32 s3, 0xf000 341; SI-NEXT: s_mov_b32 s2, -1 342; SI-NEXT: s_waitcnt lgkmcnt(0) 343; SI-NEXT: s_load_dword s1, s[0:1], 0x0 344; SI-NEXT: s_mov_b32 s0, 0 345; SI-NEXT: s_waitcnt lgkmcnt(0) 346; SI-NEXT: s_and_b32 s1, s1, 0xffff 347; SI-NEXT: s_addk_i32 s1, 0x3e7 348; SI-NEXT: s_or_b32 s4, s1, 4 349; SI-NEXT: s_mov_b32 s1, s0 350; SI-NEXT: v_mov_b32_e32 v0, s4 351; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 352; SI-NEXT: s_endpgm 353; 354; VI-LABEL: widen_i16_zextload_i64_constant_load: 355; VI: ; %bb.0: 356; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 357; VI-NEXT: v_mov_b32_e32 v0, 0 358; VI-NEXT: v_mov_b32_e32 v1, 0 359; VI-NEXT: s_waitcnt lgkmcnt(0) 360; VI-NEXT: s_load_dword s0, s[0:1], 0x0 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: s_and_b32 s0, s0, 0xffff 363; VI-NEXT: s_addk_i32 s0, 0x3e7 364; VI-NEXT: s_or_b32 s0, s0, 4 365; VI-NEXT: v_mov_b32_e32 v2, s0 366; VI-NEXT: flat_store_dword v[0:1], v2 367; VI-NEXT: s_endpgm 368 %load = load i16, i16 addrspace(4)* %arg, align 4 369 %zext = zext i16 %load to i32 370 %add = add i32 %zext, 999 371 %or = or i32 %add, 4 372 store i32 %or, i32 addrspace(1)* null 373 ret void 374} 375 376define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) { 377; SI-LABEL: widen_i1_zext_to_i64_constant_load: 378; SI: ; %bb.0: 379; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 380; SI-NEXT: s_mov_b32 s3, 0xf000 381; SI-NEXT: s_mov_b32 s2, -1 382; SI-NEXT: s_waitcnt lgkmcnt(0) 383; SI-NEXT: s_load_dword s1, s[0:1], 0x0 384; SI-NEXT: s_mov_b32 s0, 0 385; SI-NEXT: s_waitcnt lgkmcnt(0) 386; SI-NEXT: s_and_b32 s1, s1, 1 387; SI-NEXT: s_add_u32 s4, s1, 0x3e7 388; SI-NEXT: s_addc_u32 s5, 0, 0 389; SI-NEXT: v_mov_b32_e32 v0, s4 390; SI-NEXT: s_mov_b32 s1, s0 391; SI-NEXT: v_mov_b32_e32 v1, s5 392; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 393; SI-NEXT: s_endpgm 394; 395; VI-LABEL: widen_i1_zext_to_i64_constant_load: 396; VI: ; %bb.0: 397; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 398; VI-NEXT: v_mov_b32_e32 v0, 0 399; VI-NEXT: v_mov_b32_e32 v1, 0 400; VI-NEXT: s_waitcnt lgkmcnt(0) 401; VI-NEXT: s_load_dword s0, s[0:1], 0x0 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: s_and_b32 s0, s0, 1 404; VI-NEXT: s_add_u32 s0, s0, 0x3e7 405; VI-NEXT: s_addc_u32 s1, 0, 0 406; VI-NEXT: v_mov_b32_e32 v3, s1 407; VI-NEXT: v_mov_b32_e32 v2, s0 408; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 409; VI-NEXT: s_endpgm 410 %load = load i1, i1 addrspace(4)* %arg, align 4 411 %zext = zext i1 %load to i64 412 %add = add i64 %zext, 999 413 store i64 %add, i64 addrspace(1)* null 414 ret void 415} 416 417define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) { 418; SI-LABEL: widen_i16_constant32_load: 419; SI: ; %bb.0: 420; SI-NEXT: s_load_dword s0, s[0:1], 0x9 421; SI-NEXT: s_mov_b32 s1, 0 422; SI-NEXT: s_mov_b32 s3, 0xf000 423; SI-NEXT: s_mov_b32 s2, -1 424; SI-NEXT: s_waitcnt lgkmcnt(0) 425; SI-NEXT: s_load_dword s0, s[0:1], 0x0 426; SI-NEXT: s_waitcnt lgkmcnt(0) 427; SI-NEXT: s_addk_i32 s0, 0x3e7 428; SI-NEXT: s_or_b32 s4, s0, 4 429; SI-NEXT: s_mov_b32 s0, s1 430; SI-NEXT: v_mov_b32_e32 v0, s4 431; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 432; SI-NEXT: s_endpgm 433; 434; VI-LABEL: widen_i16_constant32_load: 435; VI: ; %bb.0: 436; VI-NEXT: s_load_dword s0, s[0:1], 0x24 437; VI-NEXT: s_mov_b32 s1, 0 438; VI-NEXT: v_mov_b32_e32 v0, 0 439; VI-NEXT: v_mov_b32_e32 v1, 0 440; VI-NEXT: s_waitcnt lgkmcnt(0) 441; VI-NEXT: s_load_dword s0, s[0:1], 0x0 442; VI-NEXT: s_waitcnt lgkmcnt(0) 443; VI-NEXT: s_addk_i32 s0, 0x3e7 444; VI-NEXT: s_or_b32 s0, s0, 4 445; VI-NEXT: v_mov_b32_e32 v2, s0 446; VI-NEXT: flat_store_short v[0:1], v2 447; VI-NEXT: s_endpgm 448 %load = load i16, i16 addrspace(6)* %arg, align 4 449 %add = add i16 %load, 999 450 %or = or i16 %add, 4 451 store i16 %or, i16 addrspace(1)* null 452 ret void 453} 454 455define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) { 456; SI-LABEL: widen_i16_global_invariant_load: 457; SI: ; %bb.0: 458; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 459; SI-NEXT: s_mov_b32 s3, 0xf000 460; SI-NEXT: s_mov_b32 s2, -1 461; SI-NEXT: s_waitcnt lgkmcnt(0) 462; SI-NEXT: s_load_dword s1, s[0:1], 0x0 463; SI-NEXT: s_mov_b32 s0, 0 464; SI-NEXT: s_waitcnt lgkmcnt(0) 465; SI-NEXT: s_addk_i32 s1, 0x3e7 466; SI-NEXT: s_or_b32 s4, s1, 1 467; SI-NEXT: s_mov_b32 s1, s0 468; SI-NEXT: v_mov_b32_e32 v0, s4 469; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 470; SI-NEXT: s_endpgm 471; 472; VI-LABEL: widen_i16_global_invariant_load: 473; VI: ; %bb.0: 474; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 475; VI-NEXT: v_mov_b32_e32 v0, 0 476; VI-NEXT: v_mov_b32_e32 v1, 0 477; VI-NEXT: s_waitcnt lgkmcnt(0) 478; VI-NEXT: s_load_dword s0, s[0:1], 0x0 479; VI-NEXT: s_waitcnt lgkmcnt(0) 480; VI-NEXT: s_addk_i32 s0, 0x3e7 481; VI-NEXT: s_or_b32 s0, s0, 1 482; VI-NEXT: v_mov_b32_e32 v2, s0 483; VI-NEXT: flat_store_short v[0:1], v2 484; VI-NEXT: s_endpgm 485 %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0 486 %add = add i16 %load, 999 487 %or = or i16 %add, 1 488 store i16 %or, i16 addrspace(1)* null 489 ret void 490} 491 492declare i32 @llvm.amdgcn.workitem.id.x() 493 494!0 = !{} 495