1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17 18 19; EGCM: VTX_READ_8{{.*}} #3 20; EGCM: KC0[2].Y 21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 22 %ext = zext i8 %in to i32 23 store i32 %ext, i32 addrspace(1)* %out, align 4 24 ret void 25} 26 27; FUNC-LABEL: {{^}}i8_zext_arg: 28; HSA-VI: kernarg_segment_byte_size = 12 29; HSA-VI: kernarg_segment_alignment = 4 30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32 33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 35 36 37; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 38; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 39; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 40 41; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 42; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 43; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 44; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 45define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 46 %ext = zext i8 %in to i32 47 store i32 %ext, i32 addrspace(1)* %out, align 4 48 ret void 49} 50 51; FUNC-LABEL: {{^}}i8_sext_arg: 52; HSA-VI: kernarg_segment_byte_size = 12 53; HSA-VI: kernarg_segment_alignment = 4 54; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 55 56; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 57 58; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 59; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 60; HSA-VI: flat_store_dword 61 62 63; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 65; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 66 67; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 68; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 69; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 70; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 71define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 72 %ext = sext i8 %in to i32 73 store i32 %ext, i32 addrspace(1)* %out, align 4 74 ret void 75} 76 77; FUNC-LABEL: {{^}}i16_arg: 78; HSA-VI: kernarg_segment_byte_size = 12 79; HSA-VI: kernarg_segment_alignment = 4 80 81; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 82 83; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 84; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 85 86; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 87; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 88; HSA-VI: flat_store_dword 89 90; EGCM: VTX_READ_16 91; EGCM: KC0[2].Y 92define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 93 %ext = zext i16 %in to i32 94 store i32 %ext, i32 addrspace(1)* %out, align 4 95 ret void 96} 97 98; FUNC-LABEL: {{^}}i16_zext_arg: 99; HSA-VI: kernarg_segment_byte_size = 12 100; HSA-VI: kernarg_segment_alignment = 4 101 102; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 103; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 104 105; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 106; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 107; HSA-VI: flat_store_dword 108 109; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 110; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 111; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 112 113; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 114; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 115; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 116; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 117define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 118 %ext = zext i16 %in to i32 119 store i32 %ext, i32 addrspace(1)* %out, align 4 120 ret void 121} 122 123; FUNC-LABEL: {{^}}i16_sext_arg: 124; HSA-VI: kernarg_segment_byte_size = 12 125; HSA-VI: kernarg_segment_alignment = 4 126 127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 129 130 131; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 132; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 133; HSA-VI: flat_store_dword 134 135; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, 136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 137; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 138 139; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, 140; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 141; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 142; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 143define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 144 %ext = sext i16 %in to i32 145 store i32 %ext, i32 addrspace(1)* %out, align 4 146 ret void 147} 148 149; FUNC-LABEL: {{^}}i32_arg: 150; HSA-VI: kernarg_segment_byte_size = 12 151; HSA-VI: kernarg_segment_alignment = 4 152 153; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 154; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 155; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 156; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 157define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 158entry: 159 store i32 %in, i32 addrspace(1)* %out, align 4 160 ret void 161} 162 163; FUNC-LABEL: {{^}}f32_arg: 164; HSA-VI: kernarg_segment_byte_size = 12 165; HSA-VI: kernarg_segment_alignment = 4 166; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z 167; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 168; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 169; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 170define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 171entry: 172 store float %in, float addrspace(1)* %out, align 4 173 ret void 174} 175 176; FUNC-LABEL: {{^}}v2i8_arg: 177; HSA-VI: kernarg_segment_byte_size = 12 178; HSA-VI: kernarg_segment_alignment = 4 179 180; EGCM: VTX_READ_8 181; EGCM: VTX_READ_8 182 183; GCN: s_load_dword s 184; GCN-NOT: {{buffer|flat|global}}_load_ 185define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 186entry: 187 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 188 ret void 189} 190 191; FUNC-LABEL: {{^}}v2i16_arg: 192; HSA-VI: kernarg_segment_byte_size = 12 193; HSA-VI: kernarg_segment_alignment = 4 194 195; EGCM: VTX_READ_16 196; EGCM: VTX_READ_16 197 198; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 199; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 200; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 201define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 202entry: 203 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 204 ret void 205} 206 207; FUNC-LABEL: {{^}}v2i32_arg: 208; HSA-VI: kernarg_segment_byte_size = 16 209; HSA-VI: kernarg_segment_alignment = 4 210 211; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 212; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 215; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 216define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 217entry: 218 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 219 ret void 220} 221 222; FUNC-LABEL: {{^}}v2f32_arg: 223; HSA-VI: kernarg_segment_byte_size = 16 224; HSA-VI: kernarg_segment_alignment = 4 225 226; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 227; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 228; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 229; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 230; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 231define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 232entry: 233 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 234 ret void 235} 236 237; FUNC-LABEL: {{^}}v3i8_arg: 238; HSA-VI: kernarg_segment_byte_size = 12 239; HSA-VI: kernarg_segment_alignment = 4 240 241; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 242; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 243; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 244 245; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 246 247; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 248; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 249define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 250entry: 251 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 252 ret void 253} 254 255; FUNC-LABEL: {{^}}v3i16_arg: 256; HSA-VI: kernarg_segment_byte_size = 16 257; HSA-VI: kernarg_segment_alignment = 4 258 259; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 260; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 261; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 262 263; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 264 265; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 266; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 267define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 268entry: 269 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 270 ret void 271} 272 273; FUNC-LABEL: {{^}}v3i32_arg: 274; HSA-VI: kernarg_segment_byte_size = 32 275; HSA-VI: kernarg_segment_alignment = 4 276; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 277; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 278; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 279; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 280; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 281; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 282define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 283entry: 284 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 285 ret void 286} 287 288; FUNC-LABEL: {{^}}v3f32_arg: 289; HSA-VI: kernarg_segment_byte_size = 32 290; HSA-VI: kernarg_segment_alignment = 4 291; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 292; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 293; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 294; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 295; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 296; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 297define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 298entry: 299 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 300 ret void 301} 302 303; FUNC-LABEL: {{^}}v4i8_arg: 304; HSA-VI: kernarg_segment_byte_size = 12 305; HSA-VI: kernarg_segment_alignment = 4 306; EGCM: VTX_READ_8 307; EGCM: VTX_READ_8 308; EGCM: VTX_READ_8 309; EGCM: VTX_READ_8 310 311; GCN-DAG: s_load_dwordx2 s 312; GCN-DAG: s_load_dword s 313define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 314entry: 315 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 316 ret void 317} 318 319; FUNC-LABEL: {{^}}v4i16_arg: 320; HSA-VI: kernarg_segment_byte_size = 16 321; HSA-VI: kernarg_segment_alignment = 4 322; EGCM: VTX_READ_16 323; EGCM: VTX_READ_16 324; EGCM: VTX_READ_16 325; EGCM: VTX_READ_16 326 327; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 328; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 329 330; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 331; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 332 333 334; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 335; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 336 337; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 338; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 339define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 340entry: 341 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 342 ret void 343} 344 345; FUNC-LABEL: {{^}}v4i32_arg: 346; HSA-VI: kernarg_segment_byte_size = 32 347; HSA-VI: kernarg_segment_alignment = 4 348; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 349; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 350; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 351; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 352 353; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 354; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 355; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 356define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 357entry: 358 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 359 ret void 360} 361 362; FUNC-LABEL: {{^}}v4f32_arg: 363; HSA-VI: kernarg_segment_byte_size = 32 364; HSA-VI: kernarg_segment_alignment = 4 365; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 366; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 367; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 368; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 369; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 370; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 371; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 372define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 373entry: 374 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 375 ret void 376} 377 378; FIXME: Lots of unpack and re-pack junk on VI 379; FUNC-LABEL: {{^}}v8i8_arg: 380; HSA-VI: kernarg_segment_byte_size = 16 381; HSA-VI: kernarg_segment_alignment = 4 382; EGCM: VTX_READ_8 383; EGCM: VTX_READ_8 384; EGCM: VTX_READ_8 385; EGCM: VTX_READ_8 386; EGCM: VTX_READ_8 387; EGCM: VTX_READ_8 388; EGCM: VTX_READ_8 389; EGCM: VTX_READ_8 390 391; SI-NOT: {{buffer|flat|global}}_load 392; SI: s_load_dwordx2 s 393; SI-NEXT: s_load_dwordx2 s 394; SI-NOT: {{buffer|flat|global}}_load 395 396; VI: s_load_dwordx2 s 397; VI-NEXT: s_load_dwordx2 s 398; VI-NOT: lshl 399; VI-NOT: _or 400; VI-NOT: _sdwa 401define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 402entry: 403 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 404 ret void 405} 406 407; FUNC-LABEL: {{^}}v8i16_arg: 408; HSA-VI: kernarg_segment_byte_size = 32 409; HSA-VI: kernarg_segment_alignment = 4 410; EGCM: VTX_READ_16 411; EGCM: VTX_READ_16 412; EGCM: VTX_READ_16 413; EGCM: VTX_READ_16 414; EGCM: VTX_READ_16 415; EGCM: VTX_READ_16 416; EGCM: VTX_READ_16 417; EGCM: VTX_READ_16 418 419; SI: s_load_dwordx4 420; SI-NEXT: s_load_dwordx2 421; SI-NOT: {{buffer|flat|global}}_load 422 423 424; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 425 426; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 427define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 428entry: 429 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 430 ret void 431} 432 433; FUNC-LABEL: {{^}}v8i32_arg: 434; HSA-VI: kernarg_segment_byte_size = 64 435; HSA-VI: kernarg_segment_alignment = 5 436; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 437; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 438; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 439; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 440; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 441; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 442; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 443; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 444 445; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 446; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 447; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 448define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 449entry: 450 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 451 ret void 452} 453 454; FUNC-LABEL: {{^}}v8f32_arg: 455; HSA-VI: kernarg_segment_byte_size = 64 456; HSA-VI: kernarg_segment_alignment = 5 457; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 458; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 459; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 460; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 461; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 462; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 463; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 464; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 465; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 466define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 467entry: 468 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 469 ret void 470} 471 472; FIXME: Pack/repack on VI 473 474; FUNC-LABEL: {{^}}v16i8_arg: 475; HSA-VI: kernarg_segment_byte_size = 32 476; HSA-VI: kernarg_segment_alignment = 4 477; EGCM: VTX_READ_8 478; EGCM: VTX_READ_8 479; EGCM: VTX_READ_8 480; EGCM: VTX_READ_8 481; EGCM: VTX_READ_8 482; EGCM: VTX_READ_8 483; EGCM: VTX_READ_8 484; EGCM: VTX_READ_8 485; EGCM: VTX_READ_8 486; EGCM: VTX_READ_8 487; EGCM: VTX_READ_8 488; EGCM: VTX_READ_8 489; EGCM: VTX_READ_8 490; EGCM: VTX_READ_8 491; EGCM: VTX_READ_8 492; EGCM: VTX_READ_8 493 494; SI: s_load_dwordx4 s 495; SI-NEXT: s_load_dwordx2 s 496; SI-NOT: {{buffer|flat|global}}_load 497 498 499; VI: s_load_dwordx4 s 500; VI-NOT: shr 501; VI-NOT: shl 502; VI-NOT: _sdwa 503; VI-NOT: _or_ 504define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 505entry: 506 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 507 ret void 508} 509 510; FUNC-LABEL: {{^}}v16i16_arg: 511; HSA-VI: kernarg_segment_byte_size = 64 512; HSA-VI: kernarg_segment_alignment = 5 513; EGCM: VTX_READ_16 514; EGCM: VTX_READ_16 515; EGCM: VTX_READ_16 516; EGCM: VTX_READ_16 517; EGCM: VTX_READ_16 518 519; EGCM: VTX_READ_16 520; EGCM: VTX_READ_16 521; EGCM: VTX_READ_16 522; EGCM: VTX_READ_16 523; EGCM: VTX_READ_16 524; EGCM: VTX_READ_16 525; EGCM: VTX_READ_16 526; EGCM: VTX_READ_16 527; EGCM: VTX_READ_16 528; EGCM: VTX_READ_16 529; EGCM: VTX_READ_16 530 531; SI: s_load_dwordx8 s 532; SI-NEXT: s_load_dwordx2 s 533; SI-NOT: {{buffer|flat|global}}_load 534 535 536; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 537 538; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 539define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 540entry: 541 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 542 ret void 543} 544 545; FUNC-LABEL: {{^}}v16i32_arg: 546; HSA-VI: kernarg_segment_byte_size = 128 547; HSA-VI: kernarg_segment_alignment = 6 548; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 549; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 550; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 551; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 552; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 553; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 554; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 555; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 556; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 557; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 558; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 559; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 560; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 561; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 562; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 563; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 564; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 565; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 566; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 567define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 568entry: 569 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 570 ret void 571} 572 573; FUNC-LABEL: {{^}}v16f32_arg: 574; HSA-VI: kernarg_segment_byte_size = 128 575; HSA-VI: kernarg_segment_alignment = 6 576; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 577; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 578; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 579; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 580; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 581; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 582; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 583; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 584; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 585; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 586; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 587; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 588; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 589; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 590; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 591; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 595define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 596entry: 597 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 598 ret void 599} 600 601; FUNC-LABEL: {{^}}kernel_arg_i64: 602; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 603; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 604 605; MESA-GCN: buffer_store_dwordx2 606define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 607 store i64 %a, i64 addrspace(1)* %out, align 8 608 ret void 609} 610 611; FUNC-LABEL: {{^}}f64_kernel_arg: 612; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 613; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 614; MESA-GCN: buffer_store_dwordx2 615 616; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 617define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 618entry: 619 store double %in, double addrspace(1)* %out 620 ret void 621} 622 623; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 624; XGCN: s_load_dwordx2 625; XGCN: s_load_dwordx2 626; XGCN: buffer_store_dwordx2 627; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 628; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 629; ret void 630; } 631 632; FUNC-LABEL: {{^}}i65_arg: 633; HSA-VI: kernarg_segment_byte_size = 24 634; HSA-VI: kernarg_segment_alignment = 4 635; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 636; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 637define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 638entry: 639 store i65 %in, i65 addrspace(1)* %out, align 4 640 ret void 641} 642 643; FUNC-LABEL: {{^}}i1_arg: 644; HSA-VI: kernarg_segment_byte_size = 12 645; HSA-VI: kernarg_segment_alignment = 4 646 647; GCN: s_load_dword s 648; GCN: s_and_b32 649; GCN: {{buffer|flat}}_store_byte 650define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 651 store i1 %x, i1 addrspace(1)* %out, align 1 652 ret void 653} 654 655; FUNC-LABEL: {{^}}i1_arg_zext_i32: 656; HSA-VI: kernarg_segment_byte_size = 12 657; HSA-VI: kernarg_segment_alignment = 4 658 659; GCN: s_load_dword 660; SGCN: buffer_store_dword 661define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 662 %ext = zext i1 %x to i32 663 store i32 %ext, i32 addrspace(1)* %out, align 4 664 ret void 665} 666 667; FUNC-LABEL: {{^}}i1_arg_zext_i64: 668; HSA-VI: kernarg_segment_byte_size = 12 669; HSA-VI: kernarg_segment_alignment = 4 670 671; GCN: s_load_dword s 672; GCN: {{buffer|flat}}_store_dwordx2 673define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 674 %ext = zext i1 %x to i64 675 store i64 %ext, i64 addrspace(1)* %out, align 8 676 ret void 677} 678 679; FUNC-LABEL: {{^}}i1_arg_sext_i32: 680; HSA-VI: kernarg_segment_byte_size = 12 681; HSA-VI: kernarg_segment_alignment = 4 682 683; GCN: s_load_dword 684; GCN: {{buffer|flat}}_store_dword 685define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 686 %ext = sext i1 %x to i32 687 store i32 %ext, i32addrspace(1)* %out, align 4 688 ret void 689} 690 691; FUNC-LABEL: {{^}}i1_arg_sext_i64: 692; HSA-VI: kernarg_segment_byte_size = 12 693; HSA-VI: kernarg_segment_alignment = 4 694 695; GCN: s_load_dword 696; GCN: s_bfe_i64 697; GCN: {{buffer|flat}}_store_dwordx2 698define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 699 %ext = sext i1 %x to i64 700 store i64 %ext, i64 addrspace(1)* %out, align 8 701 ret void 702} 703 704; FUNC-LABEL: {{^}}empty_struct_arg: 705; HSA-VI: kernarg_segment_byte_size = 0 706define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 707 ret void 708} 709 710; The correct load offsets for these: 711; load 4 from 0, 712; load 8 from 8 713; load 4 from 24 714; load 8 from 32 715 716; With the SelectionDAG argument lowering, the alignments for the 717; struct members is not properly considered, making these wrong. 718 719; FIXME: Total argument size is computed wrong 720; FUNC-LABEL: {{^}}struct_argument_alignment: 721; HSA-VI: kernarg_segment_byte_size = 40 722; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 723; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 724; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 725; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 726define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 727 %val0 = extractvalue {i32, i64} %arg0, 0 728 %val1 = extractvalue {i32, i64} %arg0, 1 729 %val2 = extractvalue {i32, i64} %arg1, 0 730 %val3 = extractvalue {i32, i64} %arg1, 1 731 store volatile i32 %val0, i32 addrspace(1)* null 732 store volatile i64 %val1, i64 addrspace(1)* null 733 store volatile i32 %val2, i32 addrspace(1)* null 734 store volatile i64 %val3, i64 addrspace(1)* null 735 ret void 736} 737 738; No padding between i8 and next struct, but round up at end to 4 byte 739; multiple. 740; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 741; HSA-VI: kernarg_segment_byte_size = 28 742; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 743; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 744; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 745; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 746define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 747 %val0 = extractvalue <{i32, i64}> %arg0, 0 748 %val1 = extractvalue <{i32, i64}> %arg0, 1 749 %val2 = extractvalue <{i32, i64}> %arg1, 0 750 %val3 = extractvalue <{i32, i64}> %arg1, 1 751 store volatile i32 %val0, i32 addrspace(1)* null 752 store volatile i64 %val1, i64 addrspace(1)* null 753 store volatile i32 %val2, i32 addrspace(1)* null 754 store volatile i64 %val3, i64 addrspace(1)* null 755 ret void 756} 757 758; GCN-LABEL: {{^}}struct_argument_alignment_after: 759; HSA-VI: kernarg_segment_byte_size = 64 760; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 761; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 762; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 763; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 764; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 765define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 766 %val0 = extractvalue {i32, i64} %arg0, 0 767 %val1 = extractvalue {i32, i64} %arg0, 1 768 %val2 = extractvalue {i32, i64} %arg2, 0 769 %val3 = extractvalue {i32, i64} %arg2, 1 770 store volatile i32 %val0, i32 addrspace(1)* null 771 store volatile i64 %val1, i64 addrspace(1)* null 772 store volatile i32 %val2, i32 addrspace(1)* null 773 store volatile i64 %val3, i64 addrspace(1)* null 774 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 775 ret void 776} 777 778; GCN-LABEL: {{^}}array_3xi32: 779; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 780; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 781; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 782; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 783define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 784 store volatile i16 %arg0, i16 addrspace(1)* undef 785 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 786 ret void 787} 788 789; FIXME: Why not all scalar loads? 790; GCN-LABEL: {{^}}array_3xi16: 791; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 792; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 793; HSA-VI: flat_load_ushort 794; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 795; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 796define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 797 store volatile i8 %arg0, i8 addrspace(1)* undef 798 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 799 ret void 800} 801