• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14
15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
17
18
19; EGCM: VTX_READ_8{{.*}} #3
20; EGCM: KC0[2].Y
21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22  %ext = zext i8 %in to i32
23  store i32 %ext, i32 addrspace(1)* %out, align 4
24  ret void
25}
26
27; FUNC-LABEL: {{^}}i8_zext_arg:
28; HSA-VI: kernarg_segment_byte_size = 12
29; HSA-VI: kernarg_segment_alignment = 4
30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32
33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
35
36
37; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
38; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
39; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
40
41; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
42; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
43; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
44; CM-NEXT:	2(2.802597e-45), 0(0.000000e+00)
45define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
46  %ext = zext i8 %in to i32
47  store i32 %ext, i32 addrspace(1)* %out, align 4
48  ret void
49}
50
51; FUNC-LABEL: {{^}}i8_sext_arg:
52; HSA-VI: kernarg_segment_byte_size = 12
53; HSA-VI: kernarg_segment_alignment = 4
54; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
55
56; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
57
58; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
59; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
60; HSA-VI: flat_store_dword
61
62
63; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
65; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
66
67; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
68; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
69; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
70; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
71define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
72  %ext = sext i8 %in to i32
73  store i32 %ext, i32 addrspace(1)* %out, align 4
74  ret void
75}
76
77; FUNC-LABEL: {{^}}i16_arg:
78; HSA-VI: kernarg_segment_byte_size = 12
79; HSA-VI: kernarg_segment_alignment = 4
80
81; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
82
83; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
84; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
85
86; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
87; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
88; HSA-VI: flat_store_dword
89
90; EGCM: VTX_READ_16
91; EGCM: KC0[2].Y
92define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
93  %ext = zext i16 %in to i32
94  store i32 %ext, i32 addrspace(1)* %out, align 4
95  ret void
96}
97
98; FUNC-LABEL: {{^}}i16_zext_arg:
99; HSA-VI: kernarg_segment_byte_size = 12
100; HSA-VI: kernarg_segment_alignment = 4
101
102; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
103; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
104
105; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
106; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
107; HSA-VI: flat_store_dword
108
109; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
110; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
111; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
112
113; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
114; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
115; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
116; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
117define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
118  %ext = zext i16 %in to i32
119  store i32 %ext, i32 addrspace(1)* %out, align 4
120  ret void
121}
122
123; FUNC-LABEL: {{^}}i16_sext_arg:
124; HSA-VI: kernarg_segment_byte_size = 12
125; HSA-VI: kernarg_segment_alignment = 4
126
127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
129
130
131; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
132; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
133; HSA-VI: flat_store_dword
134
135; EG: BFE_INT   T0.X, T0.X, 0.0, literal.x,
136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
137; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
138
139; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x,
140; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
141; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
142; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
143define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
144  %ext = sext i16 %in to i32
145  store i32 %ext, i32 addrspace(1)* %out, align 4
146  ret void
147}
148
149; FUNC-LABEL: {{^}}i32_arg:
150; HSA-VI: kernarg_segment_byte_size = 12
151; HSA-VI: kernarg_segment_alignment = 4
152
153; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
154; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
155; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
156; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
157define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
158entry:
159  store i32 %in, i32 addrspace(1)* %out, align 4
160  ret void
161}
162
163; FUNC-LABEL: {{^}}f32_arg:
164; HSA-VI: kernarg_segment_byte_size = 12
165; HSA-VI: kernarg_segment_alignment = 4
166; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z
167; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
168; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
169; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
170define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
171entry:
172  store float %in, float addrspace(1)* %out, align 4
173  ret void
174}
175
176; FUNC-LABEL: {{^}}v2i8_arg:
177; HSA-VI: kernarg_segment_byte_size = 12
178; HSA-VI: kernarg_segment_alignment = 4
179
180; EGCM: VTX_READ_8
181; EGCM: VTX_READ_8
182
183; GCN: s_load_dword s
184; GCN-NOT: {{buffer|flat|global}}_load_
185define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
186entry:
187  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
188  ret void
189}
190
191; FUNC-LABEL: {{^}}v2i16_arg:
192; HSA-VI: kernarg_segment_byte_size = 12
193; HSA-VI: kernarg_segment_alignment = 4
194
195; EGCM: VTX_READ_16
196; EGCM: VTX_READ_16
197
198; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
199; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
200; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
201define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
202entry:
203  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
204  ret void
205}
206
207; FUNC-LABEL: {{^}}v2i32_arg:
208; HSA-VI: kernarg_segment_byte_size = 16
209; HSA-VI: kernarg_segment_alignment = 4
210
211; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
212; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
215; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
216define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
217entry:
218  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
219  ret void
220}
221
222; FUNC-LABEL: {{^}}v2f32_arg:
223; HSA-VI: kernarg_segment_byte_size = 16
224; HSA-VI: kernarg_segment_alignment = 4
225
226; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
227; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
228; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
229; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
230; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
231define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
232entry:
233  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
234  ret void
235}
236
237; FUNC-LABEL: {{^}}v3i8_arg:
238; HSA-VI: kernarg_segment_byte_size = 12
239; HSA-VI: kernarg_segment_alignment = 4
240
241; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
242; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
243; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
244
245; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
246
247; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
248; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
249define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
250entry:
251  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
252  ret void
253}
254
255; FUNC-LABEL: {{^}}v3i16_arg:
256; HSA-VI: kernarg_segment_byte_size = 16
257; HSA-VI: kernarg_segment_alignment = 4
258
259; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
260; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
261; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
262
263; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
264
265; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
266; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
267define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
268entry:
269  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
270  ret void
271}
272
273; FUNC-LABEL: {{^}}v3i32_arg:
274; HSA-VI: kernarg_segment_byte_size = 32
275; HSA-VI: kernarg_segment_alignment = 4
276; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
277; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
278; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
279; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
280; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
281; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
282define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
283entry:
284  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
285  ret void
286}
287
288; FUNC-LABEL: {{^}}v3f32_arg:
289; HSA-VI: kernarg_segment_byte_size = 32
290; HSA-VI: kernarg_segment_alignment = 4
291; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
292; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
293; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
294; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
295; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
296; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
297define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
298entry:
299  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
300  ret void
301}
302
303; FUNC-LABEL: {{^}}v4i8_arg:
304; HSA-VI: kernarg_segment_byte_size = 12
305; HSA-VI: kernarg_segment_alignment = 4
306; EGCM: VTX_READ_8
307; EGCM: VTX_READ_8
308; EGCM: VTX_READ_8
309; EGCM: VTX_READ_8
310
311; GCN-DAG: s_load_dwordx2 s
312; GCN-DAG: s_load_dword s
313define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
314entry:
315  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
316  ret void
317}
318
319; FUNC-LABEL: {{^}}v4i16_arg:
320; HSA-VI: kernarg_segment_byte_size = 16
321; HSA-VI: kernarg_segment_alignment = 4
322; EGCM: VTX_READ_16
323; EGCM: VTX_READ_16
324; EGCM: VTX_READ_16
325; EGCM: VTX_READ_16
326
327; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
328; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
329
330; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
331; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
332
333
334; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
335; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
336
337; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
338; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
339define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
340entry:
341  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
342  ret void
343}
344
345; FUNC-LABEL: {{^}}v4i32_arg:
346; HSA-VI: kernarg_segment_byte_size = 32
347; HSA-VI: kernarg_segment_alignment = 4
348; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
349; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
350; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
351; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
352
353; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
354; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
355; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
356define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
357entry:
358  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
359  ret void
360}
361
362; FUNC-LABEL: {{^}}v4f32_arg:
363; HSA-VI: kernarg_segment_byte_size = 32
364; HSA-VI: kernarg_segment_alignment = 4
365; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
366; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
367; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
368; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
369; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
370; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
371; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
372define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
373entry:
374  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
375  ret void
376}
377
378; FIXME: Lots of unpack and re-pack junk on VI
379; FUNC-LABEL: {{^}}v8i8_arg:
380; HSA-VI: kernarg_segment_byte_size = 16
381; HSA-VI: kernarg_segment_alignment = 4
382; EGCM: VTX_READ_8
383; EGCM: VTX_READ_8
384; EGCM: VTX_READ_8
385; EGCM: VTX_READ_8
386; EGCM: VTX_READ_8
387; EGCM: VTX_READ_8
388; EGCM: VTX_READ_8
389; EGCM: VTX_READ_8
390
391; SI-NOT: {{buffer|flat|global}}_load
392; SI: s_load_dwordx2 s
393; SI-NEXT: s_load_dwordx2 s
394; SI-NOT: {{buffer|flat|global}}_load
395
396; VI: s_load_dwordx2 s
397; VI-NEXT: s_load_dwordx2 s
398; VI-NOT: lshl
399; VI-NOT: _or
400; VI-NOT: _sdwa
401define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
402entry:
403  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
404  ret void
405}
406
407; FUNC-LABEL: {{^}}v8i16_arg:
408; HSA-VI: kernarg_segment_byte_size = 32
409; HSA-VI: kernarg_segment_alignment = 4
410; EGCM: VTX_READ_16
411; EGCM: VTX_READ_16
412; EGCM: VTX_READ_16
413; EGCM: VTX_READ_16
414; EGCM: VTX_READ_16
415; EGCM: VTX_READ_16
416; EGCM: VTX_READ_16
417; EGCM: VTX_READ_16
418
419; SI: s_load_dwordx4
420; SI-NEXT: s_load_dwordx2
421; SI-NOT: {{buffer|flat|global}}_load
422
423
424; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
425
426; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
427define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
428entry:
429  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
430  ret void
431}
432
433; FUNC-LABEL: {{^}}v8i32_arg:
434; HSA-VI: kernarg_segment_byte_size = 64
435; HSA-VI: kernarg_segment_alignment = 5
436; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
437; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
438; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
439; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
440; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
441; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
442; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
443; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
444
445; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
446; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
447; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
448define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
449entry:
450  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
451  ret void
452}
453
454; FUNC-LABEL: {{^}}v8f32_arg:
455; HSA-VI: kernarg_segment_byte_size = 64
456; HSA-VI: kernarg_segment_alignment = 5
457; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
458; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
459; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
460; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
461; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
462; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
463; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
464; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
465; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
466define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
467entry:
468  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
469  ret void
470}
471
472; FIXME: Pack/repack on VI
473
474; FUNC-LABEL: {{^}}v16i8_arg:
475; HSA-VI: kernarg_segment_byte_size = 32
476; HSA-VI: kernarg_segment_alignment = 4
477; EGCM: VTX_READ_8
478; EGCM: VTX_READ_8
479; EGCM: VTX_READ_8
480; EGCM: VTX_READ_8
481; EGCM: VTX_READ_8
482; EGCM: VTX_READ_8
483; EGCM: VTX_READ_8
484; EGCM: VTX_READ_8
485; EGCM: VTX_READ_8
486; EGCM: VTX_READ_8
487; EGCM: VTX_READ_8
488; EGCM: VTX_READ_8
489; EGCM: VTX_READ_8
490; EGCM: VTX_READ_8
491; EGCM: VTX_READ_8
492; EGCM: VTX_READ_8
493
494; SI: s_load_dwordx4 s
495; SI-NEXT: s_load_dwordx2 s
496; SI-NOT: {{buffer|flat|global}}_load
497
498
499; VI: s_load_dwordx4 s
500; VI-NOT: shr
501; VI-NOT: shl
502; VI-NOT: _sdwa
503; VI-NOT: _or_
504define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
505entry:
506  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
507  ret void
508}
509
510; FUNC-LABEL: {{^}}v16i16_arg:
511; HSA-VI: kernarg_segment_byte_size = 64
512; HSA-VI: kernarg_segment_alignment = 5
513; EGCM: VTX_READ_16
514; EGCM: VTX_READ_16
515; EGCM: VTX_READ_16
516; EGCM: VTX_READ_16
517; EGCM: VTX_READ_16
518
519; EGCM: VTX_READ_16
520; EGCM: VTX_READ_16
521; EGCM: VTX_READ_16
522; EGCM: VTX_READ_16
523; EGCM: VTX_READ_16
524; EGCM: VTX_READ_16
525; EGCM: VTX_READ_16
526; EGCM: VTX_READ_16
527; EGCM: VTX_READ_16
528; EGCM: VTX_READ_16
529; EGCM: VTX_READ_16
530
531; SI: s_load_dwordx8 s
532; SI-NEXT: s_load_dwordx2 s
533; SI-NOT: {{buffer|flat|global}}_load
534
535
536; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
537
538; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
539define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
540entry:
541  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
542  ret void
543}
544
545; FUNC-LABEL: {{^}}v16i32_arg:
546; HSA-VI: kernarg_segment_byte_size = 128
547; HSA-VI: kernarg_segment_alignment = 6
548; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
549; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
550; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
551; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
552; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
553; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
554; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
555; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
556; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
557; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
558; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
559; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
560; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
561; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
562; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
563; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
564; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
565; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
566; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
567define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
568entry:
569  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
570  ret void
571}
572
573; FUNC-LABEL: {{^}}v16f32_arg:
574; HSA-VI: kernarg_segment_byte_size = 128
575; HSA-VI: kernarg_segment_alignment = 6
576; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
577; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
578; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
579; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
580; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
581; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
582; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
583; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
584; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
585; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
586; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
587; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
588; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
589; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
590; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
591; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
595define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
596entry:
597  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
598  ret void
599}
600
601; FUNC-LABEL: {{^}}kernel_arg_i64:
602; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
603; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
604
605; MESA-GCN: buffer_store_dwordx2
606define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
607  store i64 %a, i64 addrspace(1)* %out, align 8
608  ret void
609}
610
611; FUNC-LABEL: {{^}}f64_kernel_arg:
612; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
613; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
614; MESA-GCN: buffer_store_dwordx2
615
616; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
617define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
618entry:
619  store double %in, double addrspace(1)* %out
620  ret void
621}
622
623; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
624; XGCN: s_load_dwordx2
625; XGCN: s_load_dwordx2
626; XGCN: buffer_store_dwordx2
627; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
628;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
629;   ret void
630; }
631
632; FUNC-LABEL: {{^}}i65_arg:
633; HSA-VI: kernarg_segment_byte_size = 24
634; HSA-VI: kernarg_segment_alignment = 4
635; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
636; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
637define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
638entry:
639  store i65 %in, i65 addrspace(1)* %out, align 4
640  ret void
641}
642
643; FUNC-LABEL: {{^}}i1_arg:
644; HSA-VI: kernarg_segment_byte_size = 12
645; HSA-VI: kernarg_segment_alignment = 4
646
647; GCN: s_load_dword s
648; GCN: s_and_b32
649; GCN: {{buffer|flat}}_store_byte
650define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
651  store i1 %x, i1 addrspace(1)* %out, align 1
652  ret void
653}
654
655; FUNC-LABEL: {{^}}i1_arg_zext_i32:
656; HSA-VI: kernarg_segment_byte_size = 12
657; HSA-VI: kernarg_segment_alignment = 4
658
659; GCN: s_load_dword
660; SGCN: buffer_store_dword
661define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
662  %ext = zext i1 %x to i32
663  store i32 %ext, i32 addrspace(1)* %out, align 4
664  ret void
665}
666
667; FUNC-LABEL: {{^}}i1_arg_zext_i64:
668; HSA-VI: kernarg_segment_byte_size = 12
669; HSA-VI: kernarg_segment_alignment = 4
670
671; GCN: s_load_dword s
672; GCN: {{buffer|flat}}_store_dwordx2
673define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
674  %ext = zext i1 %x to i64
675  store i64 %ext, i64 addrspace(1)* %out, align 8
676  ret void
677}
678
679; FUNC-LABEL: {{^}}i1_arg_sext_i32:
680; HSA-VI: kernarg_segment_byte_size = 12
681; HSA-VI: kernarg_segment_alignment = 4
682
683; GCN: s_load_dword
684; GCN: {{buffer|flat}}_store_dword
685define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
686  %ext = sext i1 %x to i32
687  store i32 %ext, i32addrspace(1)* %out, align 4
688  ret void
689}
690
691; FUNC-LABEL: {{^}}i1_arg_sext_i64:
692; HSA-VI: kernarg_segment_byte_size = 12
693; HSA-VI: kernarg_segment_alignment = 4
694
695; GCN: s_load_dword
696; GCN: s_bfe_i64
697; GCN: {{buffer|flat}}_store_dwordx2
698define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
699  %ext = sext i1 %x to i64
700  store i64 %ext, i64 addrspace(1)* %out, align 8
701  ret void
702}
703
704; FUNC-LABEL: {{^}}empty_struct_arg:
705; HSA-VI: kernarg_segment_byte_size = 0
706define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
707  ret void
708}
709
710; The correct load offsets for these:
711; load 4 from 0,
712; load 8 from 8
713; load 4 from 24
714; load 8 from 32
715
716; With the SelectionDAG argument lowering, the alignments for the
717; struct members is not properly considered, making these wrong.
718
719; FIXME: Total argument size is computed wrong
720; FUNC-LABEL: {{^}}struct_argument_alignment:
721; HSA-VI: kernarg_segment_byte_size = 40
722; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
723; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
724; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
725; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
726define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
727  %val0 = extractvalue {i32, i64} %arg0, 0
728  %val1 = extractvalue {i32, i64} %arg0, 1
729  %val2 = extractvalue {i32, i64} %arg1, 0
730  %val3 = extractvalue {i32, i64} %arg1, 1
731  store volatile i32 %val0, i32 addrspace(1)* null
732  store volatile i64 %val1, i64 addrspace(1)* null
733  store volatile i32 %val2, i32 addrspace(1)* null
734  store volatile i64 %val3, i64 addrspace(1)* null
735  ret void
736}
737
738; No padding between i8 and next struct, but round up at end to 4 byte
739; multiple.
740; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
741; HSA-VI: kernarg_segment_byte_size = 28
742; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
743; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
744; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
745; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
746define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
747  %val0 = extractvalue <{i32, i64}> %arg0, 0
748  %val1 = extractvalue <{i32, i64}> %arg0, 1
749  %val2 = extractvalue <{i32, i64}> %arg1, 0
750  %val3 = extractvalue <{i32, i64}> %arg1, 1
751  store volatile i32 %val0, i32 addrspace(1)* null
752  store volatile i64 %val1, i64 addrspace(1)* null
753  store volatile i32 %val2, i32 addrspace(1)* null
754  store volatile i64 %val3, i64 addrspace(1)* null
755  ret void
756}
757
758; GCN-LABEL: {{^}}struct_argument_alignment_after:
759; HSA-VI: kernarg_segment_byte_size = 64
760; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
761; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
762; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
763; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
764; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
765define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
766  %val0 = extractvalue {i32, i64} %arg0, 0
767  %val1 = extractvalue {i32, i64} %arg0, 1
768  %val2 = extractvalue {i32, i64} %arg2, 0
769  %val3 = extractvalue {i32, i64} %arg2, 1
770  store volatile i32 %val0, i32 addrspace(1)* null
771  store volatile i64 %val1, i64 addrspace(1)* null
772  store volatile i32 %val2, i32 addrspace(1)* null
773  store volatile i64 %val3, i64 addrspace(1)* null
774  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
775  ret void
776}
777
778; GCN-LABEL: {{^}}array_3xi32:
779; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
780; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
781; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
782; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
783define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
784  store volatile i16 %arg0, i16 addrspace(1)* undef
785  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
786  ret void
787}
788
789; FIXME: Why not all scalar loads?
790; GCN-LABEL: {{^}}array_3xi16:
791; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
792; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
793; HSA-VI: flat_load_ushort
794; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
795; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
796define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
797  store volatile i8 %arg0, i8 addrspace(1)* undef
798  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
799  ret void
800}
801