• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; FIXME: Manually added checks for metadata nodes at bottom
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
4; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
5
6target datalayout = "A5"
7
8define amdgpu_kernel void @kern_noargs() {
9; HSA-LABEL: @kern_noargs(
10; HSA-NEXT:    ret void
11;
12; MESA-LABEL: @kern_noargs(
13; MESA-NEXT:    ret void
14;
15  ret void
16}
17
18define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
19; HSA-LABEL: @kern_i8(
20; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
21; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
22; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
23; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
24; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
25; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
26; HSA-NEXT:    ret void
27;
28; MESA-LABEL: @kern_i8(
29; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
30; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
31; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
32; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
33; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
34; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
35; MESA-NEXT:    ret void
36;
37  store i8 %arg, i8 addrspace(1)* undef, align 1
38  ret void
39}
40
41define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
42; HSA-LABEL: @kern_i16(
43; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
44; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
45; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
46; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
47; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
48; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
49; HSA-NEXT:    ret void
50;
51; MESA-LABEL: @kern_i16(
52; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
53; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
54; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
55; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
56; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
57; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
58; MESA-NEXT:    ret void
59;
60  store i16 %arg, i16 addrspace(1)* undef, align 1
61  ret void
62}
63
64define amdgpu_kernel void @kern_f16(half %arg) #0 {
65; HSA-LABEL: @kern_f16(
66; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
67; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
68; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
69; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
70; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
71; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
72; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
73; HSA-NEXT:    ret void
74;
75; MESA-LABEL: @kern_f16(
76; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
77; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
78; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
79; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
80; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
81; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
82; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
83; MESA-NEXT:    ret void
84;
85  store half %arg, half addrspace(1)* undef, align 1
86  ret void
87}
88
89define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
90; HSA-LABEL: @kern_zeroext_i8(
91; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
92; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
93; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
94; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
95; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
96; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
97; HSA-NEXT:    ret void
98;
99; MESA-LABEL: @kern_zeroext_i8(
100; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
101; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
102; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
103; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
104; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
105; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
106; MESA-NEXT:    ret void
107;
108  store i8 %arg, i8 addrspace(1)* undef, align 1
109  ret void
110}
111
112define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
113; HSA-LABEL: @kern_zeroext_i16(
114; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
115; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
116; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
117; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
118; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
119; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
120; HSA-NEXT:    ret void
121;
122; MESA-LABEL: @kern_zeroext_i16(
123; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
124; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
125; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
126; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
127; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
128; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
129; MESA-NEXT:    ret void
130;
131  store i16 %arg, i16 addrspace(1)* undef, align 1
132  ret void
133}
134
135define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
136; HSA-LABEL: @kern_signext_i8(
137; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
138; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
139; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
140; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
141; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
142; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
143; HSA-NEXT:    ret void
144;
145; MESA-LABEL: @kern_signext_i8(
146; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
147; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
148; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
149; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
150; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
151; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
152; MESA-NEXT:    ret void
153;
154  store i8 %arg, i8 addrspace(1)* undef, align 1
155  ret void
156}
157
158define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
159; HSA-LABEL: @kern_signext_i16(
160; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
161; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
162; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
163; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
164; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
165; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
166; HSA-NEXT:    ret void
167;
168; MESA-LABEL: @kern_signext_i16(
169; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
170; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
171; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
172; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
173; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
174; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
175; MESA-NEXT:    ret void
176;
177  store i16 %arg, i16 addrspace(1)* undef, align 1
178  ret void
179}
180
181define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
182; HSA-LABEL: @kern_i8_i8(
183; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
184; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
185; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
186; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
187; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
188; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
189; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
190; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
191; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
192; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
193; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
194; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
195; HSA-NEXT:    ret void
196;
197; MESA-LABEL: @kern_i8_i8(
198; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
199; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
200; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
201; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
202; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
203; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
204; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
205; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
206; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
207; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
208; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
209; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
210; MESA-NEXT:    ret void
211;
212  store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
213  store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
214  ret void
215}
216
217define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
218; HSA-LABEL: @kern_v3i8(
219; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
220; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
221; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
222; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
223; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
224; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
225; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
226; HSA-NEXT:    ret void
227;
228; MESA-LABEL: @kern_v3i8(
229; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
230; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
231; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
232; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
233; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
234; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
235; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
236; MESA-NEXT:    ret void
237;
238  store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
239  ret void
240}
241
242define amdgpu_kernel void @kern_i24(i24 %arg0) {
243; HSA-LABEL: @kern_i24(
244; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
245; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
246; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
247; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
248; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
249; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
250; HSA-NEXT:    ret void
251;
252; MESA-LABEL: @kern_i24(
253; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
254; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
255; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
256; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
257; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
258; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
259; MESA-NEXT:    ret void
260;
261  store i24 %arg0, i24 addrspace(1)* undef
262  ret void
263}
264
265define amdgpu_kernel void @kern_i32(i32 %arg0) {
266; HSA-LABEL: @kern_i32(
267; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
268; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
269; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
270; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
271; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
272; HSA-NEXT:    ret void
273;
274; MESA-LABEL: @kern_i32(
275; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
276; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
277; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
278; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
279; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
280; MESA-NEXT:    ret void
281;
282  store i32 %arg0, i32 addrspace(1)* undef
283  ret void
284}
285
286define amdgpu_kernel void @kern_f32(float %arg0) {
287; HSA-LABEL: @kern_f32(
288; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
289; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
290; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
291; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
292; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
293; HSA-NEXT:    ret void
294;
295; MESA-LABEL: @kern_f32(
296; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
297; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
298; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
299; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
300; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
301; MESA-NEXT:    ret void
302;
303  store float %arg0, float addrspace(1)* undef
304  ret void
305}
306
307define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
308; HSA-LABEL: @kern_v3i32(
309; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
310; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
311; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
312; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
313; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
314; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
315; HSA-NEXT:    ret void
316;
317; MESA-LABEL: @kern_v3i32(
318; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
319; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
320; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
321; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
322; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
323; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
324; MESA-NEXT:    ret void
325;
326  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
327  ret void
328}
329
330define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
331; HSA-LABEL: @kern_v8i32(
332; HSA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
333; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
334; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
335; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
336; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
337; HSA-NEXT:    ret void
338;
339; MESA-LABEL: @kern_v8i32(
340; MESA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
341; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
342; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
343; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
344; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
345; MESA-NEXT:    ret void
346;
347  store <8 x i32> %arg, <8 x i32> addrspace(1)* undef
348  ret void
349}
350
351define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
352; HSA-LABEL: @kern_v8i64(
353; HSA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
354; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
355; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
356; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
357; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
358; HSA-NEXT:    ret void
359;
360; MESA-LABEL: @kern_v8i64(
361; MESA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(100) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
362; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
363; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
364; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
365; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
366; MESA-NEXT:    ret void
367;
368  store <8 x i64> %arg, <8 x i64> addrspace(1)* undef
369  ret void
370}
371
372define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
373; HSA-LABEL: @kern_v16i64(
374; HSA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(128) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
375; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
376; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
377; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
378; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
379; HSA-NEXT:    ret void
380;
381; MESA-LABEL: @kern_v16i64(
382; MESA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(164) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
383; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
384; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
385; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
386; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
387; MESA-NEXT:    ret void
388;
389  store <16 x i64> %arg, <16 x i64> addrspace(1)* undef
390  ret void
391}
392
393define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
394; HSA-LABEL: @kern_i32_v3i32(
395; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
396; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
397; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
398; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
399; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
400; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
401; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
402; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
403; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
404; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
405; HSA-NEXT:    ret void
406;
407; MESA-LABEL: @kern_i32_v3i32(
408; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
409; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
410; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
411; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
412; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
413; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
414; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
415; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
416; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
417; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
418; MESA-NEXT:    ret void
419;
420  store i32 %arg0, i32 addrspace(1)* undef
421  store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
422  ret void
423}
424
425%struct.a = type { i32, i8, [4 x i8] }
426%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
427
428define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
429; HSA-LABEL: @kern_struct_a(
430; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
431; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
432; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
433; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
434; HSA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef, align 4
435; HSA-NEXT:    ret void
436;
437; MESA-LABEL: @kern_struct_a(
438; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
439; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
440; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
441; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
442; MESA-NEXT:    store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef, align 4
443; MESA-NEXT:    ret void
444;
445  store %struct.a %arg0, %struct.a addrspace(1)* undef
446  ret void
447}
448
449define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
450; HSA-LABEL: @kern_struct_b_packed(
451; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
452; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
453; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
454; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
455; HSA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
456; HSA-NEXT:    ret void
457;
458; MESA-LABEL: @kern_struct_b_packed(
459; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
460; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
461; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
462; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
463; MESA-NEXT:    store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
464; MESA-NEXT:    ret void
465;
466  store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
467  ret void
468}
469
470define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
471; HSA-LABEL: @kern_implicit_arg_num_bytes(
472; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
473; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
474; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
475; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
476; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
477; HSA-NEXT:    ret void
478;
479; MESA-LABEL: @kern_implicit_arg_num_bytes(
480; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
481; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
482; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
483; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
484; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
485; MESA-NEXT:    ret void
486;
487  store i32 %arg0, i32 addrspace(1)* undef
488  ret void
489}
490
491define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
492; HSA-LABEL: @kernel_implicitarg_no_struct_align(
493; HSA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
494; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
495; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
496; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
497; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
498; HSA-NEXT:    ret void
499;
500; MESA-LABEL: @kernel_implicitarg_no_struct_align(
501; MESA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
502; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
503; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
504; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
505; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
506; MESA-NEXT:    ret void
507;
508  store i32 %arg1, i32 addrspace(1)* undef
509  ret void
510}
511
512define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
513; HSA-LABEL: @kern_lds_ptr(
514; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
515; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
516; HSA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
517; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
518; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
519; HSA-NEXT:    ret void
520;
521; MESA-LABEL: @kern_lds_ptr(
522; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
523; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
524; MESA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
525; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
526; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
527; MESA-NEXT:    ret void
528;
529  store i32 0, i32 addrspace(3)* %lds, align 4
530  ret void
531}
532
533define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
534; HSA-LABEL: @kern_lds_ptr_si(
535; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
536; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 0
537; HSA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
538; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
539; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
540; HSA-NEXT:    ret void
541;
542; MESA-LABEL: @kern_lds_ptr_si(
543; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
544; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
545; MESA-NEXT:    ret void
546;
547  store i32 0, i32 addrspace(3)* %lds, align 4
548  ret void
549}
550
551define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
552; HSA-LABEL: @kern_realign_i8_i8(
553; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
554; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
555; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
556; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
557; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
558; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
559; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
560; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
561; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
562; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
563; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
564; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
565; HSA-NEXT:    ret void
566;
567; MESA-LABEL: @kern_realign_i8_i8(
568; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
569; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
570; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
571; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
572; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
573; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
574; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
575; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
576; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
577; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
578; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
579; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
580; MESA-NEXT:    ret void
581;
582  store volatile i8 %arg0, i8 addrspace(1)* undef
583  store volatile i8 %arg1, i8 addrspace(1)* undef
584  ret void
585}
586
587define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
588; HSA-LABEL: @kern_realign_i8_i8_i8(
589; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
590; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
591; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
592; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
593; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
594; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
595; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
596; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
597; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
598; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
599; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
600; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
601; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
602; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
603; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
604; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
605; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
606; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
607; HSA-NEXT:    ret void
608;
609; MESA-LABEL: @kern_realign_i8_i8_i8(
610; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
611; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
612; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
613; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
614; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
615; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
616; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
617; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
618; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
619; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
620; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
621; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
622; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
623; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
624; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
625; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
626; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
627; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
628; MESA-NEXT:    ret void
629;
630  store volatile i8 %arg0, i8 addrspace(1)* undef
631  store volatile i8 %arg1, i8 addrspace(1)* undef
632  store volatile i8 %arg2, i8 addrspace(1)* undef
633  ret void
634}
635
636define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
637; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
638; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
639; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
640; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
641; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
642; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
643; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
644; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
645; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
646; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
647; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
648; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
649; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
650; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
651; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
652; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
653; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
654; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
655; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
656; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
657; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
658; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
659; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
660; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
661; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
662; HSA-NEXT:    ret void
663;
664; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
665; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
666; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
667; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
668; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
669; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
670; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
671; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
672; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
673; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
674; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
675; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
676; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
677; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
678; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
679; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
680; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
681; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
682; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
683; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
684; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
685; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
686; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
687; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
688; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
689; MESA-NEXT:    ret void
690;
691  store volatile i8 %arg0, i8 addrspace(1)* undef
692  store volatile i8 %arg1, i8 addrspace(1)* undef
693  store volatile i8 %arg2, i8 addrspace(1)* undef
694  store volatile i8 %arg3, i8 addrspace(1)* undef
695  ret void
696}
697
698define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
699; HSA-LABEL: @kern_realign_i8_v3i8(
700; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
701; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
702; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
703; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
704; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
705; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
706; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
707; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
708; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
709; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
710; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
711; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
712; HSA-NEXT:    ret void
713;
714; MESA-LABEL: @kern_realign_i8_v3i8(
715; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
716; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
717; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
718; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
719; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
720; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
721; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
722; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
723; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
724; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
725; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
726; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
727; MESA-NEXT:    ret void
728;
729  store volatile i8 %arg0, i8 addrspace(1)* undef
730  store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
731  ret void
732}
733
734define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
735; HSA-LABEL: @kern_realign_i8_i16(
736; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
737; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
738; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
739; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
740; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
741; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
742; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
743; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
744; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
745; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
746; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
747; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
748; HSA-NEXT:    ret void
749;
750; MESA-LABEL: @kern_realign_i8_i16(
751; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
752; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
753; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
754; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
755; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
756; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
757; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
758; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
759; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
760; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
761; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
762; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
763; MESA-NEXT:    ret void
764;
765  store volatile i8 %arg0, i8 addrspace(1)* undef
766  store volatile i16 %arg1, i16 addrspace(1)* undef
767  ret void
768}
769
770define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
771; HSA-LABEL: @kern_realign_i1_i1(
772; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
773; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
774; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
775; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
776; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
777; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
778; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
779; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
780; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
781; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
782; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
783; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
784; HSA-NEXT:    ret void
785;
786; MESA-LABEL: @kern_realign_i1_i1(
787; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
788; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
789; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
790; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
791; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
792; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
793; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
794; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
795; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
796; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
797; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
798; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
799; MESA-NEXT:    ret void
800;
801  store volatile i1 %arg0, i1 addrspace(1)* undef
802  store volatile i1 %arg1, i1 addrspace(1)* undef
803  ret void
804}
805
806define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
807; HSA-LABEL: @kern_realign_i1_i1_i1(
808; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
809; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
810; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
811; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
812; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
813; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
814; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
815; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
816; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
817; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
818; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
819; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
820; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
821; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
822; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
823; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
824; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
825; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
826; HSA-NEXT:    ret void
827;
828; MESA-LABEL: @kern_realign_i1_i1_i1(
829; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
830; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
831; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
832; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
833; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
834; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
835; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
836; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
837; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
838; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
839; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
840; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
841; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
842; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
843; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
844; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
845; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
846; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
847; MESA-NEXT:    ret void
848;
849  store volatile i1 %arg0, i1 addrspace(1)* undef
850  store volatile i1 %arg1, i1 addrspace(1)* undef
851  store volatile i1 %arg2, i1 addrspace(1)* undef
852  ret void
853}
854
855define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
856; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
857; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
858; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
859; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
860; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
861; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
862; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
863; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
864; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
865; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
866; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
867; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
868; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
869; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
870; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
871; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
872; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
873; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
874; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
875; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
876; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
877; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
878; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
879; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
880; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
881; HSA-NEXT:    ret void
882;
883; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
884; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
885; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
886; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
887; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
888; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
889; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
890; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
891; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
892; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
893; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
894; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
895; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
896; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
897; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
898; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
899; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
900; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
901; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
902; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
903; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
904; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
905; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
906; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
907; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
908; MESA-NEXT:    ret void
909;
910  store volatile i1 %arg0, i1 addrspace(1)* undef
911  store volatile i1 %arg1, i1 addrspace(1)* undef
912  store volatile i1 %arg2, i1 addrspace(1)* undef
913  store volatile i1 %arg3, i1 addrspace(1)* undef
914  ret void
915}
916
917define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
918; HSA-LABEL: @kern_realign_i1_v3i1(
919; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
920; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
921; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
922; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
923; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
924; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
925; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
926; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
927; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
928; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
929; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
930; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
931; HSA-NEXT:    ret void
932;
933; MESA-LABEL: @kern_realign_i1_v3i1(
934; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
935; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
936; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
937; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
938; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
939; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 40
940; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
941; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
942; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
943; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
944; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
945; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
946; MESA-NEXT:    ret void
947;
948  store volatile i1 %arg0, i1 addrspace(1)* undef
949  store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
950  ret void
951}
952
953define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
954; HSA-LABEL: @kern_realign_i1_i16(
955; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
956; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
957; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
958; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
959; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
960; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
961; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
962; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
963; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
964; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
965; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
966; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
967; HSA-NEXT:    ret void
968;
969; MESA-LABEL: @kern_realign_i1_i16(
970; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
971; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
972; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
973; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
974; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
975; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
976; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
977; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
978; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
979; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
980; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
981; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
982; MESA-NEXT:    ret void
983;
984  store volatile i1 %arg0, i1 addrspace(1)* undef
985  store volatile i16 %arg1, i16 addrspace(1)* undef
986  ret void
987}
988
989define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
990; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
991; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
992; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
993; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
994; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
995; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
996; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
997; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
998; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
999; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
1000; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1001; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
1002; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1003; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1004; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1005; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1006; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
1007; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1008; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1009; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1010; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1011; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1012; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1013; HSA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1014; HSA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1015; HSA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1016; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1017; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1018; HSA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1019; HSA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1020; HSA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1021; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1022; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1023; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1024; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1025; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1026; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1027; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1028; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1029; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1030; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1031; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1032; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1033; HSA-NEXT:    ret void
1034;
1035; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
1036; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1037; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1038; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1039; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1040; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
1041; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1042; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1043; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1044; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
1045; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1046; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1047; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1048; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1049; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1050; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1051; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1052; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1053; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1054; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1055; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1056; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1057; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1058; MESA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1059; MESA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1060; MESA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1061; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1062; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1063; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1064; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1065; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1066; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1067; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1068; MESA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1069; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1070; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1071; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1072; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1073; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1074; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1075; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1076; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1077; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1078; MESA-NEXT:    ret void
1079;
1080  store volatile i8 %arg0, i8 addrspace(1)* undef
1081  store volatile i8 %arg1, i8 addrspace(1)* undef
1082  store volatile i8 %arg2, i8 addrspace(1)* undef
1083  store volatile i8 %arg3, i8 addrspace(1)* undef
1084  store volatile i8 %arg5, i8 addrspace(1)* undef
1085  store volatile i8 %arg6, i8 addrspace(1)* undef
1086  store volatile i8 %arg7, i8 addrspace(1)* undef
1087  ret void
1088}
1089
1090define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
1091; HSA-LABEL: @kern_realign_f16_f16(
1092; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1093; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1094; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1095; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1096; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1097; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1098; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1099; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1100; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1101; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1102; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1103; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1104; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1105; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1106; HSA-NEXT:    ret void
1107;
1108; MESA-LABEL: @kern_realign_f16_f16(
1109; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1110; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1111; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1112; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1113; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1114; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1115; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1116; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1117; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1118; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1119; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1120; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1121; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1122; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1123; MESA-NEXT:    ret void
1124;
1125  store volatile half %arg0, half addrspace(1)* undef
1126  store volatile half %arg1, half addrspace(1)* undef
1127  ret void
1128}
1129
1130define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
1131; HSA-LABEL: @kern_global_ptr(
1132; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1133; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1134; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1135; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1136; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1137; HSA-NEXT:    ret void
1138;
1139; MESA-LABEL: @kern_global_ptr(
1140; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1141; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1142; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1143; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1144; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1145; MESA-NEXT:    ret void
1146;
1147  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1148  ret void
1149}
1150
1151define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
1152; HSA-LABEL: @kern_global_ptr_dereferencable(
1153; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1154; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
1155; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1156; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
1157; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1158; HSA-NEXT:    ret void
1159;
1160; MESA-LABEL: @kern_global_ptr_dereferencable(
1161; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1162; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1163; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1164; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
1165; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1166; MESA-NEXT:    ret void
1167;
1168  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1169  ret void
1170}
1171
1172define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
1173; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1174; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1175; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
1176; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1177; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
1178; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1179; HSA-NEXT:    ret void
1180;
1181; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1182; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1183; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1184; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1185; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
1186; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1187; MESA-NEXT:    ret void
1188;
1189  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1190  ret void
1191}
1192
1193define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
1194; HSA-LABEL: @kern_nonnull_global_ptr(
1195; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1196; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1197; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1198; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
1199; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1200; HSA-NEXT:    ret void
1201;
1202; MESA-LABEL: @kern_nonnull_global_ptr(
1203; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1204; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1205; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1206; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
1207; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1208; MESA-NEXT:    ret void
1209;
1210  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1211  ret void
1212}
1213
1214define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
1215; HSA-LABEL: @kern_align32_global_ptr(
1216; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1217; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1218; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1219; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
1220; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1221; HSA-NEXT:    ret void
1222;
1223; MESA-LABEL: @kern_align32_global_ptr(
1224; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1225; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1226; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1227; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
1228; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1229; MESA-NEXT:    ret void
1230;
1231  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1232  ret void
1233}
1234
1235define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
1236; HSA-LABEL: @kern_noalias_global_ptr(
1237; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1238; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1239; HSA-NEXT:    ret void
1240;
1241; MESA-LABEL: @kern_noalias_global_ptr(
1242; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1243; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1244; MESA-NEXT:    ret void
1245;
1246  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1247  ret void
1248}
1249
1250define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
1251; HSA-LABEL: @kern_noalias_global_ptr_x2(
1252; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1253; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1254; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1255; HSA-NEXT:    ret void
1256;
1257; MESA-LABEL: @kern_noalias_global_ptr_x2(
1258; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1259; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1260; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1261; MESA-NEXT:    ret void
1262;
1263  store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
1264  store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
1265  ret void
1266}
1267
1268define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
1269; HSA-LABEL: @struct_i8_i8_arg(
1270; HSA-NEXT:  entry:
1271; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1272; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
1273; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1274; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1275; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1276; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1277; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1278; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1279; HSA-NEXT:    ret void
1280;
1281; MESA-LABEL: @struct_i8_i8_arg(
1282; MESA-NEXT:  entry:
1283; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1284; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
1285; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1286; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1287; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1288; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1289; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1290; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1291; MESA-NEXT:    ret void
1292;
1293entry:
1294  %elt0 = extractvalue {i8, i8} %in, 0
1295  %elt1 = extractvalue {i8, i8} %in, 1
1296  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1297  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1298  ret void
1299}
1300
1301define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
1302; HSA-LABEL: @struct_i8_i16_arg(
1303; HSA-NEXT:  entry:
1304; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1305; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
1306; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1307; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1308; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1309; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1310; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1311; HSA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1312; HSA-NEXT:    ret void
1313;
1314; MESA-LABEL: @struct_i8_i16_arg(
1315; MESA-NEXT:  entry:
1316; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1317; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
1318; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1319; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1320; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1321; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1322; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1323; MESA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1324; MESA-NEXT:    ret void
1325;
1326entry:
1327  %elt0 = extractvalue {i8, i16} %in, 0
1328  %elt1 = extractvalue {i8, i16} %in, 1
1329  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1330  store volatile i16 %elt1, i16 addrspace(1)* null, align 4
1331  ret void
1332}
1333
1334define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
1335; HSA-LABEL: @array_2xi8_arg(
1336; HSA-NEXT:  entry:
1337; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1338; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
1339; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1340; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1341; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1342; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1343; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1344; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1345; HSA-NEXT:    ret void
1346;
1347; MESA-LABEL: @array_2xi8_arg(
1348; MESA-NEXT:  entry:
1349; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1350; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
1351; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1352; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1353; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1354; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1355; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1356; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1357; MESA-NEXT:    ret void
1358;
1359entry:
1360  %elt0 = extractvalue [2 x i8] %in, 0
1361  %elt1 = extractvalue [2 x i8] %in, 1
1362  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1363  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1364  ret void
1365}
1366
1367define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
1368; HSA-LABEL: @array_2xi1_arg(
1369; HSA-NEXT:  entry:
1370; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1371; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
1372; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1373; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1374; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1375; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1376; HSA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1377; HSA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1378; HSA-NEXT:    ret void
1379;
1380; MESA-LABEL: @array_2xi1_arg(
1381; MESA-NEXT:  entry:
1382; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1383; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
1384; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1385; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1386; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1387; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1388; MESA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1389; MESA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1390; MESA-NEXT:    ret void
1391;
1392entry:
1393  %elt0 = extractvalue [2 x i1] %in, 0
1394  %elt1 = extractvalue [2 x i1] %in, 1
1395  store volatile i1 %elt0, i1 addrspace(1)* null, align 4
1396  store volatile i1 %elt1, i1 addrspace(1)* null, align 4
1397  ret void
1398}
1399
1400define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
1401; HSA-LABEL: @only_empty_struct(
1402; HSA-NEXT:    ret void
1403;
1404; MESA-LABEL: @only_empty_struct(
1405; MESA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1406; MESA-NEXT:    ret void
1407;
1408  ret void
1409}
1410
1411define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
1412; HSA-LABEL: @empty_struct_with_other(
1413; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1414; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
1415; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1416; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1417; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1418; HSA-NEXT:    ret void
1419;
1420; MESA-LABEL: @empty_struct_with_other(
1421; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1422; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
1423; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1424; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1425; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1426; MESA-NEXT:    ret void
1427;
1428  store i32 %arg1, i32 addrspace(1)* undef
1429  ret void
1430}
1431
1432; Should insert code after the allocas
1433define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
1434; HSA-LABEL: @static_alloca_kern_i32(
1435; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1436; HSA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1437; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
1438; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1439; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1440; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1441; HSA-NEXT:    ret void
1442;
1443; MESA-LABEL: @static_alloca_kern_i32(
1444; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1445; MESA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1446; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
1447; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1448; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1449; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1450; MESA-NEXT:    ret void
1451;
1452  %alloca = alloca i32, addrspace(5)
1453  store volatile i32 %arg0, i32 addrspace(5)* %alloca
1454  ret void
1455}
1456
1457; Make sure we don't break the IR if an alloca depends on the
1458; kernargs.
1459define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
1460; HSA-LABEL: @dyn_alloca_kernarg_i32(
1461; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1462; HSA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1463; HSA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
1464; HSA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1465; HSA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1466; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1467; HSA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1468; HSA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1469; HSA-NEXT:    ret void
1470;
1471; MESA-LABEL: @dyn_alloca_kernarg_i32(
1472; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1473; MESA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1474; MESA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
1475; MESA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1476; MESA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1477; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1478; MESA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1479; MESA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1480; MESA-NEXT:    ret void
1481;
1482  %alloca0 = alloca i32, addrspace(5)
1483  %alloca1 = alloca i32, i32 %n, addrspace(5)
1484  store volatile i32 0, i32 addrspace(5)* %alloca0
1485  store volatile i32 1, i32 addrspace(5)* %alloca1
1486  ret void
1487}
1488
1489; Byref pointers should only be treated as offsets from kernarg
1490define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
1491; HSA-LABEL: @byref_constant_i8_arg(
1492; HSA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1493; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
1494; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1495; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1496; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8
1497; HSA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1498; HSA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1499; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1500; HSA-NEXT:    ret void
1501;
1502; MESA-LABEL: @byref_constant_i8_arg(
1503; MESA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1504; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
1505; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1506; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1507; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44
1508; MESA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1509; MESA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1510; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1511; MESA-NEXT:    ret void
1512;
1513  %in = load i8, i8 addrspace(4)* %in.byref
1514  %ext = zext i8 %in to i32
1515  store i32 %ext, i32 addrspace(1)* %out, align 4
1516  ret void
1517}
1518
1519define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
1520; HSA-LABEL: @byref_constant_i16_arg(
1521; HSA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1522; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
1523; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1524; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1525; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8
1526; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1527; HSA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1528; HSA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1529; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1530; HSA-NEXT:    ret void
1531;
1532; MESA-LABEL: @byref_constant_i16_arg(
1533; MESA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1534; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
1535; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1536; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1537; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44
1538; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1539; MESA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1540; MESA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1541; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1542; MESA-NEXT:    ret void
1543;
1544  %in = load i16, i16 addrspace(4)* %in.byref
1545  %ext = zext i16 %in to i32
1546  store i32 %ext, i32 addrspace(1)* %out, align 4
1547  ret void
1548}
1549
1550define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) {
1551; HSA-LABEL: @byref_constant_i32_arg(
1552; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1553; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1554; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1555; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1556; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1557; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1558; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1559; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1560; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1561; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1562; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1563; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1564; HSA-NEXT:    ret void
1565;
1566; MESA-LABEL: @byref_constant_i32_arg(
1567; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1568; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1569; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1570; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1571; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1572; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1573; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1574; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1575; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1576; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1577; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1578; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1579; MESA-NEXT:    ret void
1580;
1581  %in = load i32, i32 addrspace(4)* %in.byref
1582  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1583  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1584  ret void
1585}
1586
1587define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) {
1588; HSA-LABEL: @byref_constant_v4i32_arg(
1589; HSA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1590; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
1591; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1592; HSA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1593; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16
1594; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1595; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32
1596; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1597; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1598; HSA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1599; HSA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1600; HSA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1601; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1602; HSA-NEXT:    ret void
1603;
1604; MESA-LABEL: @byref_constant_v4i32_arg(
1605; MESA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1606; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
1607; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1608; MESA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1609; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52
1610; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1611; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68
1612; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1613; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1614; MESA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1615; MESA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1616; MESA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1617; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1618; MESA-NEXT:    ret void
1619;
1620  %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref
1621  store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
1622  %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)*
1623  store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4
1624  ret void
1625}
1626
1627define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
1628; HSA-LABEL: @byref_align_constant_i32_arg(
1629; HSA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1630; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1631; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1632; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1633; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256
1634; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1635; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260
1636; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1637; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1638; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1639; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1640; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1641; HSA-NEXT:    ret void
1642;
1643; MESA-LABEL: @byref_align_constant_i32_arg(
1644; MESA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(300) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1645; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1646; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1647; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1648; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292
1649; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1650; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296
1651; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1652; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0
1653; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1654; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1655; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1656; MESA-NEXT:    ret void
1657;
1658  %in = load i32, i32 addrspace(4)* %in.byref
1659  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1660  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1661  ret void
1662}
1663
1664define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
1665; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
1666; HSA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(132) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1667; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
1668; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1669; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1670; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64
1671; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1672; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128
1673; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1674; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1675; HSA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1676; HSA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1677; HSA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1678; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1679; HSA-NEXT:    ret void
1680;
1681; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
1682; MESA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(168) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1683; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
1684; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1685; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1686; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100
1687; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1688; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164
1689; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1690; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1691; MESA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1692; MESA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1693; MESA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1694; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1695; MESA-NEXT:    ret void
1696;
1697  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
1698  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
1699  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
1700  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1701  ret void
1702}
1703
1704; Also accept byref kernel arguments with other global address spaces.
1705define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) {
1706; HSA-LABEL: @byref_global_i32_arg(
1707; HSA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1708; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1709; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1710; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1711; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1712; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1713; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1714; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1715; HSA-NEXT:    ret void
1716;
1717; MESA-LABEL: @byref_global_i32_arg(
1718; MESA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1719; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1720; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1721; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1722; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1723; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1724; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1725; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1726; MESA-NEXT:    ret void
1727;
1728  %in = load i32, i32 addrspace(1)* %in.byref
1729  store i32 %in, i32 addrspace(1)* %out, align 4
1730  ret void
1731}
1732
1733define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) {
1734; HSA-LABEL: @byref_flat_i32_arg(
1735; HSA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1736; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
1737; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1738; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1739; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8
1740; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1741; HSA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1742; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1743; HSA-NEXT:    ret void
1744;
1745; MESA-LABEL: @byref_flat_i32_arg(
1746; MESA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1747; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
1748; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1749; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1750; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44
1751; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1752; MESA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1753; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1754; MESA-NEXT:    ret void
1755;
1756  %in = load i32, i32* %in.byref
1757  store i32 %in, i32 addrspace(1)* %out, align 4
1758  ret void
1759}
1760
1761define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
1762; HSA-LABEL: @byref_constant_32bit_i32_arg(
1763; HSA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1764; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
1765; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1766; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1767; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8
1768; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1769; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1770; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1771; HSA-NEXT:    ret void
1772;
1773; MESA-LABEL: @byref_constant_32bit_i32_arg(
1774; MESA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1775; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
1776; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1777; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1778; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44
1779; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1780; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1781; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1782; MESA-NEXT:    ret void
1783;
1784  %in = load i32, i32 addrspace(6)* %in.byref
1785  store i32 %in, i32 addrspace(1)* %out, align 4
1786  ret void
1787}
1788
1789define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) {
1790; HSA-LABEL: @byref_unknown_as_i32_arg(
1791; HSA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1792; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
1793; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1794; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1795; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8
1796; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1797; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1798; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1799; HSA-NEXT:    ret void
1800;
1801; MESA-LABEL: @byref_unknown_as_i32_arg(
1802; MESA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1803; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
1804; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1805; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1806; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44
1807; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1808; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1809; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1810; MESA-NEXT:    ret void
1811;
1812  %in = load i32, i32 addrspace(999)* %in.byref
1813  store i32 %in, i32 addrspace(1)* %out, align 4
1814  ret void
1815}
1816
1817; Invalid, but should not crash.
1818define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) {
1819; HSA-LABEL: @byref_local_i32_arg(
1820; HSA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1821; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1822; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1823; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1824; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1825; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1826; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1827; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1828; HSA-NEXT:    ret void
1829;
1830; MESA-LABEL: @byref_local_i32_arg(
1831; MESA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1832; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1833; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1834; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1835; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1836; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1837; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1838; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1839; MESA-NEXT:    ret void
1840;
1841  %in = load i32, i32 addrspace(3)* %in.byref
1842  store i32 %in, i32 addrspace(1)* %out, align 4
1843  ret void
1844}
1845
1846define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) {
1847; HSA-LABEL: @multi_byref_constant_i32_arg(
1848; HSA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1849; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1850; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1851; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1852; HSA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1853; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1854; HSA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1855; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1856; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16
1857; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1858; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1859; HSA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1860; HSA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1861; HSA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1862; HSA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1863; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1864; HSA-NEXT:    ret void
1865;
1866; MESA-LABEL: @multi_byref_constant_i32_arg(
1867; MESA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1868; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1869; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1870; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1871; MESA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1872; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1873; MESA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1874; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1875; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52
1876; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1877; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1878; MESA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1879; MESA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1880; MESA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1881; MESA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1882; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1883; MESA-NEXT:    ret void
1884;
1885  %in0 = load i32, i32 addrspace(4)* %in0.byref
1886  %in1 = load i32, i32 addrspace(4)* %in1.byref
1887  store volatile i32 %in0, i32 addrspace(1)* %out, align 4
1888  store volatile i32 %in1, i32 addrspace(1)* %out, align 4
1889  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1890  ret void
1891}
1892
1893define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) {
1894; HSA-LABEL: @byref_constant_i32_arg_offset0(
1895; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1896; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
1897; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1898; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1899; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1900; HSA-NEXT:    ret void
1901;
1902; MESA-LABEL: @byref_constant_i32_arg_offset0(
1903; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1904; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
1905; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1906; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1907; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1908; MESA-NEXT:    ret void
1909;
1910  %in = load i32, i32 addrspace(4)* %in.byref
1911  store i32 %in, i32 addrspace(1)* undef, align 4
1912  ret void
1913}
1914
1915attributes #0 = { nounwind "target-cpu"="kaveri" }
1916attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1917attributes #2 = { nounwind "target-cpu"="tahiti" }
1918
1919; GCN: 0 = !{}
1920; GCN: !1 = !{i64 42}
1921; GCN: !2 = !{i64 128}
1922; GCN: !3 = !{i64 1024}
1923