1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
3 // RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
4
5 // REQUIRES: aarch64-registered-target
6
7 // Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics
8
9 #include <arm_neon.h>
10
11 // Vector form
12
13 // CHECK-LABEL: @test_vfmlal_low_f16(
14 // CHECK-NEXT: entry:
15 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
16 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
17 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
18 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
19 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
20 //
test_vfmlal_low_f16(float32x2_t a,float16x4_t b,float16x4_t c)21 float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
22 return vfmlal_low_f16(a, b, c);
23 }
24
25 // CHECK-LABEL: @test_vfmlsl_low_f16(
26 // CHECK-NEXT: entry:
27 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
28 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
29 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
30 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
31 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
32 //
test_vfmlsl_low_f16(float32x2_t a,float16x4_t b,float16x4_t c)33 float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
34 return vfmlsl_low_f16(a, b, c);
35 }
36
37 // CHECK-LABEL: @test_vfmlal_high_f16(
38 // CHECK-NEXT: entry:
39 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
40 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
41 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
42 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
43 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
44 //
test_vfmlal_high_f16(float32x2_t a,float16x4_t b,float16x4_t c)45 float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
46 return vfmlal_high_f16(a, b, c);
47 }
48
49 // CHECK-LABEL: @test_vfmlsl_high_f16(
50 // CHECK-NEXT: entry:
51 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
52 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
53 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
54 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #3
55 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
56 //
test_vfmlsl_high_f16(float32x2_t a,float16x4_t b,float16x4_t c)57 float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
58 return vfmlsl_high_f16(a, b, c);
59 }
60
61 // CHECK-LABEL: @test_vfmlalq_low_f16(
62 // CHECK-NEXT: entry:
63 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
64 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
65 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
66 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
67 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
68 //
test_vfmlalq_low_f16(float32x4_t a,float16x8_t b,float16x8_t c)69 float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
70 return vfmlalq_low_f16(a, b, c);
71 }
72
73 // CHECK-LABEL: @test_vfmlslq_low_f16(
74 // CHECK-NEXT: entry:
75 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
76 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
77 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
78 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
79 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
80 //
test_vfmlslq_low_f16(float32x4_t a,float16x8_t b,float16x8_t c)81 float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
82 return vfmlslq_low_f16(a, b, c);
83 }
84
85 // CHECK-LABEL: @test_vfmlalq_high_f16(
86 // CHECK-NEXT: entry:
87 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
88 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
89 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
90 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
91 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
92 //
test_vfmlalq_high_f16(float32x4_t a,float16x8_t b,float16x8_t c)93 float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
94 return vfmlalq_high_f16(a, b, c);
95 }
96
97 // CHECK-LABEL: @test_vfmlslq_high_f16(
98 // CHECK-NEXT: entry:
99 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
100 // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
101 // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
102 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #3
103 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
104 //
test_vfmlslq_high_f16(float32x4_t a,float16x8_t b,float16x8_t c)105 float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
106 return vfmlslq_high_f16(a, b, c);
107 }
108
109 // Indexed form
110
111 // CHECK-LABEL: @test_vfmlal_lane_low_f16(
112 // CHECK-NEXT: entry:
113 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
114 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
115 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
116 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
117 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
118 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
119 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
120 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
121 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
122 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
123 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
124 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
125 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
126 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
127 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
128 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
129 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
130 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
131 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
132 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
133 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
134 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
135 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
136 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
137 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
138 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
139 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
140 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
141 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
142 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
143 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
144 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
145 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
146 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
147 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
148 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
149 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
150 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
151 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
152 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
153 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
154 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
155 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
156 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
157 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
158 //
test_vfmlal_lane_low_f16(float32x2_t a,float16x4_t b,float16x4_t c)159 float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
160 return vfmlal_lane_low_f16(a, b, c, 0);
161 }
162
163 // CHECK-LABEL: @test_vfmlal_lane_high_f16(
164 // CHECK-NEXT: entry:
165 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
166 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
167 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
168 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
169 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
170 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
171 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
172 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
173 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
174 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
175 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
176 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
177 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
178 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
179 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
180 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
181 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
182 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
183 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
184 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
185 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
186 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
187 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
188 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
189 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
190 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
191 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
192 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
193 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
194 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
195 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
196 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
197 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
198 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
199 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
200 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
201 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
202 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
203 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
204 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
205 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
206 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
207 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
208 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
209 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
210 //
test_vfmlal_lane_high_f16(float32x2_t a,float16x4_t b,float16x4_t c)211 float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
212 return vfmlal_lane_high_f16(a, b, c, 1);
213 }
214
215 // CHECK-LABEL: @test_vfmlalq_lane_low_f16(
216 // CHECK-NEXT: entry:
217 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
218 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
219 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
220 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
221 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
222 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
223 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
224 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
225 // CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
226 // CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2
227 // CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
228 // CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2
229 // CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
230 // CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2
231 // CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
232 // CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2
233 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
234 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
235 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
236 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
237 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
238 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
239 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
240 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
241 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
242 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
243 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
244 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
245 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
246 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
247 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
248 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
249 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
250 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
251 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
252 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
253 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
254 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
255 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
256 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
257 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
258 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
259 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
260 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
261 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
262 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
263 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
264 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
265 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
266 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
267 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
268 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
269 // CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
270 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
271 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
272 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
273 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
274 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
275 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
276 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
277 // CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
278 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
279 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
280 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
281 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
282 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
283 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
284 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
285 // CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
286 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
287 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
288 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
289 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
290 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
291 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
292 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
293 // CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
294 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
295 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
296 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
297 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
298 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
299 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
300 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
301 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
302 //
test_vfmlalq_lane_low_f16(float32x4_t a,float16x8_t b,float16x4_t c)303 float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
304 return vfmlalq_lane_low_f16(a, b, c, 2);
305 }
306
307 // CHECK-LABEL: @test_vfmlalq_lane_high_f16(
308 // CHECK-NEXT: entry:
309 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
310 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
311 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
312 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
313 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
314 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
315 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
316 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
317 // CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
318 // CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2
319 // CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
320 // CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2
321 // CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
322 // CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2
323 // CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
324 // CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2
325 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
326 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
327 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
328 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
329 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
330 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
331 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
332 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
333 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
334 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
335 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
336 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
337 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
338 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
339 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
340 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
341 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
342 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
343 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
344 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
345 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
346 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
347 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
348 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
349 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
350 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
351 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
352 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
353 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
354 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
355 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
356 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
357 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
358 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
359 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
360 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
361 // CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
362 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
363 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
364 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
365 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
366 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
367 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
368 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
369 // CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
370 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
371 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
372 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
373 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
374 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
375 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
376 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
377 // CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
378 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
379 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
380 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
381 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
382 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
383 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
384 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
385 // CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
386 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
387 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
388 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
389 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
390 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
391 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
392 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
393 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
394 //
test_vfmlalq_lane_high_f16(float32x4_t a,float16x8_t b,float16x4_t c)395 float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
396 return vfmlalq_lane_high_f16(a, b, c, 3);
397 }
398
399 // CHECK-LABEL: @test_vfmlal_laneq_low_f16(
400 // CHECK-NEXT: entry:
401 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
402 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
403 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
404 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
405 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
406 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
407 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
408 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
409 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
410 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
411 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
412 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
413 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
414 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
415 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
416 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
417 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
418 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
419 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
420 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
421 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
422 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
423 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
424 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
425 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
426 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
427 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
428 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
429 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
430 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
431 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
432 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
433 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
434 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
435 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
436 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
437 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
438 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
439 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
440 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
441 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
442 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
443 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
444 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
445 // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
446 //
test_vfmlal_laneq_low_f16(float32x2_t a,float16x4_t b,float16x8_t c)447 float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
448 return vfmlal_laneq_low_f16(a, b, c, 4);
449 }
450
451 // CHECK-LABEL: @test_vfmlal_laneq_high_f16(
452 // CHECK-NEXT: entry:
453 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
454 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
455 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
456 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
457 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
458 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
459 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
460 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
461 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
462 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
463 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
464 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
465 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
466 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
467 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
468 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
469 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
470 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
471 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
472 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
473 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
474 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
475 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
476 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
477 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
478 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
479 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
480 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
481 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
482 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
483 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
484 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
485 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
486 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
487 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
488 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
489 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
490 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
491 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
492 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
493 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
494 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
495 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
496 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
497 // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
498 //
test_vfmlal_laneq_high_f16(float32x2_t a,float16x4_t b,float16x8_t c)499 float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
500 return vfmlal_laneq_high_f16(a, b, c, 5);
501 }
502
503 // CHECK-LABEL: @test_vfmlalq_laneq_low_f16(
504 // CHECK-NEXT: entry:
505 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
506 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
507 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
508 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
509 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
510 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
511 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
512 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
513 // CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
514 // CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2
515 // CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
516 // CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2
517 // CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
518 // CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2
519 // CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
520 // CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2
521 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
522 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
523 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
524 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
525 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
526 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
527 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
528 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
529 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
530 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
531 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
532 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
533 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
534 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
535 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
536 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
537 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
538 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
539 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
540 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
541 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
542 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
543 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
544 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
545 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
546 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
547 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
548 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
549 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
550 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
551 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
552 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
553 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
554 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
555 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
556 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
557 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
558 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
559 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
560 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
561 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
562 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
563 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
564 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
565 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
566 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
567 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
568 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
569 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
570 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
571 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
572 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
573 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
574 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
575 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
576 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
577 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
578 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
579 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
580 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
581 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
582 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
583 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
584 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
585 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
586 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
587 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
588 // CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
589 // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
590 //
test_vfmlalq_laneq_low_f16(float32x4_t a,float16x8_t b,float16x8_t c)591 float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
592 return vfmlalq_laneq_low_f16(a, b, c, 6);
593 }
594
595 // CHECK-LABEL: @test_vfmlalq_laneq_high_f16(
596 // CHECK-NEXT: entry:
597 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
598 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
599 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
600 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
601 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
602 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
603 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
604 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
605 // CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
606 // CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2
607 // CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
608 // CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2
609 // CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
610 // CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2
611 // CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
612 // CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2
613 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
614 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
615 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
616 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
617 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
618 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
619 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
620 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
621 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
622 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
623 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
624 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
625 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
626 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
627 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
628 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
629 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
630 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
631 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
632 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
633 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
634 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
635 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
636 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
637 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
638 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
639 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
640 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
641 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
642 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
643 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
644 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
645 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
646 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
647 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
648 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
649 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
650 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
651 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
652 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
653 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
654 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
655 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
656 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
657 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
658 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
659 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
660 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
661 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
662 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
663 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
664 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
665 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
666 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
667 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
668 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
669 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
670 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
671 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
672 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
673 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
674 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
675 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
676 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
677 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
678 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
679 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
680 // CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
681 // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
682 //
test_vfmlalq_laneq_high_f16(float32x4_t a,float16x8_t b,float16x8_t c)683 float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
684 return vfmlalq_laneq_high_f16(a, b, c, 7);
685 }
686
687 // CHECK-LABEL: @test_vfmlsl_lane_low_f16(
688 // CHECK-NEXT: entry:
689 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
690 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
691 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
692 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
693 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
694 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
695 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
696 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
697 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
698 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
699 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
700 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 0
701 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
702 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
703 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
704 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
705 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
706 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
707 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
708 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
709 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
710 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
711 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
712 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
713 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
714 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
715 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
716 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 0
717 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
718 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
719 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
720 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
721 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
722 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
723 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
724 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 0
725 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
726 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
727 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
728 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
729 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
730 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
731 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
732 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
733 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
734 //
test_vfmlsl_lane_low_f16(float32x2_t a,float16x4_t b,float16x4_t c)735 float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
736 return vfmlsl_lane_low_f16(a, b, c, 0);
737 }
738
739 // CHECK-LABEL: @test_vfmlsl_lane_high_f16(
740 // CHECK-NEXT: entry:
741 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
742 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
743 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
744 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
745 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
746 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
747 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
748 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
749 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
750 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
751 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
752 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1
753 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
754 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
755 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
756 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
757 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
758 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
759 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
760 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
761 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
762 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
763 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
764 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
765 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
766 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
767 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
768 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 1
769 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
770 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
771 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
772 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
773 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
774 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
775 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
776 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 1
777 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
778 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
779 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
780 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
781 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
782 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
783 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
784 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
785 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
786 //
test_vfmlsl_lane_high_f16(float32x2_t a,float16x4_t b,float16x4_t c)787 float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
788 return vfmlsl_lane_high_f16(a, b, c, 1);
789 }
790
791 // CHECK-LABEL: @test_vfmlslq_lane_low_f16(
792 // CHECK-NEXT: entry:
793 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
794 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
795 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
796 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
797 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
798 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
799 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
800 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
801 // CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
802 // CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2
803 // CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
804 // CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2
805 // CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
806 // CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2
807 // CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
808 // CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2
809 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
810 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
811 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
812 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2
813 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
814 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
815 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
816 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
817 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
818 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
819 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
820 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
821 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
822 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
823 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
824 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
825 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
826 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
827 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
828 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 2
829 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
830 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
831 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
832 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
833 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
834 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
835 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
836 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 2
837 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
838 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
839 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
840 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
841 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
842 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
843 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
844 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 2
845 // CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
846 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
847 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
848 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
849 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
850 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
851 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
852 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 2
853 // CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
854 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
855 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
856 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
857 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
858 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
859 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
860 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 2
861 // CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
862 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
863 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
864 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
865 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
866 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
867 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
868 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 2
869 // CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
870 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
871 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
872 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
873 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
874 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
875 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
876 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
877 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
878 //
test_vfmlslq_lane_low_f16(float32x4_t a,float16x8_t b,float16x4_t c)879 float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
880 return vfmlslq_lane_low_f16(a, b, c, 2);
881 }
882
883 // CHECK-LABEL: @test_vfmlslq_lane_high_f16(
884 // CHECK-NEXT: entry:
885 // CHECK-NEXT: [[__REINT_716:%.*]] = alloca <4 x half>, align 8
886 // CHECK-NEXT: [[__REINT1_716:%.*]] = alloca i16, align 2
887 // CHECK-NEXT: [[__REINT_7164:%.*]] = alloca <4 x half>, align 8
888 // CHECK-NEXT: [[__REINT1_7165:%.*]] = alloca i16, align 2
889 // CHECK-NEXT: [[__REINT_71614:%.*]] = alloca <4 x half>, align 8
890 // CHECK-NEXT: [[__REINT1_71615:%.*]] = alloca i16, align 2
891 // CHECK-NEXT: [[__REINT_71624:%.*]] = alloca <4 x half>, align 8
892 // CHECK-NEXT: [[__REINT1_71625:%.*]] = alloca i16, align 2
893 // CHECK-NEXT: [[__REINT_71634:%.*]] = alloca <4 x half>, align 8
894 // CHECK-NEXT: [[__REINT1_71635:%.*]] = alloca i16, align 2
895 // CHECK-NEXT: [[__REINT_71644:%.*]] = alloca <4 x half>, align 8
896 // CHECK-NEXT: [[__REINT1_71645:%.*]] = alloca i16, align 2
897 // CHECK-NEXT: [[__REINT_71654:%.*]] = alloca <4 x half>, align 8
898 // CHECK-NEXT: [[__REINT1_71655:%.*]] = alloca i16, align 2
899 // CHECK-NEXT: [[__REINT_71664:%.*]] = alloca <4 x half>, align 8
900 // CHECK-NEXT: [[__REINT1_71665:%.*]] = alloca i16, align 2
901 // CHECK-NEXT: store <4 x half> [[C:%.*]], <4 x half>* [[__REINT_716]], align 8
902 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_716]] to <4 x i16>*
903 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
904 // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
905 // CHECK-NEXT: store i16 [[VGET_LANE]], i16* [[__REINT1_716]], align 2
906 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_716]] to half*
907 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
908 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
909 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_7164]], align 8
910 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half>* [[__REINT_7164]] to <4 x i16>*
911 // CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 8
912 // CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
913 // CHECK-NEXT: store i16 [[VGET_LANE8]], i16* [[__REINT1_7165]], align 2
914 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7165]] to half*
915 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
916 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
917 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71614]], align 8
918 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half>* [[__REINT_71614]] to <4 x i16>*
919 // CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 8
920 // CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP9]], i32 3
921 // CHECK-NEXT: store i16 [[VGET_LANE18]], i16* [[__REINT1_71615]], align 2
922 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71615]] to half*
923 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
924 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
925 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71624]], align 8
926 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half>* [[__REINT_71624]] to <4 x i16>*
927 // CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 8
928 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP13]], i32 3
929 // CHECK-NEXT: store i16 [[VGET_LANE28]], i16* [[__REINT1_71625]], align 2
930 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71625]] to half*
931 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
932 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
933 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71634]], align 8
934 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x half>* [[__REINT_71634]] to <4 x i16>*
935 // CHECK-NEXT: [[TMP17:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 8
936 // CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP17]], i32 3
937 // CHECK-NEXT: store i16 [[VGET_LANE38]], i16* [[__REINT1_71635]], align 2
938 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71635]] to half*
939 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
940 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
941 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71644]], align 8
942 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <4 x half>* [[__REINT_71644]] to <4 x i16>*
943 // CHECK-NEXT: [[TMP21:%.*]] = load <4 x i16>, <4 x i16>* [[TMP20]], align 8
944 // CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP21]], i32 3
945 // CHECK-NEXT: store i16 [[VGET_LANE48]], i16* [[__REINT1_71645]], align 2
946 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71645]] to half*
947 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
948 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
949 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71654]], align 8
950 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x half>* [[__REINT_71654]] to <4 x i16>*
951 // CHECK-NEXT: [[TMP25:%.*]] = load <4 x i16>, <4 x i16>* [[TMP24]], align 8
952 // CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP25]], i32 3
953 // CHECK-NEXT: store i16 [[VGET_LANE58]], i16* [[__REINT1_71655]], align 2
954 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71655]] to half*
955 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
956 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
957 // CHECK-NEXT: store <4 x half> [[C]], <4 x half>* [[__REINT_71664]], align 8
958 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x half>* [[__REINT_71664]] to <4 x i16>*
959 // CHECK-NEXT: [[TMP29:%.*]] = load <4 x i16>, <4 x i16>* [[TMP28]], align 8
960 // CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP29]], i32 3
961 // CHECK-NEXT: store i16 [[VGET_LANE68]], i16* [[__REINT1_71665]], align 2
962 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71665]] to half*
963 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
964 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
965 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
966 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
967 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
968 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
969 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
970 //
test_vfmlslq_lane_high_f16(float32x4_t a,float16x8_t b,float16x4_t c)971 float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
972 return vfmlslq_lane_high_f16(a, b, c, 3);
973 }
974
975 // CHECK-LABEL: @test_vfmlsl_laneq_low_f16(
976 // CHECK-NEXT: entry:
977 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
978 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
979 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
980 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
981 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
982 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
983 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
984 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
985 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
986 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
987 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
988 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
989 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
990 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
991 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
992 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
993 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
994 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
995 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
996 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4
997 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
998 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
999 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
1000 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
1001 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
1002 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
1003 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
1004 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 4
1005 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
1006 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
1007 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
1008 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
1009 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
1010 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
1011 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
1012 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 4
1013 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
1014 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
1015 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
1016 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
1017 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1018 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
1019 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
1020 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
1021 // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
1022 //
test_vfmlsl_laneq_low_f16(float32x2_t a,float16x4_t b,float16x8_t c)1023 float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
1024 return vfmlsl_laneq_low_f16(a, b, c, 4);
1025 }
1026
1027 // CHECK-LABEL: @test_vfmlsl_laneq_high_f16(
1028 // CHECK-NEXT: entry:
1029 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
1030 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
1031 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
1032 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
1033 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
1034 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
1035 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
1036 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
1037 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
1038 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
1039 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
1040 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
1041 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
1042 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
1043 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
1044 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP3]], i32 0
1045 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
1046 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
1047 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
1048 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5
1049 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
1050 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
1051 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
1052 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP7]], i32 1
1053 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
1054 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
1055 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
1056 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 5
1057 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
1058 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
1059 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
1060 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP11]], i32 2
1061 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
1062 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
1063 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
1064 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 5
1065 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
1066 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
1067 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
1068 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP15]], i32 3
1069 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
1070 // CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
1071 // CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
1072 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #3
1073 // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
1074 //
test_vfmlsl_laneq_high_f16(float32x2_t a,float16x4_t b,float16x8_t c)1075 float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
1076 return vfmlsl_laneq_high_f16(a, b, c, 5);
1077 }
1078
1079 // CHECK-LABEL: @test_vfmlslq_laneq_low_f16(
1080 // CHECK-NEXT: entry:
1081 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
1082 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
1083 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
1084 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
1085 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
1086 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
1087 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
1088 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
1089 // CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
1090 // CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2
1091 // CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
1092 // CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2
1093 // CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
1094 // CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2
1095 // CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
1096 // CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2
1097 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
1098 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
1099 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
1100 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
1101 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
1102 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
1103 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
1104 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
1105 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
1106 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
1107 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
1108 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6
1109 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
1110 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
1111 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
1112 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
1113 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
1114 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
1115 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
1116 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 6
1117 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
1118 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
1119 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
1120 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
1121 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
1122 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
1123 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
1124 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 6
1125 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
1126 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
1127 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
1128 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
1129 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
1130 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
1131 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
1132 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 6
1133 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
1134 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
1135 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
1136 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
1137 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
1138 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
1139 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
1140 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 6
1141 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
1142 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
1143 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
1144 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
1145 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
1146 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
1147 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
1148 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 6
1149 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
1150 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
1151 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
1152 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
1153 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
1154 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
1155 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
1156 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 6
1157 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
1158 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
1159 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
1160 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
1161 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1162 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
1163 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
1164 // CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
1165 // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
1166 //
test_vfmlslq_laneq_low_f16(float32x4_t a,float16x8_t b,float16x8_t c)1167 float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
1168 return vfmlslq_laneq_low_f16(a, b, c, 6);
1169 }
1170
1171 // CHECK-LABEL: @test_vfmlslq_laneq_high_f16(
1172 // CHECK-NEXT: entry:
1173 // CHECK-NEXT: [[__REINT_719:%.*]] = alloca <8 x half>, align 16
1174 // CHECK-NEXT: [[__REINT1_719:%.*]] = alloca i16, align 2
1175 // CHECK-NEXT: [[__REINT_7194:%.*]] = alloca <8 x half>, align 16
1176 // CHECK-NEXT: [[__REINT1_7195:%.*]] = alloca i16, align 2
1177 // CHECK-NEXT: [[__REINT_71914:%.*]] = alloca <8 x half>, align 16
1178 // CHECK-NEXT: [[__REINT1_71915:%.*]] = alloca i16, align 2
1179 // CHECK-NEXT: [[__REINT_71924:%.*]] = alloca <8 x half>, align 16
1180 // CHECK-NEXT: [[__REINT1_71925:%.*]] = alloca i16, align 2
1181 // CHECK-NEXT: [[__REINT_71934:%.*]] = alloca <8 x half>, align 16
1182 // CHECK-NEXT: [[__REINT1_71935:%.*]] = alloca i16, align 2
1183 // CHECK-NEXT: [[__REINT_71944:%.*]] = alloca <8 x half>, align 16
1184 // CHECK-NEXT: [[__REINT1_71945:%.*]] = alloca i16, align 2
1185 // CHECK-NEXT: [[__REINT_71954:%.*]] = alloca <8 x half>, align 16
1186 // CHECK-NEXT: [[__REINT1_71955:%.*]] = alloca i16, align 2
1187 // CHECK-NEXT: [[__REINT_71964:%.*]] = alloca <8 x half>, align 16
1188 // CHECK-NEXT: [[__REINT1_71965:%.*]] = alloca i16, align 2
1189 // CHECK-NEXT: store <8 x half> [[C:%.*]], <8 x half>* [[__REINT_719]], align 16
1190 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_719]] to <8 x i16>*
1191 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
1192 // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
1193 // CHECK-NEXT: store i16 [[VGETQ_LANE]], i16* [[__REINT1_719]], align 2
1194 // CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[__REINT1_719]] to half*
1195 // CHECK-NEXT: [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
1196 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
1197 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_7194]], align 16
1198 // CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half>* [[__REINT_7194]] to <8 x i16>*
1199 // CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 16
1200 // CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7
1201 // CHECK-NEXT: store i16 [[VGETQ_LANE8]], i16* [[__REINT1_7195]], align 2
1202 // CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[__REINT1_7195]] to half*
1203 // CHECK-NEXT: [[TMP7:%.*]] = load half, half* [[TMP6]], align 2
1204 // CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP7]], i32 1
1205 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71914]], align 16
1206 // CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half>* [[__REINT_71914]] to <8 x i16>*
1207 // CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[TMP8]], align 16
1208 // CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP9]], i32 7
1209 // CHECK-NEXT: store i16 [[VGETQ_LANE18]], i16* [[__REINT1_71915]], align 2
1210 // CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[__REINT1_71915]] to half*
1211 // CHECK-NEXT: [[TMP11:%.*]] = load half, half* [[TMP10]], align 2
1212 // CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP11]], i32 2
1213 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71924]], align 16
1214 // CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half>* [[__REINT_71924]] to <8 x i16>*
1215 // CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, <8 x i16>* [[TMP12]], align 16
1216 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP13]], i32 7
1217 // CHECK-NEXT: store i16 [[VGETQ_LANE28]], i16* [[__REINT1_71925]], align 2
1218 // CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[__REINT1_71925]] to half*
1219 // CHECK-NEXT: [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
1220 // CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP15]], i32 3
1221 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71934]], align 16
1222 // CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x half>* [[__REINT_71934]] to <8 x i16>*
1223 // CHECK-NEXT: [[TMP17:%.*]] = load <8 x i16>, <8 x i16>* [[TMP16]], align 16
1224 // CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP17]], i32 7
1225 // CHECK-NEXT: store i16 [[VGETQ_LANE38]], i16* [[__REINT1_71935]], align 2
1226 // CHECK-NEXT: [[TMP18:%.*]] = bitcast i16* [[__REINT1_71935]] to half*
1227 // CHECK-NEXT: [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
1228 // CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP19]], i32 4
1229 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71944]], align 16
1230 // CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x half>* [[__REINT_71944]] to <8 x i16>*
1231 // CHECK-NEXT: [[TMP21:%.*]] = load <8 x i16>, <8 x i16>* [[TMP20]], align 16
1232 // CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP21]], i32 7
1233 // CHECK-NEXT: store i16 [[VGETQ_LANE48]], i16* [[__REINT1_71945]], align 2
1234 // CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[__REINT1_71945]] to half*
1235 // CHECK-NEXT: [[TMP23:%.*]] = load half, half* [[TMP22]], align 2
1236 // CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP23]], i32 5
1237 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71954]], align 16
1238 // CHECK-NEXT: [[TMP24:%.*]] = bitcast <8 x half>* [[__REINT_71954]] to <8 x i16>*
1239 // CHECK-NEXT: [[TMP25:%.*]] = load <8 x i16>, <8 x i16>* [[TMP24]], align 16
1240 // CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP25]], i32 7
1241 // CHECK-NEXT: store i16 [[VGETQ_LANE58]], i16* [[__REINT1_71955]], align 2
1242 // CHECK-NEXT: [[TMP26:%.*]] = bitcast i16* [[__REINT1_71955]] to half*
1243 // CHECK-NEXT: [[TMP27:%.*]] = load half, half* [[TMP26]], align 2
1244 // CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP27]], i32 6
1245 // CHECK-NEXT: store <8 x half> [[C]], <8 x half>* [[__REINT_71964]], align 16
1246 // CHECK-NEXT: [[TMP28:%.*]] = bitcast <8 x half>* [[__REINT_71964]] to <8 x i16>*
1247 // CHECK-NEXT: [[TMP29:%.*]] = load <8 x i16>, <8 x i16>* [[TMP28]], align 16
1248 // CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP29]], i32 7
1249 // CHECK-NEXT: store i16 [[VGETQ_LANE68]], i16* [[__REINT1_71965]], align 2
1250 // CHECK-NEXT: [[TMP30:%.*]] = bitcast i16* [[__REINT1_71965]] to half*
1251 // CHECK-NEXT: [[TMP31:%.*]] = load half, half* [[TMP30]], align 2
1252 // CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP31]], i32 7
1253 // CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
1254 // CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
1255 // CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
1256 // CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #3
1257 // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
1258 //
test_vfmlslq_laneq_high_f16(float32x4_t a,float16x8_t b,float16x8_t c)1259 float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
1260 return vfmlslq_laneq_high_f16(a, b, c, 7);
1261 }
1262