• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
5; CHECK-LABEL: zext_scaled_i16_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vldrw.u32 q1, [r1]
8; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
9; CHECK-NEXT:    bx lr
10entry:
11  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
12  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
13  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
14  %gather.zext = zext <4 x i16> %gather to <4 x i32>
15  ret <4 x i32> %gather.zext
16}
17
18define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) {
19; CHECK-LABEL: sext_scaled_i16_i32:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vldrw.u32 q1, [r1]
22; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
23; CHECK-NEXT:    bx lr
24entry:
25  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
26  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs
27  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
28  %gather.sext = sext <4 x i16> %gather to <4 x i32>
29  ret <4 x i32> %gather.sext
30}
31
32define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) {
33; CHECK-LABEL: scaled_i32_i32:
34; CHECK:       @ %bb.0: @ %entry
35; CHECK-NEXT:    vldrw.u32 q1, [r1]
36; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
37; CHECK-NEXT:    bx lr
38entry:
39  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
40  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
41  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
42  ret <4 x i32> %gather
43}
44
45; TODO: scaled_f16_i32
46
47define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) {
48; CHECK-LABEL: scaled_f32_i32:
49; CHECK:       @ %bb.0: @ %entry
50; CHECK-NEXT:    vldrw.u32 q1, [r1]
51; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
52; CHECK-NEXT:    bx lr
53entry:
54  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
55  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
56  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
57  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
58  ret <4 x float> %gather
59}
60
61define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) {
62; CHECK-LABEL: unsigned_scaled_b_i32_i16:
63; CHECK:       @ %bb.0: @ %entry
64; CHECK-NEXT:    vldrh.u32 q1, [r1]
65; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
66; CHECK-NEXT:    bx lr
67entry:
68  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
69  %offs.zext = zext <4 x i16> %offs to <4 x i32>
70  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
71  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
72  ret <4 x i32> %gather
73}
74
75define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) {
76; CHECK-LABEL: signed_scaled_i32_i16:
77; CHECK:       @ %bb.0: @ %entry
78; CHECK-NEXT:    vldrh.s32 q1, [r1]
79; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
80; CHECK-NEXT:    bx lr
81entry:
82  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
83  %offs.sext = sext <4 x i16> %offs to <4 x i32>
84  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
85  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
86  ret <4 x i32> %gather
87}
88
89define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
90; CHECK-LABEL: a_unsigned_scaled_f32_i16:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vldrh.u32 q1, [r1]
93; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
94; CHECK-NEXT:    bx lr
95entry:
96  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
97  %offs.zext = zext <4 x i16> %offs to <4 x i32>
98  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
99  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
100  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
101  ret <4 x float> %gather
102}
103
104define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
105; CHECK-LABEL: b_signed_scaled_f32_i16:
106; CHECK:       @ %bb.0: @ %entry
107; CHECK-NEXT:    vldrh.s32 q1, [r1]
108; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
109; CHECK-NEXT:    bx lr
110entry:
111  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
112  %offs.sext = sext <4 x i16> %offs to <4 x i32>
113  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
114  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
115  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
116  ret <4 x float> %gather
117}
118
119define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
120; CHECK-LABEL: zext_signed_scaled_i16_i16:
121; CHECK:       @ %bb.0: @ %entry
122; CHECK-NEXT:    vldrh.s32 q1, [r1]
123; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
124; CHECK-NEXT:    bx lr
125entry:
126  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
127  %offs.sext = sext <4 x i16> %offs to <4 x i32>
128  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
129  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
130  %gather.zext = zext <4 x i16> %gather to <4 x i32>
131  ret <4 x i32> %gather.zext
132}
133
134define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
135; CHECK-LABEL: sext_signed_scaled_i16_i16:
136; CHECK:       @ %bb.0: @ %entry
137; CHECK-NEXT:    vldrh.s32 q1, [r1]
138; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
139; CHECK-NEXT:    bx lr
140entry:
141  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
142  %offs.sext = sext <4 x i16> %offs to <4 x i32>
143  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
144  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
145  %gather.sext = sext <4 x i16> %gather to <4 x i32>
146  ret <4 x i32> %gather.sext
147}
148
149define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
150; CHECK-LABEL: zext_unsigned_scaled_i16_i16:
151; CHECK:       @ %bb.0: @ %entry
152; CHECK-NEXT:    vldrh.u32 q1, [r1]
153; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
154; CHECK-NEXT:    bx lr
155entry:
156  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
157  %offs.zext = zext <4 x i16> %offs to <4 x i32>
158  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
159  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
160  %gather.zext = zext <4 x i16> %gather to <4 x i32>
161  ret <4 x i32> %gather.zext
162}
163
164define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) {
165; CHECK-LABEL: sext_unsigned_scaled_i16_i16:
166; CHECK:       @ %bb.0: @ %entry
167; CHECK-NEXT:    vldrh.u32 q1, [r1]
168; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
169; CHECK-NEXT:    bx lr
170entry:
171  %offs = load <4 x i16>, <4 x i16>* %offptr, align 2
172  %offs.zext = zext <4 x i16> %offs to <4 x i32>
173  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
174  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
175  %gather.sext = sext <4 x i16> %gather to <4 x i32>
176  ret <4 x i32> %gather.sext
177}
178
179define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) {
180; CHECK-LABEL: unsigned_scaled_b_i32_i8:
181; CHECK:       @ %bb.0: @ %entry
182; CHECK-NEXT:    vldrb.u32 q1, [r1]
183; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
184; CHECK-NEXT:    bx lr
185entry:
186  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
187  %offs.zext = zext <4 x i8> %offs to <4 x i32>
188  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
189  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
190  ret <4 x i32> %gather
191}
192
193define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) {
194; CHECK-LABEL: signed_scaled_i32_i8:
195; CHECK:       @ %bb.0: @ %entry
196; CHECK-NEXT:    vldrb.s32 q1, [r1]
197; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
198; CHECK-NEXT:    bx lr
199entry:
200  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
201  %offs.sext = sext <4 x i8> %offs to <4 x i32>
202  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
203  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
204  ret <4 x i32> %gather
205}
206
207define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
208; CHECK-LABEL: a_unsigned_scaled_f32_i8:
209; CHECK:       @ %bb.0: @ %entry
210; CHECK-NEXT:    vldrb.u32 q1, [r1]
211; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
212; CHECK-NEXT:    bx lr
213entry:
214  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
215  %offs.zext = zext <4 x i8> %offs to <4 x i32>
216  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext
217  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
218  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
219  ret <4 x float> %gather
220}
221
222define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
223; CHECK-LABEL: b_signed_scaled_f32_i8:
224; CHECK:       @ %bb.0: @ %entry
225; CHECK-NEXT:    vldrb.s32 q1, [r1]
226; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
227; CHECK-NEXT:    bx lr
228entry:
229  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
230  %offs.sext = sext <4 x i8> %offs to <4 x i32>
231  %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext
232  %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*>
233  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
234  ret <4 x float> %gather
235}
236
237define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
238; CHECK-LABEL: zext_signed_scaled_i16_i8:
239; CHECK:       @ %bb.0: @ %entry
240; CHECK-NEXT:    vldrb.s32 q1, [r1]
241; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
242; CHECK-NEXT:    bx lr
243entry:
244  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
245  %offs.sext = sext <4 x i8> %offs to <4 x i32>
246  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
247  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
248  %gather.zext = zext <4 x i16> %gather to <4 x i32>
249  ret <4 x i32> %gather.zext
250}
251
252define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
253; CHECK-LABEL: sext_signed_scaled_i16_i8:
254; CHECK:       @ %bb.0: @ %entry
255; CHECK-NEXT:    vldrb.s32 q1, [r1]
256; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
257; CHECK-NEXT:    bx lr
258entry:
259  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
260  %offs.sext = sext <4 x i8> %offs to <4 x i32>
261  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext
262  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
263  %gather.sext = sext <4 x i16> %gather to <4 x i32>
264  ret <4 x i32> %gather.sext
265}
266
267define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
268; CHECK-LABEL: zext_unsigned_scaled_i16_i8:
269; CHECK:       @ %bb.0: @ %entry
270; CHECK-NEXT:    vldrb.u32 q1, [r1]
271; CHECK-NEXT:    vldrh.u32 q0, [r0, q1, uxtw #1]
272; CHECK-NEXT:    bx lr
273entry:
274  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
275  %offs.zext = zext <4 x i8> %offs to <4 x i32>
276  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
277  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
278  %gather.zext = zext <4 x i16> %gather to <4 x i32>
279  ret <4 x i32> %gather.zext
280}
281
282define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) {
283; CHECK-LABEL: sext_unsigned_scaled_i16_i8:
284; CHECK:       @ %bb.0: @ %entry
285; CHECK-NEXT:    vldrb.u32 q1, [r1]
286; CHECK-NEXT:    vldrh.s32 q0, [r0, q1, uxtw #1]
287; CHECK-NEXT:    bx lr
288entry:
289  %offs = load <4 x i8>, <4 x i8>* %offptr, align 1
290  %offs.zext = zext <4 x i8> %offs to <4 x i32>
291  %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext
292  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
293  %gather.sext = sext <4 x i16> %gather to <4 x i32>
294  ret <4 x i32> %gather.sext
295}
296
297define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) {
298; CHECK-LABEL: scaled_i32_i32_2gep:
299; CHECK:       @ %bb.0: @ %entry
300; CHECK-NEXT:    vldrw.u32 q1, [r1]
301; CHECK-NEXT:    vmov.i32 q0, #0x14
302; CHECK-NEXT:    vshl.i32 q1, q1, #2
303; CHECK-NEXT:    vadd.i32 q1, q1, r0
304; CHECK-NEXT:    vadd.i32 q1, q1, q0
305; CHECK-NEXT:    vldrw.u32 q0, [q1]
306; CHECK-NEXT:    bx lr
307entry:
308  %offs = load <4 x i32>, <4 x i32>* %offptr, align 4
309  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs
310  %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
311  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
312  ret <4 x i32> %gather
313}
314
315define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base, <4 x i32>* %offptr) {
316; CHECK-LABEL: scaled_i32_i32_2gep2:
317; CHECK:       @ %bb.0: @ %entry
318; CHECK-NEXT:    adr r1, .LCPI21_0
319; CHECK-NEXT:    vldrw.u32 q1, [r1]
320; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
321; CHECK-NEXT:    bx lr
322; CHECK-NEXT:    .p2align 4
323; CHECK-NEXT:  @ %bb.1:
324; CHECK-NEXT:  .LCPI21_0:
325; CHECK-NEXT:    .long 5 @ 0x5
326; CHECK-NEXT:    .long 8 @ 0x8
327; CHECK-NEXT:    .long 11 @ 0xb
328; CHECK-NEXT:    .long 14 @ 0xe
329entry:
330  %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
331  %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5
332  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
333  ret <4 x i32> %gather
334}
335
336declare <4 x i8>  @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
337declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
338declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
339declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>)
340declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
341