• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4
5define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %dst, <4 x i32> %offs) {
6; CHECK-LABEL: scatter_inc_minipred_4i32:
7; CHECK:       @ %bb.0:
8; CHECK-NEXT:    movw r1, #3855
9; CHECK-NEXT:    vmov.i32 q2, #0x4
10; CHECK-NEXT:    vadd.i32 q1, q1, q2
11; CHECK-NEXT:    vmsr p0, r1
12; CHECK-NEXT:    vpst
13; CHECK-NEXT:    vstrwt.32 q0, [r0, q1, uxtw #2]
14; CHECK-NEXT:    bx lr
15  %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4>
16  %2 = getelementptr inbounds i32, i32* %dst, <4 x i32> %1
17  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
18  ret void
19}
20
21define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) {
22; CHECK-LABEL: scatter_inc_mini_8i16:
23; CHECK:       @ %bb.0:
24; CHECK-NEXT:    .vsave {d8, d9}
25; CHECK-NEXT:    vpush {d8, d9}
26; CHECK-NEXT:    vshl.i32 q1, q1, #1
27; CHECK-NEXT:    vmov.i32 q3, #0x10
28; CHECK-NEXT:    vadd.i32 q1, q1, r0
29; CHECK-NEXT:    vmov.u16 r2, q0[0]
30; CHECK-NEXT:    vadd.i32 q4, q1, q3
31; CHECK-NEXT:    vshl.i32 q1, q2, #1
32; CHECK-NEXT:    vmov r1, s16
33; CHECK-NEXT:    vadd.i32 q1, q1, r0
34; CHECK-NEXT:    vmov r0, s17
35; CHECK-NEXT:    vadd.i32 q1, q1, q3
36; CHECK-NEXT:    strh r2, [r1]
37; CHECK-NEXT:    vmov.u16 r1, q0[1]
38; CHECK-NEXT:    strh r1, [r0]
39; CHECK-NEXT:    vmov r0, s18
40; CHECK-NEXT:    vmov.u16 r1, q0[2]
41; CHECK-NEXT:    strh r1, [r0]
42; CHECK-NEXT:    vmov r0, s19
43; CHECK-NEXT:    vmov.u16 r1, q0[3]
44; CHECK-NEXT:    strh r1, [r0]
45; CHECK-NEXT:    vmov r0, s4
46; CHECK-NEXT:    vmov.u16 r1, q0[4]
47; CHECK-NEXT:    strh r1, [r0]
48; CHECK-NEXT:    vmov r0, s5
49; CHECK-NEXT:    vmov.u16 r1, q0[5]
50; CHECK-NEXT:    strh r1, [r0]
51; CHECK-NEXT:    vmov r0, s6
52; CHECK-NEXT:    vmov.u16 r1, q0[6]
53; CHECK-NEXT:    strh r1, [r0]
54; CHECK-NEXT:    vmov r0, s7
55; CHECK-NEXT:    vmov.u16 r1, q0[7]
56; CHECK-NEXT:    strh r1, [r0]
57; CHECK-NEXT:    vpop {d8, d9}
58; CHECK-NEXT:    bx lr
59  %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
60  %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1
61  call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
62  ret void
63}
64
65define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) {
66; CHECK-LABEL: scatter_inc_mini_16i8:
67; CHECK:       @ %bb.0:
68; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
69; CHECK-NEXT:    vpush {d8, d9, d10, d11}
70; CHECK-NEXT:    vmov.i32 q5, #0x10
71; CHECK-NEXT:    vadd.i32 q1, q1, r0
72; CHECK-NEXT:    vadd.i32 q4, q1, q5
73; CHECK-NEXT:    vmov.u8 r2, q0[0]
74; CHECK-NEXT:    vmov r1, s16
75; CHECK-NEXT:    vadd.i32 q3, q3, r0
76; CHECK-NEXT:    vadd.i32 q2, q2, r0
77; CHECK-NEXT:    vadd.i32 q3, q3, q5
78; CHECK-NEXT:    vadd.i32 q2, q2, q5
79; CHECK-NEXT:    strb r2, [r1]
80; CHECK-NEXT:    add r1, sp, #32
81; CHECK-NEXT:    vldrw.u32 q1, [r1]
82; CHECK-NEXT:    vmov.u8 r1, q0[1]
83; CHECK-NEXT:    vadd.i32 q1, q1, r0
84; CHECK-NEXT:    vmov r0, s17
85; CHECK-NEXT:    vadd.i32 q1, q1, q5
86; CHECK-NEXT:    strb r1, [r0]
87; CHECK-NEXT:    vmov r0, s18
88; CHECK-NEXT:    vmov.u8 r1, q0[2]
89; CHECK-NEXT:    strb r1, [r0]
90; CHECK-NEXT:    vmov r0, s19
91; CHECK-NEXT:    vmov.u8 r1, q0[3]
92; CHECK-NEXT:    strb r1, [r0]
93; CHECK-NEXT:    vmov r0, s8
94; CHECK-NEXT:    vmov.u8 r1, q0[4]
95; CHECK-NEXT:    strb r1, [r0]
96; CHECK-NEXT:    vmov r0, s9
97; CHECK-NEXT:    vmov.u8 r1, q0[5]
98; CHECK-NEXT:    strb r1, [r0]
99; CHECK-NEXT:    vmov r0, s10
100; CHECK-NEXT:    vmov.u8 r1, q0[6]
101; CHECK-NEXT:    strb r1, [r0]
102; CHECK-NEXT:    vmov r0, s11
103; CHECK-NEXT:    vmov.u8 r1, q0[7]
104; CHECK-NEXT:    strb r1, [r0]
105; CHECK-NEXT:    vmov r0, s12
106; CHECK-NEXT:    vmov.u8 r1, q0[8]
107; CHECK-NEXT:    strb r1, [r0]
108; CHECK-NEXT:    vmov r0, s13
109; CHECK-NEXT:    vmov.u8 r1, q0[9]
110; CHECK-NEXT:    strb r1, [r0]
111; CHECK-NEXT:    vmov r0, s14
112; CHECK-NEXT:    vmov.u8 r1, q0[10]
113; CHECK-NEXT:    strb r1, [r0]
114; CHECK-NEXT:    vmov r0, s15
115; CHECK-NEXT:    vmov.u8 r1, q0[11]
116; CHECK-NEXT:    strb r1, [r0]
117; CHECK-NEXT:    vmov r0, s4
118; CHECK-NEXT:    vmov.u8 r1, q0[12]
119; CHECK-NEXT:    strb r1, [r0]
120; CHECK-NEXT:    vmov r0, s5
121; CHECK-NEXT:    vmov.u8 r1, q0[13]
122; CHECK-NEXT:    strb r1, [r0]
123; CHECK-NEXT:    vmov r0, s6
124; CHECK-NEXT:    vmov.u8 r1, q0[14]
125; CHECK-NEXT:    strb r1, [r0]
126; CHECK-NEXT:    vmov r0, s7
127; CHECK-NEXT:    vmov.u8 r1, q0[15]
128; CHECK-NEXT:    strb r1, [r0]
129; CHECK-NEXT:    vpop {d8, d9, d10, d11}
130; CHECK-NEXT:    bx lr
131  %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
132  %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1
133  call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
134  ret void
135}
136
137define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, i32* %dst, i32 %n) {
138; CHECK-LABEL: scatter_inc_v4i32_complex:
139; CHECK:       @ %bb.0: @ %entry
140; CHECK-NEXT:    .save {r4, lr}
141; CHECK-NEXT:    push {r4, lr}
142; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
143; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
144; CHECK-NEXT:    .pad #24
145; CHECK-NEXT:    sub sp, #24
146; CHECK-NEXT:    cmp r1, #1
147; CHECK-NEXT:    blt .LBB3_5
148; CHECK-NEXT:  @ %bb.1: @ %vector.ph.preheader
149; CHECK-NEXT:    adr r4, .LCPI3_2
150; CHECK-NEXT:    bic r2, r1, #3
151; CHECK-NEXT:    vldrw.u32 q3, [r4]
152; CHECK-NEXT:    sub.w r12, r2, #4
153; CHECK-NEXT:    adr.w lr, .LCPI3_1
154; CHECK-NEXT:    movs r3, #1
155; CHECK-NEXT:    vadd.i32 q3, q3, r0
156; CHECK-NEXT:    add.w r3, r3, r12, lsr #2
157; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
158; CHECK-NEXT:    vldrw.u32 q3, [lr]
159; CHECK-NEXT:    adr.w r12, .LCPI3_0
160; CHECK-NEXT:    vadd.i32 q4, q3, r0
161; CHECK-NEXT:    vldrw.u32 q3, [r12]
162; CHECK-NEXT:    vadd.i32 q3, q3, r0
163; CHECK-NEXT:  .LBB3_2: @ %vector.ph
164; CHECK-NEXT:    @ =>This Loop Header: Depth=1
165; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
166; CHECK-NEXT:    dls lr, r3
167; CHECK-NEXT:    vmov q6, q4
168; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
169; CHECK-NEXT:    vmov q5, q3
170; CHECK-NEXT:  .LBB3_3: @ %vector.body
171; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
172; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
173; CHECK-NEXT:    vstrw.32 q0, [q5, #48]!
174; CHECK-NEXT:    vstrw.32 q1, [q6, #48]!
175; CHECK-NEXT:    vstrw.32 q2, [q7, #48]!
176; CHECK-NEXT:    le lr, .LBB3_3
177; CHECK-NEXT:  @ %bb.4: @ %middle.block
178; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
179; CHECK-NEXT:    cmp r2, r1
180; CHECK-NEXT:    bne .LBB3_2
181; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
182; CHECK-NEXT:    add sp, #24
183; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
184; CHECK-NEXT:    pop {r4, pc}
185; CHECK-NEXT:    .p2align 4
186; CHECK-NEXT:  @ %bb.6:
187; CHECK-NEXT:  .LCPI3_0:
188; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
189; CHECK-NEXT:    .long 4294967260 @ 0xffffffdc
190; CHECK-NEXT:    .long 4294967272 @ 0xffffffe8
191; CHECK-NEXT:    .long 4294967284 @ 0xfffffff4
192; CHECK-NEXT:  .LCPI3_1:
193; CHECK-NEXT:    .long 4294967252 @ 0xffffffd4
194; CHECK-NEXT:    .long 4294967264 @ 0xffffffe0
195; CHECK-NEXT:    .long 4294967276 @ 0xffffffec
196; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
197; CHECK-NEXT:  .LCPI3_2:
198; CHECK-NEXT:    .long 4294967256 @ 0xffffffd8
199; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
200; CHECK-NEXT:    .long 4294967280 @ 0xfffffff0
201; CHECK-NEXT:    .long 4294967292 @ 0xfffffffc
202entry:
203  %cmp22 = icmp sgt i32 %n, 0
204  br i1 %cmp22, label %vector.ph, label %for.cond.cleanup
205
206vector.ph:                                        ; preds = %for.body.preheader
207  %n.vec = and i32 %n, -4
208  br label %vector.body
209
210vector.body:                                      ; preds = %vector.body, %vector.ph
211  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
212  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
213  %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
214  %1 = getelementptr inbounds i32, i32* %dst, <4 x i32> %0
215  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data1, <4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
216  %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
217  %3 = getelementptr inbounds i32, i32* %dst, <4 x i32> %2
218  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data2, <4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
219  %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2>
220  %5 = getelementptr inbounds i32, i32* %dst, <4 x i32> %4
221  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data3, <4 x i32*> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
222  %index.next = add i32 %index, 4
223  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
224  %6 = icmp eq i32 %index.next, %n.vec
225  br i1 %6, label %middle.block, label %vector.body
226
227middle.block:                                     ; preds = %vector.body
228  %cmp.n = icmp eq i32 %n.vec, %n
229  br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph
230
231for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
232  ret void
233}
234
235
236declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
237declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
238declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
239declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
240declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
241declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
242declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
243declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
244declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
245