• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
5; CHECK-LABEL: vldst4:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r7, lr}
8; CHECK-NEXT:    push {r4, r5, r7, lr}
9; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
10; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
11; CHECK-NEXT:    .pad #80
12; CHECK-NEXT:    sub sp, #80
13; CHECK-NEXT:    mul r12, r3, r2
14; CHECK-NEXT:    movs r2, #0
15; CHECK-NEXT:    cmp.w r2, r12, lsr #2
16; CHECK-NEXT:    beq.w .LBB0_3
17; CHECK-NEXT:  @ %bb.1: @ %vector.ph
18; CHECK-NEXT:    mvn r3, #7
19; CHECK-NEXT:    ldr r5, [sp, #160]
20; CHECK-NEXT:    and.w r3, r3, r12, lsr #2
21; CHECK-NEXT:    sub.w r12, r3, #8
22; CHECK-NEXT:    movs r3, #1
23; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
24; CHECK-NEXT:    dls lr, lr
25; CHECK-NEXT:  .LBB0_2: @ %vector.body
26; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
27; CHECK-NEXT:    vldrh.u16 q5, [r0, #32]
28; CHECK-NEXT:    vldrh.u16 q3, [r0, #48]
29; CHECK-NEXT:    vldrh.u16 q7, [r0], #64
30; CHECK-NEXT:    vmov r2, s20
31; CHECK-NEXT:    vmovx.f16 s8, s12
32; CHECK-NEXT:    vmov.16 q0[4], r2
33; CHECK-NEXT:    vmov r3, s22
34; CHECK-NEXT:    vmov.16 q0[5], r3
35; CHECK-NEXT:    vmov r2, s12
36; CHECK-NEXT:    vmov.16 q0[6], r2
37; CHECK-NEXT:    vmov r2, s28
38; CHECK-NEXT:    vldrh.u16 q6, [r0, #-48]
39; CHECK-NEXT:    vmov.16 q1[0], r2
40; CHECK-NEXT:    vmov r3, s30
41; CHECK-NEXT:    vmov.16 q1[1], r3
42; CHECK-NEXT:    vmov r2, s24
43; CHECK-NEXT:    vmov.16 q1[2], r2
44; CHECK-NEXT:    vmov r2, s14
45; CHECK-NEXT:    vmov.16 q0[7], r2
46; CHECK-NEXT:    vmov r2, s26
47; CHECK-NEXT:    vmov.16 q1[3], r2
48; CHECK-NEXT:    vmov.f32 s6, s2
49; CHECK-NEXT:    vmov.f32 s7, s3
50; CHECK-NEXT:    vmul.f16 q0, q1, r5
51; CHECK-NEXT:    vmovx.f16 s4, s24
52; CHECK-NEXT:    vmov q4, q0
53; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
54; CHECK-NEXT:    vmovx.f16 s0, s30
55; CHECK-NEXT:    vmov r3, s3
56; CHECK-NEXT:    vmov r2, s0
57; CHECK-NEXT:    vmovx.f16 s0, s28
58; CHECK-NEXT:    vmov r4, s0
59; CHECK-NEXT:    vmov.16 q0[0], r4
60; CHECK-NEXT:    vmov.16 q0[1], r2
61; CHECK-NEXT:    vmov r2, s4
62; CHECK-NEXT:    vmovx.f16 s4, s22
63; CHECK-NEXT:    vmov.16 q0[2], r2
64; CHECK-NEXT:    vmov r2, s4
65; CHECK-NEXT:    vmovx.f16 s4, s20
66; CHECK-NEXT:    vmov r4, s4
67; CHECK-NEXT:    vmov.16 q1[4], r4
68; CHECK-NEXT:    vmov.16 q1[5], r2
69; CHECK-NEXT:    vmov r2, s8
70; CHECK-NEXT:    vmovx.f16 s8, s14
71; CHECK-NEXT:    vmov.16 q1[6], r2
72; CHECK-NEXT:    vmov r2, s8
73; CHECK-NEXT:    vmovx.f16 s8, s26
74; CHECK-NEXT:    vmov.16 q1[7], r2
75; CHECK-NEXT:    vmov r2, s8
76; CHECK-NEXT:    vmov.16 q0[3], r2
77; CHECK-NEXT:    vmovx.f16 s8, s13
78; CHECK-NEXT:    vmov.f32 s2, s6
79; CHECK-NEXT:    vmov.f32 s3, s7
80; CHECK-NEXT:    vmov.16 q1[0], r3
81; CHECK-NEXT:    vmul.f16 q0, q0, r5
82; CHECK-NEXT:    vmov r3, s23
83; CHECK-NEXT:    vmov r2, s3
84; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
85; CHECK-NEXT:    vmovx.f16 s0, s19
86; CHECK-NEXT:    vmov.16 q1[1], r2
87; CHECK-NEXT:    vmov r2, s0
88; CHECK-NEXT:    vmov.16 q1[4], r2
89; CHECK-NEXT:    vmov r2, s21
90; CHECK-NEXT:    vmov.16 q0[4], r2
91; CHECK-NEXT:    vmov r2, s13
92; CHECK-NEXT:    vmov.16 q0[5], r3
93; CHECK-NEXT:    vmov r3, s29
94; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
95; CHECK-NEXT:    vmov.16 q0[6], r2
96; CHECK-NEXT:    vmov r2, s31
97; CHECK-NEXT:    vmov.16 q1[0], r3
98; CHECK-NEXT:    vmov.16 q1[1], r2
99; CHECK-NEXT:    vmov r2, s25
100; CHECK-NEXT:    vmov.16 q1[2], r2
101; CHECK-NEXT:    vmov r2, s15
102; CHECK-NEXT:    vmov.16 q0[7], r2
103; CHECK-NEXT:    vmov r2, s27
104; CHECK-NEXT:    vmov.16 q1[3], r2
105; CHECK-NEXT:    vmov.f32 s6, s2
106; CHECK-NEXT:    vmov.f32 s7, s3
107; CHECK-NEXT:    vmovx.f16 s0, s31
108; CHECK-NEXT:    vmov r2, s0
109; CHECK-NEXT:    vmovx.f16 s0, s29
110; CHECK-NEXT:    vmov r4, s0
111; CHECK-NEXT:    vmul.f16 q4, q1, r5
112; CHECK-NEXT:    vmov.16 q0[0], r4
113; CHECK-NEXT:    vmovx.f16 s4, s25
114; CHECK-NEXT:    vmov.16 q0[1], r2
115; CHECK-NEXT:    vmov r2, s4
116; CHECK-NEXT:    vmovx.f16 s4, s23
117; CHECK-NEXT:    vmov.16 q0[2], r2
118; CHECK-NEXT:    vmov r2, s4
119; CHECK-NEXT:    vmovx.f16 s4, s21
120; CHECK-NEXT:    vmov r4, s4
121; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
122; CHECK-NEXT:    vmov.16 q1[4], r4
123; CHECK-NEXT:    vmov r3, s16
124; CHECK-NEXT:    vmov.16 q1[5], r2
125; CHECK-NEXT:    vmov r2, s8
126; CHECK-NEXT:    vmovx.f16 s8, s15
127; CHECK-NEXT:    vmov.16 q1[6], r2
128; CHECK-NEXT:    vmov r2, s8
129; CHECK-NEXT:    vmovx.f16 s8, s27
130; CHECK-NEXT:    vmov.16 q1[7], r2
131; CHECK-NEXT:    vmov r2, s8
132; CHECK-NEXT:    vmov.16 q0[3], r2
133; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
134; CHECK-NEXT:    vmov.f32 s2, s6
135; CHECK-NEXT:    vmov.f32 s3, s7
136; CHECK-NEXT:    vmov.16 q1[2], r3
137; CHECK-NEXT:    vmul.f16 q6, q0, r5
138; CHECK-NEXT:    vmovx.f16 s0, s16
139; CHECK-NEXT:    vmov r2, s24
140; CHECK-NEXT:    vmov.16 q1[3], r2
141; CHECK-NEXT:    vmov r2, s0
142; CHECK-NEXT:    vmovx.f16 s0, s24
143; CHECK-NEXT:    vmov.16 q1[6], r2
144; CHECK-NEXT:    vmov r2, s0
145; CHECK-NEXT:    vmovx.f16 s0, s8
146; CHECK-NEXT:    vmov.16 q1[7], r2
147; CHECK-NEXT:    vmov r2, s8
148; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
149; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
150; CHECK-NEXT:    vmov.16 q5[0], r2
151; CHECK-NEXT:    vmov r2, s0
152; CHECK-NEXT:    vmov r3, s4
153; CHECK-NEXT:    vmovx.f16 s0, s4
154; CHECK-NEXT:    vmov.16 q5[1], r3
155; CHECK-NEXT:    vmov r3, s25
156; CHECK-NEXT:    vmov.16 q5[4], r2
157; CHECK-NEXT:    vmov r2, s0
158; CHECK-NEXT:    vmov.16 q5[5], r2
159; CHECK-NEXT:    vmov r2, s17
160; CHECK-NEXT:    vmov.16 q3[2], r2
161; CHECK-NEXT:    vmovx.f16 s0, s17
162; CHECK-NEXT:    vmov.16 q3[3], r3
163; CHECK-NEXT:    vmov r2, s0
164; CHECK-NEXT:    vmovx.f16 s0, s25
165; CHECK-NEXT:    vmov.16 q3[6], r2
166; CHECK-NEXT:    vmov r2, s0
167; CHECK-NEXT:    vmovx.f16 s0, s9
168; CHECK-NEXT:    vmov.16 q3[7], r2
169; CHECK-NEXT:    vmov r2, s9
170; CHECK-NEXT:    vmov.16 q7[0], r2
171; CHECK-NEXT:    vmov r3, s5
172; CHECK-NEXT:    vmov.16 q7[1], r3
173; CHECK-NEXT:    vmov r2, s0
174; CHECK-NEXT:    vmovx.f16 s0, s5
175; CHECK-NEXT:    vmov.16 q7[4], r2
176; CHECK-NEXT:    vmov r2, s0
177; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
178; CHECK-NEXT:    vmov.16 q7[5], r2
179; CHECK-NEXT:    vmov r3, s26
180; CHECK-NEXT:    vmov r2, s2
181; CHECK-NEXT:    vmovx.f16 s0, s2
182; CHECK-NEXT:    vmov.16 q2[2], r2
183; CHECK-NEXT:    vmov q4, q1
184; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
185; CHECK-NEXT:    vmov.16 q2[3], r3
186; CHECK-NEXT:    vmov r2, s0
187; CHECK-NEXT:    vmovx.f16 s0, s26
188; CHECK-NEXT:    vmov.16 q2[6], r2
189; CHECK-NEXT:    vmov r2, s0
190; CHECK-NEXT:    vmov.16 q2[7], r2
191; CHECK-NEXT:    vmov r2, s6
192; CHECK-NEXT:    vmov r3, s18
193; CHECK-NEXT:    vmov.16 q0[0], r2
194; CHECK-NEXT:    vmovx.f16 s4, s6
195; CHECK-NEXT:    vmov.16 q0[1], r3
196; CHECK-NEXT:    vmov r2, s4
197; CHECK-NEXT:    vmovx.f16 s4, s18
198; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
199; CHECK-NEXT:    vmov.16 q0[4], r2
200; CHECK-NEXT:    vmov r2, s4
201; CHECK-NEXT:    vmov.16 q0[5], r2
202; CHECK-NEXT:    vmov r2, s19
203; CHECK-NEXT:    vmov r3, s27
204; CHECK-NEXT:    vmov.16 q1[2], r2
205; CHECK-NEXT:    vmovx.f16 s16, s19
206; CHECK-NEXT:    vmov.16 q1[3], r3
207; CHECK-NEXT:    vmov r2, s16
208; CHECK-NEXT:    vmovx.f16 s16, s27
209; CHECK-NEXT:    vmov.16 q1[6], r2
210; CHECK-NEXT:    vmov r2, s16
211; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
212; CHECK-NEXT:    vmov.16 q1[7], r2
213; CHECK-NEXT:    vmov.f32 s1, s9
214; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
215; CHECK-NEXT:    vmovx.f16 s16, s19
216; CHECK-NEXT:    vmov.f32 s3, s11
217; CHECK-NEXT:    vmov r2, s16
218; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
219; CHECK-NEXT:    vmov.f32 s21, s25
220; CHECK-NEXT:    vstrh.16 q0, [r1, #32]
221; CHECK-NEXT:    vmov.16 q4[5], r2
222; CHECK-NEXT:    vmov.f32 s29, s13
223; CHECK-NEXT:    vmov q2, q4
224; CHECK-NEXT:    vmov.f32 s23, s27
225; CHECK-NEXT:    vmov.f32 s9, s5
226; CHECK-NEXT:    vmov.f32 s11, s7
227; CHECK-NEXT:    vstrh.16 q2, [r1, #48]
228; CHECK-NEXT:    vstrh.16 q5, [r1], #64
229; CHECK-NEXT:    vmov.f32 s31, s15
230; CHECK-NEXT:    vstrh.16 q7, [r1, #-48]
231; CHECK-NEXT:    le lr, .LBB0_2
232; CHECK-NEXT:  .LBB0_3: @ %while.end
233; CHECK-NEXT:    add sp, #80
234; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
235; CHECK-NEXT:    pop {r4, r5, r7, pc}
236entry:
237  %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
238  %l0 = bitcast i16 %tmp.0.extract.trunc to half
239  %mul = mul i32 %numCols, %numRows
240  %shr = lshr i32 %mul, 2
241  %cmp38 = icmp eq i32 %shr, 0
242  br i1 %cmp38, label %while.end, label %vector.ph
243
244vector.ph:                                        ; preds = %vector.memcheck
245  %n.vec = and i32 %shr, 1073741816
246  %l2 = shl nuw i32 %n.vec, 2
247  %ind.end = getelementptr half, half* %pIn, i32 %l2
248  %l3 = shl nuw i32 %n.vec, 2
249  %ind.end48 = getelementptr half, half* %pOut, i32 %l3
250  %ind.end50 = sub nsw i32 %shr, %n.vec
251  %broadcast.splatinsert55 = insertelement <8 x half> undef, half %l0, i32 0
252  %broadcast.splat56 = shufflevector <8 x half> %broadcast.splatinsert55, <8 x half> undef, <8 x i32> zeroinitializer
253  br label %vector.body
254
255vector.body:                                      ; preds = %vector.body, %vector.ph
256  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
257  %l4 = shl i32 %index, 2
258  %next.gep = getelementptr half, half* %pIn, i32 %l4
259  %l5 = shl i32 %index, 2
260  %l6 = bitcast half* %next.gep to <32 x half>*
261  %wide.vec = load <32 x half>, <32 x half>* %l6, align 2
262  %strided.vec = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
263  %strided.vec52 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
264  %strided.vec53 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
265  %strided.vec54 = shufflevector <32 x half> %wide.vec, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
266  %l7 = fmul <8 x half> %strided.vec, %broadcast.splat56
267  %l8 = fmul <8 x half> %strided.vec52, %broadcast.splat56
268  %l9 = fmul <8 x half> %strided.vec53, %broadcast.splat56
269  %l10 = fmul <8 x half> %strided.vec54, %broadcast.splat56
270  %l11 = getelementptr inbounds half, half* %pOut, i32 %l5
271  %l12 = bitcast half* %l11 to <32 x half>*
272  %l13 = shufflevector <8 x half> %l7, <8 x half> %l8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
273  %l14 = shufflevector <8 x half> %l9, <8 x half> %l10, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
274  %interleaved.vec = shufflevector <16 x half> %l13, <16 x half> %l14, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
275  store <32 x half> %interleaved.vec, <32 x half>* %l12, align 2
276  %index.next = add i32 %index, 8
277  %l15 = icmp eq i32 %index.next, %n.vec
278  br i1 %l15, label %while.end, label %vector.body
279
280while.end:                                        ; preds = %while.body, %middle.block, %entry
281  ret void
282}
283