• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55(
9#     size_t channels,
10#     size_t output_width,
11#     const float** input,
12#     const float* weights,
13#     float* output,
14#     size_t input_stride,
15#     size_t output_increment,
16#     const union xnn_f32_output_params params[restrict static 1])
17BEGIN_FUNCTION xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55
18
19        # Save d8-d15 on stack
20        STP  d8,  d9, [sp, -64]!
21        STP d10, d11, [sp, 16]
22        STP d12, d13, [sp, 32]
23        STP d14, d15, [sp, 48]
24
25        # Load clamping_params values
26        LD2R {v30.4s, v31.4s}, [x7]
27
280:
29        #  x7 := i0
30        #  x8 := i1
31        LDP x7, x8, [x2]
32        #  x9 := i2
33        # x10 := i3
34        LDP x9, x10, [x2, 16]
35        # x11 := i4
36        # x12 := i5
37        LDP x11, x12, [x2, 32]
38        # x13 := i6
39        # x14 := i7
40        LDP x13, x14, [x2, 48]
41        # x15 := i8
42        LDR x15, [x2, 64]
43        # input += input_stride
44        ADD x2, x2, x5
45
46        # x16 := c = channels
47        # c -= 8
48        SUBS x16, x0, 8
49        # x17 := w = weights
50        MOV x17, x3
51
52        # skip main loop if c < 8
53        B.LO 3f
54
55          # SWP prologue
56
57          # Load vbias.lo
58          LD1 {v0.2S}, [x17], 8
59
60          # Load vbias.hi
61          LD1 {v1.2S}, [x17], 8
62
63          # Load vi0.lo
64          LD1 {v4.2S}, [x7], 8
65
66          # Load vk0.lo
67          LD1 {v5.2S}, [x17], 8
68
69          # Load vi0.hi
70          LD1 {v6.2S}, [x7], 8
71
72          # Load vk0.hi
73          LD1 {v7.2S}, [x17], 8
74
75          # Load vi1.lo
76          LD1 {v8.2S}, [x8], 8
77
78          # Load vk1.lo
79          LD1 {v9.2S}, [x17], 8
80
81          # Load vi1.hi
82          LD1 {v10.2S}, [x8], 8
83
84          # Load vk1.hi
85          LD1 {v11.2S}, [x17], 8
86
87          # Load vi2.lo
88          LD1 {v12.2S}, [x9], 8
89
90          # Load vk2.lo
91          LD1 {v13.2S}, [x17], 8
92
93          # Load vi2.hi
94          LD1 {v14.2S}, [x9], 8
95
96          # Load vk2.hi
97          LD1 {v15.2S}, [x17], 8
98
99          # Load vi3.lo
100          LD1 {v16.2S}, [x10], 8
101
102          # Load vk3.lo
103          LD1 {v17.2S}, [x17], 8
104
105          # Load vi3.hi
106          LD1 {v18.2S}, [x10], 8
107
108          # Load vk3.hi
109          LD1 {v19.2S}, [x17], 8
110
111          # Load vi4.lo
112          LD1 {v20.2S}, [x11], 8
113
114          # Load vk4.lo
115          LD1 {v21.2S}, [x17], 8
116
117          # Load vi4.hi
118          LD1 {v22.2S}, [x11], 8
119
120          # Load vk4.hi
121          LD1 {v23.2S}, [x17], 8
122
123          # Load vi5.lo
124          LD1 {v24.2S}, [x12], 8
125
126          # Load vk5.lo
127          LD1 {v25.2S}, [x17], 8
128
129          # Load vi5.hi
130          LD1 {v26.2S}, [x12], 8
131
132          # Load vk5.hi
133          LD1 {v27.2S}, [x17], 8
134
135          # vacc.lo += vi0.lo * vk0.lo
136          FMLA v0.2S, v4.2S, v5.2S
137          # Load vi6.lo
138          LD1 {v4.2S}, [x13], 8
139
140          # Load vk6.lo
141          LD1 {v5.2S}, [x17], 8
142
143          # vacc.hi += vi0.hi * vk0.hi
144          FMLA v1.2S, v6.2S, v7.2S
145          # Load vi6.hi
146          LD1 {v6.2S}, [x13], 8
147
148          # Load vk6.hi
149          LD1 {v7.2S}, [x17], 8
150
151          # vacc.lo += vi1.lo * vk0.lo
152          FMLA v0.2S, v8.2S, v9.2S
153          # Load vi7.lo
154          LD1 {v8.2S}, [x14], 8
155
156          # Load vk7.lo
157          LD1 {v9.2S}, [x17], 8
158
159          # vacc.hi += vi1.hi * vk0.hi
160          FMLA v1.2S, v10.2S, v11.2S
161          # Load vi7.hi
162          LD1 {v10.2S}, [x14], 8
163
164          # Load vk7.hi
165          LD1 {v11.2S}, [x17], 8
166
167          # vacc.lo += vi2.lo * vk2.lo
168          FMLA v0.2S, v12.2S, v13.2S
169          # Load vi8.lo
170          LD1 {v12.2S}, [x15], 8
171
172          # Load vk8.lo
173          LD1 {v13.2S}, [x17], 8
174
175          # vacc.hi += vi2.hi * vk2.hi
176          FMLA v1.2S, v14.2S, v15.2S
177          # Load vi8.hi
178          LD1 {v14.2S}, [x15], 8
179
180          # Load vk8.hi
181          LD1 {v15.2S}, [x17], 8
182
183          # Load vbias_next.lo
184          LD1 {v2.2S}, [x17], 8
185
186          # Load vbias_next.hi
187          LD1 {v3.2S}, [x17], 8
188
189          # vacc.lo += vi3.lo * vk3.lo
190          FMLA v0.2S, v16.2S, v17.2S
191          # Load vi0_next.lo
192          LD1 {v16.2S}, [x7], 8
193
194          # Load vk0_next.lo
195          LD1 {v17.2S}, [x17], 8
196
197          # vacc.hi += vi3.hi * vk3.hi
198          FMLA v1.2S, v18.2S, v19.2S
199          # Load vi0_next.hi
200          LD1 {v18.2S}, [x7], 8
201
202          # Load vk0_next.hi
203          LD1 {v19.2S}, [x17], 8
204
205          # vacc.lo += vi4.lo * vk4.lo
206          FMLA v0.2S, v20.2S, v21.2S
207          # Load vi1_next.lo
208          LD1 {v20.2S}, [x8], 8
209
210          # Load vk1_next.lo
211          LD1 {v21.2S}, [x17], 8
212
213          # vacc.hi += vi4.hi * vk4.hi
214          FMLA v1.2S, v22.2S, v23.2S
215          # Load vi1_next.hi
216          LD1 {v22.2S}, [x8], 8
217
218          # Load vk1_next.hi
219          LD1 {v23.2S}, [x17], 8
220
221          # vacc.lo += vi5.lo * vk5.lo
222          FMLA v0.2S, v24.2S, v25.2S
223          # Load vi2_next.lo
224          LD1 {v24.2S}, [x9], 8
225
226          # Load vk2_next.lo
227          LD1 {v25.2S}, [x17], 8
228
229          # vacc.hi += vi5.hi * vk5.hi
230          FMLA v1.2S, v26.2S, v27.2S
231          # Load vi2_next.hi
232          LD1 {v26.2S}, [x9], 8
233
234          # Load vk2_next.hi
235          LD1 {v27.2S}, [x17], 8
236
237          # vacc.lo += vi6.lo * vk6.lo
238          FMLA v0.2S, v4.2S, v5.2S
239          # Load vi3_next.lo
240          LD1 {v4.2S}, [x10], 8
241
242          # Load vk3_next.lo
243          LD1 {v5.2S}, [x17], 8
244
245          # vacc.hi += vi6.hi * vk6.hi
246          FMLA v1.2S, v6.2S, v7.2S
247          # Load vi3_next.hi
248          LD1 {v6.2S}, [x10], 8
249
250          # Load vk3_next.hi
251          LD1 {v7.2S}, [x17], 8
252
253          # vacc.lo += vi7.lo * vk7.lo
254          FMLA v0.2S, v8.2S, v9.2S
255          # Load vi4_next.lo
256          LD1 {v8.2S}, [x11], 8
257
258          # Load vk4_next.lo
259          LD1 {v9.2S}, [x17], 8
260
261          # vacc.hi += vi7.hi * vk7.hi
262          FMLA v1.2S, v10.2S, v11.2S
263          # Load vi4_next.hi
264          LD1 {v10.2S}, [x11], 8
265
266          # Load vk4_next.hi
267          LD1 {v11.2S}, [x17], 8
268
269          # vacc.lo += vi8.lo * vk8.lo
270          FMLA v0.2S, v12.2S, v13.2S
271          # Load vi5_next.lo
272          LD1 {v12.2S}, [x12], 8
273
274          # Load vk5_next.lo
275          LD1 {v13.2S}, [x17], 8
276
277          # vacc.hi += vi8.hi * vk8.hi
278          FMLA v1.2S, v14.2S, v15.2S
279          # Load vi5_next.hi
280          LD1 {v14.2S}, [x12], 8
281
282          # Load vk5_next.hi
283          LD1 {v15.2S}, [x17], 8
284
285          # vacc_next.lo += vi0_next.lo * vk0_next.lo
286          FMLA v2.2S, v16.2S, v17.2S
287          # Load vi6_next.lo
288          LD1 {v16.2S}, [x13], 8
289
290          # vacc.lo = min(vacc.lo, vmax)
291          FMIN v0.2S, v0.2S, v30.2S
292          # Load vk6_next.lo
293          LD1 {v17.2S}, [x17], 8
294
295          # vacc_next.hi += vi0_next.hi * vk0_next.hi
296          FMLA v3.2S, v18.2S, v19.2S
297          # Load vi6_next.hi
298          LD1 {v18.2S}, [x13], 8
299
300          # vacc.hi = min(vacc.hi, vmax)
301          FMIN v1.2S, v1.2S, v30.2S
302          # Load vk6_next.hi
303          LD1 {v19.2S}, [x17], 8
304
305          # vacc_next.lo += vi1_next.lo * vk1_next.lo
306          FMLA v2.2S, v20.2S, v21.2S
307          # Load vi7_next.lo
308          LD1 {v20.2S}, [x14], 8
309
310          # vacc.lo = max(vacc.lo, vmin)
311          FMAX v0.2S, v0.2S, v31.2S
312          # Load vk7_next.lo
313          LD1 {v21.2S}, [x17], 8
314
315          # vacc_next.hi += vi1_next.hi * vk1_next.hi
316          FMLA v3.2S, v22.2S, v23.2S
317          # Load vi7_next.hi
318          LD1 {v22.2S}, [x14], 8
319
320          # vacc.hi = max(vacc.hi, vmin)
321          FMAX v1.2S, v1.2S, v31.2S
322          # Load vk7_next.hi
323          LD1 {v23.2S}, [x17], 8
324
325          # vacc_next.lo += vi2_next.lo * vk2_next.lo
326          FMLA v2.2S, v24.2S, v25.2S
327          # Load vi8_next.lo
328          LD1 {v24.2S}, [x15], 8
329
330          # Load vk8_next.lo
331          LD1 {v25.2S}, [x17], 8
332
333          # vacc_next.hi += vi2_next.hi * vk2_next.hi
334          FMLA v3.2S, v26.2S, v27.2S
335          # Load vi8_next.hi
336          LD1 {v26.2S}, [x15], 8
337
338          # Store vacc
339          STP d0, d1, [x4], 16
340
341          # c -= 8
342          SUBS x16, x16, 8
343          # Load vk8_next.hi
344          LD1 {v27.2S}, [x17], 8
345
346          B.LO 2f
347
3481:
349            # SWP iteration
350
351            # Load vbias.lo
352            LD1 {v0.2S}, [x17], 8
353
354            # Load vbias.hi
355            LD1 {v1.2S}, [x17], 8
356
357            # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo
358            FMLA v2.2S, v4.2S, v5.2S
359            # Load vi0.lo
360            LD1 {v4.2S}, [x7], 8
361
362            # Load vk0.lo
363            LD1 {v5.2S}, [x17], 8
364
365            # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi
366            FMLA v3.2S, v6.2S, v7.2S
367            # Load vi0.hi
368            LD1 {v6.2S}, [x7], 8
369
370            # Load vk0.hi
371            LD1 {v7.2S}, [x17], 8
372
373            # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo
374            FMLA v2.2S, v8.2S, v9.2S
375            # Load vi1.lo
376            LD1 {v8.2S}, [x8], 8
377
378            # Load vk1.lo
379            LD1 {v9.2S}, [x17], 8
380
381            # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi
382            FMLA v3.2S, v10.2S, v11.2S
383            # Load vi1.hi
384            LD1 {v10.2S}, [x8], 8
385
386            # Load vk1.hi
387            LD1 {v11.2S}, [x17], 8
388
389            # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo
390            FMLA v2.2S, v12.2S, v13.2S
391            # Load vi2.lo
392            LD1 {v12.2S}, [x9], 8
393
394            # Load vk2.lo
395            LD1 {v13.2S}, [x17], 8
396
397            # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi
398            FMLA v3.2S, v14.2S, v15.2S
399            # Load vi2.hi
400            LD1 {v14.2S}, [x9], 8
401
402            # Load vk2.hi
403            LD1 {v15.2S}, [x17], 8
404
405            # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo
406            FMLA v2.2S, v16.2S, v17.2S
407            # Load vi3.lo
408            LD1 {v16.2S}, [x10], 8
409
410            # Load vk3.lo
411            LD1 {v17.2S}, [x17], 8
412
413            # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi
414            FMLA v3.2S, v18.2S, v19.2S
415            # Load vi3.hi
416            LD1 {v18.2S}, [x10], 8
417
418            # Load vk3.hi
419            LD1 {v19.2S}, [x17], 8
420
421            # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo
422            FMLA v2.2S, v20.2S, v21.2S
423            # Load vi4.lo
424            LD1 {v20.2S}, [x11], 8
425
426            # Load vk4.lo
427            LD1 {v21.2S}, [x17], 8
428
429            # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi
430            FMLA v3.2S, v22.2S, v23.2S
431            # Load vi4.hi
432            LD1 {v22.2S}, [x11], 8
433
434            # Load vk4.hi
435            LD1 {v23.2S}, [x17], 8
436
437            # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo
438            FMLA v2.2S, v24.2S, v25.2S
439            # Load vi5.lo
440            LD1 {v24.2S}, [x12], 8
441
442            # Load vk5.lo
443            LD1 {v25.2S}, [x17], 8
444
445            # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi
446            FMLA v3.2S, v26.2S, v27.2S
447            # Load vi5.hi
448            LD1 {v26.2S}, [x12], 8
449
450            # Load vk5.hi
451            LD1 {v27.2S}, [x17], 8
452
453            # vacc.lo += vi0.lo * vk0.lo
454            FMLA v0.2S, v4.2S, v5.2S
455            # Load vi6.lo
456            LD1 {v4.2S}, [x13], 8
457
458            # vacc_prev.lo = min(vacc_prev.lo, vmax)
459            FMIN v2.2S, v2.2S, v30.2S
460            # Load vk6.lo
461            LD1 {v5.2S}, [x17], 8
462
463            # vacc.hi += vi0.hi * vk0.hi
464            FMLA v1.2S, v6.2S, v7.2S
465            # Load vi6.hi
466            LD1 {v6.2S}, [x13], 8
467
468            # vacc_prev.hi = min(vacc_prev.hi, vmax)
469            FMIN v3.2S, v3.2S, v30.2S
470            # Load vk6.hi
471            LD1 {v7.2S}, [x17], 8
472
473            # vacc.lo += vi1.lo * vk0.lo
474            FMLA v0.2S, v8.2S, v9.2S
475            # Load vi7.lo
476            LD1 {v8.2S}, [x14], 8
477
478            # vacc_prev.lo = max(vacc_prev.lo, vmin)
479            FMAX v2.2S, v2.2S, v31.2S
480            # Load vk7.lo
481            LD1 {v9.2S}, [x17], 8
482
483            # vacc.hi += vi1.hi * vk0.hi
484            FMLA v1.2S, v10.2S, v11.2S
485            # Load vi7.hi
486            LD1 {v10.2S}, [x14], 8
487
488            # vacc_prev.lo = max(vacc_prev.lo, vmin)
489            FMAX v3.2S, v3.2S, v31.2S
490            # Load vk7.hi
491            LD1 {v11.2S}, [x17], 8
492
493            # vacc.lo += vi2.lo * vk2.lo
494            FMLA v0.2S, v12.2S, v13.2S
495            # Load vi8.lo
496            LD1 {v12.2S}, [x15], 8
497
498            # Load vk8.lo
499            LD1 {v13.2S}, [x17], 8
500
501            # vacc.hi += vi2.hi * vk2.hi
502            FMLA v1.2S, v14.2S, v15.2S
503            # Load vi8.hi
504            LD1 {v14.2S}, [x15], 8
505
506            # Store vacc_prev
507            STP d2, d3, [x4], 16
508
509            # Load vk8.hi
510            LD1 {v15.2S}, [x17], 8
511
512            # Load vbias_next.lo
513            LD1 {v2.2S}, [x17], 8
514
515            # Load vbias_next.hi
516            LD1 {v3.2S}, [x17], 8
517
518            # vacc.lo += vi3.lo * vk3.lo
519            FMLA v0.2S, v16.2S, v17.2S
520            # Load vi0_next.lo
521            LD1 {v16.2S}, [x7], 8
522
523            # Load vk0_next.lo
524            LD1 {v17.2S}, [x17], 8
525
526            # vacc.hi += vi3.hi * vk3.hi
527            FMLA v1.2S, v18.2S, v19.2S
528            # Load vi0_next.hi
529            LD1 {v18.2S}, [x7], 8
530
531            # Load vk0_next.hi
532            LD1 {v19.2S}, [x17], 8
533
534            # vacc.lo += vi4.lo * vk4.lo
535            FMLA v0.2S, v20.2S, v21.2S
536            # Load vi1_next.lo
537            LD1 {v20.2S}, [x8], 8
538
539            # Load vk1_next.lo
540            LD1 {v21.2S}, [x17], 8
541
542            # vacc.hi += vi4.hi * vk4.hi
543            FMLA v1.2S, v22.2S, v23.2S
544            # Load vi1_next.hi
545            LD1 {v22.2S}, [x8], 8
546
547            # Load vk1_next.hi
548            LD1 {v23.2S}, [x17], 8
549
550            # vacc.lo += vi5.lo * vk5.lo
551            FMLA v0.2S, v24.2S, v25.2S
552            # Load vi2_next.lo
553            LD1 {v24.2S}, [x9], 8
554
555            # Load vk2_next.lo
556            LD1 {v25.2S}, [x17], 8
557
558            # vacc.hi += vi5.hi * vk5.hi
559            FMLA v1.2S, v26.2S, v27.2S
560            # Load vi2_next.hi
561            LD1 {v26.2S}, [x9], 8
562
563            # Load vk2_next.hi
564            LD1 {v27.2S}, [x17], 8
565
566            # vacc.lo += vi6.lo * vk6.lo
567            FMLA v0.2S, v4.2S, v5.2S
568            # Load vi3_next.lo
569            LD1 {v4.2S}, [x10], 8
570
571            # Load vk3_next.lo
572            LD1 {v5.2S}, [x17], 8
573
574            # vacc.hi += vi6.hi * vk6.hi
575            FMLA v1.2S, v6.2S, v7.2S
576            # Load vi3_next.hi
577            LD1 {v6.2S}, [x10], 8
578
579            # Load vk3_next.hi
580            LD1 {v7.2S}, [x17], 8
581
582            # vacc.lo += vi7.lo * vk7.lo
583            FMLA v0.2S, v8.2S, v9.2S
584            # Load vi4_next.lo
585            LD1 {v8.2S}, [x11], 8
586
587            # Load vk4_next.lo
588            LD1 {v9.2S}, [x17], 8
589
590            # vacc.hi += vi7.hi * vk7.hi
591            FMLA v1.2S, v10.2S, v11.2S
592            # Load vi4_next.hi
593            LD1 {v10.2S}, [x11], 8
594
595            # Load vk4_next.hi
596            LD1 {v11.2S}, [x17], 8
597
598            # vacc.lo += vi8.lo * vk8.lo
599            FMLA v0.2S, v12.2S, v13.2S
600            # Load vi5_next.lo
601            LD1 {v12.2S}, [x12], 8
602
603            # Load vk5_next.lo
604            LD1 {v13.2S}, [x17], 8
605
606            # vacc.hi += vi8.hi * vk8.hi
607            FMLA v1.2S, v14.2S, v15.2S
608            # Load vi5_next.hi
609            LD1 {v14.2S}, [x12], 8
610
611            # Load vk5_next.hi
612            LD1 {v15.2S}, [x17], 8
613
614            # vacc_next.lo += vi0_next.lo * vk0_next.lo
615            FMLA v2.2S, v16.2S, v17.2S
616            # Load vi6_next.lo
617            LD1 {v16.2S}, [x13], 8
618
619            # vacc.lo = min(vacc.lo, vmax)
620            FMIN v0.2S, v0.2S, v30.2S
621            # Load vk6_next.lo
622            LD1 {v17.2S}, [x17], 8
623
624            # vacc_next.hi += vi0_next.hi * vk0_next.hi
625            FMLA v3.2S, v18.2S, v19.2S
626            # Load vi6_next.hi
627            LD1 {v18.2S}, [x13], 8
628
629            # vacc.hi = min(vacc.hi, vmax)
630            FMIN v1.2S, v1.2S, v30.2S
631            # Load vk6_next.hi
632            LD1 {v19.2S}, [x17], 8
633
634            # vacc_next.lo += vi1_next.lo * vk1_next.lo
635            FMLA v2.2S, v20.2S, v21.2S
636            # Load vi7_next.lo
637            LD1 {v20.2S}, [x14], 8
638
639            # vacc.lo = max(vacc.lo, vmin)
640            FMAX v0.2S, v0.2S, v31.2S
641            # Load vk7_next.lo
642            LD1 {v21.2S}, [x17], 8
643
644            # vacc_next.hi += vi1_next.hi * vk1_next.hi
645            FMLA v3.2S, v22.2S, v23.2S
646            # Load vi7_next.hi
647            LD1 {v22.2S}, [x14], 8
648
649            # vacc.hi = max(vacc.hi, vmin)
650            FMAX v1.2S, v1.2S, v31.2S
651            # Load vk7_next.hi
652            LD1 {v23.2S}, [x17], 8
653
654            # vacc_next.lo += vi2_next.lo * vk2_next.lo
655            FMLA v2.2S, v24.2S, v25.2S
656            # Load vi8_next.lo
657            LD1 {v24.2S}, [x15], 8
658
659            # Load vk8_next.lo
660            LD1 {v25.2S}, [x17], 8
661
662            # vacc_next.hi += vi2_next.hi * vk2_next.hi
663            FMLA v3.2S, v26.2S, v27.2S
664            # Load vi8_next.hi
665            LD1 {v26.2S}, [x15], 8
666
667            # Store vacc
668            STP d0, d1, [x4], 16
669
670            # c -= 8
671            SUBS x16, x16, 8
672            # Load vk8_next.hi
673            LD1 {v27.2S}, [x17], 8
674
675            B.HS 1b
676
6772:
678          # SWP epilogue
679
680          # vacc_prev.lo += vi3_prev.lo * vk3_prev.lo
681          FMLA v2.2S, v4.2S, v5.2S
682
683          # vacc_prev.hi += vi3_prev.hi * vk3_prev.hi
684          FMLA v3.2S, v6.2S, v7.2S
685
686          # vacc_prev.lo += vi4_prev.lo * vk4_prev.lo
687          FMLA v2.2S, v8.2S, v9.2S
688
689          # vacc_prev.hi += vi4_prev.hi * vk4_prev.hi
690          FMLA v3.2S, v10.2S, v11.2S
691
692          # vacc_prev.lo += vi5_prev.lo * vk5_prev.lo
693          FMLA v2.2S, v12.2S, v13.2S
694
695          # vacc_prev.hi += vi5_prev.hi * vk5_prev.hi
696          FMLA v3.2S, v14.2S, v15.2S
697
698          # vacc_prev.lo += vi6_prev.lo * vk6_prev.lo
699          FMLA v2.2S, v16.2S, v17.2S
700
701          # vacc_prev.hi += vi6_prev.hi * vk6_prev.hi
702          FMLA v3.2S, v18.2S, v19.2S
703
704          # vacc_prev.lo += vi7_prev.lo * vk7_prev.lo
705          FMLA v2.2S, v20.2S, v21.2S
706
707          # vacc_prev.hi += vi7_prev.hi * vk7_prev.hi
708          FMLA v3.2S, v22.2S, v23.2S
709
710          # vacc_prev.lo += vi8_prev.lo * vk8_prev.lo
711          FMLA v2.2S, v24.2S, v25.2S
712
713          # vacc_prev.hi += vi8_prev.hi * vk8_prev.hi
714          FMLA v3.2S, v26.2S, v27.2S
715
716          # vacc_prev.lo = min(vacc_prev.lo, vmax)
717          FMIN v2.2S, v2.2S, v30.2S
718
719          # vacc_prev.hi = min(vacc_prev.hi, vmax)
720          FMIN v3.2S, v3.2S, v30.2S
721
722          # vacc_prev.lo = max(vacc_prev.lo, vmin)
723          FMAX v2.2S, v2.2S, v31.2S
724
725          # vacc_prev.lo = max(vacc_prev.lo, vmin)
726          FMAX v3.2S, v3.2S, v31.2S
727
728          # Store vacc_prev
729          STP d2, d3, [x4], 16
730
7313:
732        # skip processing 4 channels if ((c - 8) & 4) = (c & 4) != 0
733        TBZ x16, 2, 4f
734
735        LDP q0, q1, [x17], 32
736        LDP q2, q3, [x17], 32
737        LDP q4, q5, [x17], 32
738        LDP q6, q7, [x17], 32
739        LDP q8, q9, [x17], 32
740        LDR q10, [x7], 16
741        LDR q11, [x8], 16
742        LDR q12, [x9], 16
743        LDR q13, [x10], 16
744        LDR q14, [x11], 16
745        LDR q15, [x12], 16
746        LDR q16, [x13], 16
747        LDR q17, [x14], 16
748        LDR q18, [x15], 16
749
750        FMLA v0.4S, v1.4S, v10.4S
751        FMLA v0.4S, v2.4S, v11.4S
752        FMLA v0.4S, v3.4S, v12.4S
753        FMLA v0.4S, v4.4S, v13.4S
754        FMLA v0.4S, v5.4S, v14.4S
755        FMLA v0.4S, v6.4S, v15.4S
756        FMLA v0.4S, v7.4S, v16.4S
757        FMLA v0.4S, v8.4S, v17.4S
758        FMLA v0.4S, v9.4S, v18.4S
759
760        FMIN v0.4S, v0.4S, v30.4S
761        FMAX v0.4S, v0.4S, v31.4S
762
763        STR q0, [x4], 16
764
7654:
766        # restore actual c value
767        ADD x16, x16, 8
768        # skip processing remainder channels unless c != 0
769        CBZ x16, 6f
770
771        LDP q0, q1, [x17], 32
772        LDP q2, q3, [x17], 32
773        LDP q4, q5, [x17], 32
774        LDP q6, q7, [x17], 32
775        LDP q8, q9, [x17], 32
776        LDR q10, [x7], 16
777        LDR q11, [x8], 16
778        LDR q12, [x9], 16
779        LDR q13, [x10], 16
780        LDR q14, [x11], 16
781        LDR q15, [x12], 16
782        LDR q16, [x13], 16
783        LDR q17, [x14], 16
784        LDR q18, [x15], 16
785
786        FMLA v0.4S, v1.4S, v10.4S
787        FMLA v0.4S, v2.4S, v11.4S
788        FMLA v0.4S, v3.4S, v12.4S
789        FMLA v0.4S, v4.4S, v13.4S
790        FMLA v0.4S, v5.4S, v14.4S
791        FMLA v0.4S, v6.4S, v15.4S
792        FMLA v0.4S, v7.4S, v16.4S
793        FMLA v0.4S, v8.4S, v17.4S
794        FMLA v0.4S, v9.4S, v18.4S
795
796        FMIN v0.4S, v0.4S, v30.4S
797        FMAX v0.4S, v0.4S, v31.4S
798
799        TBZ x16, 1, 5f
800
801        ST1 {v0.2S}, [x4], 8
802        DUP d0, v0.D[1]
803
8045:
805        TBZ x16, 0, 6f
806
807        ST1 {v0.S}[0], [x4], 4
808
8096:
810        # output_width -= 1
811        SUBS x1, x1, 1
812        # output += output_increment
813        ADD x4, x4, x6
814        # process next pixel if output_width != 0
815        B.NE 0b
816
817        # Restore d8-d15 from stack
818        LDP d14, d15, [sp, 48]
819        LDP d12, d13, [sp, 32]
820        LDP d10, d11, [sp, 16]
821        LDP  d8,  d9, [sp], 64
822        RET
823
824END_FUNCTION xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55
825
826#ifdef __ELF__
827.section ".note.GNU-stack","",%progbits
828#endif
829