• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v2 v3
33
34# A53 based on A57/A75 but with LD64
35
36BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
37
38        # Load cn_stride, params pointer
39        LDP x14, x8, [sp]
40
41        # Load min/max values
42        LD2R {v2.4s, v3.4s}, [x8]
430:
44        # Load initial bias from w into accumulators
45        LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48
46
47        MOVI v5.4s, 0  // second set of C for pipelining FMLA
48        PRFM PLDL1KEEP, [x5]
49        MOVI v6.4s, 0
50        PRFM PLDL1KEEP, [x5, 64]
51        MOVI v7.4s, 0
52        PRFM PLDL1KEEP, [x5, 128]
53        PRFM PLDL1KEEP, [x5, 192]
54
55        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
56        SUBS x0, x2, 32  // k = kc - 32
57
58        B.LO 3f
59
60        # 16 prologue
61        # Read first block of 1 A and B.
62        LDP q20, q21, [x5], 32
63        LDP q22, q23, [x5], 32
64        LDP q24, q25, [x5], 32
65        LDP q26, q27, [x5], 32
66        LDP q28, q29, [x5], 32
67        LDP q30, q31, [x5], 32
68        LDR q0, [x3], 16
69
70        # Is there at least 32.  yes do main loop
71        SUBS x0, x0, 32
72        B.LO 2f
73
74        # Main loop - 8 floats of A (32 bytes)
751:
76        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
77        FMLA v16.4s, v20.4s, v0.s[0]
78        LDR  q1, [x3], 16
79        FMLA v17.4s, v21.4s, v0.s[0]
80        LDR q20, [x5], 16
81        FMLA v18.4s, v22.4s, v0.s[0]
82        LDR q21, [x5], 16
83        FMLA  v5.4s, v23.4s, v0.s[1]
84        LDR q22, [x5], 16
85        FMLA  v6.4s, v24.4s, v0.s[1]
86        LDR q23, [x5], 16
87        FMLA  v7.4s, v25.4s, v0.s[1]
88        LDR q24, [x5], 16
89        FMLA v16.4s, v26.4s, v0.s[2]
90        LDR q25, [x5], 16
91        FMLA v17.4s, v27.4s, v0.s[2]
92        LDR q26, [x5], 16
93        FMLA v18.4s, v28.4s, v0.s[2]
94        LDR q27, [x5], 16
95        FMLA  v5.4s, v29.4s, v0.s[3]
96        LDR q28, [x5], 16
97        FMLA  v6.4s, v30.4s, v0.s[3]
98        LDR q29, [x5], 16
99        FMLA  v7.4s, v31.4s, v0.s[3]
100        LDR q30, [x5], 16
101        LDR q31, [x5], 16
102
103        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
104        FMLA v16.4s, v20.4s, v1.s[0]
105        LDR  q0, [x3], 16
106        FMLA v17.4s, v21.4s, v1.s[0]
107        LDR q20, [x5], 16
108        FMLA v18.4s, v22.4s, v1.s[0]
109        LDR q21, [x5], 16
110        FMLA  v5.4s, v23.4s, v1.s[1]
111        LDR q22, [x5], 16
112        FMLA  v6.4s, v24.4s, v1.s[1]
113        LDR q23, [x5], 16
114        FMLA  v7.4s, v25.4s, v1.s[1]
115        LDR q24, [x5], 16
116        FMLA v16.4s, v26.4s, v1.s[2]
117        LDR q25, [x5], 16
118        FMLA v17.4s, v27.4s, v1.s[2]
119        LDR q26, [x5], 16
120        FMLA v18.4s, v28.4s, v1.s[2]
121        LDR q27, [x5], 16
122        FMLA  v5.4s, v29.4s, v1.s[3]
123        LDR q28, [x5], 16
124        FMLA  v6.4s, v30.4s, v1.s[3]
125        LDR q29, [x5], 16
126        FMLA  v7.4s, v31.4s, v1.s[3]
127        LDR q30, [x5], 16
128        SUBS x0, x0, 32
129        LDR q31, [x5], 16
130        B.HS 1b
131
1322:
133        # Epilogue
134
135        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
136        FMLA v16.4s, v20.4s, v0.s[0]
137        LDR  q1, [x3], 16
138        FMLA v17.4s, v21.4s, v0.s[0]
139        LDR q20, [x5], 16
140        FMLA v18.4s, v22.4s, v0.s[0]
141        LDR q21, [x5], 16
142        FMLA  v5.4s, v23.4s, v0.s[1]
143        LDR q22, [x5], 16
144        FMLA  v6.4s, v24.4s, v0.s[1]
145        LDR q23, [x5], 16
146        FMLA  v7.4s, v25.4s, v0.s[1]
147        LDR q24, [x5], 16
148        FMLA v16.4s, v26.4s, v0.s[2]
149        LDR q25, [x5], 16
150        FMLA v17.4s, v27.4s, v0.s[2]
151        LDR q26, [x5], 16
152        FMLA v18.4s, v28.4s, v0.s[2]
153        LDR q27, [x5], 16
154        FMLA  v5.4s, v29.4s, v0.s[3]
155        LDR q28, [x5], 16
156        FMLA  v6.4s, v30.4s, v0.s[3]
157        LDR q29, [x5], 16
158        FMLA  v7.4s, v31.4s, v0.s[3]
159        LDR q30, [x5], 16
160
161        # Second block of 4.  FMA for second 4, no loads.
162        FMLA v16.4s, v20.4s, v1.s[0]
163        LDR q31, [x5], 16
164        FMLA v17.4s, v21.4s, v1.s[0]
165        FMLA v18.4s, v22.4s, v1.s[0]
166        FMLA  v5.4s, v23.4s, v1.s[1]
167        FMLA  v6.4s, v24.4s, v1.s[1]
168        FMLA  v7.4s, v25.4s, v1.s[1]
169        FMLA v16.4s, v26.4s, v1.s[2]
170        FMLA v17.4s, v27.4s, v1.s[2]
171        FMLA v18.4s, v28.4s, v1.s[2]
172        FMLA  v5.4s, v29.4s, v1.s[3]
173        FMLA  v6.4s, v30.4s, v1.s[3]
174        FMLA  v7.4s, v31.4s, v1.s[3]
175
1763:
177        # Is there a remainder?- 4 floats of A (16 bytes)
178        TBNZ x0, 4, 5f
179        # Is there a remainder?- 2 floats of A (8 bytes)
180        TBNZ x0, 3, 6f
181        # Is there a remainder?- 1 floats of A (4 bytes)
182        TBNZ x0, 2, 8f
183
1844:
185        FADD v16.4s, v16.4s, v5.4s
186        FADD v17.4s, v17.4s, v6.4s
187        FADD v18.4s, v18.4s, v7.4s
188        SUBS x1, x1, 12
189
190        # Clamp
191        FMAX v16.4s, v16.4s, v2.4s
192        FMAX v17.4s, v17.4s, v2.4s
193        FMAX v18.4s, v18.4s, v2.4s
194        FMIN v16.4s, v16.4s, v3.4s
195        FMIN v17.4s, v17.4s, v3.4s
196        FMIN v18.4s, v18.4s, v3.4s
197
198        # Store full 1 x 12
199        B.LO 9f
200
201        ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14
202        SUB  x3,  x3, x2 // a0 -= kc
203
204        B.HI 0b
205
206        RET
207
2085:
209        # Remainder- 4 floats of A (16 bytes)
210        LDR q0, [x3], 16
211        LDR q20, [x5], 16
212        LDR q21, [x5], 16
213        LDR q22, [x5], 16
214        FMLA v16.4s, v20.4s, v0.s[0]
215        FMLA v17.4s, v21.4s, v0.s[0]
216        FMLA v18.4s, v22.4s, v0.s[0]
217
218        LDR q20, [x5], 16
219        LDR q21, [x5], 16
220        LDR q22, [x5], 16
221        FMLA v16.4s, v20.4s, v0.s[1]
222        FMLA v17.4s, v21.4s, v0.s[1]
223        FMLA v18.4s, v22.4s, v0.s[1]
224
225        LDR q20, [x5], 16
226        LDR q21, [x5], 16
227        LDR q22, [x5], 16
228        FMLA v16.4s, v20.4s, v0.s[2]
229        FMLA v17.4s, v21.4s, v0.s[2]
230        FMLA v18.4s, v22.4s, v0.s[2]
231
232        LDR q20, [x5], 16
233        LDR q21, [x5], 16
234        LDR q22, [x5], 16
235        FMLA v16.4s, v20.4s, v0.s[3]
236        FMLA v17.4s, v21.4s, v0.s[3]
237        FMLA v18.4s, v22.4s, v0.s[3]
238
239        TBZ x0, 3, 7f
2406:
241        # Remainder- 2 floats of A (8 bytes)
242        LDR d0, [x3], 8
243        LDR q20, [x5], 16
244        LDR q21, [x5], 16
245        LDR q22, [x5], 16
246        FMLA v16.4s, v20.4s, v0.s[0]
247        FMLA v17.4s, v21.4s, v0.s[0]
248        FMLA v18.4s, v22.4s, v0.s[0]
249
250        LDR q20, [x5], 16
251        LDR q21, [x5], 16
252        LDR q22, [x5], 16
253        FMLA v16.4s, v20.4s, v0.s[1]
254        FMLA v17.4s, v21.4s, v0.s[1]
255        FMLA v18.4s, v22.4s, v0.s[1]
2567:
257        TBZ x0, 2, 4b
2588:
259        # Remainder- 1 float of A (4 bytes)
260        LDR s0, [x3], 4
261        LDR q20, [x5], 16
262        LDR q21, [x5], 16
263        LDR q22, [x5], 16
264        FMLA v16.4s, v20.4s, v0.s[0]
265        FMLA v17.4s, v21.4s, v0.s[0]
266        FMLA v18.4s, v22.4s, v0.s[0]
267        B 4b
268
269        # Store odd channels
2709:
271        ADD x1, x1, 12
272        TBZ x1, 3, 10f
273        STP q16, q17, [x6], 32
274        MOV v16.16b, v18.16b
275
27610:
277        TBZ x1, 2, 11f
278        STR q16, [x6], 16
279        MOV v16.16b, v17.16b
280
28111:
282        TBZ x1, 1, 12f
283        STR d16, [x6], 8
284        DUP d16, v16.d[1]
285
28612:
287        TBZ x1, 0, 13f
288        STR s16, [x6]
28913:
290        RET
291
292END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
293
294#ifdef __ELF__
295.section ".note.GNU-stack","",%progbits
296#endif
297