• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                (x0) - unused.  mr = 1
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          (x4) - unused
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         (x7) - unused
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointer
28# x3  a0
29
30# C pointer
31# x6  c0
32
33# Clamp v4 v5
34
35BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
36
37        $if INC:
38          # Load cn_stride, acc
39          LDP x14, x15, [sp]
40          # Load params pointer
41          LDR x8, [sp, 16]
42        $else:
43          # Load cn_stride, params pointer
44          LDP x14, x8, [sp]
45
46        # Load clamping_params values
47        LD2R {v4.4s, v5.4s}, [x8]
480:
49        $if INC:
50          # Load initial accumulators
51          LDP q16, q17, [x15], 32
52        $else:
53          # Load initial bias from w into accumulators
54          LDP q16, q17, [x5], 32
55
56        MOVI v18.4s, 0  // second set of C for pipelining FMLA
57        $if PREFETCH:
58          PRFM PLDL1KEEP, [x5]
59        MOVI v19.4s, 0
60        $if PREFETCH:
61          PRFM PLDL1KEEP, [x5, 64]
62        $if PREFETCH:
63          PRFM PLDL1KEEP, [x5, 128]
64        $if PREFETCH:
65          PRFM PLDL1KEEP, [x5, 192]
66
67        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
68        SUBS x0, x2, 32  // k = kc - 32
69
70        B.LO 3f
71
72        # 16 prologue
73        # Read first block of 1 A and B.
74        LDP q20, q21, [x5], 32
75        LDP q22, q23, [x5], 32
76        LDP q24, q25, [x5], 32
77        LDP q26, q27, [x5], 32
78        LDR q0, [x3], 16
79
80        # Is there at least 32.  yes do main loop
81        SUBS x0, x0, 32
82        B.LO 2f
83
84        # Main loop - 8 floats of A (32 bytes)
851:
86        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
87        FMLA v16.4s, v20.4s, v0.s[0]
88        LDR q1, [x3], 16
89        FMLA v17.4s, v21.4s, v0.s[0]
90        LDP q20, q21, [x5], 32
91        FMLA v18.4s, v22.4s, v0.s[1]
92        $if PREFETCH:
93          PRFM PLDL1KEEP, [x5, 96]
94        FMLA v19.4s, v23.4s, v0.s[1]
95        LDP q22, q23, [x5], 32
96        FMLA v16.4s, v24.4s, v0.s[2]
97        FMLA v17.4s, v25.4s, v0.s[2]
98        LDP q24, q25, [x5], 32
99        FMLA v18.4s, v26.4s, v0.s[3]
100        FMLA v19.4s, v27.4s, v0.s[3]
101        LDP q26, q27, [x5], 32
102
103        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
104        FMLA v16.4s, v20.4s, v1.s[0]
105        LDR q0, [x3], 16
106        FMLA v17.4s, v21.4s, v1.s[0]
107        LDP q20, q21, [x5], 32
108        FMLA v18.4s, v22.4s, v1.s[1]
109        FMLA v19.4s, v23.4s, v1.s[1]
110        LDP q22, q23, [x5], 32
111        FMLA v16.4s, v24.4s, v1.s[2]
112        FMLA v17.4s, v25.4s, v1.s[2]
113        LDP q24, q25, [x5], 32
114        FMLA v18.4s, v26.4s, v1.s[3]
115        FMLA v19.4s, v27.4s, v1.s[3]
116        SUBS x0, x0, 32
117        LDP q26, q27, [x5], 32
118        B.HS 1b
119
1202:
121        # Epilogue
122
123        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
124        FMLA v16.4s, v20.4s, v0.s[0]
125        LDR q1, [x3], 16
126        FMLA v17.4s, v21.4s, v0.s[0]
127        LDP q20, q21, [x5], 32
128        FMLA v18.4s, v22.4s, v0.s[1]
129        FMLA v19.4s, v23.4s, v0.s[1]
130        LDP q22, q23, [x5], 32
131        FMLA v16.4s, v24.4s, v0.s[2]
132        FMLA v17.4s, v25.4s, v0.s[2]
133        LDP q24, q25, [x5], 32
134        FMLA v18.4s, v26.4s, v0.s[3]
135        FMLA v19.4s, v27.4s, v0.s[3]
136        LDP q26, q27, [x5], 32
137
138        # Second block of 4.  no loads
139        FMLA v16.4s, v20.4s, v1.s[0]
140        FMLA v17.4s, v21.4s, v1.s[0]
141        FMLA v18.4s, v22.4s, v1.s[1]
142        FMLA v19.4s, v23.4s, v1.s[1]
143        FMLA v16.4s, v24.4s, v1.s[2]
144        FMLA v17.4s, v25.4s, v1.s[2]
145        FMLA v18.4s, v26.4s, v1.s[3]
146        FMLA v19.4s, v27.4s, v1.s[3]
147
1483:
149        # Is there a remainder?- 4 floats of A (16 bytes)
150        TBNZ x0, 4, 5f
151        # Is there a remainder?- 2 floats of A (8 bytes)
152        TBNZ x0, 3, 6f
153        # Is there a remainder?- 1 floats of A (4 bytes)
154        TBNZ x0, 2, 8f
155
1564:
157        FADD v16.4s, v16.4s, v18.4s
158        SUBS x1, x1, 8
159        FADD v17.4s, v17.4s, v19.4s
160
161        # Clamp
162        FMIN v16.4s, v16.4s, v4.4s
163        FMIN v17.4s, v17.4s, v4.4s
164        FMAX v16.4s, v16.4s, v5.4s
165        FMAX v17.4s, v17.4s, v5.4s
166
167        # Store full 1 x 8
168        B.LO 9f
169
170        STP q16, q17, [x6]
171        ADD x6, x6, x14
172
173        SUB  x3,  x3, x2 // a0 -= kc
174
175        B.HI 0b
176
177        RET
178
1795:
180        # Remainder- 4 floats of A (16 bytes)
181        LDP q20, q21, [x5], 32
182        LDR q0, [x3], 16
183        FMLA v16.4s, v20.4s, v0.s[0]
184        FMLA v17.4s, v21.4s, v0.s[0]
185        LDP q22, q23, [x5], 32
186        LDP q24, q25, [x5], 32
187        LDP q26, q27, [x5], 32
188        FMLA v18.4s, v22.4s, v0.s[1]
189        FMLA v19.4s, v23.4s, v0.s[1]
190        FMLA v16.4s, v24.4s, v0.s[2]
191        FMLA v17.4s, v25.4s, v0.s[2]
192        FMLA v18.4s, v26.4s, v0.s[3]
193        FMLA v19.4s, v27.4s, v0.s[3]
194
195        TBZ x0, 3, 7f
1966:
197        # Remainder- 2 floats of A (8 bytes)
198        LDP q20, q21, [x5], 32
199        LDR d0, [x3], 8
200        FMLA v16.4s, v20.4s, v0.s[0]
201        FMLA v17.4s, v21.4s, v0.s[0]
202        LDP q22, q23, [x5], 32
203        FMLA v18.4s, v22.4s, v0.s[1]
204        FMLA v19.4s, v23.4s, v0.s[1]
2057:
206        TBZ x0, 2, 4b
2078:
208        # Remainder- 1 float of A (4 bytes)
209        LDP q20, q21, [x5], 32
210        LDR s0, [x3], 4
211        FMLA v16.4s, v20.4s, v0.s[0]
212        FMLA v17.4s, v21.4s, v0.s[0]
213        B 4b
214
215        # Store odd channels
2169:
217        TBZ x1, 2, 10f
218        STR q16, [x6], 16
219        MOV v16.16b, v17.16b
220
22110:
222        TBZ x1, 1, 11f
223        STR d16, [x6], 8
224        DUP d16, v16.d[1]
225
22611:
227        TBZ x1, 0, 12f
228        STR s16, [x6]
22912:
230        RET
231
232END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
233
234#ifdef __ELF__
235.section ".note.GNU-stack","",%progbits
236#endif
237