• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4 v5
33
34BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75
35
36        # Load cn_stride, params pointer
37        LDP x14, x8, [sp]
38
39        # Load min/max values
40        LD2R {v4.4s, v5.4s}, [x8]
410:
42        # Load initial bias from w into accumulators
43        LDP q16, q17, [x5], 32
44
45        MOVI v18.4s, 0  // second set of C for pipelining FMLA
46        PRFM PLDL1KEEP, [x5]
47        MOVI v19.4s, 0
48        PRFM PLDL1KEEP, [x5, 64]
49        PRFM PLDL1KEEP, [x5, 128]
50        PRFM PLDL1KEEP, [x5, 192]
51
52        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
53        SUBS x0, x2, 32  // k = kc - 32
54
55        B.LO 3f
56
57        # 16 prologue
58        # Read first block of 1 A and B.
59        LDP q20, q21, [x5], 32
60        LDP q22, q23, [x5], 32
61        LDP q24, q25, [x5], 32
62        LDP q26, q27, [x5], 32
63        LDR q0, [x3], 16
64
65        # Is there at least 32.  yes do main loop
66        SUBS x0, x0, 32
67        B.LO 2f
68
69        # Main loop - 8 floats of A (32 bytes)
701:
71        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
72        FMLA v16.4s, v20.4s, v0.s[0]
73        LDR q1, [x3], 16
74        FMLA v17.4s, v21.4s, v0.s[0]
75        LDP q20, q21, [x5], 32
76        FMLA v18.4s, v22.4s, v0.s[1]
77        PRFM PLDL1KEEP, [x5, 96]
78        FMLA v19.4s, v23.4s, v0.s[1]
79        LDP q22, q23, [x5], 32
80        FMLA v16.4s, v24.4s, v0.s[2]
81        FMLA v17.4s, v25.4s, v0.s[2]
82        LDP q24, q25, [x5], 32
83        FMLA v18.4s, v26.4s, v0.s[3]
84        FMLA v19.4s, v27.4s, v0.s[3]
85        LDP q26, q27, [x5], 32
86
87        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
88        FMLA v16.4s, v20.4s, v1.s[0]
89        LDR q0, [x3], 16
90        FMLA v17.4s, v21.4s, v1.s[0]
91        LDP q20, q21, [x5], 32
92        FMLA v18.4s, v22.4s, v1.s[1]
93        FMLA v19.4s, v23.4s, v1.s[1]
94        LDP q22, q23, [x5], 32
95        FMLA v16.4s, v24.4s, v1.s[2]
96        FMLA v17.4s, v25.4s, v1.s[2]
97        LDP q24, q25, [x5], 32
98        FMLA v18.4s, v26.4s, v1.s[3]
99        FMLA v19.4s, v27.4s, v1.s[3]
100        SUBS x0, x0, 32
101        LDP q26, q27, [x5], 32
102        B.HS 1b
103
1042:
105        # Epilogue
106
107        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
108        FMLA v16.4s, v20.4s, v0.s[0]
109        LDR q1, [x3], 16
110        FMLA v17.4s, v21.4s, v0.s[0]
111        LDP q20, q21, [x5], 32
112        FMLA v18.4s, v22.4s, v0.s[1]
113        FMLA v19.4s, v23.4s, v0.s[1]
114        LDP q22, q23, [x5], 32
115        FMLA v16.4s, v24.4s, v0.s[2]
116        FMLA v17.4s, v25.4s, v0.s[2]
117        LDP q24, q25, [x5], 32
118        FMLA v18.4s, v26.4s, v0.s[3]
119        FMLA v19.4s, v27.4s, v0.s[3]
120        LDP q26, q27, [x5], 32
121
122        # Second block of 4.  no loads
123        FMLA v16.4s, v20.4s, v1.s[0]
124        FMLA v17.4s, v21.4s, v1.s[0]
125        FMLA v18.4s, v22.4s, v1.s[1]
126        FMLA v19.4s, v23.4s, v1.s[1]
127        FMLA v16.4s, v24.4s, v1.s[2]
128        FMLA v17.4s, v25.4s, v1.s[2]
129        FMLA v18.4s, v26.4s, v1.s[3]
130        FMLA v19.4s, v27.4s, v1.s[3]
131
1323:
133        # Is there a remainder?- 4 floats of A (16 bytes)
134        TBNZ x0, 4, 5f
135        # Is there a remainder?- 2 floats of A (8 bytes)
136        TBNZ x0, 3, 6f
137        # Is there a remainder?- 1 floats of A (4 bytes)
138        TBNZ x0, 2, 8f
139
1404:
141        FADD v16.4s, v16.4s, v18.4s
142        SUBS x1, x1, 8
143        FADD v17.4s, v17.4s, v19.4s
144
145        # Clamp
146        FMAX v16.4s, v16.4s, v4.4s
147        FMAX v17.4s, v17.4s, v4.4s
148        FMIN v16.4s, v16.4s, v5.4s
149        FMIN v17.4s, v17.4s, v5.4s
150
151        # Store full 1 x 8
152        B.LO 9f
153
154        STP q16, q17, [x6]
155        ADD x6, x6, x14
156
157        SUB  x3,  x3, x2 // a0 -= kc
158
159        B.HI 0b
160
161        RET
162
1635:
164        # Remainder- 4 floats of A (16 bytes)
165        LDP q20, q21, [x5], 32
166        LDR q0, [x3], 16
167        FMLA v16.4s, v20.4s, v0.s[0]
168        FMLA v17.4s, v21.4s, v0.s[0]
169        LDP q22, q23, [x5], 32
170        LDP q24, q25, [x5], 32
171        LDP q26, q27, [x5], 32
172        FMLA v18.4s, v22.4s, v0.s[1]
173        FMLA v19.4s, v23.4s, v0.s[1]
174        FMLA v16.4s, v24.4s, v0.s[2]
175        FMLA v17.4s, v25.4s, v0.s[2]
176        FMLA v18.4s, v26.4s, v0.s[3]
177        FMLA v19.4s, v27.4s, v0.s[3]
178
179        TBZ x0, 3, 7f
1806:
181        # Remainder- 2 floats of A (8 bytes)
182        LDP q20, q21, [x5], 32
183        LDR d0, [x3], 8
184        FMLA v16.4s, v20.4s, v0.s[0]
185        FMLA v17.4s, v21.4s, v0.s[0]
186        LDP q22, q23, [x5], 32
187        FMLA v18.4s, v22.4s, v0.s[1]
188        FMLA v19.4s, v23.4s, v0.s[1]
1897:
190        TBZ x0, 2, 4b
1918:
192        # Remainder- 1 float of A (4 bytes)
193        LDP q20, q21, [x5], 32
194        LDR s0, [x3], 4
195        FMLA v16.4s, v20.4s, v0.s[0]
196        FMLA v17.4s, v21.4s, v0.s[0]
197        B 4b
198
199        # Store odd channels
2009:
201        TBZ x1, 2, 10f
202        STR q16, [x6], 16
203        MOV v16.16b, v17.16b
204
20510:
206        TBZ x1, 1, 11f
207        STR d16, [x6], 8
208        DUP d16, v16.d[1]
209
21011:
211        TBZ x1, 0, 12f
212        STR s16, [x6]
21312:
214        RET
215
216END_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75
217
218#ifdef __ELF__
219.section ".note.GNU-stack","",%progbits
220#endif
221