• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20
52# C   v22
53# C   v24
54# C   v26
55# C   v28
56# C   v30
57# Clamp v6, (v4), (v5)
58# unused A   v8 v9 v10 v11
59# unused B   v12 v13 v14 v15
60
61
62BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
63
64        # Load acc, params pointer
65        LDP     x15, x8, [sp, 8]
66
67        # Clamp A and C pointers
68        CMP     x0, 2                   // if mr < 2
69        ADD     x9, x3, x4              // a1 = a0 + a_stride
70        ADD     x16, x6, x7             // c1 = c0 + cm_stride
71        CSEL    x9, x3, x9, LO          //   a1 = a0
72        CSEL    x16, x6, x16, LO        //   c1 = c0
73
74        # Load params
75        LDR     d6, [x8]
76
77        ADD     x10, x9, x4             // a2 = a1 + a_stride
78        ADD     x17, x16, x7            // c2 = c1 + cm_stride
79                                        // if mr <= 2
80        CSEL    x10, x9, x10, LS        //   a2 = a1
81        CSEL    x17, x16, x17, LS       //   c2 = c1
82
83        CMP     x0, 4                   // if mr < 4
84        ADD     x11, x10, x4            // a3 = a2 + a_stride
85        ADD     x14, x17, x7            // c3 = c2 + cm_stride
86        CSEL    x11, x10, x11, LO       //   a3 = a2
87        CSEL    x14, x17, x14, LO       //   c3 = c2
88
89        ADD     x12, x11, x4            // a4 = a3 + a_stride
90        ADD     x13, x14, x7            // c4 = c3 + cm_stride
91                                        // if mr <= 4
92        CSEL    x12, x11, x12, LS       //   a4 = a3
93        CSEL    x13, x14, x13, LS       //   c4 = c3
94
95        CMP     x0, 6                   // if mr < 6
96        ADD     x4, x12, x4             // a5 = a4 + a_stride
97        ADD     x7, x13, x7             // c5 = c4 + cm_stride
98        CSEL    x4, x12, x4, LO         //   a5 = a4
99        CSEL    x7, x13, x7, LO         //   c5 = c4
100
101        LDR     x8, [sp]                // load cn_stride
102
1030:
104        # Load initial accumulators
105        LDP     q20, q22, [x15], 32
106        LDP     q24, q26, [x15], 32
107        LDP     q28, q30, [x15], 32
108
109         # Is there at least 4 halffloats (8 bytes)?
110        SUBS    x0, x2, 8               // k = kc - 8
111        B.LO    3f
112
113        # Main loop - 4 halffloats of A (8 bytes)
114        # 24 FMA + 6 ld64 A + 4 LDR B
1151:
116        LDR     d0,  [x3], 8
117        LDR     q16, [x5], 16
118        LDR     q17, [x5], 16
119        LDR     d1,  [x9], 8
120        LDR     d2, [x10], 8
121        LDR     d3, [x11], 8
122        LDR     d4, [x12], 8
123        LDR     d5,  [x4], 8
124        SUBS    x0, x0, 8
125        FMLA    v20.8h, v16.8h,  v0.h[0]
126        FMLA    v22.8h, v16.8h,  v1.h[0]
127        FMLA    v24.8h, v16.8h,  v2.h[0]
128        FMLA    v26.8h, v16.8h,  v3.h[0]
129        FMLA    v28.8h, v16.8h,  v4.h[0]
130        FMLA    v30.8h, v16.8h,  v5.h[0]
131        LDR     q18, [x5], 16
132        LDR     q19, [x5], 16
133
134        FMLA    v20.8h, v17.8h,  v0.h[1]
135        FMLA    v22.8h, v17.8h,  v1.h[1]
136        FMLA    v24.8h, v17.8h,  v2.h[1]
137        FMLA    v26.8h, v17.8h,  v3.h[1]
138        FMLA    v28.8h, v17.8h,  v4.h[1]
139        FMLA    v30.8h, v17.8h,  v5.h[1]
140
141        FMLA    v20.8h, v18.8h,  v0.h[2]
142        FMLA    v22.8h, v18.8h,  v1.h[2]
143        FMLA    v24.8h, v18.8h,  v2.h[2]
144        FMLA    v26.8h, v18.8h,  v3.h[2]
145        FMLA    v28.8h, v18.8h,  v4.h[2]
146        FMLA    v30.8h, v18.8h,  v5.h[2]
147
148        FMLA    v20.8h, v19.8h,  v0.h[3]
149        FMLA    v22.8h, v19.8h,  v1.h[3]
150        FMLA    v24.8h, v19.8h,  v2.h[3]
151        FMLA    v26.8h, v19.8h,  v3.h[3]
152        FMLA    v28.8h, v19.8h,  v4.h[3]
153        FMLA    v30.8h, v19.8h,  v5.h[3]
154        B.HS    1b
155
156        # Is there a remainder?- 2 halffloats of A (4 bytes)
157        TBNZ    x0, 2, 4f
158        # Is there a remainder?- 1 halffloats of A (2 bytes)
159        TBNZ    x0, 1, 5f
1602:
161        # Scale and Clamp
162        FMUL    v20.8h, v20.8h, v6.h[0]
163        DUP     v4.8h, v6.h[1]
164        FMUL    v22.8h, v22.8h, v6.h[0]
165        DUP     v5.8h, v6.h[2]
166        FMUL    v24.8h, v24.8h, v6.h[0]
167        FMUL    v26.8h, v26.8h, v6.h[0]
168        FMUL    v28.8h, v28.8h, v6.h[0]
169        FMUL    v30.8h, v30.8h, v6.h[0]
170        FMAX    v20.8h, v20.8h, v4.8h
171        FMAX    v22.8h, v22.8h, v4.8h
172        FMAX    v24.8h, v24.8h, v4.8h
173        FMAX    v26.8h, v26.8h, v4.8h
174        FMAX    v28.8h, v28.8h, v4.8h
175        FMAX    v30.8h, v30.8h, v4.8h
176        SUBS    x1, x1, 8
177        FMIN    v20.8h, v20.8h, v5.8h
178        FMIN    v22.8h, v22.8h, v5.8h
179        FMIN    v24.8h, v24.8h, v5.8h
180        FMIN    v26.8h, v26.8h, v5.8h
181        FMIN    v28.8h, v28.8h, v5.8h
182        FMIN    v30.8h, v30.8h, v5.8h
183
184        # Store full 6 x 8
185        B.LO    6f
186
187        ST1     {v30.16b},  [x7], x8
188        SUB     x3,  x3, x2             // a0 -= kc
189        ST1     {v28.16b}, [x13], x8
190        SUB     x9,  x9, x2             // a1 -= kc
191        ST1     {v26.16b}, [x14], x8
192        SUB     x10, x10, x2            // a2 -= kc
193        ST1     {v24.16b}, [x17], x8
194        SUB     x11, x11, x2            // a3 -= kc
195        ST1     {v22.16b}, [x16], x8
196        SUB     x12, x12, x2            // a4 -= kc
197        ST1     {v20.16b},  [x6], x8
198        SUB     x4,  x4, x2             // a5 -= kc
199
200        B.HI    0b
201        RET
202
2033:
204        TBZ     x0, 2, 5f
2054:
206        # Remainder- 2 halffloats of A (4 bytes)
207        LDR     s0,  [x3], 4
208        LDR     q16, [x5], 16
209        LDR     q17, [x5], 16
210        LDR     s1,  [x9], 4
211        LDR     s2, [x10], 4
212        LDR     s3, [x11], 4
213        LDR     s4, [x12], 4
214        LDR     s5,  [x4], 4
215
216        FMLA    v20.8h, v16.8h,  v0.h[0]
217        FMLA    v22.8h, v16.8h,  v1.h[0]
218        FMLA    v24.8h, v16.8h,  v2.h[0]
219        FMLA    v26.8h, v16.8h,  v3.h[0]
220        FMLA    v28.8h, v16.8h,  v4.h[0]
221        FMLA    v30.8h, v16.8h,  v5.h[0]
222
223        FMLA    v20.8h, v17.8h,  v0.h[1]
224        FMLA    v22.8h, v17.8h,  v1.h[1]
225        FMLA    v24.8h, v17.8h,  v2.h[1]
226        FMLA    v26.8h, v17.8h,  v3.h[1]
227        FMLA    v28.8h, v17.8h,  v4.h[1]
228        FMLA    v30.8h, v17.8h,  v5.h[1]
229
230        TBZ     x0, 1, 2b
231
2325:
233        # Remainder- 1 halffloat of A (2 bytes)
234        LDR     h0,  [x3], 2
235        LDR     q16,  [x5], 16
236        LDR     h1,  [x9], 2
237        LDR     h2, [x10], 2
238        LDR     h3, [x11], 2
239        LDR     h4, [x12], 2
240        LDR     h5,  [x4], 2
241        FMLA    v20.8h, v16.8h,  v0.h[0]
242        FMLA    v22.8h, v16.8h,  v1.h[0]
243        FMLA    v24.8h, v16.8h,  v2.h[0]
244        FMLA    v26.8h, v16.8h,  v3.h[0]
245        FMLA    v28.8h, v16.8h,  v4.h[0]
246        FMLA    v30.8h, v16.8h,  v5.h[0]
247        B       2b
248
249        # Store odd width
2506:
251        TBZ     x1, 2, 7f
252        STR     d30,  [x7], 8
253        STR     d28, [x13], 8
254        DUP     d30, v30.d[1]
255        DUP     d28, v28.d[1]
256        STR     d26, [x14], 8
257        STR     d24, [x17], 8
258        DUP     d26, v26.d[1]
259        DUP     d24, v24.d[1]
260        STR     d22, [x16], 8
261        STR     d20,  [x6], 8
262        DUP     d22, v22.d[1]
263        DUP     d20, v20.d[1]
264
2657:
266        TBZ     x1, 1, 8f
267        STR     s30,  [x7], 4
268        STR     s28, [x13], 4
269        DUP     s30, v30.s[1]
270        DUP     s28, v28.s[1]
271        STR     s26, [x14], 4
272        STR     s24, [x17], 4
273        DUP     s26, v26.s[1]
274        DUP     s24, v24.s[1]
275        STR     s22, [x16], 4
276        STR     s20,  [x6], 4
277        DUP     s22, v22.s[1]
278        DUP     s20, v20.s[1]
279
2808:
281        TBZ     x1, 0, 9f
282        STR     h30,  [x7]
283        STR     h28, [x13]
284        STR     h26, [x14]
285        STR     h24, [x17]
286        STR     h22, [x16]
287        STR     h20,  [x6]
2889:
289        RET
290
291END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
292
293#ifdef __ELF__
294.section ".note.GNU-stack","",%progbits
295#endif
296