• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/8x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x19 a5
34# x20 a6
35#  x4 a7
36
37# C pointers
38#  x6 c0
39# x16 c1
40# x17 c2
41# x14 c3
42# x13 c4
43# x21 c5
44# x22 c6
45#  x7 c7
46
47# Vector register usage
48# A0   v0
49# A1   v1
50# A2   v2
51# A3   v3
52# A4   v4
53# A5   v5
54# A6   v6
55# A7   v7
56# B   v16 v17 v18 v19
57# C   v24
58# C   v25
59# C   v26
60# C   v27
61# C   v28
62# C   v29
63# C   v30
64# C   v31
65
66# Clamp v20 v21 v22
67# unused A   v8 v9 v10 v11
68# unused B   v12 v13 v14 v15
69
70BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64
71
72        # Load params pointer
73        LDR     x8, [sp, 8]
74
75        # Save x19,x20,x21,x22 on stack
76        STP     x19, x20, [sp, -32]!
77        STP     x21, x22, [sp, 16]
78
79        # Clamp A and C pointers
80        CMP     x0, 2                   // if mr < 2
81        ADD     x9, x3, x4              // a1 = a0 + a_stride
82        ADD     x16, x6, x7             // c1 = c0 + cm_stride
83        CSEL    x9, x3, x9, LO          //   a1 = a0
84        CSEL    x16, x6, x16, LO        //   c1 = c0
85
86        # Load params
87        LD3R    {v20.8h, v21.8h, v22.8h}, [x8]
88
89        ADD     x10, x9, x4             // a2 = a1 + a_stride
90        ADD     x17, x16, x7            // c2 = c1 + cm_stride
91                                        // if mr <= 2
92        CSEL    x10, x9, x10, LS        //   a2 = a1
93        CSEL    x17, x16, x17, LS       //   c2 = c1
94
95        CMP     x0, 4                   // if mr < 4
96        ADD     x11, x10, x4            // a3 = a2 + a_stride
97        ADD     x14, x17, x7            // c3 = c2 + cm_stride
98        CSEL    x11, x10, x11, LO       //   a3 = a2
99        CSEL    x14, x17, x14, LO       //   c3 = c2
100
101        ADD     x12, x11, x4            // a4 = a3 + a_stride
102        ADD     x13, x14, x7            // c4 = c3 + cm_stride
103                                        // if mr <= 4
104        CSEL    x12, x11, x12, LS       //   a4 = a3
105        CSEL    x13, x14, x13, LS       //   c4 = c3
106
107        CMP     x0, 6                   // if mr < 6
108        ADD     x19, x12, x4            // a5 = a4 + a_stride
109        ADD     x21, x13, x7            // c5 = c4 + cm_stride
110        CSEL    x19, x12, x19, LO       //   a5 = a4
111        CSEL    x21, x13, x21, LO       //   c5 = c4
112
113        ADD     x20, x19, x4            // a6 = a5 + a_stride
114        ADD     x22, x21, x7            // c6 = c5 + cm_stride
115                                        // if mr <= 6
116        CSEL    x20, x19, x20, LS       //   a6 = a5
117        CSEL    x22, x21, x22, LS       //   c6 = c5
118
119        CMP     x0, 8                   // if mr < 8
120        ADD     x4, x20, x4             // a7 = a5 + a_stride
121        ADD     x7, x22, x7             // c7 = c5 + cm_stride
122        CSEL    x4, x20, x4, LO         //   a7 = a5
123        CSEL    x7, x22, x7, LO         //   c7 = c5
124
125        LDR     x8, [sp, 32]            // load cn_stride
126
1270:
128       # Load initial bias from w into accumulators
129       LDR     q24, [x5], 16
130       MOV     v25.16b, v24.16b
131       MOV     v26.16b, v24.16b
132       MOV     v27.16b, v24.16b
133       MOV     v28.16b, v24.16b
134       MOV     v29.16b, v24.16b
135       MOV     v30.16b, v24.16b
136       MOV     v31.16b, v24.16b
137
138         # Is there at least 4 halffloats (8 bytes)?
139        SUBS    x0, x2, 8               // k = kc - 8
140        B.LO    3f
141
142        # Main loop - 4 halffloats of A (8 bytes)
143        # 32 FMA + 8 ld64 A + 4 LDR B
1441:
145        LDR     d0,  [x3], 8
146        LDR     q16,  [x5], 16
147        LDR     q17,  [x5], 16
148        LDR     d1,  [x9], 8
149        LDR     d2, [x10], 8
150        LDR     d3, [x11], 8
151        LDR     d4, [x12], 8
152        LDR     d5, [x19], 8
153        LDR     d6, [x20], 8
154        LDR     d7,  [x4], 8
155        SUBS    x0, x0, 8
156        FMLA    v24.8h, v16.8h,  v0.h[0]
157        FMLA    v25.8h, v16.8h,  v1.h[0]
158        FMLA    v26.8h, v16.8h,  v2.h[0]
159        FMLA    v27.8h, v16.8h,  v3.h[0]
160        FMLA    v28.8h, v16.8h,  v4.h[0]
161        FMLA    v29.8h, v16.8h,  v5.h[0]
162        FMLA    v30.8h, v16.8h,  v6.h[0]
163        FMLA    v31.8h, v16.8h,  v7.h[0]
164        LDR     q18,  [x5], 16
165        LDR     q19,  [x5], 16
166
167        FMLA    v24.8h, v17.8h,  v0.h[1]
168        FMLA    v25.8h, v17.8h,  v1.h[1]
169        FMLA    v26.8h, v17.8h,  v2.h[1]
170        FMLA    v27.8h, v17.8h,  v3.h[1]
171        FMLA    v28.8h, v17.8h,  v4.h[1]
172        FMLA    v29.8h, v17.8h,  v5.h[1]
173        FMLA    v30.8h, v17.8h,  v6.h[1]
174        FMLA    v31.8h, v17.8h,  v7.h[1]
175
176        FMLA    v24.8h, v18.8h,  v0.h[2]
177        FMLA    v25.8h, v18.8h,  v1.h[2]
178        FMLA    v26.8h, v18.8h,  v2.h[2]
179        FMLA    v27.8h, v18.8h,  v3.h[2]
180        FMLA    v28.8h, v18.8h,  v4.h[2]
181        FMLA    v29.8h, v18.8h,  v5.h[2]
182        FMLA    v30.8h, v18.8h,  v6.h[2]
183        FMLA    v31.8h, v18.8h,  v7.h[2]
184
185        FMLA    v24.8h, v19.8h,  v0.h[3]
186        FMLA    v25.8h, v19.8h,  v1.h[3]
187        FMLA    v26.8h, v19.8h,  v2.h[3]
188        FMLA    v27.8h, v19.8h,  v3.h[3]
189        FMLA    v28.8h, v19.8h,  v4.h[3]
190        FMLA    v29.8h, v19.8h,  v5.h[3]
191        FMLA    v30.8h, v19.8h,  v6.h[3]
192        FMLA    v31.8h, v19.8h,  v7.h[3]
193        B.HS    1b
194
195        # Is there a remainder?- 2 halffloats of A (4 bytes)
196        TBNZ    x0, 2, 4f
197        # Is there a remainder?- 1 halffloats of A (2 bytes)
198        TBNZ    x0, 1, 5f
1992:
200        # Scale and Clamp
201        FMUL    v24.8h, v24.8h, v20.8h
202        FMUL    v25.8h, v25.8h, v20.8h
203        FMUL    v26.8h, v26.8h, v20.8h
204        FMUL    v27.8h, v27.8h, v20.8h
205        FMUL    v28.8h, v28.8h, v20.8h
206        FMUL    v29.8h, v29.8h, v20.8h
207        FMUL    v30.8h, v30.8h, v20.8h
208        FMUL    v31.8h, v31.8h, v20.8h
209        FMAX    v24.8h, v24.8h, v21.8h
210        FMAX    v25.8h, v25.8h, v21.8h
211        FMAX    v26.8h, v26.8h, v21.8h
212        FMAX    v27.8h, v27.8h, v21.8h
213        FMAX    v28.8h, v28.8h, v21.8h
214        FMAX    v29.8h, v29.8h, v21.8h
215        FMAX    v30.8h, v30.8h, v21.8h
216        FMAX    v31.8h, v31.8h, v21.8h
217        SUBS    x1, x1, 8
218        FMIN    v24.8h, v24.8h, v22.8h
219        FMIN    v25.8h, v25.8h, v22.8h
220        FMIN    v26.8h, v26.8h, v22.8h
221        FMIN    v27.8h, v27.8h, v22.8h
222        FMIN    v28.8h, v28.8h, v22.8h
223        FMIN    v29.8h, v29.8h, v22.8h
224        FMIN    v30.8h, v30.8h, v22.8h
225        FMIN    v31.8h, v31.8h, v22.8h
226
227        # Store full 8 x 8
228        B.LO    6f
229
230        ST1     {v24.16b},  [x6], x8
231        SUB     x3,  x3, x2             // a0 -= kc
232        ST1     {v25.16b}, [x16], x8
233        SUB     x9,  x9, x2             // a1 -= kc
234        ST1     {v26.16b}, [x17], x8
235        SUB     x10, x10, x2            // a2 -= kc
236        ST1     {v27.16b}, [x14], x8
237        SUB     x11, x11, x2            // a3 -= kc
238        ST1     {v28.16b}, [x13], x8
239        SUB     x12, x12, x2            // a4 -= kc
240        ST1     {v29.16b}, [x21], x8
241        SUB     x19, x19, x2            // a6 -= kc
242        ST1     {v30.16b}, [x22], x8
243        SUB     x20, x20, x2            // a6 -= kc
244        ST1     {v31.16b},  [x7], x8
245        SUB     x4,  x4, x2             // a7 -= kc
246
247        B.HI    0b
248
249        # Restore x19,x20,x21,x22 from stack
250        LDP     x21, x22, [sp, 16]
251        LDP     x19, x20, [sp], 32
252        RET
253
2543:
255        TBZ     x0, 2, 5f
2564:
257        # Remainder- 2 halffloats of A (4 bytes)
258        LDR     s0,  [x3], 4
259        LDR     q16,  [x5], 16
260        LDR     q17,  [x5], 16
261        LDR     s1,  [x9], 4
262        LDR     s2, [x10], 4
263        LDR     s3, [x11], 4
264        LDR     s4, [x12], 4
265        LDR     s5, [x19], 4
266        LDR     s6, [x20], 4
267        LDR     s7,  [x4], 4
268
269        FMLA    v24.8h, v16.8h,  v0.h[0]
270        FMLA    v25.8h, v16.8h,  v1.h[0]
271        FMLA    v26.8h, v16.8h,  v2.h[0]
272        FMLA    v27.8h, v16.8h,  v3.h[0]
273        FMLA    v28.8h, v16.8h,  v4.h[0]
274        FMLA    v29.8h, v16.8h,  v5.h[0]
275        FMLA    v30.8h, v16.8h,  v6.h[0]
276        FMLA    v31.8h, v16.8h,  v7.h[0]
277
278        FMLA    v24.8h, v17.8h,  v0.h[1]
279        FMLA    v25.8h, v17.8h,  v1.h[1]
280        FMLA    v26.8h, v17.8h,  v2.h[1]
281        FMLA    v27.8h, v17.8h,  v3.h[1]
282        FMLA    v28.8h, v17.8h,  v4.h[1]
283        FMLA    v29.8h, v17.8h,  v5.h[1]
284        FMLA    v30.8h, v17.8h,  v6.h[1]
285        FMLA    v31.8h, v17.8h,  v7.h[1]
286
287        TBZ     x0, 1, 2b
288
2895:
290        # Remainder- 1 halffloat of A (2 bytes)
291        LDR     h0,  [x3], 2
292        LDR     q16,  [x5], 16
293        LDR     h1,  [x9], 2
294        LDR     h2, [x10], 2
295        LDR     h3, [x11], 2
296        LDR     h4, [x12], 2
297        LDR     h5, [x19], 2
298        LDR     h6, [x20], 2
299        LDR     h7,  [x4], 2
300
301        FMLA    v24.8h, v16.8h,  v0.h[0]
302        FMLA    v25.8h, v16.8h,  v1.h[0]
303        FMLA    v26.8h, v16.8h,  v2.h[0]
304        FMLA    v27.8h, v16.8h,  v3.h[0]
305        FMLA    v28.8h, v16.8h,  v4.h[0]
306        FMLA    v29.8h, v16.8h,  v5.h[0]
307        FMLA    v30.8h, v16.8h,  v6.h[0]
308        FMLA    v31.8h, v16.8h,  v7.h[0]
309        B       2b
310
311        # Store odd width
3126:
313        TBZ     x1, 2, 7f
314        STR     d24,  [x6], 8
315        STR     d25, [x16], 8
316        DUP     d24, v24.d[1]
317        DUP     d25, v25.d[1]
318        STR     d26, [x17], 8
319        STR     d27, [x14], 8
320        DUP     d26, v26.d[1]
321        DUP     d27, v27.d[1]
322        STR     d28, [x13], 8
323        STR     d29, [x21], 8
324        DUP     d28, v28.d[1]
325        DUP     d29, v29.d[1]
326        STR     d30, [x22], 8
327        STR     d31,  [x7], 8
328        DUP     d30, v30.d[1]
329        DUP     d31, v31.d[1]
3307:
331        TBZ     x1, 1, 8f
332        STR     s24,  [x6], 4
333        STR     s25, [x16], 4
334        DUP     s24, v24.s[1]
335        DUP     s25, v25.s[1]
336        STR     s26, [x17], 4
337        STR     s27, [x14], 4
338        DUP     s26, v26.s[1]
339        DUP     s27, v27.s[1]
340        STR     s28, [x13], 4
341        STR     s29, [x21], 4
342        DUP     s28, v28.s[1]
343        DUP     s29, v29.s[1]
344        STR     s30, [x22], 4
345        STR     s31,  [x7], 4
346        DUP     s30, v30.s[1]
347        DUP     s31, v31.s[1]
348
3498:
350        TBZ     x1, 0, 9f
351        STR     h24,  [x6]
352        STR     h25, [x16]
353        STR     h26, [x17]
354        STR     h27, [x14]
355        STR     h28, [x13]
356        STR     h29, [x21]
357        STR     h30, [x22]
358        STR     h31,  [x7]
3599:
360        # Restore x19,x20,x21,x22 from stack
361        LDP     x21, x22, [sp, 16]
362        LDP     x19, x20, [sp], 32
363        RET
364
365END_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64
366
367#ifdef __ELF__
368.section ".note.GNU-stack","",%progbits
369#endif
370