• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x16-aarch64-neonfp16arith-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x8)
22
23#     const float*restrict acc,  [sp + 8] -> x15
24#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> (x8)
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x14 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6, (v4), (v5)
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62
63BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
64
65        # Load acc, params pointer
66        LDP     x15, x8, [sp, 8]
67
68        # Clamp A and C pointers
69        CMP     x0, 2                   // if mr < 2
70        ADD     x9, x3, x4              // a1 = a0 + a_stride
71        ADD     x16, x6, x7             // c1 = c0 + cm_stride
72        CSEL    x9, x3, x9, LO          //   a1 = a0
73        CSEL    x16, x6, x16, LO        //   c1 = c0
74
75        # Load params
76        LDR     d6, [x8]
77
78        ADD     x10, x9, x4             // a2 = a1 + a_stride
79        ADD     x17, x16, x7            // c2 = c1 + cm_stride
80                                        // if mr <= 2
81        CSEL    x10, x9, x10, LS        //   a2 = a1
82        CSEL    x17, x16, x17, LS       //   c2 = c1
83
84        CMP     x0, 4                   // if mr < 4
85        ADD     x11, x10, x4            // a3 = a2 + a_stride
86        ADD     x14, x17, x7            // c3 = c2 + cm_stride
87        CSEL    x11, x10, x11, LO       //   a3 = a2
88        CSEL    x14, x17, x14, LO       //   c3 = c2
89
90        ADD     x12, x11, x4            // a4 = a3 + a_stride
91        ADD     x13, x14, x7            // c4 = c3 + cm_stride
92                                        // if mr <= 4
93        CSEL    x12, x11, x12, LS       //   a4 = a3
94        CSEL    x13, x14, x13, LS       //   c4 = c3
95
96        CMP     x0, 6                   // if mr < 6
97        ADD     x4, x12, x4             // a5 = a4 + a_stride
98        ADD     x7, x13, x7             // c5 = c4 + cm_stride
99        CSEL    x4, x12, x4, LO         //   a5 = a4
100        CSEL    x7, x13, x7, LO         //   c5 = c4
101
102        LDR     x8, [sp]                // load cn_stride
103
104
1050:
106        # Load initial accumulators
107        LDP     q20, q21, [x15], 32
108        LDP     q22, q23, [x15], 32
109        LDP     q24, q25, [x15], 32
110        LDP     q26, q27, [x15], 32
111        LDP     q28, q29, [x15], 32
112        LDP     q30, q31, [x15], 32
113
114         # Is there at least 2 halffloats (4 bytes)?
115        SUBS    x0, x2, 4               // k = kc - 4
116        B.LO    3f
117
118        # Main loop - 2 halffloats of A (4 bytes)
119        # 24 FMA + 6 ld32 A + 4 LDR B
1201:
121        LDR     s0,  [x3], 4
122        LDR     q16, [x5], 16
123        LDR     q17, [x5], 16
124        LDR     s1,  [x9], 4
125        LDR     s2, [x10], 4
126        LDR     s3, [x11], 4
127        LDR     s4, [x12], 4
128        LDR     s5,  [x4], 4
129        SUBS    x0, x0, 4
130        FMLA    v20.8h, v16.8h,  v0.h[0]
131        FMLA    v22.8h, v16.8h,  v1.h[0]
132        FMLA    v24.8h, v16.8h,  v2.h[0]
133        FMLA    v26.8h, v16.8h,  v3.h[0]
134        LDR     q18, [x5], 16
135        LDR     q19, [x5], 16
136        FMLA    v28.8h, v16.8h,  v4.h[0]
137        FMLA    v30.8h, v16.8h,  v5.h[0]
138        FMLA    v21.8h, v17.8h,  v0.h[0]
139        FMLA    v23.8h, v17.8h,  v1.h[0]
140        FMLA    v25.8h, v17.8h,  v2.h[0]
141        FMLA    v27.8h, v17.8h,  v3.h[0]
142        FMLA    v29.8h, v17.8h,  v4.h[0]
143        FMLA    v31.8h, v17.8h,  v5.h[0]
144
145        FMLA    v20.8h, v18.8h,  v0.h[1]
146        FMLA    v22.8h, v18.8h,  v1.h[1]
147        FMLA    v24.8h, v18.8h,  v2.h[1]
148        FMLA    v26.8h, v18.8h,  v3.h[1]
149        FMLA    v28.8h, v18.8h,  v4.h[1]
150        FMLA    v30.8h, v18.8h,  v5.h[1]
151        FMLA    v21.8h, v19.8h,  v0.h[1]
152        FMLA    v23.8h, v19.8h,  v1.h[1]
153        FMLA    v25.8h, v19.8h,  v2.h[1]
154        FMLA    v27.8h, v19.8h,  v3.h[1]
155        FMLA    v29.8h, v19.8h,  v4.h[1]
156        FMLA    v31.8h, v19.8h,  v5.h[1]
157        B.HS    1b
158
159        # Is there a remainder?- 1 halffloat of A (2 bytes)
160        TBNZ    x0, 1, 3f
1612:
162        # Scale and Clamp
163        FMUL    v20.8h, v20.8h, v6.h[0]
164        DUP     v4.8h, v6.h[1]
165        FMUL    v21.8h, v21.8h, v6.h[0]
166        DUP     v5.8h, v6.h[2]
167        FMUL    v22.8h, v22.8h, v6.h[0]
168        FMUL    v23.8h, v23.8h, v6.h[0]
169        FMUL    v24.8h, v24.8h, v6.h[0]
170        FMUL    v25.8h, v25.8h, v6.h[0]
171        FMUL    v26.8h, v26.8h, v6.h[0]
172        FMUL    v27.8h, v27.8h, v6.h[0]
173        FMUL    v28.8h, v28.8h, v6.h[0]
174        FMUL    v29.8h, v29.8h, v6.h[0]
175        FMUL    v30.8h, v30.8h, v6.h[0]
176        FMUL    v31.8h, v31.8h, v6.h[0]
177        FMAX    v20.8h, v20.8h, v4.8h
178        FMAX    v21.8h, v21.8h, v4.8h
179        FMAX    v22.8h, v22.8h, v4.8h
180        FMAX    v23.8h, v23.8h, v4.8h
181        FMAX    v24.8h, v24.8h, v4.8h
182        FMAX    v25.8h, v25.8h, v4.8h
183        FMAX    v26.8h, v26.8h, v4.8h
184        FMAX    v27.8h, v27.8h, v4.8h
185        FMAX    v28.8h, v28.8h, v4.8h
186        FMAX    v29.8h, v29.8h, v4.8h
187        FMAX    v30.8h, v30.8h, v4.8h
188        FMAX    v31.8h, v31.8h, v4.8h
189        SUBS    x1, x1, 16
190        FMIN    v20.8h, v20.8h, v5.8h
191        FMIN    v21.8h, v21.8h, v5.8h
192        FMIN    v22.8h, v22.8h, v5.8h
193        FMIN    v23.8h, v23.8h, v5.8h
194        FMIN    v24.8h, v24.8h, v5.8h
195        FMIN    v25.8h, v25.8h, v5.8h
196        FMIN    v26.8h, v26.8h, v5.8h
197        FMIN    v27.8h, v27.8h, v5.8h
198        FMIN    v28.8h, v28.8h, v5.8h
199        FMIN    v29.8h, v29.8h, v5.8h
200        FMIN    v30.8h, v30.8h, v5.8h
201        FMIN    v31.8h, v31.8h, v5.8h
202
203        # Store full 6 x 16
204        B.LO    4f
205
206        ST1     {v30.16b, v31.16b},  [x7], x8
207        SUB     x3,  x3, x2             // a0 -= kc
208        ST1     {v28.16b, v29.16b}, [x13], x8
209        SUB     x9,  x9, x2             // a1 -= kc
210        ST1     {v26.16b, v27.16b}, [x14], x8
211        SUB     x10, x10, x2            // a2 -= kc
212        ST1     {v24.16b, v25.16b}, [x17], x8
213        SUB     x11, x11, x2            // a3 -= kc
214        ST1     {v22.16b, v23.16b}, [x16], x8
215        SUB     x12, x12, x2            // a4 -= kc
216        ST1     {v20.16b, v21.16b},  [x6], x8
217        SUB     x4,  x4, x2             // a5 -= kc
218
219        B.HI    0b
220        RET
221
2223:
223        # Remainder- 1 halffloat of A (2 bytes)
224        LDR     h0,  [x3], 2
225        LDR     q16, [x5], 16
226        LDR     q17, [x5], 16
227        LDR     h1,  [x9], 2
228        LDR     h2, [x10], 2
229        LDR     h3, [x11], 2
230        LDR     h4, [x12], 2
231        LDR     h5,  [x4], 2
232        FMLA    v20.8h, v16.8h,  v0.h[0]
233        FMLA    v22.8h, v16.8h,  v1.h[0]
234        FMLA    v24.8h, v16.8h,  v2.h[0]
235        FMLA    v26.8h, v16.8h,  v3.h[0]
236        FMLA    v28.8h, v16.8h,  v4.h[0]
237        FMLA    v30.8h, v16.8h,  v5.h[0]
238        FMLA    v21.8h, v17.8h,  v0.h[0]
239        FMLA    v23.8h, v17.8h,  v1.h[0]
240        FMLA    v25.8h, v17.8h,  v2.h[0]
241        FMLA    v27.8h, v17.8h,  v3.h[0]
242        FMLA    v29.8h, v17.8h,  v4.h[0]
243        FMLA    v31.8h, v17.8h,  v5.h[0]
244        B       2b
245
246        # Store odd width
2474:
248        TBZ     x1, 3, 5f
249        STR     q30,  [x7], 16
250        MOV     v30.16b, v31.16b
251        STR     q28, [x13], 16
252        MOV     v28.16b, v29.16b
253        STR     q26, [x14], 16
254        MOV     v26.16b, v27.16b
255        STR     q24, [x17], 16
256        MOV     v24.16b, v25.16b
257        STR     q22, [x16], 16
258        MOV     v22.16b, v23.16b
259        STR     q20,  [x6], 16
260        MOV     v20.16b, v21.16b
261
2625:
263        TBZ     x1, 2, 6f
264        STR     d30,  [x7], 8
265        STR     d28, [x13], 8
266        DUP     d30, v30.d[1]
267        DUP     d28, v28.d[1]
268        STR     d26, [x14], 8
269        STR     d24, [x17], 8
270        DUP     d26, v26.d[1]
271        DUP     d24, v24.d[1]
272        STR     d22, [x16], 8
273        STR     d20,  [x6], 8
274        DUP     d22, v22.d[1]
275        DUP     d20, v20.d[1]
276
2776:
278        TBZ     x1, 1, 7f
279        STR     s30,  [x7], 4
280        STR     s28, [x13], 4
281        DUP     s30, v30.s[1]
282        DUP     s28, v28.s[1]
283        STR     s26, [x14], 4
284        STR     s24, [x17], 4
285        DUP     s26, v26.s[1]
286        DUP     s24, v24.s[1]
287        STR     s22, [x16], 4
288        STR     s20,  [x6], 4
289        DUP     s22, v22.s[1]
290        DUP     s20, v20.s[1]
291
2927:
293        TBZ     x1, 0, 8f
294        STR     h30,  [x7]
295        STR     h28, [x13]
296        STR     h26, [x14]
297        STR     h24, [x17]
298        STR     h22, [x16]
299        STR     h20,  [x6]
3008:
301        RET
302
303END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
304
305#ifdef __ELF__
306.section ".note.GNU-stack","",%progbits
307#endif
308