• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20
52# C   v22
53# C   v24
54# C   v26
55# C   v28
56# C   v30
57# Clamp v6, (v4), (v5)
58# unused A   v8 v9 v10 v11
59# unused B   v12 v13 v14 v15
60
61
62BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
63
64        # Load acc, params pointer
65        LDP x15, x8, [sp, 8]
66
67        # Clamp A and C pointers
68        CMP x0, 2                // if mr < 2
69        ADD x9, x3, x4           // a1 = a0 + a_stride
70        ADD x16, x6, x7          // c1 = c0 + cm_stride
71        CSEL x9, x3, x9, LO      //   a1 = a0
72        CSEL x16, x6, x16, LO    //   c1 = c0
73
74        ADD x10, x9, x4          // a2 = a1 + a_stride
75        ADD x17, x16, x7         // c2 = c1 + cm_stride
76                                 // if mr <= 2
77        CSEL x10, x9, x10, LS    //   a2 = a1
78        CSEL x17, x16, x17, LS   //   c2 = c1
79
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x14, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x14, x17, x14, LO   //   c3 = c2
85
86        ADD x12, x11, x4         // a4 = a3 + a_stride
87        ADD x13, x14, x7         // c4 = c3 + cm_stride
88                                 // if mr <= 4
89        CSEL x12, x11, x12, LS   //   a4 = a3
90        CSEL x13, x14, x13, LS   //   c4 = c3
91
92        CMP x0, 6                // if mr < 6
93        ADD x4, x12, x4          // a5 = a4 + a_stride
94        ADD x7, x13, x7          // c5 = c4 + cm_stride
95        CSEL x4, x12, x4, LO     //   a5 = a4
96        CSEL x7, x13, x7, LO     //   c5 = c4
97
98        # Load params scale value
99        LD1R {v6.8h}, [x8]
100        ADD x8, x8, 2
101
1020:
103        # Load initial accumulators
104        LDP q20, q22, [x15], 32
105        LDP q24, q26, [x15], 32
106        LDP q28, q30, [x15], 32
107
108         # Is there at least 4 halffloats (8 bytes)?
109        SUBS x0, x2, 8  // k = kc - 8
110        B.LO 3f
111
112        # Main loop - 4 halffloats of A (8 bytes)
113        # 24 FMA + 6 ld64 A + 4 LDR B
1141:
115        LDR   d0,  [x3], 8
116        LDR  q16, [x5], 16
117        LDR  q17, [x5], 16
118        LDR   d1,  [x9], 8
119        LDR   d2, [x10], 8
120        LDR   d3, [x11], 8
121        LDR   d4, [x12], 8
122        LDR   d5,  [x4], 8
123        SUBS x0, x0, 8
124        FMLA v20.8h, v16.8h,  v0.h[0]
125        FMLA v22.8h, v16.8h,  v1.h[0]
126        FMLA v24.8h, v16.8h,  v2.h[0]
127        FMLA v26.8h, v16.8h,  v3.h[0]
128        FMLA v28.8h, v16.8h,  v4.h[0]
129        FMLA v30.8h, v16.8h,  v5.h[0]
130        LDR  q18, [x5], 16
131        LDR  q19, [x5], 16
132
133        FMLA v20.8h, v17.8h,  v0.h[1]
134        FMLA v22.8h, v17.8h,  v1.h[1]
135        FMLA v24.8h, v17.8h,  v2.h[1]
136        FMLA v26.8h, v17.8h,  v3.h[1]
137        FMLA v28.8h, v17.8h,  v4.h[1]
138        FMLA v30.8h, v17.8h,  v5.h[1]
139
140        FMLA v20.8h, v18.8h,  v0.h[2]
141        FMLA v22.8h, v18.8h,  v1.h[2]
142        FMLA v24.8h, v18.8h,  v2.h[2]
143        FMLA v26.8h, v18.8h,  v3.h[2]
144        FMLA v28.8h, v18.8h,  v4.h[2]
145        FMLA v30.8h, v18.8h,  v5.h[2]
146
147        FMLA v20.8h, v19.8h,  v0.h[3]
148        FMLA v22.8h, v19.8h,  v1.h[3]
149        FMLA v24.8h, v19.8h,  v2.h[3]
150        FMLA v26.8h, v19.8h,  v3.h[3]
151        FMLA v28.8h, v19.8h,  v4.h[3]
152        FMLA v30.8h, v19.8h,  v5.h[3]
153        B.HS 1b
154
155        # Is there a remainder?- 2 halffloats of A (4 bytes)
156        TBNZ x0, 2, 4f
157        # Is there a remainder?- 1 halffloats of A (2 bytes)
158        TBNZ x0, 1, 5f
1592:
160        # Scale and Clamp
161        FMUL v20.8h, v20.8h, v6.8h
162        # Load params values
163        LD2R {v4.8h, v5.8h}, [x8]
164        FMUL v22.8h, v22.8h, v6.8h
165        FMUL v24.8h, v24.8h, v6.8h
166        FMUL v26.8h, v26.8h, v6.8h
167        FMUL v28.8h, v28.8h, v6.8h
168        FMUL v30.8h, v30.8h, v6.8h
169        # Load cn_stride
170        LDR x0, [sp, 0]
171        FMAX v20.8h, v20.8h, v4.8h
172        FMAX v22.8h, v22.8h, v4.8h
173        FMAX v24.8h, v24.8h, v4.8h
174        FMAX v26.8h, v26.8h, v4.8h
175        FMAX v28.8h, v28.8h, v4.8h
176        FMAX v30.8h, v30.8h, v4.8h
177        SUBS x1, x1, 8
178        FMIN v20.8h, v20.8h, v5.8h
179        FMIN v22.8h, v22.8h, v5.8h
180        FMIN v24.8h, v24.8h, v5.8h
181        FMIN v26.8h, v26.8h, v5.8h
182        FMIN v28.8h, v28.8h, v5.8h
183        FMIN v30.8h, v30.8h, v5.8h
184
185        # Store full 6 x 8
186        B.LO 6f
187
188        ST1 {v30.16b},  [x7], x0
189        SUB  x3,  x3, x2 // a0 -= kc
190        ST1 {v28.16b}, [x13], x0
191        SUB  x9,  x9, x2 // a1 -= kc
192        ST1 {v26.16b}, [x14], x0
193        SUB x10, x10, x2 // a2 -= kc
194        ST1 {v24.16b}, [x17], x0
195        SUB x11, x11, x2 // a3 -= kc
196        ST1 {v22.16b}, [x16], x0
197        SUB x12, x12, x2 // a4 -= kc
198        ST1 {v20.16b},  [x6], x0
199        SUB  x4,  x4, x2 // a5 -= kc
200
201        B.HI 0b
202        RET
203
2043:
205        TBZ x0, 2, 5f
2064:
207        # Remainder- 2 halffloats of A (4 bytes)
208        LDR   s0,  [x3], 4
209        LDR  q16, [x5], 16
210        LDR  q17, [x5], 16
211        LDR   s1,  [x9], 4
212        LDR   s2, [x10], 4
213        LDR   s3, [x11], 4
214        LDR   s4, [x12], 4
215        LDR   s5,  [x4], 4
216
217        FMLA v20.8h, v16.8h,  v0.h[0]
218        FMLA v22.8h, v16.8h,  v1.h[0]
219        FMLA v24.8h, v16.8h,  v2.h[0]
220        FMLA v26.8h, v16.8h,  v3.h[0]
221        FMLA v28.8h, v16.8h,  v4.h[0]
222        FMLA v30.8h, v16.8h,  v5.h[0]
223
224        FMLA v20.8h, v17.8h,  v0.h[1]
225        FMLA v22.8h, v17.8h,  v1.h[1]
226        FMLA v24.8h, v17.8h,  v2.h[1]
227        FMLA v26.8h, v17.8h,  v3.h[1]
228        FMLA v28.8h, v17.8h,  v4.h[1]
229        FMLA v30.8h, v17.8h,  v5.h[1]
230
231        TBZ x0, 1, 2b
232
2335:
234        # Remainder- 1 halffloat of A (2 bytes)
235        LDR   h0,  [x3], 2
236        LDR  q16,  [x5], 16
237        LDR   h1,  [x9], 2
238        LDR   h2, [x10], 2
239        LDR   h3, [x11], 2
240        LDR   h4, [x12], 2
241        LDR   h5,  [x4], 2
242        FMLA v20.8h, v16.8h,  v0.h[0]
243        FMLA v22.8h, v16.8h,  v1.h[0]
244        FMLA v24.8h, v16.8h,  v2.h[0]
245        FMLA v26.8h, v16.8h,  v3.h[0]
246        FMLA v28.8h, v16.8h,  v4.h[0]
247        FMLA v30.8h, v16.8h,  v5.h[0]
248        B 2b
249
250        # Store odd width
2516:
252        TBZ x1, 2, 7f
253        STR d30,  [x7], 8
254        DUP d30, v30.d[1]
255        STR d28, [x13], 8
256        DUP d28, v28.d[1]
257        STR d26, [x14], 8
258        DUP d26, v26.d[1]
259        STR d24, [x17], 8
260        DUP d24, v24.d[1]
261        STR d22, [x16], 8
262        DUP d22, v22.d[1]
263        STR d20,  [x6], 8
264        DUP d20, v20.d[1]
265
2667:
267        TBZ x1, 1, 8f
268        STR s30,  [x7], 4
269        DUP s30, v30.s[1]
270        STR s28, [x13], 4
271        DUP s28, v28.s[1]
272        STR s26, [x14], 4
273        DUP s26, v26.s[1]
274        STR s24, [x17], 4
275        DUP s24, v24.s[1]
276        STR s22, [x16], 4
277        DUP s22, v22.s[1]
278        STR s20,  [x6], 4
279        DUP s20, v20.s[1]
280
2818:
282        TBZ x1, 0, 9f
283        STR h30,  [x7]
284        STR h28, [x13]
285        STR h26, [x14]
286        STR h24, [x17]
287        STR h22, [x16]
288        STR h20,  [x6]
2899:
290        RET
291
292END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
293
294#ifdef __ELF__
295.section ".note.GNU-stack","",%progbits
296#endif
297