• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/4x8-aarch64-neonfp16arith-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0   v0
40# A1   v1
41# A2   v2
42# A3   v3
43# B   v20 v21 v22 v23
44# C   v16
45# C   v18
46# C   v28
47# C   v30
48# Clamp v4, v5, v6
49# unused A   v7 v8 v9 v10 v11
50# unused B   v19
51
52BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
53
54        # Load cn_stride, params pointer
55        LDP     x14, x8, [sp]
56
57        # Load params values
58        LD3R    {v4.8h, v5.8h, v6.8h}, [x8]
59
60        # Clamp A and C pointers
61        CMP     x0, 2                   // if mr < 2
62        ADD     x11, x3, x4             // a1 = a0 + a_stride
63        ADD     x9, x6, x7              // c1 = c0 + cm_stride
64        CSEL    x11, x3, x11, LO        //   a1 = a0
65        CSEL    x9, x6, x9, LO          //   c1 = c0
66
67        ADD     x12, x11, x4            // a2 = a1 + a_stride
68        ADD     x10, x9, x7             // c2 = c1 + cm_stride
69                                        // if mr <= 2
70        CSEL    x12, x11, x12, LS       //   a2 = a1
71        CSEL    x10, x9, x10, LS        //   c2 = c1
72
73        CMP     x0, 4                   // if mr < 4
74        ADD     x4, x12, x4             // a3 = a2 + a_stride
75        ADD     x7, x10, x7             // c3 = c2 + cm_stride
76        CSEL    x4, x12, x4, LO         //   a3 = a2
77        CSEL    x7, x10, x7, LO         //   c3 = c2
78
790:
80        # Load initial bias from w into accumulators
81        LDR     q16, [x5], 16
82        MOV     v18.16b, v16.16b
83        MOV     v28.16b, v16.16b
84        MOV     v30.16b, v16.16b
85
86        # Is there at least 4 halffloats (8 bytes)?
87        SUBS    x0, x2, 8               // k = kc - 8
88        B.LO    3f
89
90        # Main loop - 4 halffloats of A (8 bytes)
911:
92        LDR     d0,  [x3], 8
93        LDR     q20,  [x5], 16
94        LDR     q21,  [x5], 16
95        LDR     d1, [x11], 8
96        LDR     d2, [x12], 8
97        LDR     d3,  [x4], 8
98        SUBS    x0, x0, 8
99        FMLA    v16.8h, v20.8h, v0.h[0]
100        FMLA    v18.8h, v20.8h, v1.h[0]
101        FMLA    v28.8h, v20.8h, v2.h[0]
102        FMLA    v30.8h, v20.8h, v3.h[0]
103        LDR     q22, [x5], 16
104        LDR     q23, [x5], 16
105
106        FMLA    v16.8h, v21.8h, v0.h[1]
107        FMLA    v18.8h, v21.8h, v1.h[1]
108        FMLA    v28.8h, v21.8h, v2.h[1]
109        FMLA    v30.8h, v21.8h, v3.h[1]
110
111        FMLA    v16.8h, v22.8h, v0.h[2]
112        FMLA    v18.8h, v22.8h, v1.h[2]
113        FMLA    v28.8h, v22.8h, v2.h[2]
114        FMLA    v30.8h, v22.8h, v3.h[2]
115
116        FMLA    v16.8h, v23.8h, v0.h[3]
117        FMLA    v18.8h, v23.8h, v1.h[3]
118        FMLA    v28.8h, v23.8h, v2.h[3]
119        FMLA    v30.8h, v23.8h, v3.h[3]
120        B.HS    1b
121
122        # Is there a remainder?- 2 halffloats of A (4 bytes)
123        TBNZ    x0, 2, 4f
124        # Is there a remainder?- 1 halffloats of A (2 bytes)
125        TBNZ    x0, 1, 5f
1262:
127        # Scale and Clamp
128        FMUL    v16.8h, v16.8h, v4.8h
129        SUBS    x1, x1, 8
130        FMUL    v18.8h, v18.8h, v4.8h
131        FMUL    v28.8h, v28.8h, v4.8h
132        FMUL    v30.8h, v30.8h, v4.8h
133        FMAX    v16.8h, v16.8h, v5.8h
134        FMAX    v18.8h, v18.8h, v5.8h
135        FMAX    v28.8h, v28.8h, v5.8h
136        FMAX    v30.8h, v30.8h, v5.8h
137        FMIN    v16.8h, v16.8h, v6.8h
138        FMIN    v18.8h, v18.8h, v6.8h
139        FMIN    v28.8h, v28.8h, v6.8h
140        FMIN    v30.8h, v30.8h, v6.8h
141
142        # Store full 4 x 8
143        B.LO    6f
144
145        ST1     {v16.16b},  [x6], x14
146        SUB     x3,  x3, x2             // a0 -= kc
147        ST1     {v18.16b},  [x9], x14
148        SUB     x11, x11, x2            // a1 -= kc
149        ST1     {v28.16b}, [x10], x14
150        SUB     x12, x12, x2            // a2 -= kc
151        ST1     {v30.16b},  [x7], x14
152        SUB     x4,  x4, x2             // a3 -= kc
153
154        B.HI    0b
155        RET
156
1573:
158        TBZ     x0, 2, 5f
1594:
160        # Remainder- 2 halffloats of A (4 bytes)
161        LDR     s0,  [x3], 4
162        LDR     q20, [x5], 16
163        LDR     q21, [x5], 16
164        LDR     s1, [x11], 4
165        LDR     s2, [x12], 4
166        LDR     s3,  [x4], 4
167
168        FMLA    v16.8h, v20.8h, v0.h[0]
169        FMLA    v18.8h, v20.8h, v1.h[0]
170        FMLA    v28.8h, v20.8h, v2.h[0]
171        FMLA    v30.8h, v20.8h, v3.h[0]
172
173        FMLA    v16.8h, v21.8h, v0.h[1]
174        FMLA    v18.8h, v21.8h, v1.h[1]
175        FMLA    v28.8h, v21.8h, v2.h[1]
176        FMLA    v30.8h, v21.8h, v3.h[1]
177
178        TBZ     x0, 1, 2b
179
1805:
181        # Remainder- 1 halffloat of A (2 bytes)
182        LDR     h0,  [x3], 2
183        LDR     q20, [x5], 16
184        LDR     h1, [x11], 2
185        LDR     h2, [x12], 2
186        LDR     h3 , [x4], 2
187        FMLA    v16.8h, v20.8h, v0.h[0]
188        FMLA    v18.8h, v20.8h, v1.h[0]
189        FMLA    v28.8h, v20.8h, v2.h[0]
190        FMLA    v30.8h, v20.8h, v3.h[0]
191        B       2b
192
193        # Store odd width
1946:
195        TBZ     x1, 2, 7f
196        STR     d16, [x6], 8
197        STR     d18, [x9], 8
198        DUP     d16, v16.d[1]
199        DUP     d18, v18.d[1]
200        STR     d28, [x10], 8
201        STR     d30, [x7], 8
202        DUP     d28, v28.d[1]
203        DUP     d30, v30.d[1]
204
2057:
206        TBZ     x1, 1, 8f
207        STR     s16,  [x6], 4
208        STR     s18,  [x9], 4
209        DUP     s16, v16.s[1]
210        DUP     s18, v18.s[1]
211        STR     s28, [x10], 4
212        STR     s30,  [x7], 4
213        DUP     s28, v28.s[1]
214        DUP     s30, v30.s[1]
215
2168:
217        TBZ     x1, 0, 9f
218        STR     h16,  [x6]
219        STR     h18,  [x9]
220        STR     h28, [x10]
221        STR     h30,  [x7]
2229:
223        RET
224
225END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
226
227#ifdef __ELF__
228.section ".note.GNU-stack","",%progbits
229#endif
230