• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> (x8)
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4, v5, v6
33
34BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
35
36        # Load cn_stride, params pointer
37        LDP     x14, x8, [sp]
38
39        # Load params values
40        LD3R    {v4.8h, v5.8h, v6.8h}, [x8]
410:
42        # Load initial bias from w into accumulators
43        LDP     q16, q17, [x5], 32
44
45        MOVI    v18.8h, 0               // second set of C for pipelining FMLA
46        MOVI    v19.8h, 0
47
48        # Is there at least 2 halffloats (4 bytes)
49        SUBS    x0, x2, 4               // k = kc - 4
50
51        B.LO    3f
52
53        # Main loop - 2 halffloats of A (4 bytes)
541:
55        LDR     s0, [x3], 4
56        LDR     q20, [x5], 16
57        LDR     q21, [x5], 16
58        LDR     q22, [x5], 16
59        LDR     q23, [x5], 16
60        SUBS    x0, x0, 4
61        FMLA    v16.8h, v20.8h, v0.h[0]
62        FMLA    v17.8h, v21.8h, v0.h[0]
63        FMLA    v18.8h, v22.8h, v0.h[1]
64        FMLA    v19.8h, v23.8h, v0.h[1]
65        B.HS    1b
66
67        # Is there a remainder?- 1 halffloats of A (2 bytes)
68        TBNZ    x0, 1, 3f
69
702:
71        FADD    v16.8h, v16.8h, v18.8h
72        FADD    v17.8h, v17.8h, v19.8h
73        SUBS    x1, x1, 16
74
75        # Scale and Clamp
76
77        FMUL    v16.8h, v16.8h, v4.8h
78        FMUL    v17.8h, v17.8h, v4.8h
79        FMAX    v16.8h, v16.8h, v5.8h
80        FMAX    v17.8h, v17.8h, v5.8h
81        FMIN    v16.8h, v16.8h, v6.8h
82        FMIN    v17.8h, v17.8h, v6.8h
83
84        # Store full 1 x 16
85        B.LO    4f
86
87        STP     q16, q17, [x6]
88        ADD     x6, x6, x14
89
90        SUB     x3,  x3, x2             // a0 -= kc
91
92        B.HI    0b
93
94        RET
95
963:
97        # Remainder- 1 halffloat of A (2 bytes)
98        LDR     q20, [x5], 16
99        LDR     q21, [x5], 16
100        LDR     h0, [x3], 2
101        FMLA    v16.8h, v20.8h, v0.h[0]
102        FMLA    v17.8h, v21.8h, v0.h[0]
103        B       2b
104
105        # Store odd channels
1064:
107        TBZ     x1, 3, 5f
108        STR     q16, [x6], 16
109        MOV     v16.16b, v17.16b
110
1115:
112        TBZ     x1, 2, 6f
113        STR     d16, [x6], 8
114        DUP     d16, v16.d[1]
115
1166:
117        TBZ     x1, 1, 7f
118        STR     s16, [x6], 4
119        DUP     s16, v16.s[1]
120
1217:
122        TBZ     x1, 0, 8f
123        STR     h16, [x6]
1248:
125        RET
126
127END_FUNCTION xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
128
129#ifdef __ELF__
130.section ".note.GNU-stack","",%progbits
131#endif
132