• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64(
9#     size_t mr,                 x0
10#     size_t nc,                 x1
11#     size_t kc,                 x2 / x0
12#     const int8_t* restrict a,  x3
13#     size_t a_stride,           (x4)
14#     const void* restrict w,    x5
15#     int8_t* restrict c,        x6
16#     size_t cm_stride,          (x7)
17#     size_t cn_stride,          [sp] -> x12
18#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x11
19
20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21
22# Register usage
23# A0  x3 v0
24# B   x5 v4  v5  v6  v7  v16  v17 v18 v19
25# C0  x6 v28 v29 v30 v31
26# unused v8 v9 v10 v11 v12 v13 v14 v15
27
28BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
29        ADD      x2, x2, 3          // kc = (kc + 3) & ~3
30        BIC      x2, x2, 3
31
32        .p2align 3
330:
34        # Load initial bias from w into accumulators
35        LDP     q28, q29, [x5], 32
36        SUBS     x0, x2, 8          // k = kc - 8
37        LDP     q30, q31, [x5], 32
38        LDR     x11, [sp, 8]        // params
39
40        # Is there at least 8 bytes?
41        B.LO    3f
42
43        # Main loop - 8 bytes of A
44        .p2align 3
451:
46        LDR     d0,  [x3], 8
47        LDR     q16, [x5, 0]
48        LDR     q17, [x5, 16]
49        SDOT    v28.4s, v16.16b, v0.4b[0]
50        LDR     q18, [x5, 32]
51        SDOT    v29.4s, v17.16b, v0.4b[0]
52        LDR     q19, [x5, 48]
53        SDOT    v30.4s, v18.16b, v0.4b[0]
54        LDR      q4, [x5, 64]
55        SDOT    v31.4s, v19.16b, v0.4b[0]
56        LDR      q5, [x5, 80]
57        SDOT    v28.4s, v4.16b,  v0.4b[1]
58        LDR      q6, [x5, 96]
59        SDOT    v29.4s, v5.16b,  v0.4b[1]
60        LDR      q7, [x5, 112]
61        SDOT    v30.4s, v6.16b,  v0.4b[1]
62        ADD     x5, x5, 128
63        SDOT    v31.4s, v7.16b,  v0.4b[1]
64        SUBS    x0, x0, 8
65        B.HS    1b
66
67        # Is there a remainder?- 1 to 4 bytes of A
68        TBNZ    x0, 2, 3f
69
702:
71         # Apply params - scale, shift, bias and clamp
72        LD2R    {v0.4s, v1.4s}, [x11], 8
73        SQRDMULH  v4.4s, v28.4s, v0.4s
74        SQRDMULH  v5.4s, v29.4s, v0.4s
75        CMEQ    v2.4s, v1.4s, 0
76        SQRDMULH  v6.4s, v30.4s, v0.4s
77        SQRDMULH  v7.4s, v31.4s, v0.4s
78        BIC     v28.16b, v28.16b, v2.16b
79        BIC     v29.16b, v29.16b, v2.16b
80        BIC     v30.16b, v30.16b, v2.16b
81        BIC     v31.16b, v31.16b, v2.16b
82        SSRA    v4.4s, v28.4s, 31  // signed shift right accumulate
83        SSRA    v5.4s, v29.4s, 31
84        SSRA    v6.4s, v30.4s, 31
85        SSRA    v7.4s, v31.4s, 31
86        SRSHL   v4.4s, v4.4s, v1.4s  // signed rounding shift left
87        SRSHL   v5.4s, v5.4s, v1.4s
88        SRSHL   v6.4s, v6.4s, v1.4s
89        SRSHL   v7.4s, v7.4s, v1.4s
90        LD1R    {v2.8h}, [x11], 2   // add bias
91        SQXTN   v4.4h, v4.4s
92        SQXTN   v6.4h, v6.4s
93        SQXTN2  v4.8h, v5.4s
94        SQXTN2  v6.8h, v7.4s
95        LD2R    {v0.16b, v1.16b}, [x11]   // clamp to min/max
96        SQADD   v4.8h, v4.8h, v2.8h
97        SQADD   v6.8h, v6.8h, v2.8h
98        LDR     x12, [sp]   // cn_stride
99        SQXTN   v4.8b, v4.8h
100        SQXTN2  v4.16b, v6.8h
101        SUBS    x1, x1, 16
102        SMAX    v4.16b, v4.16b, v0.16b
103        SMIN    v4.16b, v4.16b, v1.16b
104        B.LO    4f
105
106        # Store full 1 x 16
107        ST1     {v4.16b}, [x6], x12
108        SUB     x3,  x3, x2          // a0 -= kc
109        B.NE    0b
110
111        RET
112
113        # Remainder - 4 bytes of A
114        .p2align 3
1153:
116        LDR     s0,  [x3], 4
117        LDR     q16, [x5, 0]
118        LDR     q17, [x5, 16]
119        SDOT    v28.4s, v16.16b, v0.4b[0]
120        LDR     q18, [x5, 32]
121        SDOT    v29.4s, v17.16b, v0.4b[0]
122        LDR     q19, [x5, 48]
123        SDOT    v30.4s, v18.16b, v0.4b[0]
124        ADD     x5, x5, 64
125        SDOT    v31.4s, v19.16b, v0.4b[0]
126        B       2b
127
128        # Store odd width
129        .p2align 3
1304:
131        TBZ     x1, 3, 5f
132        STR     d4, [x6], 8
133        DUP     d4, v4.d[1]
1345:
135        TBZ     x1, 2, 6f
136        STR     s4, [x6], 4
137        DUP     s4, v4.s[1]
1386:
139        TBZ     x1, 1, 7f
140        ST1     {v4.h}[0], [x6], 2
141        DUP     h4, v4.h[1]
1427:
143        TBZ     x1, 0, 8f
144        ST1     {v4.b}[0], [x6]
1458:
146        RET
147
148END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64
149
150#ifdef __ELF__
151.section ".note.GNU-stack","",%progbits
152#endif
153