1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const int8_t* restrict a, x3 13# size_t a_stride, (x4) 14# const void* restrict w, x5 15# int8_t* restrict c, x6 16# size_t cm_stride, (x7) 17# size_t cn_stride, [sp] -> x12 18# const union xnn_qs8_gemm_params params) [sp + 8] -> x11 19 20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21 22# Register usage 23# A0 x3 v0 24# B x5 v4 v5 v6 v7 v16 v17 v18 v19 25# C0 x6 v28 v29 v30 v31 26# unused v8 v9 v10 v11 v12 v13 v14 v15 27 28BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64 29 ADD x2, x2, 3 // kc = (kc + 3) & ~3 30 BIC x2, x2, 3 31 32 .p2align 3 330: 34 # Load initial bias from w into accumulators 35 LDP q28, q29, [x5], 32 36 SUBS x0, x2, 8 // k = kc - 8 37 LDP q30, q31, [x5], 32 38 LDR x11, [sp, 8] // params 39 40 # Is there at least 8 bytes? 41 B.LO 3f 42 43 # Main loop - 8 bytes of A 44 .p2align 3 451: 46 LDR d0, [x3], 8 47 LDR q16, [x5, 0] 48 LDR q17, [x5, 16] 49 SDOT v28.4s, v16.16b, v0.4b[0] 50 LDR q18, [x5, 32] 51 SDOT v29.4s, v17.16b, v0.4b[0] 52 LDR q19, [x5, 48] 53 SDOT v30.4s, v18.16b, v0.4b[0] 54 LDR q4, [x5, 64] 55 SDOT v31.4s, v19.16b, v0.4b[0] 56 LDR q5, [x5, 80] 57 SDOT v28.4s, v4.16b, v0.4b[1] 58 LDR q6, [x5, 96] 59 SDOT v29.4s, v5.16b, v0.4b[1] 60 LDR q7, [x5, 112] 61 SDOT v30.4s, v6.16b, v0.4b[1] 62 ADD x5, x5, 128 63 SDOT v31.4s, v7.16b, v0.4b[1] 64 SUBS x0, x0, 8 65 B.HS 1b 66 67 # Is there a remainder?- 1 to 4 bytes of A 68 TBNZ x0, 2, 3f 69 702: 71 # Apply params - scale, shift, bias and clamp 72 LD2R {v0.4s, v1.4s}, [x11], 8 73 SQRDMULH v4.4s, v28.4s, v0.4s 74 SQRDMULH v5.4s, v29.4s, v0.4s 75 CMEQ v2.4s, v1.4s, 0 76 SQRDMULH v6.4s, v30.4s, v0.4s 77 SQRDMULH v7.4s, v31.4s, v0.4s 78 BIC v28.16b, v28.16b, v2.16b 79 BIC v29.16b, v29.16b, v2.16b 80 BIC v30.16b, v30.16b, v2.16b 81 BIC v31.16b, v31.16b, v2.16b 82 SSRA v4.4s, v28.4s, 31 // signed shift right accumulate 83 SSRA v5.4s, v29.4s, 31 84 SSRA v6.4s, v30.4s, 31 85 SSRA v7.4s, v31.4s, 31 86 SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left 87 SRSHL v5.4s, v5.4s, v1.4s 88 SRSHL v6.4s, v6.4s, v1.4s 89 SRSHL v7.4s, v7.4s, v1.4s 90 LD1R {v2.8h}, [x11], 2 // add bias 91 SQXTN v4.4h, v4.4s 92 SQXTN v6.4h, v6.4s 93 SQXTN2 v4.8h, v5.4s 94 SQXTN2 v6.8h, v7.4s 95 LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max 96 SQADD v4.8h, v4.8h, v2.8h 97 SQADD v6.8h, v6.8h, v2.8h 98 LDR x12, [sp] // cn_stride 99 SQXTN v4.8b, v4.8h 100 SQXTN2 v4.16b, v6.8h 101 SUBS x1, x1, 16 102 SMAX v4.16b, v4.16b, v0.16b 103 SMIN v4.16b, v4.16b, v1.16b 104 B.LO 4f 105 106 # Store full 1 x 16 107 ST1 {v4.16b}, [x6], x12 108 SUB x3, x3, x2 // a0 -= kc 109 B.NE 0b 110 111 RET 112 113 # Remainder - 4 bytes of A 114 .p2align 3 1153: 116 LDR s0, [x3], 4 117 LDR q16, [x5, 0] 118 LDR q17, [x5, 16] 119 SDOT v28.4s, v16.16b, v0.4b[0] 120 LDR q18, [x5, 32] 121 SDOT v29.4s, v17.16b, v0.4b[0] 122 LDR q19, [x5, 48] 123 SDOT v30.4s, v18.16b, v0.4b[0] 124 ADD x5, x5, 64 125 SDOT v31.4s, v19.16b, v0.4b[0] 126 B 2b 127 128 # Store odd width 129 .p2align 3 1304: 131 TBZ x1, 3, 5f 132 STR d4, [x6], 8 133 DUP d4, v4.d[1] 1345: 135 TBZ x1, 2, 6f 136 STR s4, [x6], 4 137 DUP s4, v4.s[1] 1386: 139 TBZ x1, 1, 7f 140 ST1 {v4.h}[0], [x6], 2 141 DUP h4, v4.h[1] 1427: 143 TBZ x1, 0, 8f 144 ST1 {v4.b}[0], [x6] 1458: 146 RET 147 148END_FUNCTION xnn_qs8_gemm_minmax_ukernel_1x16c4__aarch64_neondot_ld64 149 150#ifdef __ELF__ 151.section ".note.GNU-stack","",%progbits 152#endif 153