1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, (x4) - unused 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v4 v5 33 34# A53 based on A57/A75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 37 38 $if INC: 39 # Load cn_stride, acc 40 LDP x14, x15, [sp] 41 # Load params pointer 42 LDR x8, [sp, 16] 43 $else: 44 # Load cn_stride, params pointer 45 LDP x14, x8, [sp] 46 47 # Load min/max values 48 LD2R {v4.4s, v5.4s}, [x8] 490: 50 $if INC: 51 # Load initial accumulators 52 LDP q16, q17, [x15], 32 53 $else: 54 # Load initial bias from w into accumulators 55 LDP q16, q17, [x5], 32 56 57 MOVI v18.4s, 0 // second set of C for pipelining FMLA 58 PRFM PLDL1KEEP, [x5] 59 MOVI v19.4s, 0 60 PRFM PLDL1KEEP, [x5, 64] 61 PRFM PLDL1KEEP, [x5, 128] 62 PRFM PLDL1KEEP, [x5, 192] 63 64 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 65 SUBS x0, x2, 32 // k = kc - 32 66 67 B.LO 3f 68 69 # 16 prologue 70 # Read first block of 1 A and B. 71 LDP q20, q21, [x5], 32 72 LDP q22, q23, [x5], 32 73 LDP q24, q25, [x5], 32 74 LDP q26, q27, [x5], 32 75 LDR q0, [x3], 16 76 77 # Is there at least 32. yes do main loop 78 SUBS x0, x0, 32 79 B.LO 2f 80 81 # Main loop - 8 floats of A (32 bytes) 821: 83 # First block of 4. FMA for first 4, loads for 2nd block of 4. 84 FMLA v16.4s, v20.4s, v0.s[0] 85 LDR q1, [x3], 16 86 FMLA v17.4s, v21.4s, v0.s[0] 87 LDR q20, [x5], 16 88 FMLA v18.4s, v22.4s, v0.s[1] 89 LDR q21, [x5], 16 90 FMLA v19.4s, v23.4s, v0.s[1] 91 LDR q22, [x5], 16 92 FMLA v16.4s, v24.4s, v0.s[2] 93 LDR q23, [x5], 16 94 FMLA v17.4s, v25.4s, v0.s[2] 95 LDR q24, [x5], 16 96 FMLA v18.4s, v26.4s, v0.s[3] 97 LDR q25, [x5], 16 98 FMLA v19.4s, v27.4s, v0.s[3] 99 LDR q26, [x5], 16 100 LDR q27, [x5], 16 101 102 # Second block of 4. FMA for second 4, loads for 1st block of 4. 103 FMLA v16.4s, v20.4s, v1.s[0] 104 LDR q0, [x3], 16 105 FMLA v17.4s, v21.4s, v1.s[0] 106 LDR q20, [x5], 16 107 FMLA v18.4s, v22.4s, v1.s[1] 108 LDR q21, [x5], 16 109 FMLA v19.4s, v23.4s, v1.s[1] 110 LDR q22, [x5], 16 111 FMLA v16.4s, v24.4s, v1.s[2] 112 LDR q23, [x5], 16 113 FMLA v17.4s, v25.4s, v1.s[2] 114 LDR q24, [x5], 16 115 FMLA v18.4s, v26.4s, v1.s[3] 116 LDR q25, [x5], 16 117 FMLA v19.4s, v27.4s, v1.s[3] 118 LDR q26, [x5], 16 119 SUBS x0, x0, 32 120 LDR q27, [x5], 16 121 B.HS 1b 122 1232: 124 # Epilogue 125 126 # First block of 4. FMA for first 4, loads for 2nd block of 4. 127 FMLA v16.4s, v20.4s, v0.s[0] 128 LDR q1, [x3], 16 129 FMLA v17.4s, v21.4s, v0.s[0] 130 LDR q20, [x5], 16 131 FMLA v18.4s, v22.4s, v0.s[1] 132 LDR q21, [x5], 16 133 FMLA v19.4s, v23.4s, v0.s[1] 134 LDR q22, [x5], 16 135 FMLA v16.4s, v24.4s, v0.s[2] 136 LDR q23, [x5], 16 137 FMLA v17.4s, v25.4s, v0.s[2] 138 LDR q24, [x5], 16 139 FMLA v18.4s, v26.4s, v0.s[3] 140 LDR q25, [x5], 16 141 FMLA v19.4s, v27.4s, v0.s[3] 142 LDR q26, [x5], 16 143 144 # Second block of 4. no loads 145 FMLA v16.4s, v20.4s, v1.s[0] 146 LDR q27, [x5], 16 147 FMLA v17.4s, v21.4s, v1.s[0] 148 FMLA v18.4s, v22.4s, v1.s[1] 149 FMLA v19.4s, v23.4s, v1.s[1] 150 FMLA v16.4s, v24.4s, v1.s[2] 151 FMLA v17.4s, v25.4s, v1.s[2] 152 FMLA v18.4s, v26.4s, v1.s[3] 153 FMLA v19.4s, v27.4s, v1.s[3] 154 1553: 156 # Is there a remainder?- 4 floats of A (16 bytes) 157 TBNZ x0, 4, 5f 158 # Is there a remainder?- 2 floats of A (8 bytes) 159 TBNZ x0, 3, 6f 160 # Is there a remainder?- 1 floats of A (4 bytes) 161 TBNZ x0, 2, 8f 162 1634: 164 FADD v16.4s, v16.4s, v18.4s 165 FADD v17.4s, v17.4s, v19.4s 166 167 # Clamp 168 FMAX v16.4s, v16.4s, v4.4s 169 SUBS x1, x1, 8 170 FMAX v17.4s, v17.4s, v4.4s 171 FMIN v16.4s, v16.4s, v5.4s 172 FMIN v17.4s, v17.4s, v5.4s 173 174 # Store full 1 x 8 175 B.LO 9f 176 177 ST1 {v16.16b, v17.16b}, [x6], x14 178 SUB x3, x3, x2 // a0 -= kc 179 180 B.HI 0b 181 182 RET 183 1845: 185 # Remainder- 4 floats of A (16 bytes) 186 LDR q20, [x5], 16 187 LDR q21, [x5], 16 188 LDR q0, [x3], 16 189 FMLA v16.4s, v20.4s, v0.s[0] 190 FMLA v17.4s, v21.4s, v0.s[0] 191 LDR q22, [x5], 16 192 LDR q23, [x5], 16 193 LDR q24, [x5], 16 194 LDR q25, [x5], 16 195 LDR q26, [x5], 16 196 LDR q27, [x5], 16 197 FMLA v18.4s, v22.4s, v0.s[1] 198 FMLA v19.4s, v23.4s, v0.s[1] 199 FMLA v16.4s, v24.4s, v0.s[2] 200 FMLA v17.4s, v25.4s, v0.s[2] 201 FMLA v18.4s, v26.4s, v0.s[3] 202 FMLA v19.4s, v27.4s, v0.s[3] 203 204 TBZ x0, 3, 7f 2056: 206 # Remainder- 2 floats of A (8 bytes) 207 LDR q20, [x5], 16 208 LDR q21, [x5], 16 209 LDR d0, [x3], 8 210 FMLA v16.4s, v20.4s, v0.s[0] 211 FMLA v17.4s, v21.4s, v0.s[0] 212 LDR q22, [x5], 16 213 LDR q23, [x5], 16 214 FMLA v18.4s, v22.4s, v0.s[1] 215 FMLA v19.4s, v23.4s, v0.s[1] 2167: 217 TBZ x0, 2, 4b 2188: 219 # Remainder- 1 float of A (4 bytes) 220 LDR q20, [x5], 16 221 LDR q21, [x5], 16 222 LDR s0, [x3], 4 223 FMLA v16.4s, v20.4s, v0.s[0] 224 FMLA v17.4s, v21.4s, v0.s[0] 225 B 4b 226 227 # Store odd channels 2289: 229 TBZ x1, 2, 10f 230 STR q16, [x6], 16 231 MOV v16.16b, v17.16b 232 23310: 234 TBZ x1, 1, 11f 235 STR d16, [x6], 8 236 DUP d16, v16.d[1] 237 23811: 239 TBZ x1, 0, 12f 240 STR s16, [x6] 24112: 242 RET 243 244END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 245 246#ifdef __ELF__ 247.section ".note.GNU-stack","",%progbits 248#endif 249