1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, (x4) - unused 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Clamp v2 v3 33 34# A53 based on A57/A75 but with LD64 35 36BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 37 38 $if INC: 39 # Load cn_stride, acc 40 LDP x14, x15, [sp] 41 # Load params pointer 42 LDR x8, [sp, 16] 43 $else: 44 # Load cn_stride, params pointer 45 LDP x14, x8, [sp] 46 47 # Load min/max values 48 LD2R {v2.4s, v3.4s}, [x8] 490: 50 $if INC: 51 # Load initial accumulators 52 LD1 {v16.16b, v17.16b, v18.16b}, [x15], 48 53 $else: 54 # Load initial bias from w into accumulators 55 LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48 56 57 MOVI v5.4s, 0 // second set of C for pipelining FMLA 58 PRFM PLDL1KEEP, [x5] 59 MOVI v6.4s, 0 60 PRFM PLDL1KEEP, [x5, 64] 61 MOVI v7.4s, 0 62 PRFM PLDL1KEEP, [x5, 128] 63 PRFM PLDL1KEEP, [x5, 192] 64 65 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 66 SUBS x0, x2, 32 // k = kc - 32 67 68 B.LO 3f 69 70 # 16 prologue 71 # Read first block of 1 A and B. 72 LDP q20, q21, [x5], 32 73 LDP q22, q23, [x5], 32 74 LDP q24, q25, [x5], 32 75 LDP q26, q27, [x5], 32 76 LDP q28, q29, [x5], 32 77 LDP q30, q31, [x5], 32 78 LDR q0, [x3], 16 79 80 # Is there at least 32. yes do main loop 81 SUBS x0, x0, 32 82 B.LO 2f 83 84 # Main loop - 8 floats of A (32 bytes) 851: 86 # First block of 4. FMA for first 4, loads for 2nd block of 4. 87 FMLA v16.4s, v20.4s, v0.s[0] 88 LDR q1, [x3], 16 89 FMLA v17.4s, v21.4s, v0.s[0] 90 LDR q20, [x5], 16 91 FMLA v18.4s, v22.4s, v0.s[0] 92 LDR q21, [x5], 16 93 FMLA v5.4s, v23.4s, v0.s[1] 94 LDR q22, [x5], 16 95 FMLA v6.4s, v24.4s, v0.s[1] 96 LDR q23, [x5], 16 97 FMLA v7.4s, v25.4s, v0.s[1] 98 LDR q24, [x5], 16 99 FMLA v16.4s, v26.4s, v0.s[2] 100 LDR q25, [x5], 16 101 FMLA v17.4s, v27.4s, v0.s[2] 102 LDR q26, [x5], 16 103 FMLA v18.4s, v28.4s, v0.s[2] 104 LDR q27, [x5], 16 105 FMLA v5.4s, v29.4s, v0.s[3] 106 LDR q28, [x5], 16 107 FMLA v6.4s, v30.4s, v0.s[3] 108 LDR q29, [x5], 16 109 FMLA v7.4s, v31.4s, v0.s[3] 110 LDR q30, [x5], 16 111 LDR q31, [x5], 16 112 113 # Second block of 4. FMA for second 4, loads for 1st block of 4. 114 FMLA v16.4s, v20.4s, v1.s[0] 115 LDR q0, [x3], 16 116 FMLA v17.4s, v21.4s, v1.s[0] 117 LDR q20, [x5], 16 118 FMLA v18.4s, v22.4s, v1.s[0] 119 LDR q21, [x5], 16 120 FMLA v5.4s, v23.4s, v1.s[1] 121 LDR q22, [x5], 16 122 FMLA v6.4s, v24.4s, v1.s[1] 123 LDR q23, [x5], 16 124 FMLA v7.4s, v25.4s, v1.s[1] 125 LDR q24, [x5], 16 126 FMLA v16.4s, v26.4s, v1.s[2] 127 LDR q25, [x5], 16 128 FMLA v17.4s, v27.4s, v1.s[2] 129 LDR q26, [x5], 16 130 FMLA v18.4s, v28.4s, v1.s[2] 131 LDR q27, [x5], 16 132 FMLA v5.4s, v29.4s, v1.s[3] 133 LDR q28, [x5], 16 134 FMLA v6.4s, v30.4s, v1.s[3] 135 LDR q29, [x5], 16 136 FMLA v7.4s, v31.4s, v1.s[3] 137 LDR q30, [x5], 16 138 SUBS x0, x0, 32 139 LDR q31, [x5], 16 140 B.HS 1b 141 1422: 143 # Epilogue 144 145 # First block of 4. FMA for first 4, loads for 2nd block of 4. 146 FMLA v16.4s, v20.4s, v0.s[0] 147 LDR q1, [x3], 16 148 FMLA v17.4s, v21.4s, v0.s[0] 149 LDR q20, [x5], 16 150 FMLA v18.4s, v22.4s, v0.s[0] 151 LDR q21, [x5], 16 152 FMLA v5.4s, v23.4s, v0.s[1] 153 LDR q22, [x5], 16 154 FMLA v6.4s, v24.4s, v0.s[1] 155 LDR q23, [x5], 16 156 FMLA v7.4s, v25.4s, v0.s[1] 157 LDR q24, [x5], 16 158 FMLA v16.4s, v26.4s, v0.s[2] 159 LDR q25, [x5], 16 160 FMLA v17.4s, v27.4s, v0.s[2] 161 LDR q26, [x5], 16 162 FMLA v18.4s, v28.4s, v0.s[2] 163 LDR q27, [x5], 16 164 FMLA v5.4s, v29.4s, v0.s[3] 165 LDR q28, [x5], 16 166 FMLA v6.4s, v30.4s, v0.s[3] 167 LDR q29, [x5], 16 168 FMLA v7.4s, v31.4s, v0.s[3] 169 LDR q30, [x5], 16 170 171 # Second block of 4. FMA for second 4, no loads. 172 FMLA v16.4s, v20.4s, v1.s[0] 173 LDR q31, [x5], 16 174 FMLA v17.4s, v21.4s, v1.s[0] 175 FMLA v18.4s, v22.4s, v1.s[0] 176 FMLA v5.4s, v23.4s, v1.s[1] 177 FMLA v6.4s, v24.4s, v1.s[1] 178 FMLA v7.4s, v25.4s, v1.s[1] 179 FMLA v16.4s, v26.4s, v1.s[2] 180 FMLA v17.4s, v27.4s, v1.s[2] 181 FMLA v18.4s, v28.4s, v1.s[2] 182 FMLA v5.4s, v29.4s, v1.s[3] 183 FMLA v6.4s, v30.4s, v1.s[3] 184 FMLA v7.4s, v31.4s, v1.s[3] 185 1863: 187 # Is there a remainder?- 4 floats of A (16 bytes) 188 TBNZ x0, 4, 5f 189 # Is there a remainder?- 2 floats of A (8 bytes) 190 TBNZ x0, 3, 6f 191 # Is there a remainder?- 1 floats of A (4 bytes) 192 TBNZ x0, 2, 8f 193 1944: 195 FADD v16.4s, v16.4s, v5.4s 196 FADD v17.4s, v17.4s, v6.4s 197 FADD v18.4s, v18.4s, v7.4s 198 SUBS x1, x1, 12 199 200 # Clamp 201 FMAX v16.4s, v16.4s, v2.4s 202 FMAX v17.4s, v17.4s, v2.4s 203 FMAX v18.4s, v18.4s, v2.4s 204 FMIN v16.4s, v16.4s, v3.4s 205 FMIN v17.4s, v17.4s, v3.4s 206 FMIN v18.4s, v18.4s, v3.4s 207 208 # Store full 1 x 12 209 B.LO 9f 210 211 ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14 212 SUB x3, x3, x2 // a0 -= kc 213 214 B.HI 0b 215 216 RET 217 2185: 219 # Remainder- 4 floats of A (16 bytes) 220 LDR q0, [x3], 16 221 LDR q20, [x5], 16 222 LDR q21, [x5], 16 223 LDR q22, [x5], 16 224 FMLA v16.4s, v20.4s, v0.s[0] 225 FMLA v17.4s, v21.4s, v0.s[0] 226 FMLA v18.4s, v22.4s, v0.s[0] 227 228 LDR q20, [x5], 16 229 LDR q21, [x5], 16 230 LDR q22, [x5], 16 231 FMLA v16.4s, v20.4s, v0.s[1] 232 FMLA v17.4s, v21.4s, v0.s[1] 233 FMLA v18.4s, v22.4s, v0.s[1] 234 235 LDR q20, [x5], 16 236 LDR q21, [x5], 16 237 LDR q22, [x5], 16 238 FMLA v16.4s, v20.4s, v0.s[2] 239 FMLA v17.4s, v21.4s, v0.s[2] 240 FMLA v18.4s, v22.4s, v0.s[2] 241 242 LDR q20, [x5], 16 243 LDR q21, [x5], 16 244 LDR q22, [x5], 16 245 FMLA v16.4s, v20.4s, v0.s[3] 246 FMLA v17.4s, v21.4s, v0.s[3] 247 FMLA v18.4s, v22.4s, v0.s[3] 248 249 TBZ x0, 3, 7f 2506: 251 # Remainder- 2 floats of A (8 bytes) 252 LDR d0, [x3], 8 253 LDR q20, [x5], 16 254 LDR q21, [x5], 16 255 LDR q22, [x5], 16 256 FMLA v16.4s, v20.4s, v0.s[0] 257 FMLA v17.4s, v21.4s, v0.s[0] 258 FMLA v18.4s, v22.4s, v0.s[0] 259 260 LDR q20, [x5], 16 261 LDR q21, [x5], 16 262 LDR q22, [x5], 16 263 FMLA v16.4s, v20.4s, v0.s[1] 264 FMLA v17.4s, v21.4s, v0.s[1] 265 FMLA v18.4s, v22.4s, v0.s[1] 2667: 267 TBZ x0, 2, 4b 2688: 269 # Remainder- 1 float of A (4 bytes) 270 LDR s0, [x3], 4 271 LDR q20, [x5], 16 272 LDR q21, [x5], 16 273 LDR q22, [x5], 16 274 FMLA v16.4s, v20.4s, v0.s[0] 275 FMLA v17.4s, v21.4s, v0.s[0] 276 FMLA v18.4s, v22.4s, v0.s[0] 277 B 4b 278 279 # Store odd channels 2809: 281 ADD x1, x1, 12 282 TBZ x1, 3, 10f 283 STP q16, q17, [x6], 32 284 MOV v16.16b, v18.16b 285 28610: 287 TBZ x1, 2, 11f 288 STR q16, [x6], 16 289 MOV v16.16b, v17.16b 290 29111: 292 TBZ x1, 1, 12f 293 STR d16, [x6], 8 294 DUP d16, v16.d[1] 295 29612: 297 TBZ x1, 0, 13f 298 STR s16, [x6] 29913: 300 RET 301 302END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 303 304#ifdef __ELF__ 305.section ".note.GNU-stack","",%progbits 306#endif 307