1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Vector register usage and GPR shadows 34# a0 v0 first set of A 35# a0 v1 second set of A 36# B v2 v3 v4 x7 x10 x16 first set of B 37# B v5 v6 v7 x17 x18 x9 38# B v23 v24 v25 x7 x10 x16 second set of B (same x as first set) 39# B v17 v18 v19 x17 x18 x9 40# C v20 v21 v22 41 42BEGIN_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53 43 44 # Load cn_stride, params pointer 45 LDP x14, x8, [sp] 46 47 # Load clamping_params values 48 LD2R {v30.4s, v31.4s}, [x8] 49 500: 51 # Load initial bias from w into accumulators 52 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 53 54 PRFM PLDL1KEEP, [x5] 55 PRFM PLDL1KEEP, [x5, 64] 56 PRFM PLDL1KEEP, [x5, 128] 57 58 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 59 SUBS x0, x2, 16 // k = kc - 16 60 B.LO 5f 61 62 # Prologue - loads for first group of 6 fma 63 64 # Read first block of 1 A. 65 LDR d0, [x3], 8 // a0 66 67 LDR d2, [x5] // vb0x0123 68 LDR x7, [x5, 8] 69 70 LDR d3, [x5, 16] // vb0x4567 71 LDR x10, [x5, 24] 72 73 LDR d4, [x5, 32] // vb0x89AB 74 LDR x16, [x5, 40] 75 76 LDR d5, [x5, 48] // vb1x0123 77 LDR x17, [x5, 56] 78 79 LDR d6, [x5, 64] // vb1x4567 80 LDR x18, [x5, 72] 81 82 LDR d7, [x5, 80] // vb1x89AB 83 LDR x9, [x5, 88] 84 INS v2.d[1], x7 85 ADD x5, x5, 96 86 87 # Is there at least 4 floats (16 bytes) for main loop? 88 SUBS x0, x0, 16 89 B.LO 2f 90 91 # Main loop - 4 floats of A (16 bytes) 921: 93 # First group of 6 fma. 94 # A is loaded for 2nd group into v1 95 96 # BLOCK 0 97 LDR d1, [x3], 8 // a0 98 INS v3.d[1], x10 99 FMLA v20.4s, v2.4s, v0.s[0] 100 PRFM PLDL1KEEP, [x5, 96] 101 102 # BLOCK 1 103 INS v4.d[1], x16 104 FMLA v21.4s, v3.4s, v0.s[0] 105 PRFM PLDL1KEEP, [x5, 128] 106 107 # BLOCK 2 108 LDR d23, [x5] // vb0x0123 109 INS v5.d[1], x17 110 LDR x7, [x5, 8] 111 FMLA v22.4s, v4.4s, v0.s[0] 112 113 # BLOCK 3 114 LDR d24, [x5, 16] // vb0x4567 115 INS v6.d[1], x18 116 LDR x10, [x5, 24] 117 118 # BLOCK 4 119 LDR d25, [x5, 32] // vb0x89AB 120 INS v7.d[1], x9 121 FMLA v20.4s, v5.4s, v0.s[1] 122 LDR x16, [x5, 40] 123 124 # BLOCK 5 125 LDR d17, [x5, 48] // vb1x0123 126 LDR x17, [x5, 56] 127 FMLA v21.4s, v6.4s, v0.s[1] 128 129 # BLOCK 6 130 LDR d18, [x5, 64] // vb1x4567 131 LDR x18, [x5, 72] 132 FMLA v22.4s, v7.4s, v0.s[1] 133 134 # BLOCK 7 135 LDR d19, [x5, 80] // vb1x89AB 136 INS v23.d[1], x7 // v23 was loaded in block 2 137 LDR x9, [x5, 88] 138 139 # Second group of 6 fma. 140 # A is loaded for 1st group into v0 141 142 # BLOCK 0 143 LDR d0, [x3], 8 // a0 144 INS v24.d[1], x10 145 FMLA v20.4s, v23.4s, v1.s[0] 146 147 # BLOCK 1 148 INS v25.d[1], x16 149 FMLA v21.4s, v24.4s, v1.s[0] 150 151 # BLOCK 2 152 LDR d2, [x5, 96] // vb0x0123 153 INS v17.d[1], x17 154 LDR x7, [x5, 104] 155 FMLA v22.4s, v25.4s, v1.s[0] 156 157 # BLOCK 3 158 LDR d3, [x5, 112] // vb0x4567 159 INS v18.d[1], x18 160 LDR x10, [x5, 120] 161 162 # BLOCK 4 163 LDR d4, [x5, 128] // vb0x89AB 164 INS v19.d[1], x9 165 FMLA v20.4s, v17.4s, v1.s[1] 166 LDR x16, [x5, 136] 167 168 # BLOCK 5 169 LDR d5, [x5, 144] // vb1x0123 170 LDR x17, [x5, 152] 171 FMLA v21.4s, v18.4s, v1.s[1] 172 173 # BLOCK 6 174 LDR d6, [x5, 160] // vb1x4567 175 LDR x18, [x5, 168] 176 SUBS x0, x0, 16 177 FMLA v22.4s, v19.4s, v1.s[1] 178 179 # BLOCK 7 180 LDR d7, [x5, 176] // vb1x89AB 181 INS v2.d[1], x7 182 LDR x9, [x5, 184] 183 ADD x5, x5, 192 184 B.HS 1b 185 186 # Epilogue 187 # First block same as main loop. Second block has no loads. 1882: 189 # BLOCK 0 190 LDR d1, [x3], 8 // a0 191 INS v3.d[1], x10 192 FMLA v20.4s, v2.4s, v0.s[0] 193 PRFM PLDL1KEEP, [x5, 96] 194 195 # BLOCK 1 196 INS v4.d[1], x16 197 FMLA v21.4s, v3.4s, v0.s[0] 198 PRFM PLDL1KEEP, [x5, 128] 199 200 # BLOCK 2 201 LDR d23, [x5] // vb0x0123 202 INS v5.d[1], x17 203 LDR x7, [x5, 8] 204 FMLA v22.4s, v4.4s, v0.s[0] 205 206 # BLOCK 3 207 LDR d24, [x5, 16] // vb0x4567 208 INS v6.d[1], x18 209 LDR x10, [x5, 24] 210 211 # BLOCK 4 212 LDR d25, [x5, 32] // vb0x89AB 213 INS v7.d[1], x9 214 FMLA v20.4s, v5.4s, v0.s[1] 215 LDR x16, [x5, 40] 216 217 # BLOCK 5 218 LDR d17, [x5, 48] // vb1x0123 219 LDR x17, [x5, 56] 220 FMLA v21.4s, v6.4s, v0.s[1] 221 222 # BLOCK 6 223 LDR d18, [x5, 64] // vb1x4567 224 LDR x18, [x5, 72] 225 FMLA v22.4s, v7.4s, v0.s[1] 226 227 # BLOCK 7 228 LDR d19, [x5, 80] // vb1x89AB 229 INS v23.d[1], x7 // v23 was loaded in block 2 230 LDR x9, [x5, 88] 231 ADD x5, x5, 96 232 233 # Second group of 6 fma. 8 blocks of 4 cycles. 234 # Epilogue version does no loads 235 236 # BLOCK 0 237 INS v24.d[1], x10 238 FMLA v20.4s, v23.4s, v1.s[0] 239 240 # BLOCK 1 241 INS v25.d[1], x16 242 FMLA v21.4s, v24.4s, v1.s[0] 243 244 # BLOCK 2 245 INS v17.d[1], x17 246 FMLA v22.4s, v25.4s, v1.s[0] 247 248 # BLOCK 3 249 INS v18.d[1], x18 250 251 # BLOCK 4 252 INS v19.d[1], x9 253 FMLA v20.4s, v17.4s, v1.s[1] 254 TST x0, 15 255 256 # BLOCK 5 257 FMLA v21.4s, v18.4s, v1.s[1] 258 259 # BLOCK 6 260 FMLA v22.4s, v19.4s, v1.s[1] 261 262 # BLOCK 7 263 # Is there a remainder?- 2 floats of A (8 bytes) or less 264 B.NE 5f 265 2664: 267 # Clamp 268 FMIN v20.4s, v20.4s, v30.4s 269 SUBS x1, x1, 12 270 FMIN v21.4s, v21.4s, v30.4s 271 FMIN v22.4s, v22.4s, v30.4s 272 FMAX v20.4s, v20.4s, v31.4s 273 FMAX v21.4s, v21.4s, v31.4s 274 FMAX v22.4s, v22.4s, v31.4s 275 276 # Store full 1 x 12 277 B.LO 7f 278 279 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 280 SUB x3, x3, x2 // a0 -= kc 281 B.HI 0b 282 RET 283 2845: 285 # Is there a remainder?- 2 floats of A (8 bytes) 286 TBZ x0, 3, 6f 287 288 # Remainder - 2 floats of A (8 bytes) 289 # Read first block of 1 A. 290 LDR d0, [x3], 8 // a0 291 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 292 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48 293 294 # First block of 3 B 295 FMLA v20.4s, v2.4s, v0.s[0] 296 FMLA v21.4s, v3.4s, v0.s[0] 297 FMLA v22.4s, v4.4s, v0.s[0] 298 299 # Second block of 3 B 300 FMLA v20.4s, v5.4s, v0.s[1] 301 FMLA v21.4s, v6.4s, v0.s[1] 302 FMLA v22.4s, v7.4s, v0.s[1] 303 304 TBZ x0, 2, 4b 3056: 306 # Remainder - 1 float of A (4 bytes) 307 LDR s0, [x3], 4 // a0 308 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 309 310 FMLA v20.4s, v2.4s, v0.s[0] 311 FMLA v21.4s, v3.4s, v0.s[0] 312 FMLA v22.4s, v4.4s, v0.s[0] 313 B 4b 314 3157: 316 ADD x1, x1, 12 317 # Store odd channels 318 TBZ x1, 3, 8f 319 STP q20, q21, [x6], 32 320 MOV v20.16b, v22.16b 321 3228: 323 TBZ x1, 2, 9f 324 STR q20, [x6], 16 325 MOV v20.16b, v21.16b 326 3279: 328 TBZ x1, 1, 10f 329 STR d20, [x6], 8 330 DUP d20, v20.d[1] 331 33210: 333 TBZ x1, 0, 11f 334 STR s20, [x6] 33511: 336 RET 337 338END_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53 339 340#ifdef __ELF__ 341.section ".note.GNU-stack","",%progbits 342#endif 343