1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointer 29# x3 a0 30 31# C pointer 32# x6 c0 33 34# Vector register usage and GPR shadows 35# a0 v0 first set of A 36# a0 v1 second set of A 37# B v2 v3 v4 x7 x10 x16 first set of B 38# B v5 v6 v7 x17 x18 x9 39# B v23 v24 v25 x7 x10 x16 second set of B (same x as first set) 40# B v17 v18 v19 x17 x18 x9 41# C v20 v21 v22 42 43BEGIN_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53 44 45 # Load cn_stride, acc 46 LDP x14, x15, [sp] 47 # Load params pointer 48 LDR x8, [sp, 16] 49 50 # Load clamping_params values 51 LD2R {v30.4s, v31.4s}, [x8] 52 530: 54 # Load initial accumulators 55 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48 56 57 PRFM PLDL1KEEP, [x5] 58 PRFM PLDL1KEEP, [x5, 64] 59 PRFM PLDL1KEEP, [x5, 128] 60 61 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 62 SUBS x0, x2, 16 // k = kc - 16 63 B.LO 5f 64 65 # Prologue - loads for first group of 6 fma 66 67 # Read first block of 1 A. 68 LDR d0, [x3], 8 // a0 69 70 LDR d2, [x5] // vb0x0123 71 LDR x7, [x5, 8] 72 73 LDR d3, [x5, 16] // vb0x4567 74 LDR x10, [x5, 24] 75 76 LDR d4, [x5, 32] // vb0x89AB 77 LDR x16, [x5, 40] 78 79 LDR d5, [x5, 48] // vb1x0123 80 LDR x17, [x5, 56] 81 82 LDR d6, [x5, 64] // vb1x4567 83 LDR x18, [x5, 72] 84 85 LDR d7, [x5, 80] // vb1x89AB 86 LDR x9, [x5, 88] 87 INS v2.d[1], x7 88 ADD x5, x5, 96 89 90 # Is there at least 4 floats (16 bytes) for main loop? 91 SUBS x0, x0, 16 92 B.LO 2f 93 94 # Main loop - 4 floats of A (16 bytes) 951: 96 # First group of 6 fma. 97 # A is loaded for 2nd group into v1 98 99 # BLOCK 0 100 LDR d1, [x3], 8 // a0 101 INS v3.d[1], x10 102 FMLA v20.4s, v2.4s, v0.s[0] 103 PRFM PLDL1KEEP, [x5, 96] 104 105 # BLOCK 1 106 INS v4.d[1], x16 107 FMLA v21.4s, v3.4s, v0.s[0] 108 PRFM PLDL1KEEP, [x5, 128] 109 110 # BLOCK 2 111 LDR d23, [x5] // vb0x0123 112 INS v5.d[1], x17 113 LDR x7, [x5, 8] 114 FMLA v22.4s, v4.4s, v0.s[0] 115 116 # BLOCK 3 117 LDR d24, [x5, 16] // vb0x4567 118 INS v6.d[1], x18 119 LDR x10, [x5, 24] 120 121 # BLOCK 4 122 LDR d25, [x5, 32] // vb0x89AB 123 INS v7.d[1], x9 124 FMLA v20.4s, v5.4s, v0.s[1] 125 LDR x16, [x5, 40] 126 127 # BLOCK 5 128 LDR d17, [x5, 48] // vb1x0123 129 LDR x17, [x5, 56] 130 FMLA v21.4s, v6.4s, v0.s[1] 131 132 # BLOCK 6 133 LDR d18, [x5, 64] // vb1x4567 134 LDR x18, [x5, 72] 135 FMLA v22.4s, v7.4s, v0.s[1] 136 137 # BLOCK 7 138 LDR d19, [x5, 80] // vb1x89AB 139 INS v23.d[1], x7 // v23 was loaded in block 2 140 LDR x9, [x5, 88] 141 142 # Second group of 6 fma. 143 # A is loaded for 1st group into v0 144 145 # BLOCK 0 146 LDR d0, [x3], 8 // a0 147 INS v24.d[1], x10 148 FMLA v20.4s, v23.4s, v1.s[0] 149 150 # BLOCK 1 151 INS v25.d[1], x16 152 FMLA v21.4s, v24.4s, v1.s[0] 153 154 # BLOCK 2 155 LDR d2, [x5, 96] // vb0x0123 156 INS v17.d[1], x17 157 LDR x7, [x5, 104] 158 FMLA v22.4s, v25.4s, v1.s[0] 159 160 # BLOCK 3 161 LDR d3, [x5, 112] // vb0x4567 162 INS v18.d[1], x18 163 LDR x10, [x5, 120] 164 165 # BLOCK 4 166 LDR d4, [x5, 128] // vb0x89AB 167 INS v19.d[1], x9 168 FMLA v20.4s, v17.4s, v1.s[1] 169 LDR x16, [x5, 136] 170 171 # BLOCK 5 172 LDR d5, [x5, 144] // vb1x0123 173 LDR x17, [x5, 152] 174 FMLA v21.4s, v18.4s, v1.s[1] 175 176 # BLOCK 6 177 LDR d6, [x5, 160] // vb1x4567 178 LDR x18, [x5, 168] 179 SUBS x0, x0, 16 180 FMLA v22.4s, v19.4s, v1.s[1] 181 182 # BLOCK 7 183 LDR d7, [x5, 176] // vb1x89AB 184 INS v2.d[1], x7 185 LDR x9, [x5, 184] 186 ADD x5, x5, 192 187 B.HS 1b 188 189 # Epilogue 190 # First block same as main loop. Second block has no loads. 1912: 192 # BLOCK 0 193 LDR d1, [x3], 8 // a0 194 INS v3.d[1], x10 195 FMLA v20.4s, v2.4s, v0.s[0] 196 PRFM PLDL1KEEP, [x5, 96] 197 198 # BLOCK 1 199 INS v4.d[1], x16 200 FMLA v21.4s, v3.4s, v0.s[0] 201 PRFM PLDL1KEEP, [x5, 128] 202 203 # BLOCK 2 204 LDR d23, [x5] // vb0x0123 205 INS v5.d[1], x17 206 LDR x7, [x5, 8] 207 FMLA v22.4s, v4.4s, v0.s[0] 208 209 # BLOCK 3 210 LDR d24, [x5, 16] // vb0x4567 211 INS v6.d[1], x18 212 LDR x10, [x5, 24] 213 214 # BLOCK 4 215 LDR d25, [x5, 32] // vb0x89AB 216 INS v7.d[1], x9 217 FMLA v20.4s, v5.4s, v0.s[1] 218 LDR x16, [x5, 40] 219 220 # BLOCK 5 221 LDR d17, [x5, 48] // vb1x0123 222 LDR x17, [x5, 56] 223 FMLA v21.4s, v6.4s, v0.s[1] 224 225 # BLOCK 6 226 LDR d18, [x5, 64] // vb1x4567 227 LDR x18, [x5, 72] 228 FMLA v22.4s, v7.4s, v0.s[1] 229 230 # BLOCK 7 231 LDR d19, [x5, 80] // vb1x89AB 232 INS v23.d[1], x7 // v23 was loaded in block 2 233 LDR x9, [x5, 88] 234 ADD x5, x5, 96 235 236 # Second group of 6 fma. 8 blocks of 4 cycles. 237 # Epilogue version does no loads 238 239 # BLOCK 0 240 INS v24.d[1], x10 241 FMLA v20.4s, v23.4s, v1.s[0] 242 243 # BLOCK 1 244 INS v25.d[1], x16 245 FMLA v21.4s, v24.4s, v1.s[0] 246 247 # BLOCK 2 248 INS v17.d[1], x17 249 FMLA v22.4s, v25.4s, v1.s[0] 250 251 # BLOCK 3 252 INS v18.d[1], x18 253 254 # BLOCK 4 255 INS v19.d[1], x9 256 FMLA v20.4s, v17.4s, v1.s[1] 257 TST x0, 15 258 259 # BLOCK 5 260 FMLA v21.4s, v18.4s, v1.s[1] 261 262 # BLOCK 6 263 FMLA v22.4s, v19.4s, v1.s[1] 264 265 # BLOCK 7 266 # Is there a remainder?- 2 floats of A (8 bytes) or less 267 B.NE 5f 268 2694: 270 # Clamp 271 FMIN v20.4s, v20.4s, v30.4s 272 SUBS x1, x1, 12 273 FMIN v21.4s, v21.4s, v30.4s 274 FMIN v22.4s, v22.4s, v30.4s 275 FMAX v20.4s, v20.4s, v31.4s 276 FMAX v21.4s, v21.4s, v31.4s 277 FMAX v22.4s, v22.4s, v31.4s 278 279 # Store full 1 x 12 280 B.LO 7f 281 282 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 283 SUB x3, x3, x2 // a0 -= kc 284 B.HI 0b 285 RET 286 2875: 288 # Is there a remainder?- 2 floats of A (8 bytes) 289 TBZ x0, 3, 6f 290 291 # Remainder - 2 floats of A (8 bytes) 292 # Read first block of 1 A. 293 LDR d0, [x3], 8 // a0 294 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 295 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48 296 297 # First block of 3 B 298 FMLA v20.4s, v2.4s, v0.s[0] 299 FMLA v21.4s, v3.4s, v0.s[0] 300 FMLA v22.4s, v4.4s, v0.s[0] 301 302 # Second block of 3 B 303 FMLA v20.4s, v5.4s, v0.s[1] 304 FMLA v21.4s, v6.4s, v0.s[1] 305 FMLA v22.4s, v7.4s, v0.s[1] 306 307 TBZ x0, 2, 4b 3086: 309 # Remainder - 1 float of A (4 bytes) 310 LDR s0, [x3], 4 // a0 311 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 312 313 FMLA v20.4s, v2.4s, v0.s[0] 314 FMLA v21.4s, v3.4s, v0.s[0] 315 FMLA v22.4s, v4.4s, v0.s[0] 316 B 4b 317 3187: 319 ADD x1, x1, 12 320 # Store odd channels 321 TBZ x1, 3, 8f 322 STP q20, q21, [x6], 32 323 MOV v20.16b, v22.16b 324 3258: 326 TBZ x1, 2, 9f 327 STR q20, [x6], 16 328 MOV v20.16b, v21.16b 329 3309: 331 TBZ x1, 1, 10f 332 STR d20, [x6], 8 333 DUP d20, v20.d[1] 334 33510: 336 TBZ x1, 0, 11f 337 STR s20, [x6] 33811: 339 RET 340 341END_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53 342 343#ifdef __ELF__ 344.section ".note.GNU-stack","",%progbits 345#endif 346