1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_output_params params [sp + 24] -> x8 21 22# d8-d15 need to be preserved if used. 23# x19-30 need to be preserved if used. 24 25# A pointer 26# x8 a0 27 28# C pointer 29# x6 c0 30 31# Vector register usage and GPR shadows 32# a0 v0 first set of A 33# a0 v1 second set of A 34# B v2 v3 v4 x14 x15 x16 first set of B 35# B v5 v6 v7 x17 x18 x7 36# B v23 v24 v25 x14 x15 x16 second set of B (same x as first set) 37# B v17 v18 v19 x17 x18 x7 38# C v20 v21 v22 39 40BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53 41 42 # Load cn_stride, a_offset 43 LDP x10, x11, [sp] 44 45 # Load zero, clamping params pointer 46 LDP x12, x8, [sp, 16] 47 48 # Load clamping_params values 49 LD2R {v30.4s, v31.4s}, [x8] 50 510: 52 # Load initial bias from w into accumulators 53 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 54 55 PRFM PLDL1KEEP, [x5] 56 PRFM PLDL1KEEP, [x5, 64] 57 PRFM PLDL1KEEP, [x5, 128] 58 PRFM PLDL1KEEP, [x5, 192] 59 PRFM PLDL1KEEP, [x5, 256] 60 PRFM PLDL1KEEP, [x5, 320] 61 62 MOV x9, x3 // p = ks 63 641: 65 # Load next A pointer 66 LDR x8, [x4], 8 67 68 CMP x8, x12 // if a0 == zero 69 ADD x8, x8, x11 // a0 += a_offset 70 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 71 72 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 73 SUBS x0, x2, 16 // k = kc - 16 74 B.LO 5f 75 76 # Prologue - loads for first group of 6 fma 77 78 # Read first block of 1 A. 79 LDR d0, [x8], 8 // a0 80 81 LDR d2, [x5] // vb0x0123 82 LDR x14, [x5, 8] 83 84 LDR d3, [x5, 16] // vb0x25567 85 LDR x15, [x5, 24] 86 87 LDR d4, [x5, 32] // vb0x89AB 88 LDR x16, [x5, 40] 89 90 LDR d5, [x5, 48] // vb1x0123 91 LDR x17, [x5, 56] 92 93 LDR d6, [x5, 64] // vb1x25567 94 LDR x18, [x5, 72] 95 96 LDR d7, [x5, 80] // vb1x89AB 97 LDR x7, [x5, 88] 98 INS v2.d[1], x14 99 ADD x5, x5, 96 100 101 # Is there at least 4 floats (16 bytes) for main loop? 102 SUBS x0, x0, 16 // 4 floats for main loop 103 B.LO 3f 104 105 # Main loop - 4 floats of A (16 bytes) 1062: 107 # First group of 6 fma. 108 # A is loaded for 2nd group into v1 109 110 # BLOCK 0 111 LDR d1, [x8], 8 // a0 112 INS v3.d[1], x15 113 FMLA v20.4s, v2.4s, v0.s[0] 114 PRFM PLDL1KEEP, [x5, 192] 115 116 # BLOCK 1 117 INS v4.d[1], x16 118 FMLA v21.4s, v3.4s, v0.s[0] 119 PRFM PLDL1KEEP, [x5, 256] 120 121 # BLOCK 2 122 LDR d23, [x5] // vb0x0123 123 INS v5.d[1], x17 124 LDR x14, [x5, 8] 125 PRFM PLDL1KEEP, [x5, 320] 126 FMLA v22.4s, v4.4s, v0.s[0] 127 128 # BLOCK 3 129 LDR d24, [x5, 16] // vb0x25567 130 INS v6.d[1], x18 131 LDR x15, [x5, 24] 132 133 # BLOCK 4 134 LDR d25, [x5, 32] // vb0x89AB 135 INS v7.d[1], x7 136 FMLA v20.4s, v5.4s, v0.s[1] 137 LDR x16, [x5, 40] 138 139 # BLOCK 5 140 LDR d17, [x5, 48] // vb1x0123 141 LDR x17, [x5, 56] 142 FMLA v21.4s, v6.4s, v0.s[1] 143 144 # BLOCK 6 145 LDR d18, [x5, 64] // vb1x25567 146 LDR x18, [x5, 72] 147 FMLA v22.4s, v7.4s, v0.s[1] 148 149 # BLOCK 7 150 LDR d19, [x5, 80] // vb1x89AB 151 INS v23.d[1], x14 // v23 was loaded in block 2 152 LDR x7, [x5, 88] 153 154 # Second group of 6 fma. 155 # A is loaded for 1st group into v0 156 157 # BLOCK 0 158 LDR d0, [x8], 8 // a0 159 INS v24.d[1], x15 160 FMLA v20.4s, v23.4s, v1.s[0] 161 162 # BLOCK 1 163 INS v25.d[1], x16 164 FMLA v21.4s, v24.4s, v1.s[0] 165 166 # BLOCK 2 167 LDR d2, [x5, 96] // vb0x0123 168 INS v17.d[1], x17 169 LDR x14, [x5, 104] 170 FMLA v22.4s, v25.4s, v1.s[0] 171 172 # BLOCK 3 173 LDR d3, [x5, 112] // vb0x25567 174 INS v18.d[1], x18 175 LDR x15, [x5, 120] 176 177 # BLOCK 4 178 LDR d4, [x5, 128] // vb0x89AB 179 INS v19.d[1], x7 180 FMLA v20.4s, v17.4s, v1.s[1] 181 LDR x16, [x5, 136] 182 183 # BLOCK 5 184 LDR d5, [x5, 144] // vb1x0123 185 LDR x17, [x5, 152] 186 FMLA v21.4s, v18.4s, v1.s[1] 187 188 # BLOCK 6 189 LDR d6, [x5, 160] // vb1x25567 190 LDR x18, [x5, 168] 191 SUBS x0, x0, 16 192 FMLA v22.4s, v19.4s, v1.s[1] 193 194 # BLOCK 7 195 LDR d7, [x5, 176] // vb1x89AB 196 INS v2.d[1], x14 197 LDR x7, [x5, 184] 198 ADD x5, x5, 192 199 B.HS 2b 200 201 # Epilogue 202 # First block same as main loop. Second block has no loads. 2033: 204 # BLOCK 0 205 LDR d1, [x8], 8 // a0 206 INS v3.d[1], x15 207 FMLA v20.4s, v2.4s, v0.s[0] 208 PRFM PLDL1KEEP, [x5, 192] 209 210 # BLOCK 1 211 INS v4.d[1], x16 212 FMLA v21.4s, v3.4s, v0.s[0] 213 PRFM PLDL1KEEP, [x5, 256] 214 215 # BLOCK 2 216 LDR d23, [x5] // vb0x0123 217 INS v5.d[1], x17 218 LDR x14, [x5, 8] 219 PRFM PLDL1KEEP, [x5, 320] 220 FMLA v22.4s, v4.4s, v0.s[0] 221 222 # BLOCK 3 223 LDR d24, [x5, 16] // vb0x25567 224 INS v6.d[1], x18 225 LDR x15, [x5, 24] 226 227 # BLOCK 4 228 LDR d25, [x5, 32] // vb0x89AB 229 INS v7.d[1], x7 230 FMLA v20.4s, v5.4s, v0.s[1] 231 LDR x16, [x5, 40] 232 233 # BLOCK 5 234 LDR d17, [x5, 48] // vb1x0123 235 LDR x17, [x5, 56] 236 FMLA v21.4s, v6.4s, v0.s[1] 237 238 # BLOCK 6 239 LDR d18, [x5, 64] // vb1x25567 240 LDR x18, [x5, 72] 241 FMLA v22.4s, v7.4s, v0.s[1] 242 243 # BLOCK 7 244 LDR d19, [x5, 80] // vb1x89AB 245 INS v23.d[1], x14 // v23 was loaded in block 2 246 LDR x7, [x5, 88] 247 ADD x5, x5, 96 248 249 # Second group of 6 fma. 8 blocks of 4 cycles. 250 # Epilogue version does no loads 251 252 # BLOCK 0 253 INS v24.d[1], x15 254 FMLA v20.4s, v23.4s, v1.s[0] 255 256 # BLOCK 1 257 INS v25.d[1], x16 258 FMLA v21.4s, v24.4s, v1.s[0] 259 260 # BLOCK 2 261 INS v17.d[1], x17 262 FMLA v22.4s, v25.4s, v1.s[0] 263 264 # BLOCK 3 265 INS v18.d[1], x18 266 267 # BLOCK 4 268 INS v19.d[1], x7 269 FMLA v20.4s, v17.4s, v1.s[1] 270 TST x0, 15 271 272 # BLOCK 5 273 FMLA v21.4s, v18.4s, v1.s[1] 274 275 # BLOCK 6 276 FMLA v22.4s, v19.4s, v1.s[1] 277 278 # BLOCK 7 279 # Is there a remainder?- 2 floats of A (8 bytes) or less 280 B.NE 5f 281 2824: 283 # ks loop 284 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 285 B.NE 1b 286 287 # Clamp 288 FMIN v20.4s, v20.4s, v30.4s 289 FMIN v21.4s, v21.4s, v30.4s 290 FMIN v22.4s, v22.4s, v30.4s 291 FMAX v20.4s, v20.4s, v31.4s 292 FMAX v21.4s, v21.4s, v31.4s 293 FMAX v22.4s, v22.4s, v31.4s 294 295 # Store full 1 x 12 296 SUBS x1, x1, 12 297 B.LO 8f 298 299 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10 300 SUB x4, x4, x3 // a -= ks 301 302 # nc loop 303 B.HI 0b 304 RET 305 3065: 307 # Is there a remainder?- 2 floats of A (8 bytes) 308 TBZ x0, 3, 6f 309 310 # Remainder- 2 floats of A (8 bytes) 311 LDR d0, [x8], 8 // a0 312 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 313 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48 314 315 # First block of 3 B 316 FMLA v20.4s, v2.4s, v0.s[0] 317 FMLA v21.4s, v3.4s, v0.s[0] 318 FMLA v22.4s, v4.4s, v0.s[0] 319 320 # Second block of 3 B 321 FMLA v20.4s, v5.4s, v0.s[1] 322 FMLA v21.4s, v6.4s, v0.s[1] 323 FMLA v22.4s, v7.4s, v0.s[1] 324 325 TBZ x0, 2, 4b 3266: 327 # Remainder - 1 float of A (4 bytes) 328 LDR s0, [x8], 4 // a0 329 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48 330 331 FMLA v20.4s, v2.4s, v0.s[0] 332 FMLA v21.4s, v3.4s, v0.s[0] 333 FMLA v22.4s, v4.4s, v0.s[0] 334 B 4b 335 3368: 337 ADD x1, x1, 12 338 # Store odd channels 339 TBZ x1, 3, 9f 340 STP q20, q21, [x6] 341 ADD x6, x6, 32 342 MOV v20.16b, v22.16b 343 3449: 345 TBZ x1, 2, 10f 346 STR q20, [x6], 16 347 MOV v20.16b, v21.16b 348 34910: 350 TBZ x1, 1, 11f 351 STR d20, [x6], 8 352 DUP d20, v20.d[1] 353 35411: 355 TBZ x1, 0, 12f 356 STR s20, [x6] 35712: 358 RET 359 360END_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53 361 362#ifdef __ELF__ 363.section ".note.GNU-stack","",%progbits 364#endif 365