1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x11 a1 31# x12 a2 32# x4 a3 / a_stride 33 34# C pointers 35# x6 c0 36# x9 c1 37# x10 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 v4 42# A1 v1 v5 43# A2 v2 v6 44# A3 v3 v7 45# B v8 v9 v10 v11 46# B v12 v13 v14 v15 47# B v20 v21 v22 v23 48# B v24 v25 v26 v27 49# C v16 v17 50# C v18 v19 51# C v28 v29 52# C v30 v31 53# Clamp v4 v5 54 55BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57 56 57 # Load cn_stride, acc 58 LDP x14, x15, [sp] 59 # Load params pointer 60 LDR x8, [sp, 16] 61 62 # Load clamping_params values 63 LD2R {v4.4s, v5.4s}, [x8] 64 65 # Save d8-d15 on stack 66 STP d8, d9, [sp, -64]! 67 STP d10, d11, [sp, 16] 68 STP d12, d13, [sp, 32] 69 STP d14, d15, [sp, 48] 70 71 # Clamp A and C pointers 72 CMP x0, 2 // if mr < 2 73 ADD x11, x3, x4 // a1 = a0 + a_stride 74 ADD x9, x6, x7 // c1 = c0 + cm_stride 75 CSEL x11, x3, x11, LO // a1 = a0 76 CSEL x9, x6, x9, LO // c1 = c0 77 78 ADD x12, x11, x4 // a2 = a1 + a_stride 79 ADD x10, x9, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x12, x11, x12, LS // a2 = a1 82 CSEL x10, x9, x10, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x4, x12, x4 // a3 = a2 + a_stride 86 ADD x7, x10, x7 // c3 = c2 + cm_stride 87 CSEL x4, x12, x4, LO // a3 = a2 88 CSEL x7, x10, x7, LO // c3 = c2 89 900: 91 # Load initial accumulators 92 LDP q16, q17, [x15], 32 93 LDP q18, q19, [x15], 32 94 LDP q28, q29, [x15], 32 95 LDP q30, q31, [x15], 32 96 97 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 98 SUBS x0, x2, 32 // k = kc - 32 99 B.LO 3f 100 101 # 16 prologue 102 # Read first block of 4 A and B. 103 LDR q0, [x3], 16 104 LDP q20, q21, [x5], 32 105 LDR q1, [x11], 16 106 LDR q2, [x12], 16 107 LDR q3, [x4], 16 108 LDP q22, q23, [x5], 32 109 LDP q24, q25, [x5], 32 110 LDP q26, q27, [x5], 32 111 112 # Is there at least 32. yes do main loop 113 SUBS x0, x0, 32 114 B.LO 2f 115 116 # Main loop - 8 floats of A (32 bytes) 1171: 118 # First block of 4. FMA for first 4, loads for 2nd block of 4. 119 FMLA v16.4s, v20.4s, v0.s[0] 120 LDP q8, q9, [x5], 32 121 FMLA v17.4s, v21.4s, v0.s[0] 122 FMLA v18.4s, v20.4s, v1.s[0] 123 LDP q10, q11, [x5], 32 124 FMLA v19.4s, v21.4s, v1.s[0] 125 FMLA v28.4s, v20.4s, v2.s[0] 126 LDP q12, q13, [x5], 32 127 FMLA v29.4s, v21.4s, v2.s[0] 128 FMLA v30.4s, v20.4s, v3.s[0] 129 LDP q14, q15, [x5], 32 130 FMLA v31.4s, v21.4s, v3.s[0] 131 FMLA v16.4s, v22.4s, v0.s[1] 132 LDR q4, [x3], 16 133 FMLA v17.4s, v23.4s, v0.s[1] 134 FMLA v18.4s, v22.4s, v1.s[1] 135 LDR q5, [x11], 16 136 FMLA v19.4s, v23.4s, v1.s[1] 137 FMLA v28.4s, v22.4s, v2.s[1] 138 LDR q6, [x12], 16 139 FMLA v29.4s, v23.4s, v2.s[1] 140 FMLA v30.4s, v22.4s, v3.s[1] 141 LDR q7, [x4], 16 142 FMLA v31.4s, v23.4s, v3.s[1] 143 FMLA v16.4s, v24.4s, v0.s[2] 144 FMLA v17.4s, v25.4s, v0.s[2] 145 FMLA v18.4s, v24.4s, v1.s[2] 146 FMLA v19.4s, v25.4s, v1.s[2] 147 FMLA v28.4s, v24.4s, v2.s[2] 148 FMLA v29.4s, v25.4s, v2.s[2] 149 FMLA v30.4s, v24.4s, v3.s[2] 150 FMLA v31.4s, v25.4s, v3.s[2] 151 FMLA v16.4s, v26.4s, v0.s[3] 152 FMLA v17.4s, v27.4s, v0.s[3] 153 FMLA v18.4s, v26.4s, v1.s[3] 154 FMLA v19.4s, v27.4s, v1.s[3] 155 FMLA v28.4s, v26.4s, v2.s[3] 156 FMLA v29.4s, v27.4s, v2.s[3] 157 FMLA v30.4s, v26.4s, v3.s[3] 158 FMLA v31.4s, v27.4s, v3.s[3] 159 160 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 161 FMLA v16.4s, v8.4s, v4.s[0] 162 LDP q20, q21, [x5], 32 163 FMLA v17.4s, v9.4s, v4.s[0] 164 FMLA v18.4s, v8.4s, v5.s[0] 165 LDP q22, q23, [x5], 32 166 FMLA v19.4s, v9.4s, v5.s[0] 167 FMLA v28.4s, v8.4s, v6.s[0] 168 LDP q24, q25, [x5], 32 169 FMLA v29.4s, v9.4s, v6.s[0] 170 FMLA v30.4s, v8.4s, v7.s[0] 171 LDP q26, q27, [x5], 32 172 FMLA v31.4s, v9.4s, v7.s[0] 173 FMLA v16.4s, v10.4s, v4.s[1] 174 LDR q0, [x3], 16 175 FMLA v17.4s, v11.4s, v4.s[1] 176 FMLA v18.4s, v10.4s, v5.s[1] 177 LDR q1, [x11], 16 178 FMLA v19.4s, v11.4s, v5.s[1] 179 FMLA v28.4s, v10.4s, v6.s[1] 180 LDR q2, [x12], 16 181 FMLA v29.4s, v11.4s, v6.s[1] 182 FMLA v30.4s, v10.4s, v7.s[1] 183 LDR q3, [x4], 16 184 FMLA v31.4s, v11.4s, v7.s[1] 185 FMLA v16.4s, v12.4s, v4.s[2] 186 FMLA v17.4s, v13.4s, v4.s[2] 187 FMLA v18.4s, v12.4s, v5.s[2] 188 FMLA v19.4s, v13.4s, v5.s[2] 189 FMLA v28.4s, v12.4s, v6.s[2] 190 FMLA v29.4s, v13.4s, v6.s[2] 191 FMLA v30.4s, v12.4s, v7.s[2] 192 FMLA v31.4s, v13.4s, v7.s[2] 193 FMLA v16.4s, v14.4s, v4.s[3] 194 FMLA v17.4s, v15.4s, v4.s[3] 195 FMLA v18.4s, v14.4s, v5.s[3] 196 FMLA v19.4s, v15.4s, v5.s[3] 197 FMLA v28.4s, v14.4s, v6.s[3] 198 FMLA v29.4s, v15.4s, v6.s[3] 199 SUBS x0, x0, 32 200 FMLA v30.4s, v14.4s, v7.s[3] 201 FMLA v31.4s, v15.4s, v7.s[3] 202 B.HS 1b 203 2042: 205 # Epilogue 206 # First block of 4. FMA for first 4, loads for 2nd block of 4. 207 FMLA v16.4s, v20.4s, v0.s[0] 208 LDP q8, q9, [x5], 32 209 FMLA v17.4s, v21.4s, v0.s[0] 210 FMLA v18.4s, v20.4s, v1.s[0] 211 LDP q10, q11, [x5], 32 212 FMLA v19.4s, v21.4s, v1.s[0] 213 FMLA v28.4s, v20.4s, v2.s[0] 214 LDP q12, q13, [x5], 32 215 FMLA v29.4s, v21.4s, v2.s[0] 216 FMLA v30.4s, v20.4s, v3.s[0] 217 LDP q14, q15, [x5], 32 218 FMLA v31.4s, v21.4s, v3.s[0] 219 FMLA v16.4s, v22.4s, v0.s[1] 220 LDR q4, [x3], 16 221 FMLA v17.4s, v23.4s, v0.s[1] 222 FMLA v18.4s, v22.4s, v1.s[1] 223 LDR q5, [x11], 16 224 FMLA v19.4s, v23.4s, v1.s[1] 225 FMLA v28.4s, v22.4s, v2.s[1] 226 LDR q6, [x12], 16 227 FMLA v29.4s, v23.4s, v2.s[1] 228 FMLA v30.4s, v22.4s, v3.s[1] 229 LDR q7, [x4], 16 230 FMLA v31.4s, v23.4s, v3.s[1] 231 FMLA v16.4s, v24.4s, v0.s[2] 232 FMLA v17.4s, v25.4s, v0.s[2] 233 FMLA v18.4s, v24.4s, v1.s[2] 234 FMLA v19.4s, v25.4s, v1.s[2] 235 FMLA v28.4s, v24.4s, v2.s[2] 236 FMLA v29.4s, v25.4s, v2.s[2] 237 FMLA v30.4s, v24.4s, v3.s[2] 238 FMLA v31.4s, v25.4s, v3.s[2] 239 FMLA v16.4s, v26.4s, v0.s[3] 240 FMLA v17.4s, v27.4s, v0.s[3] 241 FMLA v18.4s, v26.4s, v1.s[3] 242 FMLA v19.4s, v27.4s, v1.s[3] 243 FMLA v28.4s, v26.4s, v2.s[3] 244 FMLA v29.4s, v27.4s, v2.s[3] 245 FMLA v30.4s, v26.4s, v3.s[3] 246 FMLA v31.4s, v27.4s, v3.s[3] 247 248 # Second block of 4. FMA for second 4, noloads 249 FMLA v16.4s, v8.4s, v4.s[0] 250 FMLA v17.4s, v9.4s, v4.s[0] 251 FMLA v18.4s, v8.4s, v5.s[0] 252 FMLA v19.4s, v9.4s, v5.s[0] 253 FMLA v28.4s, v8.4s, v6.s[0] 254 FMLA v29.4s, v9.4s, v6.s[0] 255 FMLA v30.4s, v8.4s, v7.s[0] 256 FMLA v31.4s, v9.4s, v7.s[0] 257 258 FMLA v16.4s, v10.4s, v4.s[1] 259 FMLA v17.4s, v11.4s, v4.s[1] 260 FMLA v18.4s, v10.4s, v5.s[1] 261 FMLA v19.4s, v11.4s, v5.s[1] 262 FMLA v28.4s, v10.4s, v6.s[1] 263 FMLA v29.4s, v11.4s, v6.s[1] 264 FMLA v30.4s, v10.4s, v7.s[1] 265 FMLA v31.4s, v11.4s, v7.s[1] 266 267 FMLA v16.4s, v12.4s, v4.s[2] 268 FMLA v17.4s, v13.4s, v4.s[2] 269 FMLA v18.4s, v12.4s, v5.s[2] 270 FMLA v19.4s, v13.4s, v5.s[2] 271 FMLA v28.4s, v12.4s, v6.s[2] 272 FMLA v29.4s, v13.4s, v6.s[2] 273 FMLA v30.4s, v12.4s, v7.s[2] 274 FMLA v31.4s, v13.4s, v7.s[2] 275 276 FMLA v16.4s, v14.4s, v4.s[3] 277 FMLA v17.4s, v15.4s, v4.s[3] 278 FMLA v18.4s, v14.4s, v5.s[3] 279 FMLA v19.4s, v15.4s, v5.s[3] 280 281 # Load clamping_params values 282 LD2R {v4.4s, v5.4s}, [x8] 283 284 FMLA v28.4s, v14.4s, v6.s[3] 285 FMLA v29.4s, v15.4s, v6.s[3] 286 FMLA v30.4s, v14.4s, v7.s[3] 287 FMLA v31.4s, v15.4s, v7.s[3] 288 2893: 290 # Remainder- 4 floats of A (16 bytes) 291 TBZ x0, 4, 4f 292 293 LDR q0, [x3], 16 294 LDP q20, q21, [x5], 32 295 LDR q1, [x11], 16 296 LDR q2, [x12], 16 297 LDR q3, [x4], 16 298 FMLA v16.4s, v20.4s, v0.s[0] 299 FMLA v17.4s, v21.4s, v0.s[0] 300 LDP q22, q23, [x5], 32 301 FMLA v18.4s, v20.4s, v1.s[0] 302 FMLA v19.4s, v21.4s, v1.s[0] 303 LDP q24, q25, [x5], 32 304 FMLA v28.4s, v20.4s, v2.s[0] 305 FMLA v29.4s, v21.4s, v2.s[0] 306 LDP q26, q27, [x5], 32 307 FMLA v30.4s, v20.4s, v3.s[0] 308 FMLA v31.4s, v21.4s, v3.s[0] 309 FMLA v16.4s, v22.4s, v0.s[1] 310 FMLA v17.4s, v23.4s, v0.s[1] 311 FMLA v18.4s, v22.4s, v1.s[1] 312 FMLA v19.4s, v23.4s, v1.s[1] 313 FMLA v28.4s, v22.4s, v2.s[1] 314 FMLA v29.4s, v23.4s, v2.s[1] 315 FMLA v30.4s, v22.4s, v3.s[1] 316 FMLA v31.4s, v23.4s, v3.s[1] 317 FMLA v16.4s, v24.4s, v0.s[2] 318 FMLA v17.4s, v25.4s, v0.s[2] 319 FMLA v18.4s, v24.4s, v1.s[2] 320 FMLA v19.4s, v25.4s, v1.s[2] 321 FMLA v28.4s, v24.4s, v2.s[2] 322 FMLA v29.4s, v25.4s, v2.s[2] 323 FMLA v30.4s, v24.4s, v3.s[2] 324 FMLA v31.4s, v25.4s, v3.s[2] 325 FMLA v16.4s, v26.4s, v0.s[3] 326 FMLA v17.4s, v27.4s, v0.s[3] 327 FMLA v18.4s, v26.4s, v1.s[3] 328 FMLA v19.4s, v27.4s, v1.s[3] 329 FMLA v28.4s, v26.4s, v2.s[3] 330 FMLA v29.4s, v27.4s, v2.s[3] 331 FMLA v30.4s, v26.4s, v3.s[3] 332 FMLA v31.4s, v27.4s, v3.s[3] 333 3344: 335 # Remainder- 2 floats of A (8 bytes) 336 TBZ x0, 3, 5f 337 338 LDR d0, [x3], 8 339 LDP q20, q21, [x5], 32 340 LDR d1, [x11], 8 341 LDR d2, [x12], 8 342 LDR d3, [x4], 8 343 FMLA v16.4s, v20.4s, v0.s[0] 344 FMLA v17.4s, v21.4s, v0.s[0] 345 LDP q22, q23, [x5], 32 346 FMLA v18.4s, v20.4s, v1.s[0] 347 FMLA v19.4s, v21.4s, v1.s[0] 348 FMLA v28.4s, v20.4s, v2.s[0] 349 FMLA v29.4s, v21.4s, v2.s[0] 350 FMLA v30.4s, v20.4s, v3.s[0] 351 FMLA v31.4s, v21.4s, v3.s[0] 352 FMLA v16.4s, v22.4s, v0.s[1] 353 FMLA v17.4s, v23.4s, v0.s[1] 354 FMLA v18.4s, v22.4s, v1.s[1] 355 FMLA v19.4s, v23.4s, v1.s[1] 356 FMLA v28.4s, v22.4s, v2.s[1] 357 FMLA v29.4s, v23.4s, v2.s[1] 358 FMLA v30.4s, v22.4s, v3.s[1] 359 FMLA v31.4s, v23.4s, v3.s[1] 360 3615: 362 # Remainder- 1 float of A (4 bytes) 363 TBZ x0, 2, 6f 364 365 LDR s0, [x3], 4 366 LDP q20, q21, [x5], 32 367 LDR s1, [x11], 4 368 LDR s2, [x12], 4 369 LDR s3, [x4], 4 370 FMLA v16.4s, v20.4s, v0.s[0] 371 FMLA v17.4s, v21.4s, v0.s[0] 372 FMLA v18.4s, v20.4s, v1.s[0] 373 FMLA v19.4s, v21.4s, v1.s[0] 374 FMLA v28.4s, v20.4s, v2.s[0] 375 FMLA v29.4s, v21.4s, v2.s[0] 376 FMLA v30.4s, v20.4s, v3.s[0] 377 FMLA v31.4s, v21.4s, v3.s[0] 378 3796: 380 # Clamp 381 FMIN v16.4s, v16.4s, v4.4s 382 SUBS x1, x1, 8 383 FMIN v17.4s, v17.4s, v4.4s 384 FMIN v18.4s, v18.4s, v4.4s 385 FMIN v19.4s, v19.4s, v4.4s 386 FMIN v28.4s, v28.4s, v4.4s 387 FMIN v29.4s, v29.4s, v4.4s 388 FMIN v30.4s, v30.4s, v4.4s 389 FMIN v31.4s, v31.4s, v4.4s 390 FMAX v16.4s, v16.4s, v5.4s 391 FMAX v17.4s, v17.4s, v5.4s 392 FMAX v18.4s, v18.4s, v5.4s 393 FMAX v19.4s, v19.4s, v5.4s 394 FMAX v28.4s, v28.4s, v5.4s 395 FMAX v29.4s, v29.4s, v5.4s 396 FMAX v30.4s, v30.4s, v5.4s 397 FMAX v31.4s, v31.4s, v5.4s 398 399 # Store full 4 x 8 400 B.LO 7f 401 402 STP q30, q31, [x7] 403 SUB x3, x3, x2 // a0 -= kc 404 ADD x7, x7, x14 405 STP q28, q29, [x10] 406 SUB x11, x11, x2 // a1 -= kc 407 ADD x10, x10, x14 408 STP q18, q19, [x9] 409 SUB x12, x12, x2 // a2 -= kc 410 ADD x9, x9, x14 411 STP q16, q17, [x6] 412 SUB x4, x4, x2 // a3 -= kc 413 ADD x6, x6, x14 414 415 B.HI 0b 416 417 # Restore d8-d15 from stack 418 LDP d14, d15, [sp, 48] 419 LDP d12, d13, [sp, 32] 420 LDP d10, d11, [sp, 16] 421 LDP d8, d9, [sp], 64 422 RET 423 424 # Store odd width 4257: 426 TBZ x1, 2, 8f 427 STR q30, [x7], 16 428 MOV v30.16b, v31.16b 429 STR q28, [x10], 16 430 MOV v28.16b, v29.16b 431 STR q18, [x9], 16 432 MOV v18.16b, v19.16b 433 STR q16, [x6], 16 434 MOV v16.16b, v17.16b 435 4368: 437 TBZ x1, 1, 9f 438 STR d30, [x7], 8 439 DUP d30, v30.d[1] 440 STR d28, [x10], 8 441 DUP d28, v28.d[1] 442 STR d18, [x9], 8 443 DUP d18, v18.d[1] 444 STR d16, [x6], 8 445 DUP d16, v16.d[1] 446 4479: 448 TBZ x1, 0, 10f 449 STR s30, [x7] 450 STR s28, [x10] 451 STR s18, [x9] 452 STR s16, [x6] 45310: 454 # Restore d8-d15 from stack 455 LDP d14, d15, [sp, 48] 456 LDP d12, d13, [sp, 32] 457 LDP d10, d11, [sp, 16] 458 LDP d8, d9, [sp], 64 459 RET 460 461 462END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57 463 464#ifdef __ELF__ 465.section ".note.GNU-stack","",%progbits 466#endif 467