1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 v4 40# A1 v1 v5 41# A2 v2 v6 42# A3 v3 v7 43# B v8 v9 v10 v11 44# B v12 v13 v14 v15 45# B v20 v21 v22 v23 46# B v24 v25 v26 v27 47# C v16 v17 48# C v18 v19 49# C v28 v29 50# C v30 v31 51# Clamp v4 v5 52 53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 54 55 # Load cn_stride, params pointer 56 LDP x14, x8, [sp] 57 58 # Load min/max values 59 LD2R {v4.4s, v5.4s}, [x8] 60 61 # Save d8-d15 on stack 62 STP d8, d9, [sp, -64]! 63 STP d10, d11, [sp, 16] 64 STP d12, d13, [sp, 32] 65 STP d14, d15, [sp, 48] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x11, x3, x4 // a1 = a0 + a_stride 70 ADD x9, x6, x7 // c1 = c0 + cm_stride 71 CSEL x11, x3, x11, LO // a1 = a0 72 CSEL x9, x6, x9, LO // c1 = c0 73 74 ADD x12, x11, x4 // a2 = a1 + a_stride 75 ADD x10, x9, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x12, x11, x12, LS // a2 = a1 78 CSEL x10, x9, x10, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x4, x12, x4 // a3 = a2 + a_stride 82 ADD x7, x10, x7 // c3 = c2 + cm_stride 83 CSEL x4, x12, x4, LO // a3 = a2 84 CSEL x7, x10, x7, LO // c3 = c2 85 860: 87 # Load initial bias from w into accumulators 88 LDP q16, q17, [x5], 32 89 MOV v18.16b, v16.16b 90 MOV v19.16b, v17.16b 91 MOV v28.16b, v16.16b 92 MOV v29.16b, v17.16b 93 MOV v30.16b, v16.16b 94 MOV v31.16b, v17.16b 95 96 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 97 SUBS x0, x2, 32 // k = kc - 32 98 B.LO 3f 99 100 # 16 prologue 101 # Read first block of 4 A and B. 102 LDR q0, [x3], 16 103 LDP q20, q21, [x5], 32 104 LDR q1, [x11], 16 105 LDR q2, [x12], 16 106 LDR q3, [x4], 16 107 LDP q22, q23, [x5], 32 108 LDP q24, q25, [x5], 32 109 LDP q26, q27, [x5], 32 110 111 # Is there at least 32. yes do main loop 112 SUBS x0, x0, 32 113 B.LO 2f 114 115 # Main loop - 8 floats of A (32 bytes) 1161: 117 # First block of 4. FMA for first 4, loads for 2nd block of 4. 118 FMLA v16.4s, v20.4s, v0.s[0] 119 LDP q8, q9, [x5], 32 120 FMLA v17.4s, v21.4s, v0.s[0] 121 FMLA v18.4s, v20.4s, v1.s[0] 122 LDP q10, q11, [x5], 32 123 FMLA v19.4s, v21.4s, v1.s[0] 124 FMLA v28.4s, v20.4s, v2.s[0] 125 LDP q12, q13, [x5], 32 126 FMLA v29.4s, v21.4s, v2.s[0] 127 FMLA v30.4s, v20.4s, v3.s[0] 128 LDP q14, q15, [x5], 32 129 FMLA v31.4s, v21.4s, v3.s[0] 130 FMLA v16.4s, v22.4s, v0.s[1] 131 LDR q4, [x3], 16 132 FMLA v17.4s, v23.4s, v0.s[1] 133 FMLA v18.4s, v22.4s, v1.s[1] 134 LDR q5, [x11], 16 135 FMLA v19.4s, v23.4s, v1.s[1] 136 FMLA v28.4s, v22.4s, v2.s[1] 137 LDR q6, [x12], 16 138 FMLA v29.4s, v23.4s, v2.s[1] 139 FMLA v30.4s, v22.4s, v3.s[1] 140 LDR q7, [x4], 16 141 FMLA v31.4s, v23.4s, v3.s[1] 142 FMLA v16.4s, v24.4s, v0.s[2] 143 PRFM PLDL1KEEP, [x5, 128] 144 FMLA v17.4s, v25.4s, v0.s[2] 145 FMLA v18.4s, v24.4s, v1.s[2] 146 PRFM PLDL1KEEP, [x5, 192] 147 FMLA v19.4s, v25.4s, v1.s[2] 148 FMLA v28.4s, v24.4s, v2.s[2] 149 PRFM PLDL1KEEP, [x5, 256] 150 FMLA v29.4s, v25.4s, v2.s[2] 151 FMLA v30.4s, v24.4s, v3.s[2] 152 PRFM PLDL1KEEP, [x5, 320] 153 FMLA v31.4s, v25.4s, v3.s[2] 154 FMLA v16.4s, v26.4s, v0.s[3] 155 FMLA v17.4s, v27.4s, v0.s[3] 156 FMLA v18.4s, v26.4s, v1.s[3] 157 FMLA v19.4s, v27.4s, v1.s[3] 158 FMLA v28.4s, v26.4s, v2.s[3] 159 FMLA v29.4s, v27.4s, v2.s[3] 160 FMLA v30.4s, v26.4s, v3.s[3] 161 FMLA v31.4s, v27.4s, v3.s[3] 162 163 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 164 FMLA v16.4s, v8.4s, v4.s[0] 165 LDP q20, q21, [x5], 32 166 FMLA v17.4s, v9.4s, v4.s[0] 167 FMLA v18.4s, v8.4s, v5.s[0] 168 LDP q22, q23, [x5], 32 169 FMLA v19.4s, v9.4s, v5.s[0] 170 FMLA v28.4s, v8.4s, v6.s[0] 171 LDP q24, q25, [x5], 32 172 FMLA v29.4s, v9.4s, v6.s[0] 173 FMLA v30.4s, v8.4s, v7.s[0] 174 LDP q26, q27, [x5], 32 175 FMLA v31.4s, v9.4s, v7.s[0] 176 FMLA v16.4s, v10.4s, v4.s[1] 177 LDR q0, [x3], 16 178 FMLA v17.4s, v11.4s, v4.s[1] 179 FMLA v18.4s, v10.4s, v5.s[1] 180 LDR q1, [x11], 16 181 FMLA v19.4s, v11.4s, v5.s[1] 182 FMLA v28.4s, v10.4s, v6.s[1] 183 LDR q2, [x12], 16 184 FMLA v29.4s, v11.4s, v6.s[1] 185 FMLA v30.4s, v10.4s, v7.s[1] 186 LDR q3, [x4], 16 187 FMLA v31.4s, v11.4s, v7.s[1] 188 FMLA v16.4s, v12.4s, v4.s[2] 189 FMLA v17.4s, v13.4s, v4.s[2] 190 FMLA v18.4s, v12.4s, v5.s[2] 191 FMLA v19.4s, v13.4s, v5.s[2] 192 FMLA v28.4s, v12.4s, v6.s[2] 193 FMLA v29.4s, v13.4s, v6.s[2] 194 FMLA v30.4s, v12.4s, v7.s[2] 195 FMLA v31.4s, v13.4s, v7.s[2] 196 FMLA v16.4s, v14.4s, v4.s[3] 197 FMLA v17.4s, v15.4s, v4.s[3] 198 FMLA v18.4s, v14.4s, v5.s[3] 199 FMLA v19.4s, v15.4s, v5.s[3] 200 FMLA v28.4s, v14.4s, v6.s[3] 201 FMLA v29.4s, v15.4s, v6.s[3] 202 SUBS x0, x0, 32 203 FMLA v30.4s, v14.4s, v7.s[3] 204 FMLA v31.4s, v15.4s, v7.s[3] 205 B.HS 1b 206 2072: 208 # Epilogue 209 # First block of 4. FMA for first 4, loads for 2nd block of 4. 210 FMLA v16.4s, v20.4s, v0.s[0] 211 LDP q8, q9, [x5], 32 212 FMLA v17.4s, v21.4s, v0.s[0] 213 FMLA v18.4s, v20.4s, v1.s[0] 214 LDP q10, q11, [x5], 32 215 FMLA v19.4s, v21.4s, v1.s[0] 216 FMLA v28.4s, v20.4s, v2.s[0] 217 LDP q12, q13, [x5], 32 218 FMLA v29.4s, v21.4s, v2.s[0] 219 FMLA v30.4s, v20.4s, v3.s[0] 220 LDP q14, q15, [x5], 32 221 FMLA v31.4s, v21.4s, v3.s[0] 222 FMLA v16.4s, v22.4s, v0.s[1] 223 LDR q4, [x3], 16 224 FMLA v17.4s, v23.4s, v0.s[1] 225 FMLA v18.4s, v22.4s, v1.s[1] 226 LDR q5, [x11], 16 227 FMLA v19.4s, v23.4s, v1.s[1] 228 FMLA v28.4s, v22.4s, v2.s[1] 229 LDR q6, [x12], 16 230 FMLA v29.4s, v23.4s, v2.s[1] 231 FMLA v30.4s, v22.4s, v3.s[1] 232 LDR q7, [x4], 16 233 FMLA v31.4s, v23.4s, v3.s[1] 234 FMLA v16.4s, v24.4s, v0.s[2] 235 FMLA v17.4s, v25.4s, v0.s[2] 236 FMLA v18.4s, v24.4s, v1.s[2] 237 FMLA v19.4s, v25.4s, v1.s[2] 238 FMLA v28.4s, v24.4s, v2.s[2] 239 FMLA v29.4s, v25.4s, v2.s[2] 240 FMLA v30.4s, v24.4s, v3.s[2] 241 FMLA v31.4s, v25.4s, v3.s[2] 242 FMLA v16.4s, v26.4s, v0.s[3] 243 FMLA v17.4s, v27.4s, v0.s[3] 244 FMLA v18.4s, v26.4s, v1.s[3] 245 FMLA v19.4s, v27.4s, v1.s[3] 246 FMLA v28.4s, v26.4s, v2.s[3] 247 FMLA v29.4s, v27.4s, v2.s[3] 248 FMLA v30.4s, v26.4s, v3.s[3] 249 FMLA v31.4s, v27.4s, v3.s[3] 250 251 # Second block of 4. FMA for second 4, noloads 252 FMLA v16.4s, v8.4s, v4.s[0] 253 FMLA v17.4s, v9.4s, v4.s[0] 254 FMLA v18.4s, v8.4s, v5.s[0] 255 FMLA v19.4s, v9.4s, v5.s[0] 256 FMLA v28.4s, v8.4s, v6.s[0] 257 FMLA v29.4s, v9.4s, v6.s[0] 258 FMLA v30.4s, v8.4s, v7.s[0] 259 FMLA v31.4s, v9.4s, v7.s[0] 260 261 FMLA v16.4s, v10.4s, v4.s[1] 262 FMLA v17.4s, v11.4s, v4.s[1] 263 FMLA v18.4s, v10.4s, v5.s[1] 264 FMLA v19.4s, v11.4s, v5.s[1] 265 FMLA v28.4s, v10.4s, v6.s[1] 266 FMLA v29.4s, v11.4s, v6.s[1] 267 FMLA v30.4s, v10.4s, v7.s[1] 268 FMLA v31.4s, v11.4s, v7.s[1] 269 270 FMLA v16.4s, v12.4s, v4.s[2] 271 FMLA v17.4s, v13.4s, v4.s[2] 272 FMLA v18.4s, v12.4s, v5.s[2] 273 FMLA v19.4s, v13.4s, v5.s[2] 274 FMLA v28.4s, v12.4s, v6.s[2] 275 FMLA v29.4s, v13.4s, v6.s[2] 276 FMLA v30.4s, v12.4s, v7.s[2] 277 FMLA v31.4s, v13.4s, v7.s[2] 278 279 FMLA v16.4s, v14.4s, v4.s[3] 280 FMLA v17.4s, v15.4s, v4.s[3] 281 FMLA v18.4s, v14.4s, v5.s[3] 282 FMLA v19.4s, v15.4s, v5.s[3] 283 284 # Load min/max values 285 LD2R {v4.4s, v5.4s}, [x8] 286 287 FMLA v28.4s, v14.4s, v6.s[3] 288 FMLA v29.4s, v15.4s, v6.s[3] 289 FMLA v30.4s, v14.4s, v7.s[3] 290 FMLA v31.4s, v15.4s, v7.s[3] 291 2923: 293 # Remainder- 4 floats of A (16 bytes) 294 TBZ x0, 4, 4f 295 296 LDR q0, [x3], 16 297 LDP q20, q21, [x5], 32 298 LDR q1, [x11], 16 299 LDR q2, [x12], 16 300 LDR q3, [x4], 16 301 FMLA v16.4s, v20.4s, v0.s[0] 302 FMLA v17.4s, v21.4s, v0.s[0] 303 LDP q22, q23, [x5], 32 304 FMLA v18.4s, v20.4s, v1.s[0] 305 FMLA v19.4s, v21.4s, v1.s[0] 306 LDP q24, q25, [x5], 32 307 FMLA v28.4s, v20.4s, v2.s[0] 308 FMLA v29.4s, v21.4s, v2.s[0] 309 LDP q26, q27, [x5], 32 310 FMLA v30.4s, v20.4s, v3.s[0] 311 FMLA v31.4s, v21.4s, v3.s[0] 312 FMLA v16.4s, v22.4s, v0.s[1] 313 FMLA v17.4s, v23.4s, v0.s[1] 314 FMLA v18.4s, v22.4s, v1.s[1] 315 FMLA v19.4s, v23.4s, v1.s[1] 316 FMLA v28.4s, v22.4s, v2.s[1] 317 FMLA v29.4s, v23.4s, v2.s[1] 318 FMLA v30.4s, v22.4s, v3.s[1] 319 FMLA v31.4s, v23.4s, v3.s[1] 320 FMLA v16.4s, v24.4s, v0.s[2] 321 FMLA v17.4s, v25.4s, v0.s[2] 322 FMLA v18.4s, v24.4s, v1.s[2] 323 FMLA v19.4s, v25.4s, v1.s[2] 324 FMLA v28.4s, v24.4s, v2.s[2] 325 FMLA v29.4s, v25.4s, v2.s[2] 326 FMLA v30.4s, v24.4s, v3.s[2] 327 FMLA v31.4s, v25.4s, v3.s[2] 328 FMLA v16.4s, v26.4s, v0.s[3] 329 FMLA v17.4s, v27.4s, v0.s[3] 330 FMLA v18.4s, v26.4s, v1.s[3] 331 FMLA v19.4s, v27.4s, v1.s[3] 332 FMLA v28.4s, v26.4s, v2.s[3] 333 FMLA v29.4s, v27.4s, v2.s[3] 334 FMLA v30.4s, v26.4s, v3.s[3] 335 FMLA v31.4s, v27.4s, v3.s[3] 336 3374: 338 # Remainder- 2 floats of A (8 bytes) 339 TBZ x0, 3, 5f 340 341 LDR d0, [x3], 8 342 LDP q20, q21, [x5], 32 343 LDR d1, [x11], 8 344 LDR d2, [x12], 8 345 LDR d3, [x4], 8 346 FMLA v16.4s, v20.4s, v0.s[0] 347 FMLA v17.4s, v21.4s, v0.s[0] 348 LDP q22, q23, [x5], 32 349 FMLA v18.4s, v20.4s, v1.s[0] 350 FMLA v19.4s, v21.4s, v1.s[0] 351 FMLA v28.4s, v20.4s, v2.s[0] 352 FMLA v29.4s, v21.4s, v2.s[0] 353 FMLA v30.4s, v20.4s, v3.s[0] 354 FMLA v31.4s, v21.4s, v3.s[0] 355 FMLA v16.4s, v22.4s, v0.s[1] 356 FMLA v17.4s, v23.4s, v0.s[1] 357 FMLA v18.4s, v22.4s, v1.s[1] 358 FMLA v19.4s, v23.4s, v1.s[1] 359 FMLA v28.4s, v22.4s, v2.s[1] 360 FMLA v29.4s, v23.4s, v2.s[1] 361 FMLA v30.4s, v22.4s, v3.s[1] 362 FMLA v31.4s, v23.4s, v3.s[1] 363 3645: 365 # Remainder- 1 float of A (4 bytes) 366 TBZ x0, 2, 6f 367 368 LDR s0, [x3], 4 369 LDP q20, q21, [x5], 32 370 LDR s1, [x11], 4 371 LDR s2, [x12], 4 372 LDR s3, [x4], 4 373 FMLA v16.4s, v20.4s, v0.s[0] 374 FMLA v17.4s, v21.4s, v0.s[0] 375 FMLA v18.4s, v20.4s, v1.s[0] 376 FMLA v19.4s, v21.4s, v1.s[0] 377 FMLA v28.4s, v20.4s, v2.s[0] 378 FMLA v29.4s, v21.4s, v2.s[0] 379 FMLA v30.4s, v20.4s, v3.s[0] 380 FMLA v31.4s, v21.4s, v3.s[0] 381 3826: 383 # Clamp 384 FMAX v16.4s, v16.4s, v4.4s 385 SUBS x1, x1, 8 386 FMAX v17.4s, v17.4s, v4.4s 387 FMAX v18.4s, v18.4s, v4.4s 388 FMAX v19.4s, v19.4s, v4.4s 389 FMAX v28.4s, v28.4s, v4.4s 390 FMAX v29.4s, v29.4s, v4.4s 391 FMAX v30.4s, v30.4s, v4.4s 392 FMAX v31.4s, v31.4s, v4.4s 393 FMIN v16.4s, v16.4s, v5.4s 394 FMIN v17.4s, v17.4s, v5.4s 395 FMIN v18.4s, v18.4s, v5.4s 396 FMIN v19.4s, v19.4s, v5.4s 397 FMIN v28.4s, v28.4s, v5.4s 398 FMIN v29.4s, v29.4s, v5.4s 399 FMIN v30.4s, v30.4s, v5.4s 400 FMIN v31.4s, v31.4s, v5.4s 401 402 # Store full 4 x 8 403 B.LO 7f 404 405 STP q16, q17, [x6] 406 SUB x3, x3, x2 // a0 -= kc 407 ADD x6, x6, x14 408 STP q18, q19, [x9] 409 SUB x11, x11, x2 // a1 -= kc 410 ADD x9, x9, x14 411 STP q28, q29, [x10] 412 SUB x12, x12, x2 // a2 -= kc 413 ADD x10, x10, x14 414 STP q30, q31, [x7] 415 SUB x4, x4, x2 // a3 -= kc 416 ADD x7, x7, x14 417 418 B.HI 0b 419 420 # Restore d8-d15 from stack 421 LDP d14, d15, [sp, 48] 422 LDP d12, d13, [sp, 32] 423 LDP d10, d11, [sp, 16] 424 LDP d8, d9, [sp], 64 425 RET 426 427 # Store odd width 4287: 429 TBZ x1, 2, 8f 430 STR q16, [x6], 16 431 MOV v16.16b, v17.16b 432 STR q18, [x9], 16 433 MOV v18.16b, v19.16b 434 STR q28, [x10], 16 435 MOV v28.16b, v29.16b 436 STR q30, [x7], 16 437 MOV v30.16b, v31.16b 438 4398: 440 TBZ x1, 1, 9f 441 STR d16, [x6], 8 442 DUP d16, v16.d[1] 443 STR d18, [x9], 8 444 DUP d18, v18.d[1] 445 STR d28, [x10], 8 446 DUP d28, v28.d[1] 447 STR d30, [x7], 8 448 DUP d30, v30.d[1] 449 4509: 451 TBZ x1, 0, 10f 452 STR s16, [x6] 453 STR s18, [x9] 454 STR s28, [x10] 455 STR s30, [x7] 45610: 457 # Restore d8-d15 from stack 458 LDP d14, d15, [sp, 48] 459 LDP d12, d13, [sp, 32] 460 LDP d10, d11, [sp, 16] 461 LDP d8, d9, [sp], 64 462 RET 463 464 465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 466 467#ifdef __ELF__ 468.section ".note.GNU-stack","",%progbits 469#endif 470