1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x20 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 v4 42# A1 v1 v5 43# A2 v2 v6 44# A3 v3 v7 45# B v8 v9 v10 v11 46# B v12 v13 v14 v15 47# B v20 v21 v22 v23 48# B v24 v25 v26 v27 49# C v16 v17 50# C v18 v19 51# C v28 v29 52# C v30 v31 53# Clamp v4 v5 54 55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 56 57 # Load cn_stride, a_offset 58 LDP x10, x11, [sp] 59 60 # Load zero, params pointer 61 LDP x12, x8, [sp, 16] 62 63 # Load min/max values 64 LD2R {v4.4s, v5.4s}, [x8] 65 66 # Save x20 on stack 67 STR x20, [sp, -80]! 68 69 # Save d8-d15 on stack 70 STP d8, d9, [sp, 16] 71 STP d10, d11, [sp, 32] 72 STP d12, d13, [sp, 48] 73 STP d14, d15, [sp, 64] 74 75 # Clamp C pointers 76 CMP x0, 2 // if mr < 2 77 ADD x16, x6, x7 // c1 = c0 + cm_stride 78 CSEL x16, x6, x16, LO // c1 = c0 79 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x7, x17, x7 // c3 = c2 + cm_stride 86 CSEL x7, x17, x7, LO // c3 = c2 87 880: 89 # Load initial bias from w into accumulators 90 LDP q16, q17, [x5], 32 91 MOV v18.16b, v16.16b 92 MOV v19.16b, v17.16b 93 MOV v28.16b, v16.16b 94 MOV v29.16b, v17.16b 95 MOV v30.16b, v16.16b 96 MOV v31.16b, v17.16b 97 98 MOV x9, x3 // p = ks 99 1001: 101 # Load next 4 A pointers 102 LDP x20, x13, [x4], 16 103 LDP x14, x15, [x4], 16 104 105 CMP x20, x12 // if a0 == zero 106 ADD x20, x20, x11 // a0 += a_offset 107 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset 108 CMP x13, x12 // if a1 == zero 109 ADD x13, x13, x11 // a1 += a_offset 110 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 111 CMP x14, x12 // if a2 == zero 112 ADD x14, x14, x11 // a2 += a_offset 113 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 114 CMP x15, x12 // if a3 == zero 115 ADD x15, x15, x11 // a3 += a_offset 116 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 117 118 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 119 SUBS x0, x2, 32 // k = kc - 32 120 B.LO 4f 121 122 # 16 prologue 123 # Read first block of 4 A and B. 124 LDR q0, [x20], 16 125 LDP q20, q21, [x5], 32 126 LDR q1, [x13], 16 127 LDR q2, [x14], 16 128 LDR q3, [x15], 16 129 LDP q22, q23, [x5], 32 130 LDP q24, q25, [x5], 32 131 LDP q26, q27, [x5], 32 132 133 # Is there at least 32. yes do main loop 134 SUBS x0, x0, 32 135 B.LO 3f 136 137 # Main loop - 8 floats of A 1382: 139 # First block of 4. FMA for first 4, loads for 2nd block of 4. 140 FMLA v16.4s, v20.4s, v0.s[0] 141 LDP q8, q9, [x5], 32 142 FMLA v17.4s, v21.4s, v0.s[0] 143 FMLA v18.4s, v20.4s, v1.s[0] 144 LDP q10, q11, [x5], 32 145 FMLA v19.4s, v21.4s, v1.s[0] 146 FMLA v28.4s, v20.4s, v2.s[0] 147 LDP q12, q13, [x5], 32 148 FMLA v29.4s, v21.4s, v2.s[0] 149 FMLA v30.4s, v20.4s, v3.s[0] 150 LDP q14, q15, [x5], 32 151 FMLA v31.4s, v21.4s, v3.s[0] 152 FMLA v16.4s, v22.4s, v0.s[1] 153 LDR q4, [x20], 16 154 FMLA v17.4s, v23.4s, v0.s[1] 155 FMLA v18.4s, v22.4s, v1.s[1] 156 LDR q5, [x13], 16 157 FMLA v19.4s, v23.4s, v1.s[1] 158 FMLA v28.4s, v22.4s, v2.s[1] 159 LDR q6, [x14], 16 160 FMLA v29.4s, v23.4s, v2.s[1] 161 FMLA v30.4s, v22.4s, v3.s[1] 162 LDR q7, [x15], 16 163 FMLA v31.4s, v23.4s, v3.s[1] 164 FMLA v16.4s, v24.4s, v0.s[2] 165 PRFM PLDL1KEEP, [x5, 128] 166 FMLA v17.4s, v25.4s, v0.s[2] 167 FMLA v18.4s, v24.4s, v1.s[2] 168 PRFM PLDL1KEEP, [x5, 192] 169 FMLA v19.4s, v25.4s, v1.s[2] 170 FMLA v28.4s, v24.4s, v2.s[2] 171 PRFM PLDL1KEEP, [x5, 256] 172 FMLA v29.4s, v25.4s, v2.s[2] 173 FMLA v30.4s, v24.4s, v3.s[2] 174 PRFM PLDL1KEEP, [x5, 320] 175 FMLA v31.4s, v25.4s, v3.s[2] 176 FMLA v16.4s, v26.4s, v0.s[3] 177 FMLA v17.4s, v27.4s, v0.s[3] 178 FMLA v18.4s, v26.4s, v1.s[3] 179 FMLA v19.4s, v27.4s, v1.s[3] 180 FMLA v28.4s, v26.4s, v2.s[3] 181 FMLA v29.4s, v27.4s, v2.s[3] 182 FMLA v30.4s, v26.4s, v3.s[3] 183 FMLA v31.4s, v27.4s, v3.s[3] 184 185 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 186 FMLA v16.4s, v8.4s, v4.s[0] 187 LDP q20, q21, [x5], 32 188 FMLA v17.4s, v9.4s, v4.s[0] 189 FMLA v18.4s, v8.4s, v5.s[0] 190 LDP q22, q23, [x5], 32 191 FMLA v19.4s, v9.4s, v5.s[0] 192 FMLA v28.4s, v8.4s, v6.s[0] 193 LDP q24, q25, [x5], 32 194 FMLA v29.4s, v9.4s, v6.s[0] 195 FMLA v30.4s, v8.4s, v7.s[0] 196 LDP q26, q27, [x5], 32 197 FMLA v31.4s, v9.4s, v7.s[0] 198 FMLA v16.4s, v10.4s, v4.s[1] 199 LDR q0, [x20], 16 200 FMLA v17.4s, v11.4s, v4.s[1] 201 FMLA v18.4s, v10.4s, v5.s[1] 202 LDR q1, [x13], 16 203 FMLA v19.4s, v11.4s, v5.s[1] 204 FMLA v28.4s, v10.4s, v6.s[1] 205 LDR q2, [x14], 16 206 FMLA v29.4s, v11.4s, v6.s[1] 207 FMLA v30.4s, v10.4s, v7.s[1] 208 LDR q3, [x15], 16 209 FMLA v31.4s, v11.4s, v7.s[1] 210 FMLA v16.4s, v12.4s, v4.s[2] 211 FMLA v17.4s, v13.4s, v4.s[2] 212 FMLA v18.4s, v12.4s, v5.s[2] 213 FMLA v19.4s, v13.4s, v5.s[2] 214 FMLA v28.4s, v12.4s, v6.s[2] 215 FMLA v29.4s, v13.4s, v6.s[2] 216 FMLA v30.4s, v12.4s, v7.s[2] 217 FMLA v31.4s, v13.4s, v7.s[2] 218 FMLA v16.4s, v14.4s, v4.s[3] 219 FMLA v17.4s, v15.4s, v4.s[3] 220 FMLA v18.4s, v14.4s, v5.s[3] 221 FMLA v19.4s, v15.4s, v5.s[3] 222 FMLA v28.4s, v14.4s, v6.s[3] 223 FMLA v29.4s, v15.4s, v6.s[3] 224 SUBS x0, x0, 32 225 FMLA v30.4s, v14.4s, v7.s[3] 226 FMLA v31.4s, v15.4s, v7.s[3] 227 228 B.HS 2b 229 2303: 231 # Epilogue 232 # First block of 4. FMA for first 4, loads for 2nd block of 4. 233 FMLA v16.4s, v20.4s, v0.s[0] 234 LDP q8, q9, [x5], 32 235 FMLA v17.4s, v21.4s, v0.s[0] 236 FMLA v18.4s, v20.4s, v1.s[0] 237 LDP q10, q11, [x5], 32 238 FMLA v19.4s, v21.4s, v1.s[0] 239 FMLA v28.4s, v20.4s, v2.s[0] 240 LDP q12, q13, [x5], 32 241 FMLA v29.4s, v21.4s, v2.s[0] 242 FMLA v30.4s, v20.4s, v3.s[0] 243 LDP q14, q15, [x5], 32 244 FMLA v31.4s, v21.4s, v3.s[0] 245 FMLA v16.4s, v22.4s, v0.s[1] 246 LDR q4, [x20], 16 247 FMLA v17.4s, v23.4s, v0.s[1] 248 FMLA v18.4s, v22.4s, v1.s[1] 249 LDR q5, [x13], 16 250 FMLA v19.4s, v23.4s, v1.s[1] 251 FMLA v28.4s, v22.4s, v2.s[1] 252 LDR q6, [x14], 16 253 FMLA v29.4s, v23.4s, v2.s[1] 254 FMLA v30.4s, v22.4s, v3.s[1] 255 LDR q7, [x15], 16 256 FMLA v31.4s, v23.4s, v3.s[1] 257 FMLA v16.4s, v24.4s, v0.s[2] 258 FMLA v17.4s, v25.4s, v0.s[2] 259 FMLA v18.4s, v24.4s, v1.s[2] 260 FMLA v19.4s, v25.4s, v1.s[2] 261 FMLA v28.4s, v24.4s, v2.s[2] 262 FMLA v29.4s, v25.4s, v2.s[2] 263 FMLA v30.4s, v24.4s, v3.s[2] 264 FMLA v31.4s, v25.4s, v3.s[2] 265 FMLA v16.4s, v26.4s, v0.s[3] 266 FMLA v17.4s, v27.4s, v0.s[3] 267 FMLA v18.4s, v26.4s, v1.s[3] 268 FMLA v19.4s, v27.4s, v1.s[3] 269 FMLA v28.4s, v26.4s, v2.s[3] 270 FMLA v29.4s, v27.4s, v2.s[3] 271 FMLA v30.4s, v26.4s, v3.s[3] 272 FMLA v31.4s, v27.4s, v3.s[3] 273 274 # Second block of 4. FMA for second 4, noloads 275 FMLA v16.4s, v8.4s, v4.s[0] 276 FMLA v17.4s, v9.4s, v4.s[0] 277 FMLA v18.4s, v8.4s, v5.s[0] 278 FMLA v19.4s, v9.4s, v5.s[0] 279 FMLA v28.4s, v8.4s, v6.s[0] 280 FMLA v29.4s, v9.4s, v6.s[0] 281 FMLA v30.4s, v8.4s, v7.s[0] 282 FMLA v31.4s, v9.4s, v7.s[0] 283 FMLA v16.4s, v10.4s, v4.s[1] 284 FMLA v17.4s, v11.4s, v4.s[1] 285 FMLA v18.4s, v10.4s, v5.s[1] 286 FMLA v19.4s, v11.4s, v5.s[1] 287 FMLA v28.4s, v10.4s, v6.s[1] 288 FMLA v29.4s, v11.4s, v6.s[1] 289 FMLA v30.4s, v10.4s, v7.s[1] 290 FMLA v31.4s, v11.4s, v7.s[1] 291 FMLA v16.4s, v12.4s, v4.s[2] 292 FMLA v17.4s, v13.4s, v4.s[2] 293 FMLA v18.4s, v12.4s, v5.s[2] 294 FMLA v19.4s, v13.4s, v5.s[2] 295 FMLA v28.4s, v12.4s, v6.s[2] 296 FMLA v29.4s, v13.4s, v6.s[2] 297 FMLA v30.4s, v12.4s, v7.s[2] 298 FMLA v31.4s, v13.4s, v7.s[2] 299 300 FMLA v16.4s, v14.4s, v4.s[3] 301 FMLA v17.4s, v15.4s, v4.s[3] 302 FMLA v18.4s, v14.4s, v5.s[3] 303 FMLA v19.4s, v15.4s, v5.s[3] 304 305 # Load min/max values 306 LD2R {v4.4s, v5.4s}, [x8] 307 308 FMLA v28.4s, v14.4s, v6.s[3] 309 FMLA v29.4s, v15.4s, v6.s[3] 310 FMLA v30.4s, v14.4s, v7.s[3] 311 FMLA v31.4s, v15.4s, v7.s[3] 312 3134: 314 # Remainder- 4 floats of A 315 TBZ x0, 4, 5f 316 317 LDR q0, [x20], 16 318 LDP q20, q21, [x5], 32 319 LDR q1, [x13], 16 320 LDR q2, [x14], 16 321 LDR q3, [x15], 16 322 FMLA v16.4s, v20.4s, v0.s[0] 323 FMLA v17.4s, v21.4s, v0.s[0] 324 LDP q22, q23, [x5], 32 325 FMLA v18.4s, v20.4s, v1.s[0] 326 FMLA v19.4s, v21.4s, v1.s[0] 327 LDP q24, q25, [x5], 32 328 FMLA v28.4s, v20.4s, v2.s[0] 329 FMLA v29.4s, v21.4s, v2.s[0] 330 LDP q26, q27, [x5], 32 331 FMLA v30.4s, v20.4s, v3.s[0] 332 FMLA v31.4s, v21.4s, v3.s[0] 333 FMLA v16.4s, v22.4s, v0.s[1] 334 FMLA v17.4s, v23.4s, v0.s[1] 335 FMLA v18.4s, v22.4s, v1.s[1] 336 FMLA v19.4s, v23.4s, v1.s[1] 337 FMLA v28.4s, v22.4s, v2.s[1] 338 FMLA v29.4s, v23.4s, v2.s[1] 339 FMLA v30.4s, v22.4s, v3.s[1] 340 FMLA v31.4s, v23.4s, v3.s[1] 341 FMLA v16.4s, v24.4s, v0.s[2] 342 FMLA v17.4s, v25.4s, v0.s[2] 343 FMLA v18.4s, v24.4s, v1.s[2] 344 FMLA v19.4s, v25.4s, v1.s[2] 345 FMLA v28.4s, v24.4s, v2.s[2] 346 FMLA v29.4s, v25.4s, v2.s[2] 347 FMLA v30.4s, v24.4s, v3.s[2] 348 FMLA v31.4s, v25.4s, v3.s[2] 349 FMLA v16.4s, v26.4s, v0.s[3] 350 FMLA v17.4s, v27.4s, v0.s[3] 351 FMLA v18.4s, v26.4s, v1.s[3] 352 FMLA v19.4s, v27.4s, v1.s[3] 353 FMLA v28.4s, v26.4s, v2.s[3] 354 FMLA v29.4s, v27.4s, v2.s[3] 355 FMLA v30.4s, v26.4s, v3.s[3] 356 FMLA v31.4s, v27.4s, v3.s[3] 357 3585: 359 # Remainder- 2 floats of A 360 TBZ x0, 3, 6f 361 362 LDR d0, [x20], 8 363 LDP q20, q21, [x5], 32 364 LDR d1, [x13], 8 365 LDR d2, [x14], 8 366 LDR d3, [x15], 8 367 FMLA v16.4s, v20.4s, v0.s[0] 368 FMLA v17.4s, v21.4s, v0.s[0] 369 LDP q22, q23, [x5], 32 370 FMLA v18.4s, v20.4s, v1.s[0] 371 FMLA v19.4s, v21.4s, v1.s[0] 372 FMLA v28.4s, v20.4s, v2.s[0] 373 FMLA v29.4s, v21.4s, v2.s[0] 374 FMLA v30.4s, v20.4s, v3.s[0] 375 FMLA v31.4s, v21.4s, v3.s[0] 376 FMLA v16.4s, v22.4s, v0.s[1] 377 FMLA v17.4s, v23.4s, v0.s[1] 378 FMLA v18.4s, v22.4s, v1.s[1] 379 FMLA v19.4s, v23.4s, v1.s[1] 380 FMLA v28.4s, v22.4s, v2.s[1] 381 FMLA v29.4s, v23.4s, v2.s[1] 382 FMLA v30.4s, v22.4s, v3.s[1] 383 FMLA v31.4s, v23.4s, v3.s[1] 384 3856: 386 # Remainder- 1 float of A 387 TBZ x0, 2, 7f 388 389 LDR s0, [x20], 4 390 LDP q20, q21, [x5], 32 391 LDR s1, [x13], 4 392 LDR s2, [x14], 4 393 LDR s3, [x15], 4 394 FMLA v16.4s, v20.4s, v0.s[0] 395 FMLA v17.4s, v21.4s, v0.s[0] 396 FMLA v18.4s, v20.4s, v1.s[0] 397 FMLA v19.4s, v21.4s, v1.s[0] 398 FMLA v28.4s, v20.4s, v2.s[0] 399 FMLA v29.4s, v21.4s, v2.s[0] 400 FMLA v30.4s, v20.4s, v3.s[0] 401 FMLA v31.4s, v21.4s, v3.s[0] 402 4037: 404 # ks loop 405 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 406 B.HI 1b 407 408 # Clamp 409 FMAX v16.4s, v16.4s, v4.4s 410 FMAX v17.4s, v17.4s, v4.4s 411 FMAX v18.4s, v18.4s, v4.4s 412 FMAX v19.4s, v19.4s, v4.4s 413 FMAX v28.4s, v28.4s, v4.4s 414 FMAX v29.4s, v29.4s, v4.4s 415 FMAX v30.4s, v30.4s, v4.4s 416 FMAX v31.4s, v31.4s, v4.4s 417 FMIN v16.4s, v16.4s, v5.4s 418 FMIN v17.4s, v17.4s, v5.4s 419 FMIN v18.4s, v18.4s, v5.4s 420 FMIN v19.4s, v19.4s, v5.4s 421 FMIN v28.4s, v28.4s, v5.4s 422 FMIN v29.4s, v29.4s, v5.4s 423 FMIN v30.4s, v30.4s, v5.4s 424 FMIN v31.4s, v31.4s, v5.4s 425 426 # Store full 4 x 8 427 SUBS x1, x1, 8 428 B.LO 8f 429 430 STP q30, q31, [x7] 431 ADD x7, x7, x10 432 STP q28, q29, [x17] 433 ADD x17, x17, x10 434 STP q18, q19, [x16] 435 ADD x16, x16, x10 436 STP q16, q17, [x6] 437 ADD x6, x6, x10 438 439 SUB x4, x4, x3 // a -= ks 440 441 # nc loop 442 B.HI 0b 443 444 # Restore d8-d15 from stack 445 LDP d14, d15, [sp, 64] 446 LDP d12, d13, [sp, 48] 447 LDP d10, d11, [sp, 32] 448 LDP d8, d9, [sp, 16] 449 450 # Restore x20 from stack 451 LDR x20, [sp], 80 452 RET 453 454 # Store odd width 4558: 456 TBZ x1, 2, 9f 457 STR q30, [x7], 16 458 MOV v30.16b, v31.16b 459 STR q28, [x17], 16 460 MOV v28.16b, v29.16b 461 STR q18, [x16], 16 462 MOV v18.16b, v19.16b 463 STR q16, [x6], 16 464 MOV v16.16b, v17.16b 465 4669: 467 TBZ x1, 1, 10f 468 STR d30, [x7], 8 469 DUP d30, v30.d[1] 470 STR d28, [x17], 8 471 DUP d28, v28.d[1] 472 STR d18, [x16], 8 473 DUP d18, v18.d[1] 474 STR d16, [x6], 8 475 DUP d16, v16.d[1] 476 47710: 478 TBZ x1, 0, 11f 479 STR s30, [x7] 480 STR s28, [x17] 481 STR s18, [x16] 482 STR s16, [x6] 48311: 484 # Restore d8-d15 from stack 485 LDP d14, d15, [sp, 64] 486 LDP d12, d13, [sp, 48] 487 LDP d10, d11, [sp, 32] 488 LDP d8, d9, [sp, 16] 489 490 # Restore x20 from stack 491 LDR x20, [sp], 80 492 RET 493 494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75 495 496#ifdef __ELF__ 497.section ".note.GNU-stack","",%progbits 498#endif 499