1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> x8 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x20 a0 26# x13 a1 27# x14 a2 28# x15 a3 29 30# C pointers 31# x6 c0 32# x16 c1 33# x17 c2 34# x7 c3 / cm_stride 35 36# Vector register usage 37# A0 v0 v4 38# A1 v1 v5 39# A2 v2 v6 40# A3 v3 v7 41# B v8 v9 v10 v11 42# B v12 v13 v14 v15 43# B v20 v21 v22 v23 44# B v24 v25 v26 v27 45# C v16 v17 46# C v18 v19 47# C v28 v29 48# C v30 v31 49# Clamp v4 v5 50 51BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 52 53 # Load cn_stride, a_offset 54 LDP x10, x11, [sp] 55 56 # Load zero, params pointer 57 LDP x12, x8, [sp, 16] 58 59 # Load min/max values 60 LD2R {v4.4s, v5.4s}, [x8] 61 62 # Save x20 on stack 63 STR x20, [sp, -80]! 64 65 # Save d8-d15 on stack 66 STP d8, d9, [sp, 16] 67 STP d10, d11, [sp, 32] 68 STP d12, d13, [sp, 48] 69 STP d14, d15, [sp, 64] 70 71 # Clamp C pointers 72 CMP x0, 2 // if mr < 2 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x17, x16, x17, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x7, x17, x7 // c3 = c2 + cm_stride 82 CSEL x7, x17, x7, LO // c3 = c2 83 840: 85 # Load initial bias from w into accumulators 86 LDP q16, q17, [x5], 32 87 MOV v18.16b, v16.16b 88 MOV v19.16b, v17.16b 89 MOV v28.16b, v16.16b 90 MOV v29.16b, v17.16b 91 MOV v30.16b, v16.16b 92 MOV v31.16b, v17.16b 93 94 MOV x9, x3 // p = ks 95 961: 97 # Load next 4 A pointers 98 LDP x20, x13, [x4], 16 99 LDP x14, x15, [x4], 16 100 101 CMP x20, x12 // if a0 == zero 102 ADD x20, x20, x11 // a0 += a_offset 103 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset 104 CMP x13, x12 // if a1 == zero 105 ADD x13, x13, x11 // a1 += a_offset 106 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 107 CMP x14, x12 // if a2 == zero 108 ADD x14, x14, x11 // a2 += a_offset 109 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 110 CMP x15, x12 // if a3 == zero 111 ADD x15, x15, x11 // a3 += a_offset 112 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 113 114 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 115 SUBS x0, x2, 32 // k = kc - 32 116 B.LO 4f 117 118 # 16 prologue 119 # Read first block of 4 A and B. 120 LDR q0, [x20], 16 121 LDP q20, q21, [x5], 32 122 LDR q1, [x13], 16 123 LDR q2, [x14], 16 124 LDR q3, [x15], 16 125 LDP q22, q23, [x5], 32 126 LDP q24, q25, [x5], 32 127 LDP q26, q27, [x5], 32 128 129 # Is there at least 32. yes do main loop 130 SUBS x0, x0, 32 131 B.LO 3f 132 133 # Main loop - 8 floats of A 1342: 135 # First block of 4. FMA for first 4, loads for 2nd block of 4. 136 FMLA v16.4s, v20.4s, v0.s[0] 137 LDP q8, q9, [x5], 32 138 FMLA v17.4s, v21.4s, v0.s[0] 139 FMLA v18.4s, v20.4s, v1.s[0] 140 LDP q10, q11, [x5], 32 141 FMLA v19.4s, v21.4s, v1.s[0] 142 FMLA v28.4s, v20.4s, v2.s[0] 143 LDP q12, q13, [x5], 32 144 FMLA v29.4s, v21.4s, v2.s[0] 145 FMLA v30.4s, v20.4s, v3.s[0] 146 LDP q14, q15, [x5], 32 147 FMLA v31.4s, v21.4s, v3.s[0] 148 FMLA v16.4s, v22.4s, v0.s[1] 149 LDR q4, [x20], 16 150 FMLA v17.4s, v23.4s, v0.s[1] 151 FMLA v18.4s, v22.4s, v1.s[1] 152 LDR q5, [x13], 16 153 FMLA v19.4s, v23.4s, v1.s[1] 154 FMLA v28.4s, v22.4s, v2.s[1] 155 LDR q6, [x14], 16 156 FMLA v29.4s, v23.4s, v2.s[1] 157 FMLA v30.4s, v22.4s, v3.s[1] 158 LDR q7, [x15], 16 159 FMLA v31.4s, v23.4s, v3.s[1] 160 FMLA v16.4s, v24.4s, v0.s[2] 161 $if PREFETCH: 162 PRFM PLDL1KEEP, [x5, 128] 163 FMLA v17.4s, v25.4s, v0.s[2] 164 FMLA v18.4s, v24.4s, v1.s[2] 165 $if PREFETCH: 166 PRFM PLDL1KEEP, [x5, 192] 167 FMLA v19.4s, v25.4s, v1.s[2] 168 FMLA v28.4s, v24.4s, v2.s[2] 169 $if PREFETCH: 170 PRFM PLDL1KEEP, [x5, 256] 171 FMLA v29.4s, v25.4s, v2.s[2] 172 FMLA v30.4s, v24.4s, v3.s[2] 173 $if PREFETCH: 174 PRFM PLDL1KEEP, [x5, 320] 175 FMLA v31.4s, v25.4s, v3.s[2] 176 FMLA v16.4s, v26.4s, v0.s[3] 177 FMLA v17.4s, v27.4s, v0.s[3] 178 FMLA v18.4s, v26.4s, v1.s[3] 179 FMLA v19.4s, v27.4s, v1.s[3] 180 FMLA v28.4s, v26.4s, v2.s[3] 181 FMLA v29.4s, v27.4s, v2.s[3] 182 FMLA v30.4s, v26.4s, v3.s[3] 183 FMLA v31.4s, v27.4s, v3.s[3] 184 185 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 186 FMLA v16.4s, v8.4s, v4.s[0] 187 LDP q20, q21, [x5], 32 188 FMLA v17.4s, v9.4s, v4.s[0] 189 FMLA v18.4s, v8.4s, v5.s[0] 190 LDP q22, q23, [x5], 32 191 FMLA v19.4s, v9.4s, v5.s[0] 192 FMLA v28.4s, v8.4s, v6.s[0] 193 LDP q24, q25, [x5], 32 194 FMLA v29.4s, v9.4s, v6.s[0] 195 FMLA v30.4s, v8.4s, v7.s[0] 196 LDP q26, q27, [x5], 32 197 FMLA v31.4s, v9.4s, v7.s[0] 198 FMLA v16.4s, v10.4s, v4.s[1] 199 LDR q0, [x20], 16 200 FMLA v17.4s, v11.4s, v4.s[1] 201 FMLA v18.4s, v10.4s, v5.s[1] 202 LDR q1, [x13], 16 203 FMLA v19.4s, v11.4s, v5.s[1] 204 FMLA v28.4s, v10.4s, v6.s[1] 205 LDR q2, [x14], 16 206 FMLA v29.4s, v11.4s, v6.s[1] 207 FMLA v30.4s, v10.4s, v7.s[1] 208 LDR q3, [x15], 16 209 FMLA v31.4s, v11.4s, v7.s[1] 210 FMLA v16.4s, v12.4s, v4.s[2] 211 FMLA v17.4s, v13.4s, v4.s[2] 212 FMLA v18.4s, v12.4s, v5.s[2] 213 FMLA v19.4s, v13.4s, v5.s[2] 214 FMLA v28.4s, v12.4s, v6.s[2] 215 FMLA v29.4s, v13.4s, v6.s[2] 216 FMLA v30.4s, v12.4s, v7.s[2] 217 FMLA v31.4s, v13.4s, v7.s[2] 218 FMLA v16.4s, v14.4s, v4.s[3] 219 FMLA v17.4s, v15.4s, v4.s[3] 220 FMLA v18.4s, v14.4s, v5.s[3] 221 FMLA v19.4s, v15.4s, v5.s[3] 222 FMLA v28.4s, v14.4s, v6.s[3] 223 FMLA v29.4s, v15.4s, v6.s[3] 224 SUBS x0, x0, 32 225 FMLA v30.4s, v14.4s, v7.s[3] 226 FMLA v31.4s, v15.4s, v7.s[3] 227 228 B.HS 2b 229 2303: 231 # Epilogue 232 # First block of 4. FMA for first 4, loads for 2nd block of 4. 233 FMLA v16.4s, v20.4s, v0.s[0] 234 LDP q8, q9, [x5], 32 235 FMLA v17.4s, v21.4s, v0.s[0] 236 FMLA v18.4s, v20.4s, v1.s[0] 237 LDP q10, q11, [x5], 32 238 FMLA v19.4s, v21.4s, v1.s[0] 239 FMLA v28.4s, v20.4s, v2.s[0] 240 LDP q12, q13, [x5], 32 241 FMLA v29.4s, v21.4s, v2.s[0] 242 FMLA v30.4s, v20.4s, v3.s[0] 243 LDP q14, q15, [x5], 32 244 FMLA v31.4s, v21.4s, v3.s[0] 245 FMLA v16.4s, v22.4s, v0.s[1] 246 LDR q4, [x20], 16 247 FMLA v17.4s, v23.4s, v0.s[1] 248 FMLA v18.4s, v22.4s, v1.s[1] 249 LDR q5, [x13], 16 250 FMLA v19.4s, v23.4s, v1.s[1] 251 FMLA v28.4s, v22.4s, v2.s[1] 252 LDR q6, [x14], 16 253 FMLA v29.4s, v23.4s, v2.s[1] 254 FMLA v30.4s, v22.4s, v3.s[1] 255 LDR q7, [x15], 16 256 FMLA v31.4s, v23.4s, v3.s[1] 257 FMLA v16.4s, v24.4s, v0.s[2] 258 FMLA v17.4s, v25.4s, v0.s[2] 259 FMLA v18.4s, v24.4s, v1.s[2] 260 FMLA v19.4s, v25.4s, v1.s[2] 261 FMLA v28.4s, v24.4s, v2.s[2] 262 FMLA v29.4s, v25.4s, v2.s[2] 263 FMLA v30.4s, v24.4s, v3.s[2] 264 FMLA v31.4s, v25.4s, v3.s[2] 265 FMLA v16.4s, v26.4s, v0.s[3] 266 FMLA v17.4s, v27.4s, v0.s[3] 267 FMLA v18.4s, v26.4s, v1.s[3] 268 FMLA v19.4s, v27.4s, v1.s[3] 269 FMLA v28.4s, v26.4s, v2.s[3] 270 FMLA v29.4s, v27.4s, v2.s[3] 271 FMLA v30.4s, v26.4s, v3.s[3] 272 FMLA v31.4s, v27.4s, v3.s[3] 273 274 # Second block of 4. FMA for second 4, noloads 275 FMLA v16.4s, v8.4s, v4.s[0] 276 FMLA v17.4s, v9.4s, v4.s[0] 277 FMLA v18.4s, v8.4s, v5.s[0] 278 FMLA v19.4s, v9.4s, v5.s[0] 279 FMLA v28.4s, v8.4s, v6.s[0] 280 FMLA v29.4s, v9.4s, v6.s[0] 281 FMLA v30.4s, v8.4s, v7.s[0] 282 FMLA v31.4s, v9.4s, v7.s[0] 283 FMLA v16.4s, v10.4s, v4.s[1] 284 FMLA v17.4s, v11.4s, v4.s[1] 285 FMLA v18.4s, v10.4s, v5.s[1] 286 FMLA v19.4s, v11.4s, v5.s[1] 287 FMLA v28.4s, v10.4s, v6.s[1] 288 FMLA v29.4s, v11.4s, v6.s[1] 289 FMLA v30.4s, v10.4s, v7.s[1] 290 FMLA v31.4s, v11.4s, v7.s[1] 291 FMLA v16.4s, v12.4s, v4.s[2] 292 FMLA v17.4s, v13.4s, v4.s[2] 293 FMLA v18.4s, v12.4s, v5.s[2] 294 FMLA v19.4s, v13.4s, v5.s[2] 295 FMLA v28.4s, v12.4s, v6.s[2] 296 FMLA v29.4s, v13.4s, v6.s[2] 297 FMLA v30.4s, v12.4s, v7.s[2] 298 FMLA v31.4s, v13.4s, v7.s[2] 299 300 FMLA v16.4s, v14.4s, v4.s[3] 301 FMLA v17.4s, v15.4s, v4.s[3] 302 FMLA v18.4s, v14.4s, v5.s[3] 303 FMLA v19.4s, v15.4s, v5.s[3] 304 305 # Load min/max values 306 LD2R {v4.4s, v5.4s}, [x8] 307 308 FMLA v28.4s, v14.4s, v6.s[3] 309 FMLA v29.4s, v15.4s, v6.s[3] 310 FMLA v30.4s, v14.4s, v7.s[3] 311 FMLA v31.4s, v15.4s, v7.s[3] 312 3134: 314 # Remainder- 4 floats of A 315 TBZ x0, 4, 5f 316 317 LDR q0, [x20], 16 318 LDP q20, q21, [x5], 32 319 LDR q1, [x13], 16 320 LDR q2, [x14], 16 321 LDR q3, [x15], 16 322 FMLA v16.4s, v20.4s, v0.s[0] 323 FMLA v17.4s, v21.4s, v0.s[0] 324 LDP q22, q23, [x5], 32 325 FMLA v18.4s, v20.4s, v1.s[0] 326 FMLA v19.4s, v21.4s, v1.s[0] 327 LDP q24, q25, [x5], 32 328 FMLA v28.4s, v20.4s, v2.s[0] 329 FMLA v29.4s, v21.4s, v2.s[0] 330 LDP q26, q27, [x5], 32 331 FMLA v30.4s, v20.4s, v3.s[0] 332 FMLA v31.4s, v21.4s, v3.s[0] 333 FMLA v16.4s, v22.4s, v0.s[1] 334 FMLA v17.4s, v23.4s, v0.s[1] 335 FMLA v18.4s, v22.4s, v1.s[1] 336 FMLA v19.4s, v23.4s, v1.s[1] 337 FMLA v28.4s, v22.4s, v2.s[1] 338 FMLA v29.4s, v23.4s, v2.s[1] 339 FMLA v30.4s, v22.4s, v3.s[1] 340 FMLA v31.4s, v23.4s, v3.s[1] 341 FMLA v16.4s, v24.4s, v0.s[2] 342 FMLA v17.4s, v25.4s, v0.s[2] 343 FMLA v18.4s, v24.4s, v1.s[2] 344 FMLA v19.4s, v25.4s, v1.s[2] 345 FMLA v28.4s, v24.4s, v2.s[2] 346 FMLA v29.4s, v25.4s, v2.s[2] 347 FMLA v30.4s, v24.4s, v3.s[2] 348 FMLA v31.4s, v25.4s, v3.s[2] 349 FMLA v16.4s, v26.4s, v0.s[3] 350 FMLA v17.4s, v27.4s, v0.s[3] 351 FMLA v18.4s, v26.4s, v1.s[3] 352 FMLA v19.4s, v27.4s, v1.s[3] 353 FMLA v28.4s, v26.4s, v2.s[3] 354 FMLA v29.4s, v27.4s, v2.s[3] 355 FMLA v30.4s, v26.4s, v3.s[3] 356 FMLA v31.4s, v27.4s, v3.s[3] 357 3585: 359 # Remainder- 2 floats of A 360 TBZ x0, 3, 6f 361 362 LDR d0, [x20], 8 363 LDP q20, q21, [x5], 32 364 LDR d1, [x13], 8 365 LDR d2, [x14], 8 366 LDR d3, [x15], 8 367 FMLA v16.4s, v20.4s, v0.s[0] 368 FMLA v17.4s, v21.4s, v0.s[0] 369 LDP q22, q23, [x5], 32 370 FMLA v18.4s, v20.4s, v1.s[0] 371 FMLA v19.4s, v21.4s, v1.s[0] 372 FMLA v28.4s, v20.4s, v2.s[0] 373 FMLA v29.4s, v21.4s, v2.s[0] 374 FMLA v30.4s, v20.4s, v3.s[0] 375 FMLA v31.4s, v21.4s, v3.s[0] 376 FMLA v16.4s, v22.4s, v0.s[1] 377 FMLA v17.4s, v23.4s, v0.s[1] 378 FMLA v18.4s, v22.4s, v1.s[1] 379 FMLA v19.4s, v23.4s, v1.s[1] 380 FMLA v28.4s, v22.4s, v2.s[1] 381 FMLA v29.4s, v23.4s, v2.s[1] 382 FMLA v30.4s, v22.4s, v3.s[1] 383 FMLA v31.4s, v23.4s, v3.s[1] 384 3856: 386 # Remainder- 1 float of A 387 TBZ x0, 2, 7f 388 389 LDR s0, [x20], 4 390 LDP q20, q21, [x5], 32 391 LDR s1, [x13], 4 392 LDR s2, [x14], 4 393 LDR s3, [x15], 4 394 FMLA v16.4s, v20.4s, v0.s[0] 395 FMLA v17.4s, v21.4s, v0.s[0] 396 FMLA v18.4s, v20.4s, v1.s[0] 397 FMLA v19.4s, v21.4s, v1.s[0] 398 FMLA v28.4s, v20.4s, v2.s[0] 399 FMLA v29.4s, v21.4s, v2.s[0] 400 FMLA v30.4s, v20.4s, v3.s[0] 401 FMLA v31.4s, v21.4s, v3.s[0] 402 4037: 404 # ks loop 405 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 406 B.HI 1b 407 408 # Clamp 409 FMAX v16.4s, v16.4s, v4.4s 410 FMAX v17.4s, v17.4s, v4.4s 411 FMAX v18.4s, v18.4s, v4.4s 412 FMAX v19.4s, v19.4s, v4.4s 413 FMAX v28.4s, v28.4s, v4.4s 414 FMAX v29.4s, v29.4s, v4.4s 415 FMAX v30.4s, v30.4s, v4.4s 416 FMAX v31.4s, v31.4s, v4.4s 417 FMIN v16.4s, v16.4s, v5.4s 418 FMIN v17.4s, v17.4s, v5.4s 419 FMIN v18.4s, v18.4s, v5.4s 420 FMIN v19.4s, v19.4s, v5.4s 421 FMIN v28.4s, v28.4s, v5.4s 422 FMIN v29.4s, v29.4s, v5.4s 423 FMIN v30.4s, v30.4s, v5.4s 424 FMIN v31.4s, v31.4s, v5.4s 425 426 # Store full 4 x 8 427 SUBS x1, x1, 8 428 B.LO 8f 429 430 STP q30, q31, [x7] 431 ADD x7, x7, x10 432 STP q28, q29, [x17] 433 ADD x17, x17, x10 434 STP q18, q19, [x16] 435 ADD x16, x16, x10 436 STP q16, q17, [x6] 437 ADD x6, x6, x10 438 439 SUB x4, x4, x3 // a -= ks 440 441 # nc loop 442 B.HI 0b 443 444 # Restore d8-d15 from stack 445 LDP d14, d15, [sp, 64] 446 LDP d12, d13, [sp, 48] 447 LDP d10, d11, [sp, 32] 448 LDP d8, d9, [sp, 16] 449 450 # Restore x20 from stack 451 LDR x20, [sp], 80 452 RET 453 454 # Store odd width 4558: 456 TBZ x1, 2, 9f 457 STR q30, [x7], 16 458 MOV v30.16b, v31.16b 459 STR q28, [x17], 16 460 MOV v28.16b, v29.16b 461 STR q18, [x16], 16 462 MOV v18.16b, v19.16b 463 STR q16, [x6], 16 464 MOV v16.16b, v17.16b 465 4669: 467 TBZ x1, 1, 10f 468 STR d30, [x7], 8 469 DUP d30, v30.d[1] 470 STR d28, [x17], 8 471 DUP d28, v28.d[1] 472 STR d18, [x16], 8 473 DUP d18, v18.d[1] 474 STR d16, [x6], 8 475 DUP d16, v16.d[1] 476 47710: 478 TBZ x1, 0, 11f 479 STR s30, [x7] 480 STR s28, [x17] 481 STR s18, [x16] 482 STR s16, [x6] 48311: 484 # Restore d8-d15 from stack 485 LDP d14, d15, [sp, 64] 486 LDP d12, d13, [sp, 48] 487 LDP d10, d11, [sp, 32] 488 LDP d8, d9, [sp, 16] 489 490 # Restore x20 from stack 491 LDR x20, [sp], 80 492 RET 493 494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 495 496#ifdef __ELF__ 497.section ".note.GNU-stack","",%progbits 498#endif 499