1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 v4 40# A1 v1 v5 41# A2 v2 v6 42# A3 v3 v7 43# B v8 v9 v10 v11 44# B v12 v13 v14 v15 45# B v20 v21 v22 v23 46# B v24 v25 v26 v27 47# C v16 v17 48# C v18 v19 49# C v28 v29 50# C v30 v31 51# Clamp v4 v5 52 53BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 54 55 $if INC: 56 # Load cn_stride, acc 57 LDP x14, x15, [sp] 58 # Load params pointer 59 LDR x8, [sp, 16] 60 $else: 61 # Load cn_stride, params pointer 62 LDP x14, x8, [sp] 63 64 # Load min/max values 65 LD2R {v4.4s, v5.4s}, [x8] 66 67 # Save d8-d15 on stack 68 STP d8, d9, [sp, -64]! 69 STP d10, d11, [sp, 16] 70 STP d12, d13, [sp, 32] 71 STP d14, d15, [sp, 48] 72 73 # Clamp A and C pointers 74 CMP x0, 2 // if mr < 2 75 ADD x11, x3, x4 // a1 = a0 + a_stride 76 ADD x9, x6, x7 // c1 = c0 + cm_stride 77 CSEL x11, x3, x11, LO // a1 = a0 78 CSEL x9, x6, x9, LO // c1 = c0 79 80 ADD x12, x11, x4 // a2 = a1 + a_stride 81 ADD x10, x9, x7 // c2 = c1 + cm_stride 82 // if mr <= 2 83 CSEL x12, x11, x12, LS // a2 = a1 84 CSEL x10, x9, x10, LS // c2 = c1 85 86 CMP x0, 4 // if mr < 4 87 ADD x4, x12, x4 // a3 = a2 + a_stride 88 ADD x7, x10, x7 // c3 = c2 + cm_stride 89 CSEL x4, x12, x4, LO // a3 = a2 90 CSEL x7, x10, x7, LO // c3 = c2 91 920: 93 $if INC: 94 # Load initial accumulators 95 LDP q16, q17, [x15], 32 96 LDP q18, q19, [x15], 32 97 LDP q28, q29, [x15], 32 98 LDP q30, q31, [x15], 32 99 $else: 100 # Load initial bias from w into accumulators 101 LDP q16, q17, [x5], 32 102 MOV v18.16b, v16.16b 103 MOV v19.16b, v17.16b 104 MOV v28.16b, v16.16b 105 MOV v29.16b, v17.16b 106 MOV v30.16b, v16.16b 107 MOV v31.16b, v17.16b 108 109 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 110 SUBS x0, x2, 32 // k = kc - 32 111 B.LO 3f 112 113 # 16 prologue 114 # Read first block of 4 A and B. 115 LDR q0, [x3], 16 116 LDP q20, q21, [x5], 32 117 LDR q1, [x11], 16 118 LDR q2, [x12], 16 119 LDR q3, [x4], 16 120 LDP q22, q23, [x5], 32 121 LDP q24, q25, [x5], 32 122 LDP q26, q27, [x5], 32 123 124 # Is there at least 32. yes do main loop 125 SUBS x0, x0, 32 126 B.LO 2f 127 128 # Main loop - 8 floats of A (32 bytes) 1291: 130 # First block of 4. FMA for first 4, loads for 2nd block of 4. 131 FMLA v16.4s, v20.4s, v0.s[0] 132 LDP q8, q9, [x5], 32 133 FMLA v17.4s, v21.4s, v0.s[0] 134 FMLA v18.4s, v20.4s, v1.s[0] 135 LDP q10, q11, [x5], 32 136 FMLA v19.4s, v21.4s, v1.s[0] 137 FMLA v28.4s, v20.4s, v2.s[0] 138 LDP q12, q13, [x5], 32 139 FMLA v29.4s, v21.4s, v2.s[0] 140 FMLA v30.4s, v20.4s, v3.s[0] 141 LDP q14, q15, [x5], 32 142 FMLA v31.4s, v21.4s, v3.s[0] 143 FMLA v16.4s, v22.4s, v0.s[1] 144 LDR q4, [x3], 16 145 FMLA v17.4s, v23.4s, v0.s[1] 146 FMLA v18.4s, v22.4s, v1.s[1] 147 LDR q5, [x11], 16 148 FMLA v19.4s, v23.4s, v1.s[1] 149 FMLA v28.4s, v22.4s, v2.s[1] 150 LDR q6, [x12], 16 151 FMLA v29.4s, v23.4s, v2.s[1] 152 FMLA v30.4s, v22.4s, v3.s[1] 153 LDR q7, [x4], 16 154 FMLA v31.4s, v23.4s, v3.s[1] 155 FMLA v16.4s, v24.4s, v0.s[2] 156 $if PREFETCH: 157 PRFM PLDL1KEEP, [x5, 128] 158 FMLA v17.4s, v25.4s, v0.s[2] 159 FMLA v18.4s, v24.4s, v1.s[2] 160 $if PREFETCH: 161 PRFM PLDL1KEEP, [x5, 192] 162 FMLA v19.4s, v25.4s, v1.s[2] 163 FMLA v28.4s, v24.4s, v2.s[2] 164 $if PREFETCH: 165 PRFM PLDL1KEEP, [x5, 256] 166 FMLA v29.4s, v25.4s, v2.s[2] 167 FMLA v30.4s, v24.4s, v3.s[2] 168 $if PREFETCH: 169 PRFM PLDL1KEEP, [x5, 320] 170 FMLA v31.4s, v25.4s, v3.s[2] 171 FMLA v16.4s, v26.4s, v0.s[3] 172 FMLA v17.4s, v27.4s, v0.s[3] 173 FMLA v18.4s, v26.4s, v1.s[3] 174 FMLA v19.4s, v27.4s, v1.s[3] 175 FMLA v28.4s, v26.4s, v2.s[3] 176 FMLA v29.4s, v27.4s, v2.s[3] 177 FMLA v30.4s, v26.4s, v3.s[3] 178 FMLA v31.4s, v27.4s, v3.s[3] 179 180 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 181 FMLA v16.4s, v8.4s, v4.s[0] 182 LDP q20, q21, [x5], 32 183 FMLA v17.4s, v9.4s, v4.s[0] 184 FMLA v18.4s, v8.4s, v5.s[0] 185 LDP q22, q23, [x5], 32 186 FMLA v19.4s, v9.4s, v5.s[0] 187 FMLA v28.4s, v8.4s, v6.s[0] 188 LDP q24, q25, [x5], 32 189 FMLA v29.4s, v9.4s, v6.s[0] 190 FMLA v30.4s, v8.4s, v7.s[0] 191 LDP q26, q27, [x5], 32 192 FMLA v31.4s, v9.4s, v7.s[0] 193 FMLA v16.4s, v10.4s, v4.s[1] 194 LDR q0, [x3], 16 195 FMLA v17.4s, v11.4s, v4.s[1] 196 FMLA v18.4s, v10.4s, v5.s[1] 197 LDR q1, [x11], 16 198 FMLA v19.4s, v11.4s, v5.s[1] 199 FMLA v28.4s, v10.4s, v6.s[1] 200 LDR q2, [x12], 16 201 FMLA v29.4s, v11.4s, v6.s[1] 202 FMLA v30.4s, v10.4s, v7.s[1] 203 LDR q3, [x4], 16 204 FMLA v31.4s, v11.4s, v7.s[1] 205 FMLA v16.4s, v12.4s, v4.s[2] 206 FMLA v17.4s, v13.4s, v4.s[2] 207 FMLA v18.4s, v12.4s, v5.s[2] 208 FMLA v19.4s, v13.4s, v5.s[2] 209 FMLA v28.4s, v12.4s, v6.s[2] 210 FMLA v29.4s, v13.4s, v6.s[2] 211 FMLA v30.4s, v12.4s, v7.s[2] 212 FMLA v31.4s, v13.4s, v7.s[2] 213 FMLA v16.4s, v14.4s, v4.s[3] 214 FMLA v17.4s, v15.4s, v4.s[3] 215 FMLA v18.4s, v14.4s, v5.s[3] 216 FMLA v19.4s, v15.4s, v5.s[3] 217 FMLA v28.4s, v14.4s, v6.s[3] 218 FMLA v29.4s, v15.4s, v6.s[3] 219 SUBS x0, x0, 32 220 FMLA v30.4s, v14.4s, v7.s[3] 221 FMLA v31.4s, v15.4s, v7.s[3] 222 B.HS 1b 223 2242: 225 # Epilogue 226 # First block of 4. FMA for first 4, loads for 2nd block of 4. 227 FMLA v16.4s, v20.4s, v0.s[0] 228 LDP q8, q9, [x5], 32 229 FMLA v17.4s, v21.4s, v0.s[0] 230 FMLA v18.4s, v20.4s, v1.s[0] 231 LDP q10, q11, [x5], 32 232 FMLA v19.4s, v21.4s, v1.s[0] 233 FMLA v28.4s, v20.4s, v2.s[0] 234 LDP q12, q13, [x5], 32 235 FMLA v29.4s, v21.4s, v2.s[0] 236 FMLA v30.4s, v20.4s, v3.s[0] 237 LDP q14, q15, [x5], 32 238 FMLA v31.4s, v21.4s, v3.s[0] 239 FMLA v16.4s, v22.4s, v0.s[1] 240 LDR q4, [x3], 16 241 FMLA v17.4s, v23.4s, v0.s[1] 242 FMLA v18.4s, v22.4s, v1.s[1] 243 LDR q5, [x11], 16 244 FMLA v19.4s, v23.4s, v1.s[1] 245 FMLA v28.4s, v22.4s, v2.s[1] 246 LDR q6, [x12], 16 247 FMLA v29.4s, v23.4s, v2.s[1] 248 FMLA v30.4s, v22.4s, v3.s[1] 249 LDR q7, [x4], 16 250 FMLA v31.4s, v23.4s, v3.s[1] 251 FMLA v16.4s, v24.4s, v0.s[2] 252 FMLA v17.4s, v25.4s, v0.s[2] 253 FMLA v18.4s, v24.4s, v1.s[2] 254 FMLA v19.4s, v25.4s, v1.s[2] 255 FMLA v28.4s, v24.4s, v2.s[2] 256 FMLA v29.4s, v25.4s, v2.s[2] 257 FMLA v30.4s, v24.4s, v3.s[2] 258 FMLA v31.4s, v25.4s, v3.s[2] 259 FMLA v16.4s, v26.4s, v0.s[3] 260 FMLA v17.4s, v27.4s, v0.s[3] 261 FMLA v18.4s, v26.4s, v1.s[3] 262 FMLA v19.4s, v27.4s, v1.s[3] 263 FMLA v28.4s, v26.4s, v2.s[3] 264 FMLA v29.4s, v27.4s, v2.s[3] 265 FMLA v30.4s, v26.4s, v3.s[3] 266 FMLA v31.4s, v27.4s, v3.s[3] 267 268 # Second block of 4. FMA for second 4, noloads 269 FMLA v16.4s, v8.4s, v4.s[0] 270 FMLA v17.4s, v9.4s, v4.s[0] 271 FMLA v18.4s, v8.4s, v5.s[0] 272 FMLA v19.4s, v9.4s, v5.s[0] 273 FMLA v28.4s, v8.4s, v6.s[0] 274 FMLA v29.4s, v9.4s, v6.s[0] 275 FMLA v30.4s, v8.4s, v7.s[0] 276 FMLA v31.4s, v9.4s, v7.s[0] 277 278 FMLA v16.4s, v10.4s, v4.s[1] 279 FMLA v17.4s, v11.4s, v4.s[1] 280 FMLA v18.4s, v10.4s, v5.s[1] 281 FMLA v19.4s, v11.4s, v5.s[1] 282 FMLA v28.4s, v10.4s, v6.s[1] 283 FMLA v29.4s, v11.4s, v6.s[1] 284 FMLA v30.4s, v10.4s, v7.s[1] 285 FMLA v31.4s, v11.4s, v7.s[1] 286 287 FMLA v16.4s, v12.4s, v4.s[2] 288 FMLA v17.4s, v13.4s, v4.s[2] 289 FMLA v18.4s, v12.4s, v5.s[2] 290 FMLA v19.4s, v13.4s, v5.s[2] 291 FMLA v28.4s, v12.4s, v6.s[2] 292 FMLA v29.4s, v13.4s, v6.s[2] 293 FMLA v30.4s, v12.4s, v7.s[2] 294 FMLA v31.4s, v13.4s, v7.s[2] 295 296 FMLA v16.4s, v14.4s, v4.s[3] 297 FMLA v17.4s, v15.4s, v4.s[3] 298 FMLA v18.4s, v14.4s, v5.s[3] 299 FMLA v19.4s, v15.4s, v5.s[3] 300 301 # Load min/max values 302 LD2R {v4.4s, v5.4s}, [x8] 303 304 FMLA v28.4s, v14.4s, v6.s[3] 305 FMLA v29.4s, v15.4s, v6.s[3] 306 FMLA v30.4s, v14.4s, v7.s[3] 307 FMLA v31.4s, v15.4s, v7.s[3] 308 3093: 310 # Remainder- 4 floats of A (16 bytes) 311 TBZ x0, 4, 4f 312 313 LDR q0, [x3], 16 314 LDP q20, q21, [x5], 32 315 LDR q1, [x11], 16 316 LDR q2, [x12], 16 317 LDR q3, [x4], 16 318 FMLA v16.4s, v20.4s, v0.s[0] 319 FMLA v17.4s, v21.4s, v0.s[0] 320 LDP q22, q23, [x5], 32 321 FMLA v18.4s, v20.4s, v1.s[0] 322 FMLA v19.4s, v21.4s, v1.s[0] 323 LDP q24, q25, [x5], 32 324 FMLA v28.4s, v20.4s, v2.s[0] 325 FMLA v29.4s, v21.4s, v2.s[0] 326 LDP q26, q27, [x5], 32 327 FMLA v30.4s, v20.4s, v3.s[0] 328 FMLA v31.4s, v21.4s, v3.s[0] 329 FMLA v16.4s, v22.4s, v0.s[1] 330 FMLA v17.4s, v23.4s, v0.s[1] 331 FMLA v18.4s, v22.4s, v1.s[1] 332 FMLA v19.4s, v23.4s, v1.s[1] 333 FMLA v28.4s, v22.4s, v2.s[1] 334 FMLA v29.4s, v23.4s, v2.s[1] 335 FMLA v30.4s, v22.4s, v3.s[1] 336 FMLA v31.4s, v23.4s, v3.s[1] 337 FMLA v16.4s, v24.4s, v0.s[2] 338 FMLA v17.4s, v25.4s, v0.s[2] 339 FMLA v18.4s, v24.4s, v1.s[2] 340 FMLA v19.4s, v25.4s, v1.s[2] 341 FMLA v28.4s, v24.4s, v2.s[2] 342 FMLA v29.4s, v25.4s, v2.s[2] 343 FMLA v30.4s, v24.4s, v3.s[2] 344 FMLA v31.4s, v25.4s, v3.s[2] 345 FMLA v16.4s, v26.4s, v0.s[3] 346 FMLA v17.4s, v27.4s, v0.s[3] 347 FMLA v18.4s, v26.4s, v1.s[3] 348 FMLA v19.4s, v27.4s, v1.s[3] 349 FMLA v28.4s, v26.4s, v2.s[3] 350 FMLA v29.4s, v27.4s, v2.s[3] 351 FMLA v30.4s, v26.4s, v3.s[3] 352 FMLA v31.4s, v27.4s, v3.s[3] 353 3544: 355 # Remainder- 2 floats of A (8 bytes) 356 TBZ x0, 3, 5f 357 358 LDR d0, [x3], 8 359 LDP q20, q21, [x5], 32 360 LDR d1, [x11], 8 361 LDR d2, [x12], 8 362 LDR d3, [x4], 8 363 FMLA v16.4s, v20.4s, v0.s[0] 364 FMLA v17.4s, v21.4s, v0.s[0] 365 LDP q22, q23, [x5], 32 366 FMLA v18.4s, v20.4s, v1.s[0] 367 FMLA v19.4s, v21.4s, v1.s[0] 368 FMLA v28.4s, v20.4s, v2.s[0] 369 FMLA v29.4s, v21.4s, v2.s[0] 370 FMLA v30.4s, v20.4s, v3.s[0] 371 FMLA v31.4s, v21.4s, v3.s[0] 372 FMLA v16.4s, v22.4s, v0.s[1] 373 FMLA v17.4s, v23.4s, v0.s[1] 374 FMLA v18.4s, v22.4s, v1.s[1] 375 FMLA v19.4s, v23.4s, v1.s[1] 376 FMLA v28.4s, v22.4s, v2.s[1] 377 FMLA v29.4s, v23.4s, v2.s[1] 378 FMLA v30.4s, v22.4s, v3.s[1] 379 FMLA v31.4s, v23.4s, v3.s[1] 380 3815: 382 # Remainder- 1 float of A (4 bytes) 383 TBZ x0, 2, 6f 384 385 LDR s0, [x3], 4 386 LDP q20, q21, [x5], 32 387 LDR s1, [x11], 4 388 LDR s2, [x12], 4 389 LDR s3, [x4], 4 390 FMLA v16.4s, v20.4s, v0.s[0] 391 FMLA v17.4s, v21.4s, v0.s[0] 392 FMLA v18.4s, v20.4s, v1.s[0] 393 FMLA v19.4s, v21.4s, v1.s[0] 394 FMLA v28.4s, v20.4s, v2.s[0] 395 FMLA v29.4s, v21.4s, v2.s[0] 396 FMLA v30.4s, v20.4s, v3.s[0] 397 FMLA v31.4s, v21.4s, v3.s[0] 398 3996: 400 # Clamp 401 FMAX v16.4s, v16.4s, v4.4s 402 SUBS x1, x1, 8 403 FMAX v17.4s, v17.4s, v4.4s 404 FMAX v18.4s, v18.4s, v4.4s 405 FMAX v19.4s, v19.4s, v4.4s 406 FMAX v28.4s, v28.4s, v4.4s 407 FMAX v29.4s, v29.4s, v4.4s 408 FMAX v30.4s, v30.4s, v4.4s 409 FMAX v31.4s, v31.4s, v4.4s 410 FMIN v16.4s, v16.4s, v5.4s 411 FMIN v17.4s, v17.4s, v5.4s 412 FMIN v18.4s, v18.4s, v5.4s 413 FMIN v19.4s, v19.4s, v5.4s 414 FMIN v28.4s, v28.4s, v5.4s 415 FMIN v29.4s, v29.4s, v5.4s 416 FMIN v30.4s, v30.4s, v5.4s 417 FMIN v31.4s, v31.4s, v5.4s 418 419 # Store full 4 x 8 420 B.LO 7f 421 422 $if INC: 423 STP q30, q31, [x7] 424 SUB x3, x3, x2 // a0 -= kc 425 ADD x7, x7, x14 426 STP q28, q29, [x10] 427 SUB x11, x11, x2 // a1 -= kc 428 ADD x10, x10, x14 429 STP q18, q19, [x9] 430 SUB x12, x12, x2 // a2 -= kc 431 ADD x9, x9, x14 432 STP q16, q17, [x6] 433 SUB x4, x4, x2 // a3 -= kc 434 ADD x6, x6, x14 435 $else: 436 STP q16, q17, [x6] 437 SUB x3, x3, x2 // a0 -= kc 438 ADD x6, x6, x14 439 STP q18, q19, [x9] 440 SUB x11, x11, x2 // a1 -= kc 441 ADD x9, x9, x14 442 STP q28, q29, [x10] 443 SUB x12, x12, x2 // a2 -= kc 444 ADD x10, x10, x14 445 STP q30, q31, [x7] 446 SUB x4, x4, x2 // a3 -= kc 447 ADD x7, x7, x14 448 449 B.HI 0b 450 451 # Restore d8-d15 from stack 452 LDP d14, d15, [sp, 48] 453 LDP d12, d13, [sp, 32] 454 LDP d10, d11, [sp, 16] 455 LDP d8, d9, [sp], 64 456 RET 457 458 # Store odd width 4597: 460 TBZ x1, 2, 8f 461 $if INC: 462 STR q30, [x7], 16 463 MOV v30.16b, v31.16b 464 STR q28, [x10], 16 465 MOV v28.16b, v29.16b 466 STR q18, [x9], 16 467 MOV v18.16b, v19.16b 468 STR q16, [x6], 16 469 MOV v16.16b, v17.16b 470 $else: 471 STR q16, [x6], 16 472 MOV v16.16b, v17.16b 473 STR q18, [x9], 16 474 MOV v18.16b, v19.16b 475 STR q28, [x10], 16 476 MOV v28.16b, v29.16b 477 STR q30, [x7], 16 478 MOV v30.16b, v31.16b 479 4808: 481 TBZ x1, 1, 9f 482 $if INC: 483 STR d30, [x7], 8 484 DUP d30, v30.d[1] 485 STR d28, [x10], 8 486 DUP d28, v28.d[1] 487 STR d18, [x9], 8 488 DUP d18, v18.d[1] 489 STR d16, [x6], 8 490 DUP d16, v16.d[1] 491 $else: 492 STR d16, [x6], 8 493 DUP d16, v16.d[1] 494 STR d18, [x9], 8 495 DUP d18, v18.d[1] 496 STR d28, [x10], 8 497 DUP d28, v28.d[1] 498 STR d30, [x7], 8 499 DUP d30, v30.d[1] 500 5019: 502 TBZ x1, 0, 10f 503 $if INC: 504 STR s30, [x7] 505 STR s28, [x10] 506 STR s18, [x9] 507 STR s16, [x6] 508 $else: 509 STR s16, [x6] 510 STR s18, [x9] 511 STR s28, [x10] 512 STR s30, [x7] 51310: 514 # Restore d8-d15 from stack 515 LDP d14, d15, [sp, 48] 516 LDP d12, d13, [sp, 32] 517 LDP d10, d11, [sp, 16] 518 LDP d8, d9, [sp], 64 519 RET 520 521 522END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 523 524#ifdef __ELF__ 525.section ".note.GNU-stack","",%progbits 526#endif 527