1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x11 a1 30# x12 a2 31# x4 a3 / a_stride 32 33# C pointers 34# x6 c0 35# x9 c1 36# x10 c2 37# x7 c3 / cm_stride 38 39# Vector register usage 40# A0 v0 v4 41# A1 v1 v5 42# A2 v2 v6 43# A3 v3 v7 44# B v8 v9 v10 v11 45# B v12 v13 v14 v15 46# B v20 v21 v22 v23 47# B v24 v25 v26 v27 48# C v16 v17 49# C v18 v19 50# C v28 v29 51# C v30 v31 52# Clamp v4 v5 53 54BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 55 56 $if INC: 57 # Load cn_stride, acc 58 LDP x14, x15, [sp] 59 # Load params pointer 60 LDR x8, [sp, 16] 61 $else: 62 # Load cn_stride, params pointer 63 LDP x14, x8, [sp] 64 65 # Load clamping_params values 66 LD2R {v4.4s, v5.4s}, [x8] 67 68 # Save d8-d15 on stack 69 STP d8, d9, [sp, -64]! 70 STP d10, d11, [sp, 16] 71 STP d12, d13, [sp, 32] 72 STP d14, d15, [sp, 48] 73 74 # Clamp A and C pointers 75 CMP x0, 2 // if mr < 2 76 ADD x11, x3, x4 // a1 = a0 + a_stride 77 ADD x9, x6, x7 // c1 = c0 + cm_stride 78 CSEL x11, x3, x11, LO // a1 = a0 79 CSEL x9, x6, x9, LO // c1 = c0 80 81 ADD x12, x11, x4 // a2 = a1 + a_stride 82 ADD x10, x9, x7 // c2 = c1 + cm_stride 83 // if mr <= 2 84 CSEL x12, x11, x12, LS // a2 = a1 85 CSEL x10, x9, x10, LS // c2 = c1 86 87 CMP x0, 4 // if mr < 4 88 ADD x4, x12, x4 // a3 = a2 + a_stride 89 ADD x7, x10, x7 // c3 = c2 + cm_stride 90 CSEL x4, x12, x4, LO // a3 = a2 91 CSEL x7, x10, x7, LO // c3 = c2 92 930: 94 $if INC: 95 # Load initial accumulators 96 LDP q16, q17, [x15], 32 97 LDP q18, q19, [x15], 32 98 LDP q28, q29, [x15], 32 99 LDP q30, q31, [x15], 32 100 $else: 101 # Load initial bias from w into accumulators 102 LDP q16, q17, [x5], 32 103 MOV v18.16b, v16.16b 104 MOV v19.16b, v17.16b 105 MOV v28.16b, v16.16b 106 MOV v29.16b, v17.16b 107 MOV v30.16b, v16.16b 108 MOV v31.16b, v17.16b 109 110 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 111 SUBS x0, x2, 32 // k = kc - 32 112 B.LO 3f 113 114 # 16 prologue 115 # Read first block of 4 A and B. 116 LDR q0, [x3], 16 117 LDP q20, q21, [x5], 32 118 LDR q1, [x11], 16 119 LDR q2, [x12], 16 120 LDR q3, [x4], 16 121 LDP q22, q23, [x5], 32 122 LDP q24, q25, [x5], 32 123 LDP q26, q27, [x5], 32 124 125 # Is there at least 32. yes do main loop 126 SUBS x0, x0, 32 127 B.LO 2f 128 129 # Main loop - 8 floats of A (32 bytes) 1301: 131 # First block of 4. FMA for first 4, loads for 2nd block of 4. 132 FMLA v16.4s, v20.4s, v0.s[0] 133 LDP q8, q9, [x5], 32 134 FMLA v17.4s, v21.4s, v0.s[0] 135 FMLA v18.4s, v20.4s, v1.s[0] 136 LDP q10, q11, [x5], 32 137 FMLA v19.4s, v21.4s, v1.s[0] 138 FMLA v28.4s, v20.4s, v2.s[0] 139 LDP q12, q13, [x5], 32 140 FMLA v29.4s, v21.4s, v2.s[0] 141 FMLA v30.4s, v20.4s, v3.s[0] 142 LDP q14, q15, [x5], 32 143 FMLA v31.4s, v21.4s, v3.s[0] 144 FMLA v16.4s, v22.4s, v0.s[1] 145 LDR q4, [x3], 16 146 FMLA v17.4s, v23.4s, v0.s[1] 147 FMLA v18.4s, v22.4s, v1.s[1] 148 LDR q5, [x11], 16 149 FMLA v19.4s, v23.4s, v1.s[1] 150 FMLA v28.4s, v22.4s, v2.s[1] 151 LDR q6, [x12], 16 152 FMLA v29.4s, v23.4s, v2.s[1] 153 FMLA v30.4s, v22.4s, v3.s[1] 154 LDR q7, [x4], 16 155 FMLA v31.4s, v23.4s, v3.s[1] 156 FMLA v16.4s, v24.4s, v0.s[2] 157 $if PREFETCH: 158 PRFM PLDL1KEEP, [x5, 128] 159 FMLA v17.4s, v25.4s, v0.s[2] 160 FMLA v18.4s, v24.4s, v1.s[2] 161 $if PREFETCH: 162 PRFM PLDL1KEEP, [x5, 192] 163 FMLA v19.4s, v25.4s, v1.s[2] 164 FMLA v28.4s, v24.4s, v2.s[2] 165 $if PREFETCH: 166 PRFM PLDL1KEEP, [x5, 256] 167 FMLA v29.4s, v25.4s, v2.s[2] 168 FMLA v30.4s, v24.4s, v3.s[2] 169 $if PREFETCH: 170 PRFM PLDL1KEEP, [x5, 320] 171 FMLA v31.4s, v25.4s, v3.s[2] 172 FMLA v16.4s, v26.4s, v0.s[3] 173 FMLA v17.4s, v27.4s, v0.s[3] 174 FMLA v18.4s, v26.4s, v1.s[3] 175 FMLA v19.4s, v27.4s, v1.s[3] 176 FMLA v28.4s, v26.4s, v2.s[3] 177 FMLA v29.4s, v27.4s, v2.s[3] 178 FMLA v30.4s, v26.4s, v3.s[3] 179 FMLA v31.4s, v27.4s, v3.s[3] 180 181 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 182 FMLA v16.4s, v8.4s, v4.s[0] 183 LDP q20, q21, [x5], 32 184 FMLA v17.4s, v9.4s, v4.s[0] 185 FMLA v18.4s, v8.4s, v5.s[0] 186 LDP q22, q23, [x5], 32 187 FMLA v19.4s, v9.4s, v5.s[0] 188 FMLA v28.4s, v8.4s, v6.s[0] 189 LDP q24, q25, [x5], 32 190 FMLA v29.4s, v9.4s, v6.s[0] 191 FMLA v30.4s, v8.4s, v7.s[0] 192 LDP q26, q27, [x5], 32 193 FMLA v31.4s, v9.4s, v7.s[0] 194 FMLA v16.4s, v10.4s, v4.s[1] 195 LDR q0, [x3], 16 196 FMLA v17.4s, v11.4s, v4.s[1] 197 FMLA v18.4s, v10.4s, v5.s[1] 198 LDR q1, [x11], 16 199 FMLA v19.4s, v11.4s, v5.s[1] 200 FMLA v28.4s, v10.4s, v6.s[1] 201 LDR q2, [x12], 16 202 FMLA v29.4s, v11.4s, v6.s[1] 203 FMLA v30.4s, v10.4s, v7.s[1] 204 LDR q3, [x4], 16 205 FMLA v31.4s, v11.4s, v7.s[1] 206 FMLA v16.4s, v12.4s, v4.s[2] 207 FMLA v17.4s, v13.4s, v4.s[2] 208 FMLA v18.4s, v12.4s, v5.s[2] 209 FMLA v19.4s, v13.4s, v5.s[2] 210 FMLA v28.4s, v12.4s, v6.s[2] 211 FMLA v29.4s, v13.4s, v6.s[2] 212 FMLA v30.4s, v12.4s, v7.s[2] 213 FMLA v31.4s, v13.4s, v7.s[2] 214 FMLA v16.4s, v14.4s, v4.s[3] 215 FMLA v17.4s, v15.4s, v4.s[3] 216 FMLA v18.4s, v14.4s, v5.s[3] 217 FMLA v19.4s, v15.4s, v5.s[3] 218 FMLA v28.4s, v14.4s, v6.s[3] 219 FMLA v29.4s, v15.4s, v6.s[3] 220 SUBS x0, x0, 32 221 FMLA v30.4s, v14.4s, v7.s[3] 222 FMLA v31.4s, v15.4s, v7.s[3] 223 B.HS 1b 224 2252: 226 # Epilogue 227 # First block of 4. FMA for first 4, loads for 2nd block of 4. 228 FMLA v16.4s, v20.4s, v0.s[0] 229 LDP q8, q9, [x5], 32 230 FMLA v17.4s, v21.4s, v0.s[0] 231 FMLA v18.4s, v20.4s, v1.s[0] 232 LDP q10, q11, [x5], 32 233 FMLA v19.4s, v21.4s, v1.s[0] 234 FMLA v28.4s, v20.4s, v2.s[0] 235 LDP q12, q13, [x5], 32 236 FMLA v29.4s, v21.4s, v2.s[0] 237 FMLA v30.4s, v20.4s, v3.s[0] 238 LDP q14, q15, [x5], 32 239 FMLA v31.4s, v21.4s, v3.s[0] 240 FMLA v16.4s, v22.4s, v0.s[1] 241 LDR q4, [x3], 16 242 FMLA v17.4s, v23.4s, v0.s[1] 243 FMLA v18.4s, v22.4s, v1.s[1] 244 LDR q5, [x11], 16 245 FMLA v19.4s, v23.4s, v1.s[1] 246 FMLA v28.4s, v22.4s, v2.s[1] 247 LDR q6, [x12], 16 248 FMLA v29.4s, v23.4s, v2.s[1] 249 FMLA v30.4s, v22.4s, v3.s[1] 250 LDR q7, [x4], 16 251 FMLA v31.4s, v23.4s, v3.s[1] 252 FMLA v16.4s, v24.4s, v0.s[2] 253 FMLA v17.4s, v25.4s, v0.s[2] 254 FMLA v18.4s, v24.4s, v1.s[2] 255 FMLA v19.4s, v25.4s, v1.s[2] 256 FMLA v28.4s, v24.4s, v2.s[2] 257 FMLA v29.4s, v25.4s, v2.s[2] 258 FMLA v30.4s, v24.4s, v3.s[2] 259 FMLA v31.4s, v25.4s, v3.s[2] 260 FMLA v16.4s, v26.4s, v0.s[3] 261 FMLA v17.4s, v27.4s, v0.s[3] 262 FMLA v18.4s, v26.4s, v1.s[3] 263 FMLA v19.4s, v27.4s, v1.s[3] 264 FMLA v28.4s, v26.4s, v2.s[3] 265 FMLA v29.4s, v27.4s, v2.s[3] 266 FMLA v30.4s, v26.4s, v3.s[3] 267 FMLA v31.4s, v27.4s, v3.s[3] 268 269 # Second block of 4. FMA for second 4, noloads 270 FMLA v16.4s, v8.4s, v4.s[0] 271 FMLA v17.4s, v9.4s, v4.s[0] 272 FMLA v18.4s, v8.4s, v5.s[0] 273 FMLA v19.4s, v9.4s, v5.s[0] 274 FMLA v28.4s, v8.4s, v6.s[0] 275 FMLA v29.4s, v9.4s, v6.s[0] 276 FMLA v30.4s, v8.4s, v7.s[0] 277 FMLA v31.4s, v9.4s, v7.s[0] 278 279 FMLA v16.4s, v10.4s, v4.s[1] 280 FMLA v17.4s, v11.4s, v4.s[1] 281 FMLA v18.4s, v10.4s, v5.s[1] 282 FMLA v19.4s, v11.4s, v5.s[1] 283 FMLA v28.4s, v10.4s, v6.s[1] 284 FMLA v29.4s, v11.4s, v6.s[1] 285 FMLA v30.4s, v10.4s, v7.s[1] 286 FMLA v31.4s, v11.4s, v7.s[1] 287 288 FMLA v16.4s, v12.4s, v4.s[2] 289 FMLA v17.4s, v13.4s, v4.s[2] 290 FMLA v18.4s, v12.4s, v5.s[2] 291 FMLA v19.4s, v13.4s, v5.s[2] 292 FMLA v28.4s, v12.4s, v6.s[2] 293 FMLA v29.4s, v13.4s, v6.s[2] 294 FMLA v30.4s, v12.4s, v7.s[2] 295 FMLA v31.4s, v13.4s, v7.s[2] 296 297 FMLA v16.4s, v14.4s, v4.s[3] 298 FMLA v17.4s, v15.4s, v4.s[3] 299 FMLA v18.4s, v14.4s, v5.s[3] 300 FMLA v19.4s, v15.4s, v5.s[3] 301 302 # Load clamping_params values 303 LD2R {v4.4s, v5.4s}, [x8] 304 305 FMLA v28.4s, v14.4s, v6.s[3] 306 FMLA v29.4s, v15.4s, v6.s[3] 307 FMLA v30.4s, v14.4s, v7.s[3] 308 FMLA v31.4s, v15.4s, v7.s[3] 309 3103: 311 # Remainder- 4 floats of A (16 bytes) 312 TBZ x0, 4, 4f 313 314 LDR q0, [x3], 16 315 LDP q20, q21, [x5], 32 316 LDR q1, [x11], 16 317 LDR q2, [x12], 16 318 LDR q3, [x4], 16 319 FMLA v16.4s, v20.4s, v0.s[0] 320 FMLA v17.4s, v21.4s, v0.s[0] 321 LDP q22, q23, [x5], 32 322 FMLA v18.4s, v20.4s, v1.s[0] 323 FMLA v19.4s, v21.4s, v1.s[0] 324 LDP q24, q25, [x5], 32 325 FMLA v28.4s, v20.4s, v2.s[0] 326 FMLA v29.4s, v21.4s, v2.s[0] 327 LDP q26, q27, [x5], 32 328 FMLA v30.4s, v20.4s, v3.s[0] 329 FMLA v31.4s, v21.4s, v3.s[0] 330 FMLA v16.4s, v22.4s, v0.s[1] 331 FMLA v17.4s, v23.4s, v0.s[1] 332 FMLA v18.4s, v22.4s, v1.s[1] 333 FMLA v19.4s, v23.4s, v1.s[1] 334 FMLA v28.4s, v22.4s, v2.s[1] 335 FMLA v29.4s, v23.4s, v2.s[1] 336 FMLA v30.4s, v22.4s, v3.s[1] 337 FMLA v31.4s, v23.4s, v3.s[1] 338 FMLA v16.4s, v24.4s, v0.s[2] 339 FMLA v17.4s, v25.4s, v0.s[2] 340 FMLA v18.4s, v24.4s, v1.s[2] 341 FMLA v19.4s, v25.4s, v1.s[2] 342 FMLA v28.4s, v24.4s, v2.s[2] 343 FMLA v29.4s, v25.4s, v2.s[2] 344 FMLA v30.4s, v24.4s, v3.s[2] 345 FMLA v31.4s, v25.4s, v3.s[2] 346 FMLA v16.4s, v26.4s, v0.s[3] 347 FMLA v17.4s, v27.4s, v0.s[3] 348 FMLA v18.4s, v26.4s, v1.s[3] 349 FMLA v19.4s, v27.4s, v1.s[3] 350 FMLA v28.4s, v26.4s, v2.s[3] 351 FMLA v29.4s, v27.4s, v2.s[3] 352 FMLA v30.4s, v26.4s, v3.s[3] 353 FMLA v31.4s, v27.4s, v3.s[3] 354 3554: 356 # Remainder- 2 floats of A (8 bytes) 357 TBZ x0, 3, 5f 358 359 LDR d0, [x3], 8 360 LDP q20, q21, [x5], 32 361 LDR d1, [x11], 8 362 LDR d2, [x12], 8 363 LDR d3, [x4], 8 364 FMLA v16.4s, v20.4s, v0.s[0] 365 FMLA v17.4s, v21.4s, v0.s[0] 366 LDP q22, q23, [x5], 32 367 FMLA v18.4s, v20.4s, v1.s[0] 368 FMLA v19.4s, v21.4s, v1.s[0] 369 FMLA v28.4s, v20.4s, v2.s[0] 370 FMLA v29.4s, v21.4s, v2.s[0] 371 FMLA v30.4s, v20.4s, v3.s[0] 372 FMLA v31.4s, v21.4s, v3.s[0] 373 FMLA v16.4s, v22.4s, v0.s[1] 374 FMLA v17.4s, v23.4s, v0.s[1] 375 FMLA v18.4s, v22.4s, v1.s[1] 376 FMLA v19.4s, v23.4s, v1.s[1] 377 FMLA v28.4s, v22.4s, v2.s[1] 378 FMLA v29.4s, v23.4s, v2.s[1] 379 FMLA v30.4s, v22.4s, v3.s[1] 380 FMLA v31.4s, v23.4s, v3.s[1] 381 3825: 383 # Remainder- 1 float of A (4 bytes) 384 TBZ x0, 2, 6f 385 386 LDR s0, [x3], 4 387 LDP q20, q21, [x5], 32 388 LDR s1, [x11], 4 389 LDR s2, [x12], 4 390 LDR s3, [x4], 4 391 FMLA v16.4s, v20.4s, v0.s[0] 392 FMLA v17.4s, v21.4s, v0.s[0] 393 FMLA v18.4s, v20.4s, v1.s[0] 394 FMLA v19.4s, v21.4s, v1.s[0] 395 FMLA v28.4s, v20.4s, v2.s[0] 396 FMLA v29.4s, v21.4s, v2.s[0] 397 FMLA v30.4s, v20.4s, v3.s[0] 398 FMLA v31.4s, v21.4s, v3.s[0] 399 4006: 401 # Clamp 402 FMIN v16.4s, v16.4s, v4.4s 403 SUBS x1, x1, 8 404 FMIN v17.4s, v17.4s, v4.4s 405 FMIN v18.4s, v18.4s, v4.4s 406 FMIN v19.4s, v19.4s, v4.4s 407 FMIN v28.4s, v28.4s, v4.4s 408 FMIN v29.4s, v29.4s, v4.4s 409 FMIN v30.4s, v30.4s, v4.4s 410 FMIN v31.4s, v31.4s, v4.4s 411 FMAX v16.4s, v16.4s, v5.4s 412 FMAX v17.4s, v17.4s, v5.4s 413 FMAX v18.4s, v18.4s, v5.4s 414 FMAX v19.4s, v19.4s, v5.4s 415 FMAX v28.4s, v28.4s, v5.4s 416 FMAX v29.4s, v29.4s, v5.4s 417 FMAX v30.4s, v30.4s, v5.4s 418 FMAX v31.4s, v31.4s, v5.4s 419 420 # Store full 4 x 8 421 B.LO 7f 422 423 $if INC: 424 STP q30, q31, [x7] 425 SUB x3, x3, x2 // a0 -= kc 426 ADD x7, x7, x14 427 STP q28, q29, [x10] 428 SUB x11, x11, x2 // a1 -= kc 429 ADD x10, x10, x14 430 STP q18, q19, [x9] 431 SUB x12, x12, x2 // a2 -= kc 432 ADD x9, x9, x14 433 STP q16, q17, [x6] 434 SUB x4, x4, x2 // a3 -= kc 435 ADD x6, x6, x14 436 $else: 437 STP q16, q17, [x6] 438 SUB x3, x3, x2 // a0 -= kc 439 ADD x6, x6, x14 440 STP q18, q19, [x9] 441 SUB x11, x11, x2 // a1 -= kc 442 ADD x9, x9, x14 443 STP q28, q29, [x10] 444 SUB x12, x12, x2 // a2 -= kc 445 ADD x10, x10, x14 446 STP q30, q31, [x7] 447 SUB x4, x4, x2 // a3 -= kc 448 ADD x7, x7, x14 449 450 B.HI 0b 451 452 # Restore d8-d15 from stack 453 LDP d14, d15, [sp, 48] 454 LDP d12, d13, [sp, 32] 455 LDP d10, d11, [sp, 16] 456 LDP d8, d9, [sp], 64 457 RET 458 459 # Store odd width 4607: 461 TBZ x1, 2, 8f 462 $if INC: 463 STR q30, [x7], 16 464 MOV v30.16b, v31.16b 465 STR q28, [x10], 16 466 MOV v28.16b, v29.16b 467 STR q18, [x9], 16 468 MOV v18.16b, v19.16b 469 STR q16, [x6], 16 470 MOV v16.16b, v17.16b 471 $else: 472 STR q16, [x6], 16 473 MOV v16.16b, v17.16b 474 STR q18, [x9], 16 475 MOV v18.16b, v19.16b 476 STR q28, [x10], 16 477 MOV v28.16b, v29.16b 478 STR q30, [x7], 16 479 MOV v30.16b, v31.16b 480 4818: 482 TBZ x1, 1, 9f 483 $if INC: 484 STR d30, [x7], 8 485 DUP d30, v30.d[1] 486 STR d28, [x10], 8 487 DUP d28, v28.d[1] 488 STR d18, [x9], 8 489 DUP d18, v18.d[1] 490 STR d16, [x6], 8 491 DUP d16, v16.d[1] 492 $else: 493 STR d16, [x6], 8 494 DUP d16, v16.d[1] 495 STR d18, [x9], 8 496 DUP d18, v18.d[1] 497 STR d28, [x10], 8 498 DUP d28, v28.d[1] 499 STR d30, [x7], 8 500 DUP d30, v30.d[1] 501 5029: 503 TBZ x1, 0, 10f 504 $if INC: 505 STR s30, [x7] 506 STR s28, [x10] 507 STR s18, [x9] 508 STR s16, [x6] 509 $else: 510 STR s16, [x6] 511 STR s18, [x9] 512 STR s28, [x10] 513 STR s30, [x7] 51410: 515 # Restore d8-d15 from stack 516 LDP d14, d15, [sp, 48] 517 LDP d12, d13, [sp, 32] 518 LDP d10, d11, [sp, 16] 519 LDP d8, d9, [sp], 64 520 RET 521 522 523END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 524 525#ifdef __ELF__ 526.section ".note.GNU-stack","",%progbits 527#endif 528