1// Auto-generated file. Do not edit! 2// Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# const int8_t* restrict a, x3 18# size_t a_stride, x4 19# const void* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> x12 23# const union xnn_qs8_minmax_params params) [sp + 8] -> x11 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# Register usage 28# A0 x3 v0 v4 29# A1 x15 v1 v5 30# A2 x13 v2 v6 31# A3 x4 v3 v7 32# B x5 v8 v9 v10 v11 33# C0 x6 v16 v20 v24 v28 34# C1 x8 v17 v21 v25 v29 35# C2 x9 v18 v22 v26 v30 36# C3 x7 v19 v23 v27 v31 37# unused v12 v13 v14 v15 38 39# x14 temp for Cortex-A55 loads 40 41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 42 43 # Clamp A and C pointers 44 CMP x0, 2 // if mr < 2 45 46 LDP x12, x11, [sp] // cn_stride, params 47 48 ADD x15, x3, x4 // a1 = a0 + a_stride 49 ADD x8, x6, x7 // c1 = c0 + cm_stride 50 51 STP d8, d9, [sp, -32]! 52 53 CSEL x15, x3, x15, LO // a1 = a0 54 CSEL x8, x6, x8, LO // c1 = c0 55 ADD x2, x2, 3 // kc = (kc + 3) & ~3 56 57 ADD x13, x15, x4 // a2 = a1 + a_stride 58 ADD x9, x8, x7 // c2 = c1 + cm_stride 59 // if mr <= 2 60 CSEL x13, x15, x13, LS // a2 = a1 61 CSEL x9, x8, x9, LS // c2 = c1 62 BIC x2, x2, 3 63 64 STP d10, d11, [sp, 16] 65 66 CMP x0, 4 // if mr < 4 67 ADD x4, x13, x4 // a3 = a2 + a_stride 68 ADD x7, x9, x7 // c3 = c2 + cm_stride 69 CSEL x4, x13, x4, LO // a3 = a2 70 CSEL x7, x9, x7, LO // c3 = c2 71 72 .p2align 3 730: 74 # Load initial bias from w into accumulators 75 LDP q16, q20, [x5], 32 76 MOV v17.16b, v16.16b 77 MOV v18.16b, v16.16b 78 LDP q24, q28, [x5], 32 79 MOV v19.16b, v16.16b 80 MOV v21.16b, v20.16b 81 MOV v22.16b, v20.16b 82 MOV v23.16b, v20.16b 83 MOV v25.16b, v24.16b 84 MOV v26.16b, v24.16b 85 SUBS x0, x2, 16 // k = kc - 16 86 MOV v27.16b, v24.16b 87 MOV v29.16b, v28.16b 88 MOV v30.16b, v28.16b 89 MOV v31.16b, v28.16b 90 # Is there at least 16 bytes for prologue/epilogue? 91 B.LO 4f 92 93 # prologue - read A and B values for block 0 and 1 94 LDR d0, [x3], 8 95 LDR q8, [x5], 16 96 LDR d1, [x15], 8 97 LDR d2, [x13], 8 98 LDR d3, [x4], 8 99 SUBS x0, x0, 16 // is there 16 for main loop? 100 LDR d9, [x5], 8 101 LDR x14, [x5], 8 102 # Is there at least 16 bytes for main loop? 103 B.LO 2f 104 105 # Main loop - 16 bytes of A in 4 groups. 106 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 107 # 4 LD64 for A 108 # 4 LD128 for W. = 2 LD64 + INS. 109 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 110 111 .p2align 3 1121: 113 # BLOCK 0 114 SDOT v16.4s, v8.16b, v0.4b[0] 115 LDR d10, [x5], 8 116 SDOT v17.4s, v8.16b, v1.4b[0] 117 INS v9.d[1], x14 118 SDOT v18.4s, v8.16b, v2.4b[0] 119 LDR x14, [x5], 8 120 SDOT v19.4s, v8.16b, v3.4b[0] 121 LDR d4, [x3], 8 122 123 # BLOCK 1 124 SDOT v20.4s, v9.16b, v0.4b[0] 125 LDR d11, [x5], 8 126 SDOT v21.4s, v9.16b, v1.4b[0] 127 INS v10.d[1], x14 128 SDOT v22.4s, v9.16b, v2.4b[0] 129 LDR x14, [x5], 8 130 SDOT v23.4s, v9.16b, v3.4b[0] 131 LDR d5, [x15], 8 132 133 # BLOCK 2 134 SDOT v24.4s, v10.16b, v0.4b[0] 135 LDR d8, [x5], 8 136 SDOT v25.4s, v10.16b, v1.4b[0] 137 INS v11.d[1], x14 138 SDOT v26.4s, v10.16b, v2.4b[0] 139 LDR x14, [x5], 8 140 SDOT v27.4s, v10.16b, v3.4b[0] 141 LDR d6, [x13], 8 142 143 # BLOCK 3 144 SDOT v28.4s, v11.16b, v0.4b[0] 145 LDR d9, [x5], 8 146 SDOT v29.4s, v11.16b, v1.4b[0] 147 INS v8.d[1], x14 148 SDOT v30.4s, v11.16b, v2.4b[0] 149 LDR x14, [x5], 8 150 SDOT v31.4s, v11.16b, v3.4b[0] 151 LDR d7, [x4], 8 152 153 # BLOCK 0 154 SDOT v16.4s, v8.16b, v0.4b[1] 155 LDR d10, [x5], 8 156 SDOT v17.4s, v8.16b, v1.4b[1] 157 INS v9.d[1], x14 158 SDOT v18.4s, v8.16b, v2.4b[1] 159 LDR x14, [x5], 8 160 SDOT v19.4s, v8.16b, v3.4b[1] 161 162 # BLOCK 1 163 SDOT v20.4s, v9.16b, v0.4b[1] 164 LDR d11, [x5], 8 165 SDOT v21.4s, v9.16b, v1.4b[1] 166 INS v10.d[1], x14 167 SDOT v22.4s, v9.16b, v2.4b[1] 168 LDR x14, [x5], 8 169 SDOT v23.4s, v9.16b, v3.4b[1] 170 171 # BLOCK 2 172 SDOT v24.4s, v10.16b, v0.4b[1] 173 LDR d8, [x5], 8 174 SDOT v25.4s, v10.16b, v1.4b[1] 175 INS v11.d[1], x14 176 SDOT v26.4s, v10.16b, v2.4b[1] 177 LDR x14, [x5], 8 178 SDOT v27.4s, v10.16b, v3.4b[1] 179 180 # BLOCK 3 181 SDOT v28.4s, v11.16b, v0.4b[1] 182 LDR d9, [x5], 8 183 SDOT v29.4s, v11.16b, v1.4b[1] 184 INS v8.d[1], x14 185 SDOT v30.4s, v11.16b, v2.4b[1] 186 LDR x14, [x5], 8 187 SDOT v31.4s, v11.16b, v3.4b[1] 188 189 # BLOCK 0 190 SDOT v16.4s, v8.16b, v4.4b[0] 191 LDR d10, [x5], 8 192 SDOT v17.4s, v8.16b, v5.4b[0] 193 INS v9.d[1], x14 194 SDOT v18.4s, v8.16b, v6.4b[0] 195 LDR x14, [x5], 8 196 SDOT v19.4s, v8.16b, v7.4b[0] 197 LDR d0, [x3], 8 198 199 # BLOCK 1 200 SDOT v20.4s, v9.16b, v4.4b[0] 201 LDR d11, [x5], 8 202 SDOT v21.4s, v9.16b, v5.4b[0] 203 INS v10.d[1], x14 204 SDOT v22.4s, v9.16b, v6.4b[0] 205 LDR x14, [x5], 8 206 SDOT v23.4s, v9.16b, v7.4b[0] 207 LDR d1, [x15], 8 208 209 # BLOCK 2 210 SDOT v24.4s, v10.16b, v4.4b[0] 211 LDR d8, [x5], 8 212 SDOT v25.4s, v10.16b, v5.4b[0] 213 INS v11.d[1], x14 214 SDOT v26.4s, v10.16b, v6.4b[0] 215 LDR x14, [x5], 8 216 SDOT v27.4s, v10.16b, v7.4b[0] 217 LDR d2, [x13], 8 218 219 # BLOCK 3 220 SDOT v28.4s, v11.16b, v4.4b[0] 221 LDR d9, [x5], 8 222 SDOT v29.4s, v11.16b, v5.4b[0] 223 INS v8.d[1], x14 224 SDOT v30.4s, v11.16b, v6.4b[0] 225 LDR x14, [x5], 8 226 SDOT v31.4s, v11.16b, v7.4b[0] 227 LDR d3, [x4], 8 228 229 # BLOCK 0 230 SDOT v16.4s, v8.16b, v4.4b[1] 231 LDR d10, [x5], 8 232 SDOT v17.4s, v8.16b, v5.4b[1] 233 INS v9.d[1], x14 234 SDOT v18.4s, v8.16b, v6.4b[1] 235 LDR x14, [x5], 8 236 SDOT v19.4s, v8.16b, v7.4b[1] 237 238 # BLOCK 1 239 SDOT v20.4s, v9.16b, v4.4b[1] 240 LDR d11, [x5], 8 241 SDOT v21.4s, v9.16b, v5.4b[1] 242 INS v10.d[1], x14 243 SDOT v22.4s, v9.16b, v6.4b[1] 244 LDR x14, [x5], 8 245 SDOT v23.4s, v9.16b, v7.4b[1] 246 247 # BLOCK 2 248 SDOT v24.4s, v10.16b, v4.4b[1] 249 LDR d8, [x5], 8 // First B values for block 0 and 1 250 SDOT v25.4s, v10.16b, v5.4b[1] 251 INS v11.d[1], x14 252 SDOT v26.4s, v10.16b, v6.4b[1] 253 LDR x14, [x5], 8 254 SDOT v27.4s, v10.16b, v7.4b[1] 255 SUBS x0, x0, 16 256 257 # BLOCK 3 258 SDOT v28.4s, v11.16b, v4.4b[1] 259 LDR d9, [x5], 8 260 SDOT v29.4s, v11.16b, v5.4b[1] 261 INS v8.d[1], x14 262 SDOT v30.4s, v11.16b, v6.4b[1] 263 LDR x14, [x5], 8 264 SDOT v31.4s, v11.16b, v7.4b[1] 265 B.HS 1b 266 267 # Epilogue. Same as main loop but no preloads in final group 2682: 269 # BLOCK 0 270 SDOT v16.4s, v8.16b, v0.4b[0] 271 LDR d10, [x5], 8 272 SDOT v17.4s, v8.16b, v1.4b[0] 273 INS v9.d[1], x14 274 SDOT v18.4s, v8.16b, v2.4b[0] 275 LDR x14, [x5], 8 276 SDOT v19.4s, v8.16b, v3.4b[0] 277 LDR d4, [x3], 8 278 279 # BLOCK 1 280 SDOT v20.4s, v9.16b, v0.4b[0] 281 LDR d11, [x5], 8 282 SDOT v21.4s, v9.16b, v1.4b[0] 283 INS v10.d[1], x14 284 SDOT v22.4s, v9.16b, v2.4b[0] 285 LDR x14, [x5], 8 286 SDOT v23.4s, v9.16b, v3.4b[0] 287 LDR d5, [x15], 8 288 289 # BLOCK 2 290 SDOT v24.4s, v10.16b, v0.4b[0] 291 LDR d8, [x5], 8 292 SDOT v25.4s, v10.16b, v1.4b[0] 293 INS v11.d[1], x14 294 SDOT v26.4s, v10.16b, v2.4b[0] 295 LDR x14, [x5], 8 296 SDOT v27.4s, v10.16b, v3.4b[0] 297 LDR d6, [x13], 8 298 299 # BLOCK 3 300 SDOT v28.4s, v11.16b, v0.4b[0] 301 LDR d9, [x5], 8 302 SDOT v29.4s, v11.16b, v1.4b[0] 303 INS v8.d[1], x14 304 SDOT v30.4s, v11.16b, v2.4b[0] 305 LDR x14, [x5], 8 306 SDOT v31.4s, v11.16b, v3.4b[0] 307 LDR d7, [x4], 8 308 309 # BLOCK 0 310 SDOT v16.4s, v8.16b, v0.4b[1] 311 LDR d10, [x5], 8 312 SDOT v17.4s, v8.16b, v1.4b[1] 313 INS v9.d[1], x14 314 SDOT v18.4s, v8.16b, v2.4b[1] 315 LDR x14, [x5], 8 316 SDOT v19.4s, v8.16b, v3.4b[1] 317 318 # BLOCK 1 319 SDOT v20.4s, v9.16b, v0.4b[1] 320 LDR d11, [x5], 8 321 SDOT v21.4s, v9.16b, v1.4b[1] 322 INS v10.d[1], x14 323 SDOT v22.4s, v9.16b, v2.4b[1] 324 LDR x14, [x5], 8 325 SDOT v23.4s, v9.16b, v3.4b[1] 326 327 # BLOCK 2 328 SDOT v24.4s, v10.16b, v0.4b[1] 329 LDR d8, [x5], 8 330 SDOT v25.4s, v10.16b, v1.4b[1] 331 INS v11.d[1], x14 332 SDOT v26.4s, v10.16b, v2.4b[1] 333 LDR x14, [x5], 8 334 SDOT v27.4s, v10.16b, v3.4b[1] 335 336 # BLOCK 3 337 SDOT v28.4s, v11.16b, v0.4b[1] 338 LDR d9, [x5], 8 339 SDOT v29.4s, v11.16b, v1.4b[1] 340 INS v8.d[1], x14 341 SDOT v30.4s, v11.16b, v2.4b[1] 342 LDR x14, [x5], 8 343 SDOT v31.4s, v11.16b, v3.4b[1] 344 345 # BLOCK 0 346 SDOT v16.4s, v8.16b, v4.4b[0] 347 LDR d10, [x5], 8 348 SDOT v17.4s, v8.16b, v5.4b[0] 349 INS v9.d[1], x14 350 SDOT v18.4s, v8.16b, v6.4b[0] 351 LDR x14, [x5], 8 352 SDOT v19.4s, v8.16b, v7.4b[0] 353 354 # BLOCK 1 355 SDOT v20.4s, v9.16b, v4.4b[0] 356 LDR d11, [x5], 8 357 SDOT v21.4s, v9.16b, v5.4b[0] 358 INS v10.d[1], x14 359 SDOT v22.4s, v9.16b, v6.4b[0] 360 LDR x14, [x5], 8 361 SDOT v23.4s, v9.16b, v7.4b[0] 362 363 # BLOCK 2 364 SDOT v24.4s, v10.16b, v4.4b[0] 365 LDR d8, [x5], 8 366 SDOT v25.4s, v10.16b, v5.4b[0] 367 INS v11.d[1], x14 368 SDOT v26.4s, v10.16b, v6.4b[0] 369 LDR x14, [x5], 8 370 SDOT v27.4s, v10.16b, v7.4b[0] 371 372 # BLOCK 3 373 SDOT v28.4s, v11.16b, v4.4b[0] 374 LDR d9, [x5], 8 375 SDOT v29.4s, v11.16b, v5.4b[0] 376 INS v8.d[1], x14 377 SDOT v30.4s, v11.16b, v6.4b[0] 378 LDR x14, [x5], 8 379 SDOT v31.4s, v11.16b, v7.4b[0] 380 381 # BLOCK 0 382 SDOT v16.4s, v8.16b, v4.4b[1] 383 LDR d10, [x5], 8 384 SDOT v17.4s, v8.16b, v5.4b[1] 385 INS v9.d[1], x14 386 SDOT v18.4s, v8.16b, v6.4b[1] 387 LDR x14, [x5], 8 388 SDOT v19.4s, v8.16b, v7.4b[1] 389 390 # BLOCK 1 391 SDOT v20.4s, v9.16b, v4.4b[1] 392 LDR d11, [x5], 8 393 SDOT v21.4s, v9.16b, v5.4b[1] 394 INS v10.d[1], x14 395 SDOT v22.4s, v9.16b, v6.4b[1] 396 LDR x14, [x5], 8 397 SDOT v23.4s, v9.16b, v7.4b[1] 398 399 # BLOCK 2 400 SDOT v24.4s, v10.16b, v4.4b[1] 401 SDOT v25.4s, v10.16b, v5.4b[1] 402 INS v11.d[1], x14 403 SDOT v26.4s, v10.16b, v6.4b[1] 404 SDOT v27.4s, v10.16b, v7.4b[1] 405 AND x0, x2, 15 // kc remainder 0 to 12 406 407 # BLOCK 3 408 SDOT v28.4s, v11.16b, v4.4b[1] 409 SDOT v29.4s, v11.16b, v5.4b[1] 410 SDOT v30.4s, v11.16b, v6.4b[1] 411 SDOT v31.4s, v11.16b, v7.4b[1] 412 413 # Is there a remainder?- 4 to 12 bytes of A 414 CBNZ x0, 5f 415 416 .p2align 3 4173: 418 SCVTF v16.4s, v16.4s 419 SCVTF v17.4s, v17.4s 420 # Load per channel scale values from weights 421 LDR q4, [x5], 16 422 SCVTF v18.4s, v18.4s 423 SCVTF v19.4s, v19.4s 424 LDR q5, [x5], 16 425 SCVTF v20.4s, v20.4s 426 SCVTF v21.4s, v21.4s 427 SCVTF v22.4s, v22.4s 428 SCVTF v23.4s, v23.4s 429 SCVTF v24.4s, v24.4s 430 SCVTF v25.4s, v25.4s 431 SCVTF v26.4s, v26.4s 432 SCVTF v27.4s, v27.4s 433 SCVTF v28.4s, v28.4s 434 SCVTF v29.4s, v29.4s 435 SCVTF v30.4s, v30.4s 436 SCVTF v31.4s, v31.4s 437 438 LDR q6, [x5], 16 439 FMUL v16.4s, v16.4s, v4.4s 440 FMUL v17.4s, v17.4s, v4.4s 441 FMUL v18.4s, v18.4s, v4.4s 442 FMUL v19.4s, v19.4s, v4.4s 443 FMUL v20.4s, v20.4s, v5.4s 444 LDR q4, [x5], 16 445 FMUL v21.4s, v21.4s, v5.4s 446 FMUL v22.4s, v22.4s, v5.4s 447 FMUL v23.4s, v23.4s, v5.4s 448 FMUL v24.4s, v24.4s, v6.4s 449 FMUL v25.4s, v25.4s, v6.4s 450 FMUL v26.4s, v26.4s, v6.4s 451 FMUL v27.4s, v27.4s, v6.4s 452 FMUL v28.4s, v28.4s, v4.4s 453 FMUL v29.4s, v29.4s, v4.4s 454 FMUL v30.4s, v30.4s, v4.4s 455 FMUL v31.4s, v31.4s, v4.4s 456 457 FCVTNS v16.4s, v16.4s 458 FCVTNS v17.4s, v17.4s 459 FCVTNS v18.4s, v18.4s 460 FCVTNS v19.4s, v19.4s 461 FCVTNS v20.4s, v20.4s 462 FCVTNS v21.4s, v21.4s 463 FCVTNS v22.4s, v22.4s 464 FCVTNS v23.4s, v23.4s 465 FCVTNS v24.4s, v24.4s 466 FCVTNS v25.4s, v25.4s 467 FCVTNS v26.4s, v26.4s 468 FCVTNS v27.4s, v27.4s 469 FCVTNS v28.4s, v28.4s 470 FCVTNS v29.4s, v29.4s 471 FCVTNS v30.4s, v30.4s 472 FCVTNS v31.4s, v31.4s 473 474 SQXTN v16.4h, v16.4s 475 SQXTN v17.4h, v17.4s 476 SQXTN v18.4h, v18.4s 477 SQXTN v19.4h, v19.4s 478 SQXTN v24.4h, v24.4s 479 SQXTN v25.4h, v25.4s 480 SQXTN v26.4h, v26.4s 481 SQXTN v27.4h, v27.4s 482 LD1R {v6.8h}, [x11], 2 // add bias 483 484 SQXTN2 v16.8h, v20.4s 485 SQXTN2 v17.8h, v21.4s 486 SQXTN2 v18.8h, v22.4s 487 SQXTN2 v19.8h, v23.4s 488 SQXTN2 v24.8h, v28.4s 489 SQXTN2 v25.8h, v29.4s 490 SQXTN2 v26.8h, v30.4s 491 SQXTN2 v27.8h, v31.4s 492 493 SQADD v16.8h, v16.8h, v6.8h 494 SQADD v17.8h, v17.8h, v6.8h 495 SQADD v18.8h, v18.8h, v6.8h 496 SQADD v19.8h, v19.8h, v6.8h 497 SQADD v24.8h, v24.8h, v6.8h 498 SQADD v25.8h, v25.8h, v6.8h 499 SQADD v26.8h, v26.8h, v6.8h 500 SQADD v27.8h, v27.8h, v6.8h 501 LD1R {v4.16b}, [x11], 1 // clamp min value 502 503 SQXTN v0.8b, v16.8h 504 SQXTN v1.8b, v17.8h 505 SQXTN v2.8b, v18.8h 506 SQXTN v3.8b, v19.8h 507 LD1R {v5.16b}, [x11] // clamp max value 508 SQXTN2 v0.16b, v24.8h 509 SQXTN2 v1.16b, v25.8h 510 SQXTN2 v2.16b, v26.8h 511 SQXTN2 v3.16b, v27.8h 512 SUB x11, x11, 3 // rewind params pointer 513 514 SMAX v0.16b, v0.16b, v4.16b 515 SMAX v1.16b, v1.16b, v4.16b 516 SMAX v2.16b, v2.16b, v4.16b 517 SMAX v3.16b, v3.16b, v4.16b 518 SUBS x1, x1, 16 519 SMIN v0.16b, v0.16b, v5.16b 520 SMIN v1.16b, v1.16b, v5.16b 521 SMIN v2.16b, v2.16b, v5.16b 522 SMIN v3.16b, v3.16b, v5.16b 523 B.LO 6f 524 525 # Store full 4 x 16 526 ST1 {v0.16b}, [x6], x12 527 SUB x3, x3, x2 // a0 -= kc 528 ST1 {v1.16b}, [x8], x12 529 SUB x15, x15, x2 // a1 -= kc 530 ST1 {v2.16b}, [x9], x12 531 SUB x13, x13, x2 // a2 -= kc 532 ST1 {v3.16b}, [x7], x12 533 SUB x4, x4, x2 // a3 -= kc 534 B.NE 0b 535 536 # Restore d8-d11 from stack 537 LDP d10, d11, [sp, 16] 538 LDP d8, d9, [sp], 32 539 RET 540 541 # Remainder- 4 to 12 bytes of A 542 # Although C4, its safe to read 16 bytes. 543 .p2align 3 5444: 545 AND x0, x2, 15 // kc remainder 4 to 12 5465: 547 LDP q8, q9, [x5], 32 548 LDP q10, q11, [x5], 32 549 LD1 {v0.16b}, [x3], x0 550 LD1 {v1.16b}, [x15], x0 551 LD1 {v2.16b}, [x13], x0 552 LD1 {v3.16b}, [x4], x0 553 SDOT v16.4s, v8.16b, v0.4b[0] 554 SDOT v17.4s, v8.16b, v1.4b[0] 555 SDOT v18.4s, v8.16b, v2.4b[0] 556 SDOT v19.4s, v8.16b, v3.4b[0] 557 SDOT v20.4s, v9.16b, v0.4b[0] 558 SDOT v21.4s, v9.16b, v1.4b[0] 559 SDOT v22.4s, v9.16b, v2.4b[0] 560 SDOT v23.4s, v9.16b, v3.4b[0] 561 SDOT v24.4s, v10.16b, v0.4b[0] 562 SDOT v25.4s, v10.16b, v1.4b[0] 563 SDOT v26.4s, v10.16b, v2.4b[0] 564 SDOT v27.4s, v10.16b, v3.4b[0] 565 SDOT v28.4s, v11.16b, v0.4b[0] 566 SDOT v29.4s, v11.16b, v1.4b[0] 567 SDOT v30.4s, v11.16b, v2.4b[0] 568 SDOT v31.4s, v11.16b, v3.4b[0] 569 CMP x0, 4 570 B.LS 3b 571 LDP q8, q9, [x5], 32 572 LDP q10, q11, [x5], 32 573 SDOT v16.4s, v8.16b, v0.4b[1] 574 SDOT v17.4s, v8.16b, v1.4b[1] 575 SDOT v18.4s, v8.16b, v2.4b[1] 576 SDOT v19.4s, v8.16b, v3.4b[1] 577 SDOT v20.4s, v9.16b, v0.4b[1] 578 SDOT v21.4s, v9.16b, v1.4b[1] 579 SDOT v22.4s, v9.16b, v2.4b[1] 580 SDOT v23.4s, v9.16b, v3.4b[1] 581 SDOT v24.4s, v10.16b, v0.4b[1] 582 SDOT v25.4s, v10.16b, v1.4b[1] 583 SDOT v26.4s, v10.16b, v2.4b[1] 584 SDOT v27.4s, v10.16b, v3.4b[1] 585 SDOT v28.4s, v11.16b, v0.4b[1] 586 SDOT v29.4s, v11.16b, v1.4b[1] 587 SDOT v30.4s, v11.16b, v2.4b[1] 588 SDOT v31.4s, v11.16b, v3.4b[1] 589 CMP x0, 8 590 B.LS 3b 591 LDP q8, q9, [x5], 32 592 LDP q10, q11, [x5], 32 593 SDOT v16.4s, v8.16b, v0.4b[2] 594 SDOT v17.4s, v8.16b, v1.4b[2] 595 SDOT v18.4s, v8.16b, v2.4b[2] 596 SDOT v19.4s, v8.16b, v3.4b[2] 597 SDOT v20.4s, v9.16b, v0.4b[2] 598 SDOT v21.4s, v9.16b, v1.4b[2] 599 SDOT v22.4s, v9.16b, v2.4b[2] 600 SDOT v23.4s, v9.16b, v3.4b[2] 601 SDOT v24.4s, v10.16b, v0.4b[2] 602 SDOT v25.4s, v10.16b, v1.4b[2] 603 SDOT v26.4s, v10.16b, v2.4b[2] 604 SDOT v27.4s, v10.16b, v3.4b[2] 605 SDOT v28.4s, v11.16b, v0.4b[2] 606 SDOT v29.4s, v11.16b, v1.4b[2] 607 SDOT v30.4s, v11.16b, v2.4b[2] 608 SDOT v31.4s, v11.16b, v3.4b[2] 609 B 3b 610 611 # Store odd width 612 .p2align 3 6136: 614 TBZ x1, 3, 7f 615 STR d0, [x6], 8 616 STR d1, [x8], 8 617 DUP d0, v0.d[1] 618 DUP d1, v1.d[1] 619 STR d2, [x9], 8 620 STR d3, [x7], 8 621 DUP d2, v2.d[1] 622 DUP d3, v3.d[1] 6237: 624 TBZ x1, 2, 8f 625 STR s0, [x6], 4 626 STR s1, [x8], 4 627 DUP s0, v0.s[1] 628 DUP s1, v1.s[1] 629 STR s2, [x9], 4 630 STR s3, [x7], 4 631 DUP s2, v2.s[1] 632 DUP s3, v3.s[1] 6338: 634 TBZ x1, 1, 9f 635 STR h0, [x6], 2 636 STR h1, [x8], 2 637 DUP h0, v0.h[1] 638 DUP h1, v1.h[1] 639 STR h2, [x9], 2 640 STR h3, [x7], 2 641 DUP h2, v2.h[1] 642 DUP h3, v3.h[1] 6439: 644 TBZ x1, 0, 10f 645 STR b0, [x6] 646 STR b1, [x8] 647 STR b2, [x9] 648 STR b3, [x7] 64910: 650 # Restore d8-d11 from stack 651 LDP d10, d11, [sp, 16] 652 LDP d8, d9, [sp], 32 653 RET 654 655END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55 656 657#ifdef __ELF__ 658.section ".note.GNU-stack","",%progbits 659#endif 660