1// Auto-generated file. Do not edit! 2// Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10 11#include <xnnpack/assembly.h> 12 13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55( 14# size_t mr, x0 15# size_t nc, x1 16# size_t kc, x2 / x0 17# size_t ks, x3 / x9 18# const int8_t**restrict a, x4 19# const int8_t* restrict w, x5 20# int8_t* restrict c, x6 21# size_t cm_stride, x7 22# size_t cn_stride, [sp] -> (x0) 23# size_t a_offset, [sp + 8] -> x8 24# const int8_t* zero, [sp + 16] -> x12 25# const union xnn_qs8_conv_minmax_params params [sp + 24] -> (x11) 26 27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 28 29# Register usage 30# A0 x13 v0 v4 31# A1 x14 v1 v5 32# A2 x15 v2 v6 33# A3 x10 v3 v7 34# B x5 v8 v9 v10 v11 35# C0 x6 v16 v20 v24 v28 36# C1 x16 v17 v21 v25 v29 37# C2 x17 v18 v22 v26 v30 38# C3 x7 v19 v23 v27 v31 39# unused v12 v13 v14 v15 40 41# x11 temp for Cortex-A55 loads 42 43BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55 44 45 # Clamp C pointers 46 CMP x0, 2 // if mr < 2 47 LDR x8, [sp, 8] // Load a_offset 48 ADD x16, x6, x7 // c1 = c0 + cm_stride 49 LDP x12, x11, [sp, 16] // Load zero, params pointer 50 CSEL x16, x6, x16, LO // c1 = c0 51 ADD x2, x2, 3 // kc = (kc + 3) & ~3 52 STP d8, d9, [sp, -32]! // Save d8-d11 on stack 53 54 ADD x17, x16, x7 // c2 = c1 + cm_stride 55 STP d10, d11, [sp, 16] 56 // if mr <= 2 57 CSEL x17, x16, x17, LS // c2 = c1 58 BIC x2, x2, 3 59 60 CMP x0, 4 // if mr < 4 61 ADD x7, x17, x7 // c3 = c2 + cm_stride 62 CSEL x7, x17, x7, LO // c3 = c2 63 64 .p2align 3 650: 66 # Load initial bias from w into accumulators 67 LDP q16, q20, [x5], 32 68 MOV v17.16b, v16.16b 69 MOV v18.16b, v16.16b 70 LDP q24, q28, [x5], 32 71 MOV v19.16b, v16.16b 72 MOV v21.16b, v20.16b 73 MOV v22.16b, v20.16b 74 MOV v23.16b, v20.16b 75 MOV v25.16b, v24.16b 76 MOV v26.16b, v24.16b 77 MOV v27.16b, v24.16b 78 MOV v29.16b, v28.16b 79 MOV v30.16b, v28.16b 80 MOV v31.16b, v28.16b 81 MOV x9, x3 // p = ks 82 83 .p2align 3 841: 85 # Load next 4 A pointers 86 LDP x13, x14, [x4], 16 87 LDP x15, x10, [x4], 16 88 89 CMP x13, x12 // if a0 == zero 90 ADD x13, x13, x8 // a0 += a_offset 91 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 92 CMP x14, x12 // if a1 == zero 93 ADD x14, x14, x8 // a1 += a_offset 94 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 95 CMP x15, x12 // if a2 == zero 96 ADD x15, x15, x8 // a2 += a_offset 97 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 98 CMP x10, x12 // if a3 == zero 99 ADD x10, x10, x8 // a3 += a_offset 100 CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset 101 102 # Is there at least 16 bytes for prologue/epilogue? 103 SUBS x0, x2, 16 // k = kc - 16 104 B.LO 5f 105 106 # prologue - read A and B values for block 0 and 1 107 LDR d0, [x13], 8 108 LDR q8, [x5], 16 109 LDR d1, [x14], 8 110 LDR d2, [x15], 8 111 LDR d3, [x10], 8 112 SUBS x0, x0, 16 // is there 16 for main loop? 113 LDR d9, [x5], 8 114 LDR x11, [x5], 8 115 # Is there at least 16 bytes for main loop? 116 B.LO 3f 117 118 # Main loop - 16 bytes of A in 4 groups. 119 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 120 # 4 LD64 for A 121 # 4 LD128 for W. = 2 LD64 + INS. 122 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 123 124 .p2align 3 1252: 126 # BLOCK 0 127 SDOT v16.4s, v8.16b, v0.4b[0] 128 LDR d10, [x5], 8 129 SDOT v17.4s, v8.16b, v1.4b[0] 130 INS v9.d[1], x11 131 SDOT v18.4s, v8.16b, v2.4b[0] 132 LDR x11, [x5], 8 133 SDOT v19.4s, v8.16b, v3.4b[0] 134 LDR d4, [x13], 8 135 136 # BLOCK 1 137 SDOT v20.4s, v9.16b, v0.4b[0] 138 LDR d11, [x5], 8 139 SDOT v21.4s, v9.16b, v1.4b[0] 140 INS v10.d[1], x11 141 SDOT v22.4s, v9.16b, v2.4b[0] 142 LDR x11, [x5], 8 143 SDOT v23.4s, v9.16b, v3.4b[0] 144 LDR d5, [x14], 8 145 146 # BLOCK 2 147 SDOT v24.4s, v10.16b, v0.4b[0] 148 LDR d8, [x5], 8 149 SDOT v25.4s, v10.16b, v1.4b[0] 150 INS v11.d[1], x11 151 SDOT v26.4s, v10.16b, v2.4b[0] 152 LDR x11, [x5], 8 153 SDOT v27.4s, v10.16b, v3.4b[0] 154 LDR d6, [x15], 8 155 156 # BLOCK 3 157 SDOT v28.4s, v11.16b, v0.4b[0] 158 LDR d9, [x5], 8 159 SDOT v29.4s, v11.16b, v1.4b[0] 160 INS v8.d[1], x11 161 SDOT v30.4s, v11.16b, v2.4b[0] 162 LDR x11, [x5], 8 163 SDOT v31.4s, v11.16b, v3.4b[0] 164 LDR d7, [x10], 8 165 166 # BLOCK 0 167 SDOT v16.4s, v8.16b, v0.4b[1] 168 LDR d10, [x5], 8 169 SDOT v17.4s, v8.16b, v1.4b[1] 170 INS v9.d[1], x11 171 SDOT v18.4s, v8.16b, v2.4b[1] 172 LDR x11, [x5], 8 173 SDOT v19.4s, v8.16b, v3.4b[1] 174 175 # BLOCK 1 176 SDOT v20.4s, v9.16b, v0.4b[1] 177 LDR d11, [x5], 8 178 SDOT v21.4s, v9.16b, v1.4b[1] 179 INS v10.d[1], x11 180 SDOT v22.4s, v9.16b, v2.4b[1] 181 LDR x11, [x5], 8 182 SDOT v23.4s, v9.16b, v3.4b[1] 183 184 # BLOCK 2 185 SDOT v24.4s, v10.16b, v0.4b[1] 186 LDR d8, [x5], 8 187 SDOT v25.4s, v10.16b, v1.4b[1] 188 INS v11.d[1], x11 189 SDOT v26.4s, v10.16b, v2.4b[1] 190 LDR x11, [x5], 8 191 SDOT v27.4s, v10.16b, v3.4b[1] 192 193 # BLOCK 3 194 SDOT v28.4s, v11.16b, v0.4b[1] 195 LDR d9, [x5], 8 196 SDOT v29.4s, v11.16b, v1.4b[1] 197 INS v8.d[1], x11 198 SDOT v30.4s, v11.16b, v2.4b[1] 199 LDR x11, [x5], 8 200 SDOT v31.4s, v11.16b, v3.4b[1] 201 202 # BLOCK 0 203 SDOT v16.4s, v8.16b, v4.4b[0] 204 LDR d10, [x5], 8 205 SDOT v17.4s, v8.16b, v5.4b[0] 206 INS v9.d[1], x11 207 SDOT v18.4s, v8.16b, v6.4b[0] 208 LDR x11, [x5], 8 209 SDOT v19.4s, v8.16b, v7.4b[0] 210 LDR d0, [x13], 8 211 212 # BLOCK 1 213 SDOT v20.4s, v9.16b, v4.4b[0] 214 LDR d11, [x5], 8 215 SDOT v21.4s, v9.16b, v5.4b[0] 216 INS v10.d[1], x11 217 SDOT v22.4s, v9.16b, v6.4b[0] 218 LDR x11, [x5], 8 219 SDOT v23.4s, v9.16b, v7.4b[0] 220 LDR d1, [x14], 8 221 222 # BLOCK 2 223 SDOT v24.4s, v10.16b, v4.4b[0] 224 LDR d8, [x5], 8 225 SDOT v25.4s, v10.16b, v5.4b[0] 226 INS v11.d[1], x11 227 SDOT v26.4s, v10.16b, v6.4b[0] 228 LDR x11, [x5], 8 229 SDOT v27.4s, v10.16b, v7.4b[0] 230 LDR d2, [x15], 8 231 232 # BLOCK 3 233 SDOT v28.4s, v11.16b, v4.4b[0] 234 LDR d9, [x5], 8 235 SDOT v29.4s, v11.16b, v5.4b[0] 236 INS v8.d[1], x11 237 SDOT v30.4s, v11.16b, v6.4b[0] 238 LDR x11, [x5], 8 239 SDOT v31.4s, v11.16b, v7.4b[0] 240 LDR d3, [x10], 8 241 242 # BLOCK 0 243 SDOT v16.4s, v8.16b, v4.4b[1] 244 LDR d10, [x5], 8 245 SDOT v17.4s, v8.16b, v5.4b[1] 246 INS v9.d[1], x11 247 SDOT v18.4s, v8.16b, v6.4b[1] 248 LDR x11, [x5], 8 249 SDOT v19.4s, v8.16b, v7.4b[1] 250 251 # BLOCK 1 252 SDOT v20.4s, v9.16b, v4.4b[1] 253 LDR d11, [x5], 8 254 SDOT v21.4s, v9.16b, v5.4b[1] 255 INS v10.d[1], x11 256 SDOT v22.4s, v9.16b, v6.4b[1] 257 LDR x11, [x5], 8 258 SDOT v23.4s, v9.16b, v7.4b[1] 259 260 # BLOCK 2 261 SDOT v24.4s, v10.16b, v4.4b[1] 262 LDR d8, [x5], 8 // First B values for block 0 and 1 263 SDOT v25.4s, v10.16b, v5.4b[1] 264 INS v11.d[1], x11 265 SDOT v26.4s, v10.16b, v6.4b[1] 266 LDR x11, [x5], 8 267 SDOT v27.4s, v10.16b, v7.4b[1] 268 SUBS x0, x0, 16 269 270 # BLOCK 3 271 SDOT v28.4s, v11.16b, v4.4b[1] 272 LDR d9, [x5], 8 273 SDOT v29.4s, v11.16b, v5.4b[1] 274 INS v8.d[1], x11 275 SDOT v30.4s, v11.16b, v6.4b[1] 276 LDR x11, [x5], 8 277 SDOT v31.4s, v11.16b, v7.4b[1] 278 B.HS 2b 279 280 # Epilogue. Same as main loop but no preloads in final group 2813: 282 # BLOCK 0 283 SDOT v16.4s, v8.16b, v0.4b[0] 284 LDR d10, [x5], 8 285 SDOT v17.4s, v8.16b, v1.4b[0] 286 INS v9.d[1], x11 287 SDOT v18.4s, v8.16b, v2.4b[0] 288 LDR x11, [x5], 8 289 SDOT v19.4s, v8.16b, v3.4b[0] 290 LDR d4, [x13], 8 291 292 # BLOCK 1 293 SDOT v20.4s, v9.16b, v0.4b[0] 294 LDR d11, [x5], 8 295 SDOT v21.4s, v9.16b, v1.4b[0] 296 INS v10.d[1], x11 297 SDOT v22.4s, v9.16b, v2.4b[0] 298 LDR x11, [x5], 8 299 SDOT v23.4s, v9.16b, v3.4b[0] 300 LDR d5, [x14], 8 301 302 # BLOCK 2 303 SDOT v24.4s, v10.16b, v0.4b[0] 304 LDR d8, [x5], 8 305 SDOT v25.4s, v10.16b, v1.4b[0] 306 INS v11.d[1], x11 307 SDOT v26.4s, v10.16b, v2.4b[0] 308 LDR x11, [x5], 8 309 SDOT v27.4s, v10.16b, v3.4b[0] 310 LDR d6, [x15], 8 311 312 # BLOCK 3 313 SDOT v28.4s, v11.16b, v0.4b[0] 314 LDR d9, [x5], 8 315 SDOT v29.4s, v11.16b, v1.4b[0] 316 INS v8.d[1], x11 317 SDOT v30.4s, v11.16b, v2.4b[0] 318 LDR x11, [x5], 8 319 SDOT v31.4s, v11.16b, v3.4b[0] 320 LDR d7, [x10], 8 321 322 # BLOCK 0 323 SDOT v16.4s, v8.16b, v0.4b[1] 324 LDR d10, [x5], 8 325 SDOT v17.4s, v8.16b, v1.4b[1] 326 INS v9.d[1], x11 327 SDOT v18.4s, v8.16b, v2.4b[1] 328 LDR x11, [x5], 8 329 SDOT v19.4s, v8.16b, v3.4b[1] 330 331 # BLOCK 1 332 SDOT v20.4s, v9.16b, v0.4b[1] 333 LDR d11, [x5], 8 334 SDOT v21.4s, v9.16b, v1.4b[1] 335 INS v10.d[1], x11 336 SDOT v22.4s, v9.16b, v2.4b[1] 337 LDR x11, [x5], 8 338 SDOT v23.4s, v9.16b, v3.4b[1] 339 340 # BLOCK 2 341 SDOT v24.4s, v10.16b, v0.4b[1] 342 LDR d8, [x5], 8 343 SDOT v25.4s, v10.16b, v1.4b[1] 344 INS v11.d[1], x11 345 SDOT v26.4s, v10.16b, v2.4b[1] 346 LDR x11, [x5], 8 347 SDOT v27.4s, v10.16b, v3.4b[1] 348 349 # BLOCK 3 350 SDOT v28.4s, v11.16b, v0.4b[1] 351 LDR d9, [x5], 8 352 SDOT v29.4s, v11.16b, v1.4b[1] 353 INS v8.d[1], x11 354 SDOT v30.4s, v11.16b, v2.4b[1] 355 LDR x11, [x5], 8 356 SDOT v31.4s, v11.16b, v3.4b[1] 357 358 # BLOCK 0 359 SDOT v16.4s, v8.16b, v4.4b[0] 360 LDR d10, [x5], 8 361 SDOT v17.4s, v8.16b, v5.4b[0] 362 INS v9.d[1], x11 363 SDOT v18.4s, v8.16b, v6.4b[0] 364 LDR x11, [x5], 8 365 SDOT v19.4s, v8.16b, v7.4b[0] 366 367 # BLOCK 1 368 SDOT v20.4s, v9.16b, v4.4b[0] 369 LDR d11, [x5], 8 370 SDOT v21.4s, v9.16b, v5.4b[0] 371 INS v10.d[1], x11 372 SDOT v22.4s, v9.16b, v6.4b[0] 373 LDR x11, [x5], 8 374 SDOT v23.4s, v9.16b, v7.4b[0] 375 376 # BLOCK 2 377 SDOT v24.4s, v10.16b, v4.4b[0] 378 LDR d8, [x5], 8 379 SDOT v25.4s, v10.16b, v5.4b[0] 380 INS v11.d[1], x11 381 SDOT v26.4s, v10.16b, v6.4b[0] 382 LDR x11, [x5], 8 383 SDOT v27.4s, v10.16b, v7.4b[0] 384 385 # BLOCK 3 386 SDOT v28.4s, v11.16b, v4.4b[0] 387 LDR d9, [x5], 8 388 SDOT v29.4s, v11.16b, v5.4b[0] 389 INS v8.d[1], x11 390 SDOT v30.4s, v11.16b, v6.4b[0] 391 LDR x11, [x5], 8 392 SDOT v31.4s, v11.16b, v7.4b[0] 393 394 # BLOCK 0 395 SDOT v16.4s, v8.16b, v4.4b[1] 396 LDR d10, [x5], 8 397 SDOT v17.4s, v8.16b, v5.4b[1] 398 INS v9.d[1], x11 399 SDOT v18.4s, v8.16b, v6.4b[1] 400 LDR x11, [x5], 8 401 SDOT v19.4s, v8.16b, v7.4b[1] 402 403 # BLOCK 1 404 SDOT v20.4s, v9.16b, v4.4b[1] 405 LDR d11, [x5], 8 406 SDOT v21.4s, v9.16b, v5.4b[1] 407 INS v10.d[1], x11 408 SDOT v22.4s, v9.16b, v6.4b[1] 409 LDR x11, [x5], 8 410 SDOT v23.4s, v9.16b, v7.4b[1] 411 412 # BLOCK 2 413 SDOT v24.4s, v10.16b, v4.4b[1] 414 SDOT v25.4s, v10.16b, v5.4b[1] 415 INS v11.d[1], x11 416 SDOT v26.4s, v10.16b, v6.4b[1] 417 SDOT v27.4s, v10.16b, v7.4b[1] 418 AND x0, x2, 15 // kc remainder 0 to 12 419 420 # BLOCK 3 421 SDOT v28.4s, v11.16b, v4.4b[1] 422 SDOT v29.4s, v11.16b, v5.4b[1] 423 LDR x11, [sp, 56] // reload params pointer 424 SDOT v30.4s, v11.16b, v6.4b[1] 425 SDOT v31.4s, v11.16b, v7.4b[1] 426 427 # Is there a remainder?- 4 to 12 bytes of A 428 CBNZ x0, 6f 429 430 .p2align 3 4314: 432 # ks loop 433 SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*) 434 B.HI 1b 435 436 # Apply params - preshift, scale, postshift, bias and clamp 437 LD1R {v4.4s}, [x11], 4 438 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 439 SQSHL v17.4s, v17.4s, v4.4s 440 SQSHL v18.4s, v18.4s, v4.4s 441 SQSHL v19.4s, v19.4s, v4.4s 442 SQSHL v20.4s, v20.4s, v4.4s 443 SQSHL v21.4s, v21.4s, v4.4s 444 SQSHL v22.4s, v22.4s, v4.4s 445 SQSHL v23.4s, v23.4s, v4.4s 446 LD1R {v5.4s}, [x11], 4 447 SQSHL v24.4s, v24.4s, v4.4s 448 SQSHL v25.4s, v25.4s, v4.4s 449 SQSHL v26.4s, v26.4s, v4.4s 450 SQSHL v27.4s, v27.4s, v4.4s 451 SQSHL v28.4s, v28.4s, v4.4s 452 SQSHL v29.4s, v29.4s, v4.4s 453 SQSHL v30.4s, v30.4s, v4.4s 454 SQSHL v31.4s, v31.4s, v4.4s 455 LD1R {v6.4s}, [x11], 4 456 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 457 SQDMULH v17.4s, v17.4s, v5.4s 458 SQDMULH v18.4s, v18.4s, v5.4s 459 SQDMULH v19.4s, v19.4s, v5.4s 460 SQDMULH v20.4s, v20.4s, v5.4s 461 SQDMULH v21.4s, v21.4s, v5.4s 462 SQDMULH v22.4s, v22.4s, v5.4s 463 SQDMULH v23.4s, v23.4s, v5.4s 464 SQDMULH v24.4s, v24.4s, v5.4s 465 SQDMULH v25.4s, v25.4s, v5.4s 466 SQDMULH v26.4s, v26.4s, v5.4s 467 SQDMULH v27.4s, v27.4s, v5.4s 468 SQDMULH v28.4s, v28.4s, v5.4s 469 SQDMULH v29.4s, v29.4s, v5.4s 470 SQDMULH v30.4s, v30.4s, v5.4s 471 SQDMULH v31.4s, v31.4s, v5.4s 472 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 473 SRSHL v17.4s, v17.4s, v6.4s 474 SRSHL v18.4s, v18.4s, v6.4s 475 SRSHL v19.4s, v19.4s, v6.4s 476 SRSHL v20.4s, v20.4s, v6.4s 477 SRSHL v21.4s, v21.4s, v6.4s 478 SRSHL v22.4s, v22.4s, v6.4s 479 SRSHL v23.4s, v23.4s, v6.4s 480 SRSHL v24.4s, v24.4s, v6.4s 481 SRSHL v25.4s, v25.4s, v6.4s 482 SRSHL v26.4s, v26.4s, v6.4s 483 SRSHL v27.4s, v27.4s, v6.4s 484 SRSHL v28.4s, v28.4s, v6.4s 485 SRSHL v29.4s, v29.4s, v6.4s 486 SRSHL v30.4s, v30.4s, v6.4s 487 SRSHL v31.4s, v31.4s, v6.4s 488 489 SQXTN v16.4h, v16.4s 490 SQXTN v17.4h, v17.4s 491 SQXTN v18.4h, v18.4s 492 SQXTN v19.4h, v19.4s 493 SQXTN v24.4h, v24.4s 494 SQXTN v25.4h, v25.4s 495 SQXTN v26.4h, v26.4s 496 SQXTN v27.4h, v27.4s 497 LD1R {v6.8h}, [x11], 2 // add bias 498 499 SQXTN2 v16.8h, v20.4s 500 SQXTN2 v17.8h, v21.4s 501 SQXTN2 v18.8h, v22.4s 502 SQXTN2 v19.8h, v23.4s 503 SQXTN2 v24.8h, v28.4s 504 SQXTN2 v25.8h, v29.4s 505 SQXTN2 v26.8h, v30.4s 506 SQXTN2 v27.8h, v31.4s 507 508 SQADD v16.8h, v16.8h, v6.8h 509 SQADD v17.8h, v17.8h, v6.8h 510 SQADD v18.8h, v18.8h, v6.8h 511 SQADD v19.8h, v19.8h, v6.8h 512 SQADD v24.8h, v24.8h, v6.8h 513 SQADD v25.8h, v25.8h, v6.8h 514 SQADD v26.8h, v26.8h, v6.8h 515 SQADD v27.8h, v27.8h, v6.8h 516 LD1R {v4.16b}, [x11], 1 // clamp min value 517 518 SQXTN v0.8b, v16.8h 519 SQXTN v1.8b, v17.8h 520 SQXTN v2.8b, v18.8h 521 SQXTN v3.8b, v19.8h 522 LD1R {v5.16b}, [x11] // clamp max value 523 SQXTN2 v0.16b, v24.8h 524 SQXTN2 v1.16b, v25.8h 525 SQXTN2 v2.16b, v26.8h 526 SQXTN2 v3.16b, v27.8h 527 LDR x0, [sp, 32] // cn_stride 528 SMAX v0.16b, v0.16b, v4.16b 529 SMAX v1.16b, v1.16b, v4.16b 530 SUB x11, x11, 15 // rewind params pointer 531 SMAX v2.16b, v2.16b, v4.16b 532 SMAX v3.16b, v3.16b, v4.16b 533 SUBS x1, x1, 16 534 SMIN v0.16b, v0.16b, v5.16b 535 SMIN v1.16b, v1.16b, v5.16b 536 SMIN v2.16b, v2.16b, v5.16b 537 SMIN v3.16b, v3.16b, v5.16b 538 B.LO 7f 539 540 # Store full 4 x 16 541 ST1 {v3.16b}, [x7], x0 542 ST1 {v2.16b}, [x17], x0 543 ST1 {v1.16b}, [x16], x0 544 ST1 {v0.16b}, [x6], x0 545 546 SUB x4, x4, x3 // a -= ks 547 548 # nc loop 549 B.HI 0b 550 551 # Restore d8-d11 from stack 552 LDP d10, d11, [sp, 16] 553 LDP d8, d9, [sp], 32 554 RET 555 556 # Remainder- 4 to 12 bytes of A 557 # Although C4, its safe to read 16 bytes. 558 .p2align 3 5595: 560 AND x0, x2, 15 // kc remainder 4 to 12 5616: 562 LDR q0, [x13] 563 LDP q8, q9, [x5], 32 564 LDR q1, [x14] 565 LDR q2, [x15] 566 LDR q3, [x10] 567 LDP q10, q11, [x5], 32 568 SDOT v16.4s, v8.16b, v0.4b[0] 569 SDOT v17.4s, v8.16b, v1.4b[0] 570 SDOT v18.4s, v8.16b, v2.4b[0] 571 SDOT v19.4s, v8.16b, v3.4b[0] 572 SDOT v20.4s, v9.16b, v0.4b[0] 573 SDOT v21.4s, v9.16b, v1.4b[0] 574 SDOT v22.4s, v9.16b, v2.4b[0] 575 SDOT v23.4s, v9.16b, v3.4b[0] 576 SDOT v24.4s, v10.16b, v0.4b[0] 577 SDOT v25.4s, v10.16b, v1.4b[0] 578 SDOT v26.4s, v10.16b, v2.4b[0] 579 SDOT v27.4s, v10.16b, v3.4b[0] 580 SDOT v28.4s, v11.16b, v0.4b[0] 581 SDOT v29.4s, v11.16b, v1.4b[0] 582 SDOT v30.4s, v11.16b, v2.4b[0] 583 SDOT v31.4s, v11.16b, v3.4b[0] 584 CMP x0, 4 585 B.LS 4b 586 LDP q8, q9, [x5], 32 587 LDP q10, q11, [x5], 32 588 SDOT v16.4s, v8.16b, v0.4b[1] 589 SDOT v17.4s, v8.16b, v1.4b[1] 590 SDOT v18.4s, v8.16b, v2.4b[1] 591 SDOT v19.4s, v8.16b, v3.4b[1] 592 SDOT v20.4s, v9.16b, v0.4b[1] 593 SDOT v21.4s, v9.16b, v1.4b[1] 594 SDOT v22.4s, v9.16b, v2.4b[1] 595 SDOT v23.4s, v9.16b, v3.4b[1] 596 SDOT v24.4s, v10.16b, v0.4b[1] 597 SDOT v25.4s, v10.16b, v1.4b[1] 598 SDOT v26.4s, v10.16b, v2.4b[1] 599 SDOT v27.4s, v10.16b, v3.4b[1] 600 SDOT v28.4s, v11.16b, v0.4b[1] 601 SDOT v29.4s, v11.16b, v1.4b[1] 602 SDOT v30.4s, v11.16b, v2.4b[1] 603 SDOT v31.4s, v11.16b, v3.4b[1] 604 CMP x0, 8 605 B.LS 4b 606 LDP q8, q9, [x5], 32 607 LDP q10, q11, [x5], 32 608 SDOT v16.4s, v8.16b, v0.4b[2] 609 SDOT v17.4s, v8.16b, v1.4b[2] 610 SDOT v18.4s, v8.16b, v2.4b[2] 611 SDOT v19.4s, v8.16b, v3.4b[2] 612 SDOT v20.4s, v9.16b, v0.4b[2] 613 SDOT v21.4s, v9.16b, v1.4b[2] 614 SDOT v22.4s, v9.16b, v2.4b[2] 615 SDOT v23.4s, v9.16b, v3.4b[2] 616 SDOT v24.4s, v10.16b, v0.4b[2] 617 SDOT v25.4s, v10.16b, v1.4b[2] 618 SDOT v26.4s, v10.16b, v2.4b[2] 619 SDOT v27.4s, v10.16b, v3.4b[2] 620 SDOT v28.4s, v11.16b, v0.4b[2] 621 SDOT v29.4s, v11.16b, v1.4b[2] 622 SDOT v30.4s, v11.16b, v2.4b[2] 623 SDOT v31.4s, v11.16b, v3.4b[2] 624 B 4b 625 626 # Store odd width 627 .p2align 3 6287: 629 TBZ x1, 3, 8f 630 STR d3, [x7], 8 631 STR d2, [x17], 8 632 DUP d3, v3.d[1] 633 DUP d2, v2.d[1] 634 STR d1, [x16], 8 635 STR d0, [x6], 8 636 DUP d1, v1.d[1] 637 DUP d0, v0.d[1] 6388: 639 TBZ x1, 2, 9f 640 STR s3, [x7], 4 641 STR s2, [x17], 4 642 DUP s3, v3.s[1] 643 DUP s2, v2.s[1] 644 STR s1, [x16], 4 645 STR s0, [x6], 4 646 DUP s1, v1.s[1] 647 DUP s0, v0.s[1] 6489: 649 TBZ x1, 1, 10f 650 STR h3, [x7], 2 651 STR h2, [x17], 2 652 DUP h3, v3.h[1] 653 DUP h2, v2.h[1] 654 STR h1, [x16], 2 655 STR h0, [x6], 2 656 DUP h1, v1.h[1] 657 DUP h0, v0.h[1] 65810: 659 TBZ x1, 0, 11f 660 STR b3, [x7] 661 STR b2, [x17] 662 STR b1, [x16] 663 STR b0, [x6] 66411: 665 # Restore d8-d11 from stack 666 LDP d10, d11, [sp, 16] 667 LDP d8, d9, [sp], 32 668 RET 669 670END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55 671 672#ifdef __ELF__ 673.section ".note.GNU-stack","",%progbits 674#endif 675