1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8 9#include <xnnpack/assembly.h> 10 11$DATATYPE = "qc8" if CHANNELWISE else "qs8" 12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION] 14# void xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55( 15# size_t mr, x0 16# size_t nc, x1 17# size_t kc, x2 / x0 18# const int8_t* restrict a, x3 19# size_t a_stride, x4 20# const void* restrict w, x5 21# int8_t* restrict c, x6 22# size_t cm_stride, x7 23# size_t cn_stride, [sp] -> x12 24# const union ${PARAMS_UNION} params) [sp + 8] -> x11 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# Register usage 29# A0 x3 v0 v4 30# A1 x15 v1 v5 31# A2 x13 v2 v6 32# A3 x4 v3 v7 33# B x5 v8 v9 v10 v11 34# C0 x6 v16 v20 v24 v28 35# C1 x8 v17 v21 v25 v29 36# C2 x9 v18 v22 v26 v30 37# C3 x7 v19 v23 v27 v31 38# unused v12 v13 v14 v15 39 40# x14 temp for Cortex-A55 loads 41 42BEGIN_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 43 44 # Clamp A and C pointers 45 CMP x0, 2 // if mr < 2 46 47 LDP x12, x11, [sp] // cn_stride, params 48 49 ADD x15, x3, x4 // a1 = a0 + a_stride 50 ADD x8, x6, x7 // c1 = c0 + cm_stride 51 52 STP d8, d9, [sp, -32]! 53 54 CSEL x15, x3, x15, LO // a1 = a0 55 CSEL x8, x6, x8, LO // c1 = c0 56 ADD x2, x2, 3 // kc = (kc + 3) & ~3 57 58 ADD x13, x15, x4 // a2 = a1 + a_stride 59 ADD x9, x8, x7 // c2 = c1 + cm_stride 60 // if mr <= 2 61 CSEL x13, x15, x13, LS // a2 = a1 62 CSEL x9, x8, x9, LS // c2 = c1 63 BIC x2, x2, 3 64 65 STP d10, d11, [sp, 16] 66 67 CMP x0, 4 // if mr < 4 68 ADD x4, x13, x4 // a3 = a2 + a_stride 69 ADD x7, x9, x7 // c3 = c2 + cm_stride 70 CSEL x4, x13, x4, LO // a3 = a2 71 CSEL x7, x9, x7, LO // c3 = c2 72 73 .p2align 3 740: 75 # Load initial bias from w into accumulators 76 LDP q16, q20, [x5], 32 77 MOV v17.16b, v16.16b 78 MOV v18.16b, v16.16b 79 LDP q24, q28, [x5], 32 80 MOV v19.16b, v16.16b 81 MOV v21.16b, v20.16b 82 MOV v22.16b, v20.16b 83 MOV v23.16b, v20.16b 84 MOV v25.16b, v24.16b 85 MOV v26.16b, v24.16b 86 SUBS x0, x2, 16 // k = kc - 16 87 MOV v27.16b, v24.16b 88 MOV v29.16b, v28.16b 89 MOV v30.16b, v28.16b 90 MOV v31.16b, v28.16b 91 # Is there at least 16 bytes for prologue/epilogue? 92 B.LO 4f 93 94 # prologue - read A and B values for block 0 and 1 95 LDR d0, [x3], 8 96 LDR q8, [x5], 16 97 LDR d1, [x15], 8 98 LDR d2, [x13], 8 99 LDR d3, [x4], 8 100 SUBS x0, x0, 16 // is there 16 for main loop? 101 LDR d9, [x5], 8 102 LDR x14, [x5], 8 103 # Is there at least 16 bytes for main loop? 104 B.LO 2f 105 106 # Main loop - 16 bytes of A in 4 groups. 107 # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels 108 # 4 LD64 for A 109 # 4 LD128 for W. = 2 LD64 + INS. 110 # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS. 111 112 .p2align 3 1131: 114 # BLOCK 0 115 SDOT v16.4s, v8.16b, v0.4b[0] 116 LDR d10, [x5], 8 117 SDOT v17.4s, v8.16b, v1.4b[0] 118 INS v9.d[1], x14 119 SDOT v18.4s, v8.16b, v2.4b[0] 120 LDR x14, [x5], 8 121 SDOT v19.4s, v8.16b, v3.4b[0] 122 LDR d4, [x3], 8 123 124 # BLOCK 1 125 SDOT v20.4s, v9.16b, v0.4b[0] 126 LDR d11, [x5], 8 127 SDOT v21.4s, v9.16b, v1.4b[0] 128 INS v10.d[1], x14 129 SDOT v22.4s, v9.16b, v2.4b[0] 130 LDR x14, [x5], 8 131 SDOT v23.4s, v9.16b, v3.4b[0] 132 LDR d5, [x15], 8 133 134 # BLOCK 2 135 SDOT v24.4s, v10.16b, v0.4b[0] 136 LDR d8, [x5], 8 137 SDOT v25.4s, v10.16b, v1.4b[0] 138 INS v11.d[1], x14 139 SDOT v26.4s, v10.16b, v2.4b[0] 140 LDR x14, [x5], 8 141 SDOT v27.4s, v10.16b, v3.4b[0] 142 LDR d6, [x13], 8 143 144 # BLOCK 3 145 SDOT v28.4s, v11.16b, v0.4b[0] 146 LDR d9, [x5], 8 147 SDOT v29.4s, v11.16b, v1.4b[0] 148 INS v8.d[1], x14 149 SDOT v30.4s, v11.16b, v2.4b[0] 150 LDR x14, [x5], 8 151 SDOT v31.4s, v11.16b, v3.4b[0] 152 LDR d7, [x4], 8 153 154 # BLOCK 0 155 SDOT v16.4s, v8.16b, v0.4b[1] 156 LDR d10, [x5], 8 157 SDOT v17.4s, v8.16b, v1.4b[1] 158 INS v9.d[1], x14 159 SDOT v18.4s, v8.16b, v2.4b[1] 160 LDR x14, [x5], 8 161 SDOT v19.4s, v8.16b, v3.4b[1] 162 163 # BLOCK 1 164 SDOT v20.4s, v9.16b, v0.4b[1] 165 LDR d11, [x5], 8 166 SDOT v21.4s, v9.16b, v1.4b[1] 167 INS v10.d[1], x14 168 SDOT v22.4s, v9.16b, v2.4b[1] 169 LDR x14, [x5], 8 170 SDOT v23.4s, v9.16b, v3.4b[1] 171 172 # BLOCK 2 173 SDOT v24.4s, v10.16b, v0.4b[1] 174 LDR d8, [x5], 8 175 SDOT v25.4s, v10.16b, v1.4b[1] 176 INS v11.d[1], x14 177 SDOT v26.4s, v10.16b, v2.4b[1] 178 LDR x14, [x5], 8 179 SDOT v27.4s, v10.16b, v3.4b[1] 180 181 # BLOCK 3 182 SDOT v28.4s, v11.16b, v0.4b[1] 183 LDR d9, [x5], 8 184 SDOT v29.4s, v11.16b, v1.4b[1] 185 INS v8.d[1], x14 186 SDOT v30.4s, v11.16b, v2.4b[1] 187 LDR x14, [x5], 8 188 SDOT v31.4s, v11.16b, v3.4b[1] 189 190 # BLOCK 0 191 SDOT v16.4s, v8.16b, v4.4b[0] 192 LDR d10, [x5], 8 193 SDOT v17.4s, v8.16b, v5.4b[0] 194 INS v9.d[1], x14 195 SDOT v18.4s, v8.16b, v6.4b[0] 196 LDR x14, [x5], 8 197 SDOT v19.4s, v8.16b, v7.4b[0] 198 LDR d0, [x3], 8 199 200 # BLOCK 1 201 SDOT v20.4s, v9.16b, v4.4b[0] 202 LDR d11, [x5], 8 203 SDOT v21.4s, v9.16b, v5.4b[0] 204 INS v10.d[1], x14 205 SDOT v22.4s, v9.16b, v6.4b[0] 206 LDR x14, [x5], 8 207 SDOT v23.4s, v9.16b, v7.4b[0] 208 LDR d1, [x15], 8 209 210 # BLOCK 2 211 SDOT v24.4s, v10.16b, v4.4b[0] 212 LDR d8, [x5], 8 213 SDOT v25.4s, v10.16b, v5.4b[0] 214 INS v11.d[1], x14 215 SDOT v26.4s, v10.16b, v6.4b[0] 216 LDR x14, [x5], 8 217 SDOT v27.4s, v10.16b, v7.4b[0] 218 LDR d2, [x13], 8 219 220 # BLOCK 3 221 SDOT v28.4s, v11.16b, v4.4b[0] 222 LDR d9, [x5], 8 223 SDOT v29.4s, v11.16b, v5.4b[0] 224 INS v8.d[1], x14 225 SDOT v30.4s, v11.16b, v6.4b[0] 226 LDR x14, [x5], 8 227 SDOT v31.4s, v11.16b, v7.4b[0] 228 LDR d3, [x4], 8 229 230 # BLOCK 0 231 SDOT v16.4s, v8.16b, v4.4b[1] 232 LDR d10, [x5], 8 233 SDOT v17.4s, v8.16b, v5.4b[1] 234 INS v9.d[1], x14 235 SDOT v18.4s, v8.16b, v6.4b[1] 236 LDR x14, [x5], 8 237 SDOT v19.4s, v8.16b, v7.4b[1] 238 239 # BLOCK 1 240 SDOT v20.4s, v9.16b, v4.4b[1] 241 LDR d11, [x5], 8 242 SDOT v21.4s, v9.16b, v5.4b[1] 243 INS v10.d[1], x14 244 SDOT v22.4s, v9.16b, v6.4b[1] 245 LDR x14, [x5], 8 246 SDOT v23.4s, v9.16b, v7.4b[1] 247 248 # BLOCK 2 249 SDOT v24.4s, v10.16b, v4.4b[1] 250 LDR d8, [x5], 8 // First B values for block 0 and 1 251 SDOT v25.4s, v10.16b, v5.4b[1] 252 INS v11.d[1], x14 253 SDOT v26.4s, v10.16b, v6.4b[1] 254 LDR x14, [x5], 8 255 SDOT v27.4s, v10.16b, v7.4b[1] 256 SUBS x0, x0, 16 257 258 # BLOCK 3 259 SDOT v28.4s, v11.16b, v4.4b[1] 260 LDR d9, [x5], 8 261 SDOT v29.4s, v11.16b, v5.4b[1] 262 INS v8.d[1], x14 263 SDOT v30.4s, v11.16b, v6.4b[1] 264 LDR x14, [x5], 8 265 SDOT v31.4s, v11.16b, v7.4b[1] 266 B.HS 1b 267 268 # Epilogue. Same as main loop but no preloads in final group 2692: 270 # BLOCK 0 271 SDOT v16.4s, v8.16b, v0.4b[0] 272 LDR d10, [x5], 8 273 SDOT v17.4s, v8.16b, v1.4b[0] 274 INS v9.d[1], x14 275 SDOT v18.4s, v8.16b, v2.4b[0] 276 LDR x14, [x5], 8 277 SDOT v19.4s, v8.16b, v3.4b[0] 278 LDR d4, [x3], 8 279 280 # BLOCK 1 281 SDOT v20.4s, v9.16b, v0.4b[0] 282 LDR d11, [x5], 8 283 SDOT v21.4s, v9.16b, v1.4b[0] 284 INS v10.d[1], x14 285 SDOT v22.4s, v9.16b, v2.4b[0] 286 LDR x14, [x5], 8 287 SDOT v23.4s, v9.16b, v3.4b[0] 288 LDR d5, [x15], 8 289 290 # BLOCK 2 291 SDOT v24.4s, v10.16b, v0.4b[0] 292 LDR d8, [x5], 8 293 SDOT v25.4s, v10.16b, v1.4b[0] 294 INS v11.d[1], x14 295 SDOT v26.4s, v10.16b, v2.4b[0] 296 LDR x14, [x5], 8 297 SDOT v27.4s, v10.16b, v3.4b[0] 298 LDR d6, [x13], 8 299 300 # BLOCK 3 301 SDOT v28.4s, v11.16b, v0.4b[0] 302 LDR d9, [x5], 8 303 SDOT v29.4s, v11.16b, v1.4b[0] 304 INS v8.d[1], x14 305 SDOT v30.4s, v11.16b, v2.4b[0] 306 LDR x14, [x5], 8 307 SDOT v31.4s, v11.16b, v3.4b[0] 308 LDR d7, [x4], 8 309 310 # BLOCK 0 311 SDOT v16.4s, v8.16b, v0.4b[1] 312 LDR d10, [x5], 8 313 SDOT v17.4s, v8.16b, v1.4b[1] 314 INS v9.d[1], x14 315 SDOT v18.4s, v8.16b, v2.4b[1] 316 LDR x14, [x5], 8 317 SDOT v19.4s, v8.16b, v3.4b[1] 318 319 # BLOCK 1 320 SDOT v20.4s, v9.16b, v0.4b[1] 321 LDR d11, [x5], 8 322 SDOT v21.4s, v9.16b, v1.4b[1] 323 INS v10.d[1], x14 324 SDOT v22.4s, v9.16b, v2.4b[1] 325 LDR x14, [x5], 8 326 SDOT v23.4s, v9.16b, v3.4b[1] 327 328 # BLOCK 2 329 SDOT v24.4s, v10.16b, v0.4b[1] 330 LDR d8, [x5], 8 331 SDOT v25.4s, v10.16b, v1.4b[1] 332 INS v11.d[1], x14 333 SDOT v26.4s, v10.16b, v2.4b[1] 334 LDR x14, [x5], 8 335 SDOT v27.4s, v10.16b, v3.4b[1] 336 337 # BLOCK 3 338 SDOT v28.4s, v11.16b, v0.4b[1] 339 LDR d9, [x5], 8 340 SDOT v29.4s, v11.16b, v1.4b[1] 341 INS v8.d[1], x14 342 SDOT v30.4s, v11.16b, v2.4b[1] 343 LDR x14, [x5], 8 344 SDOT v31.4s, v11.16b, v3.4b[1] 345 346 # BLOCK 0 347 SDOT v16.4s, v8.16b, v4.4b[0] 348 LDR d10, [x5], 8 349 SDOT v17.4s, v8.16b, v5.4b[0] 350 INS v9.d[1], x14 351 SDOT v18.4s, v8.16b, v6.4b[0] 352 LDR x14, [x5], 8 353 SDOT v19.4s, v8.16b, v7.4b[0] 354 355 # BLOCK 1 356 SDOT v20.4s, v9.16b, v4.4b[0] 357 LDR d11, [x5], 8 358 SDOT v21.4s, v9.16b, v5.4b[0] 359 INS v10.d[1], x14 360 SDOT v22.4s, v9.16b, v6.4b[0] 361 LDR x14, [x5], 8 362 SDOT v23.4s, v9.16b, v7.4b[0] 363 364 # BLOCK 2 365 SDOT v24.4s, v10.16b, v4.4b[0] 366 LDR d8, [x5], 8 367 SDOT v25.4s, v10.16b, v5.4b[0] 368 INS v11.d[1], x14 369 SDOT v26.4s, v10.16b, v6.4b[0] 370 LDR x14, [x5], 8 371 SDOT v27.4s, v10.16b, v7.4b[0] 372 373 # BLOCK 3 374 SDOT v28.4s, v11.16b, v4.4b[0] 375 LDR d9, [x5], 8 376 SDOT v29.4s, v11.16b, v5.4b[0] 377 INS v8.d[1], x14 378 SDOT v30.4s, v11.16b, v6.4b[0] 379 LDR x14, [x5], 8 380 SDOT v31.4s, v11.16b, v7.4b[0] 381 382 # BLOCK 0 383 SDOT v16.4s, v8.16b, v4.4b[1] 384 LDR d10, [x5], 8 385 SDOT v17.4s, v8.16b, v5.4b[1] 386 INS v9.d[1], x14 387 SDOT v18.4s, v8.16b, v6.4b[1] 388 LDR x14, [x5], 8 389 SDOT v19.4s, v8.16b, v7.4b[1] 390 391 # BLOCK 1 392 SDOT v20.4s, v9.16b, v4.4b[1] 393 LDR d11, [x5], 8 394 SDOT v21.4s, v9.16b, v5.4b[1] 395 INS v10.d[1], x14 396 SDOT v22.4s, v9.16b, v6.4b[1] 397 LDR x14, [x5], 8 398 SDOT v23.4s, v9.16b, v7.4b[1] 399 400 # BLOCK 2 401 SDOT v24.4s, v10.16b, v4.4b[1] 402 SDOT v25.4s, v10.16b, v5.4b[1] 403 INS v11.d[1], x14 404 SDOT v26.4s, v10.16b, v6.4b[1] 405 SDOT v27.4s, v10.16b, v7.4b[1] 406 AND x0, x2, 15 // kc remainder 0 to 12 407 408 # BLOCK 3 409 SDOT v28.4s, v11.16b, v4.4b[1] 410 SDOT v29.4s, v11.16b, v5.4b[1] 411 SDOT v30.4s, v11.16b, v6.4b[1] 412 SDOT v31.4s, v11.16b, v7.4b[1] 413 414 # Is there a remainder?- 4 to 12 bytes of A 415 CBNZ x0, 5f 416 417 .p2align 3 4183: 419 $if REQUANTIZATION == "RNDNU": 420 # Apply params - preshift, scale, postshift, bias and clamp 421 LD1R {v4.4s}, [x11], 4 422 SQSHL v16.4s, v16.4s, v4.4s // shift to upper bits 423 SQSHL v17.4s, v17.4s, v4.4s 424 SQSHL v18.4s, v18.4s, v4.4s 425 SQSHL v19.4s, v19.4s, v4.4s 426 SQSHL v20.4s, v20.4s, v4.4s 427 SQSHL v21.4s, v21.4s, v4.4s 428 SQSHL v22.4s, v22.4s, v4.4s 429 SQSHL v23.4s, v23.4s, v4.4s 430 LD1R {v5.4s}, [x11], 4 431 SQSHL v24.4s, v24.4s, v4.4s 432 SQSHL v25.4s, v25.4s, v4.4s 433 SQSHL v26.4s, v26.4s, v4.4s 434 SQSHL v27.4s, v27.4s, v4.4s 435 SQSHL v28.4s, v28.4s, v4.4s 436 SQSHL v29.4s, v29.4s, v4.4s 437 SQSHL v30.4s, v30.4s, v4.4s 438 SQSHL v31.4s, v31.4s, v4.4s 439 LD1R {v6.4s}, [x11], 4 440 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding 441 SQDMULH v17.4s, v17.4s, v5.4s 442 SQDMULH v18.4s, v18.4s, v5.4s 443 SQDMULH v19.4s, v19.4s, v5.4s 444 SQDMULH v20.4s, v20.4s, v5.4s 445 SQDMULH v21.4s, v21.4s, v5.4s 446 SQDMULH v22.4s, v22.4s, v5.4s 447 SQDMULH v23.4s, v23.4s, v5.4s 448 SQDMULH v24.4s, v24.4s, v5.4s 449 SQDMULH v25.4s, v25.4s, v5.4s 450 SQDMULH v26.4s, v26.4s, v5.4s 451 SQDMULH v27.4s, v27.4s, v5.4s 452 SQDMULH v28.4s, v28.4s, v5.4s 453 SQDMULH v29.4s, v29.4s, v5.4s 454 SQDMULH v30.4s, v30.4s, v5.4s 455 SQDMULH v31.4s, v31.4s, v5.4s 456 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left 457 SRSHL v17.4s, v17.4s, v6.4s 458 SRSHL v18.4s, v18.4s, v6.4s 459 SRSHL v19.4s, v19.4s, v6.4s 460 SRSHL v20.4s, v20.4s, v6.4s 461 SRSHL v21.4s, v21.4s, v6.4s 462 SRSHL v22.4s, v22.4s, v6.4s 463 SRSHL v23.4s, v23.4s, v6.4s 464 SRSHL v24.4s, v24.4s, v6.4s 465 SRSHL v25.4s, v25.4s, v6.4s 466 SRSHL v26.4s, v26.4s, v6.4s 467 SRSHL v27.4s, v27.4s, v6.4s 468 SRSHL v28.4s, v28.4s, v6.4s 469 SRSHL v29.4s, v29.4s, v6.4s 470 SRSHL v30.4s, v30.4s, v6.4s 471 SRSHL v31.4s, v31.4s, v6.4s 472 $elif REQUANTIZATION == "FP32": 473 SCVTF v16.4s, v16.4s 474 SCVTF v17.4s, v17.4s 475 $if not CHANNELWISE: 476 # Apply params - scale, bias and clamp 477 LD1R {v4.4s}, [x11], 4 478 SCVTF v18.4s, v18.4s 479 SCVTF v19.4s, v19.4s 480 $else: 481 # Load per channel scale values from weights 482 LDR q4, [x5], 16 483 SCVTF v18.4s, v18.4s 484 SCVTF v19.4s, v19.4s 485 LDR q5, [x5], 16 486 SCVTF v20.4s, v20.4s 487 SCVTF v21.4s, v21.4s 488 SCVTF v22.4s, v22.4s 489 SCVTF v23.4s, v23.4s 490 SCVTF v24.4s, v24.4s 491 SCVTF v25.4s, v25.4s 492 SCVTF v26.4s, v26.4s 493 SCVTF v27.4s, v27.4s 494 SCVTF v28.4s, v28.4s 495 SCVTF v29.4s, v29.4s 496 SCVTF v30.4s, v30.4s 497 SCVTF v31.4s, v31.4s 498 499 $if CHANNELWISE: 500 LDR q6, [x5], 16 501 FMUL v16.4s, v16.4s, v4.4s 502 FMUL v17.4s, v17.4s, v4.4s 503 FMUL v18.4s, v18.4s, v4.4s 504 FMUL v19.4s, v19.4s, v4.4s 505 FMUL v20.4s, v20.4s, v5.4s 506 LDR q4, [x5], 16 507 FMUL v21.4s, v21.4s, v5.4s 508 FMUL v22.4s, v22.4s, v5.4s 509 FMUL v23.4s, v23.4s, v5.4s 510 FMUL v24.4s, v24.4s, v6.4s 511 FMUL v25.4s, v25.4s, v6.4s 512 FMUL v26.4s, v26.4s, v6.4s 513 FMUL v27.4s, v27.4s, v6.4s 514 FMUL v28.4s, v28.4s, v4.4s 515 FMUL v29.4s, v29.4s, v4.4s 516 FMUL v30.4s, v30.4s, v4.4s 517 FMUL v31.4s, v31.4s, v4.4s 518 $else: 519 FMUL v16.4s, v16.4s, v4.4s 520 FMUL v17.4s, v17.4s, v4.4s 521 FMUL v18.4s, v18.4s, v4.4s 522 FMUL v19.4s, v19.4s, v4.4s 523 FMUL v20.4s, v20.4s, v4.4s 524 FMUL v21.4s, v21.4s, v4.4s 525 FMUL v22.4s, v22.4s, v4.4s 526 FMUL v23.4s, v23.4s, v4.4s 527 FMUL v24.4s, v24.4s, v4.4s 528 FMUL v25.4s, v25.4s, v4.4s 529 FMUL v26.4s, v26.4s, v4.4s 530 FMUL v27.4s, v27.4s, v4.4s 531 FMUL v28.4s, v28.4s, v4.4s 532 FMUL v29.4s, v29.4s, v4.4s 533 FMUL v30.4s, v30.4s, v4.4s 534 FMUL v31.4s, v31.4s, v4.4s 535 536 FCVTNS v16.4s, v16.4s 537 FCVTNS v17.4s, v17.4s 538 FCVTNS v18.4s, v18.4s 539 FCVTNS v19.4s, v19.4s 540 FCVTNS v20.4s, v20.4s 541 FCVTNS v21.4s, v21.4s 542 FCVTNS v22.4s, v22.4s 543 FCVTNS v23.4s, v23.4s 544 FCVTNS v24.4s, v24.4s 545 FCVTNS v25.4s, v25.4s 546 FCVTNS v26.4s, v26.4s 547 FCVTNS v27.4s, v27.4s 548 FCVTNS v28.4s, v28.4s 549 FCVTNS v29.4s, v29.4s 550 FCVTNS v30.4s, v30.4s 551 FCVTNS v31.4s, v31.4s 552 553 SQXTN v16.4h, v16.4s 554 SQXTN v17.4h, v17.4s 555 SQXTN v18.4h, v18.4s 556 SQXTN v19.4h, v19.4s 557 SQXTN v24.4h, v24.4s 558 SQXTN v25.4h, v25.4s 559 SQXTN v26.4h, v26.4s 560 SQXTN v27.4h, v27.4s 561 LD1R {v6.8h}, [x11], 2 // add bias 562 563 SQXTN2 v16.8h, v20.4s 564 SQXTN2 v17.8h, v21.4s 565 SQXTN2 v18.8h, v22.4s 566 SQXTN2 v19.8h, v23.4s 567 SQXTN2 v24.8h, v28.4s 568 SQXTN2 v25.8h, v29.4s 569 SQXTN2 v26.8h, v30.4s 570 SQXTN2 v27.8h, v31.4s 571 572 SQADD v16.8h, v16.8h, v6.8h 573 SQADD v17.8h, v17.8h, v6.8h 574 SQADD v18.8h, v18.8h, v6.8h 575 SQADD v19.8h, v19.8h, v6.8h 576 SQADD v24.8h, v24.8h, v6.8h 577 SQADD v25.8h, v25.8h, v6.8h 578 SQADD v26.8h, v26.8h, v6.8h 579 SQADD v27.8h, v27.8h, v6.8h 580 LD1R {v4.16b}, [x11], 1 // clamp min value 581 582 SQXTN v0.8b, v16.8h 583 SQXTN v1.8b, v17.8h 584 SQXTN v2.8b, v18.8h 585 SQXTN v3.8b, v19.8h 586 LD1R {v5.16b}, [x11] // clamp max value 587 SQXTN2 v0.16b, v24.8h 588 SQXTN2 v1.16b, v25.8h 589 SQXTN2 v2.16b, v26.8h 590 SQXTN2 v3.16b, v27.8h 591 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer 592 593 SMAX v0.16b, v0.16b, v4.16b 594 SMAX v1.16b, v1.16b, v4.16b 595 SMAX v2.16b, v2.16b, v4.16b 596 SMAX v3.16b, v3.16b, v4.16b 597 SUBS x1, x1, 16 598 SMIN v0.16b, v0.16b, v5.16b 599 SMIN v1.16b, v1.16b, v5.16b 600 SMIN v2.16b, v2.16b, v5.16b 601 SMIN v3.16b, v3.16b, v5.16b 602 B.LO 6f 603 604 # Store full 4 x 16 605 ST1 {v0.16b}, [x6], x12 606 SUB x3, x3, x2 // a0 -= kc 607 ST1 {v1.16b}, [x8], x12 608 SUB x15, x15, x2 // a1 -= kc 609 ST1 {v2.16b}, [x9], x12 610 SUB x13, x13, x2 // a2 -= kc 611 ST1 {v3.16b}, [x7], x12 612 SUB x4, x4, x2 // a3 -= kc 613 B.NE 0b 614 615 # Restore d8-d11 from stack 616 LDP d10, d11, [sp, 16] 617 LDP d8, d9, [sp], 32 618 RET 619 620 # Remainder- 4 to 12 bytes of A 621 # Although C4, its safe to read 16 bytes. 622 .p2align 3 6234: 624 AND x0, x2, 15 // kc remainder 4 to 12 6255: 626 LDP q8, q9, [x5], 32 627 LDP q10, q11, [x5], 32 628 LD1 {v0.16b}, [x3], x0 629 LD1 {v1.16b}, [x15], x0 630 LD1 {v2.16b}, [x13], x0 631 LD1 {v3.16b}, [x4], x0 632 SDOT v16.4s, v8.16b, v0.4b[0] 633 SDOT v17.4s, v8.16b, v1.4b[0] 634 SDOT v18.4s, v8.16b, v2.4b[0] 635 SDOT v19.4s, v8.16b, v3.4b[0] 636 SDOT v20.4s, v9.16b, v0.4b[0] 637 SDOT v21.4s, v9.16b, v1.4b[0] 638 SDOT v22.4s, v9.16b, v2.4b[0] 639 SDOT v23.4s, v9.16b, v3.4b[0] 640 SDOT v24.4s, v10.16b, v0.4b[0] 641 SDOT v25.4s, v10.16b, v1.4b[0] 642 SDOT v26.4s, v10.16b, v2.4b[0] 643 SDOT v27.4s, v10.16b, v3.4b[0] 644 SDOT v28.4s, v11.16b, v0.4b[0] 645 SDOT v29.4s, v11.16b, v1.4b[0] 646 SDOT v30.4s, v11.16b, v2.4b[0] 647 SDOT v31.4s, v11.16b, v3.4b[0] 648 CMP x0, 4 649 B.LS 3b 650 LDP q8, q9, [x5], 32 651 LDP q10, q11, [x5], 32 652 SDOT v16.4s, v8.16b, v0.4b[1] 653 SDOT v17.4s, v8.16b, v1.4b[1] 654 SDOT v18.4s, v8.16b, v2.4b[1] 655 SDOT v19.4s, v8.16b, v3.4b[1] 656 SDOT v20.4s, v9.16b, v0.4b[1] 657 SDOT v21.4s, v9.16b, v1.4b[1] 658 SDOT v22.4s, v9.16b, v2.4b[1] 659 SDOT v23.4s, v9.16b, v3.4b[1] 660 SDOT v24.4s, v10.16b, v0.4b[1] 661 SDOT v25.4s, v10.16b, v1.4b[1] 662 SDOT v26.4s, v10.16b, v2.4b[1] 663 SDOT v27.4s, v10.16b, v3.4b[1] 664 SDOT v28.4s, v11.16b, v0.4b[1] 665 SDOT v29.4s, v11.16b, v1.4b[1] 666 SDOT v30.4s, v11.16b, v2.4b[1] 667 SDOT v31.4s, v11.16b, v3.4b[1] 668 CMP x0, 8 669 B.LS 3b 670 LDP q8, q9, [x5], 32 671 LDP q10, q11, [x5], 32 672 SDOT v16.4s, v8.16b, v0.4b[2] 673 SDOT v17.4s, v8.16b, v1.4b[2] 674 SDOT v18.4s, v8.16b, v2.4b[2] 675 SDOT v19.4s, v8.16b, v3.4b[2] 676 SDOT v20.4s, v9.16b, v0.4b[2] 677 SDOT v21.4s, v9.16b, v1.4b[2] 678 SDOT v22.4s, v9.16b, v2.4b[2] 679 SDOT v23.4s, v9.16b, v3.4b[2] 680 SDOT v24.4s, v10.16b, v0.4b[2] 681 SDOT v25.4s, v10.16b, v1.4b[2] 682 SDOT v26.4s, v10.16b, v2.4b[2] 683 SDOT v27.4s, v10.16b, v3.4b[2] 684 SDOT v28.4s, v11.16b, v0.4b[2] 685 SDOT v29.4s, v11.16b, v1.4b[2] 686 SDOT v30.4s, v11.16b, v2.4b[2] 687 SDOT v31.4s, v11.16b, v3.4b[2] 688 B 3b 689 690 # Store odd width 691 .p2align 3 6926: 693 TBZ x1, 3, 7f 694 STR d0, [x6], 8 695 STR d1, [x8], 8 696 DUP d0, v0.d[1] 697 DUP d1, v1.d[1] 698 STR d2, [x9], 8 699 STR d3, [x7], 8 700 DUP d2, v2.d[1] 701 DUP d3, v3.d[1] 7027: 703 TBZ x1, 2, 8f 704 STR s0, [x6], 4 705 STR s1, [x8], 4 706 DUP s0, v0.s[1] 707 DUP s1, v1.s[1] 708 STR s2, [x9], 4 709 STR s3, [x7], 4 710 DUP s2, v2.s[1] 711 DUP s3, v3.s[1] 7128: 713 TBZ x1, 1, 9f 714 STR h0, [x6], 2 715 STR h1, [x8], 2 716 DUP h0, v0.h[1] 717 DUP h1, v1.h[1] 718 STR h2, [x9], 2 719 STR h3, [x7], 2 720 DUP h2, v2.h[1] 721 DUP h3, v3.h[1] 7229: 723 TBZ x1, 0, 10f 724 STR b0, [x6] 725 STR b1, [x8] 726 STR b2, [x9] 727 STR b3, [x7] 72810: 729 # Restore d8-d11 from stack 730 LDP d10, d11, [sp, 16] 731 LDP d8, d9, [sp], 32 732 RET 733 734END_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55 735 736#ifdef __ELF__ 737.section ".note.GNU-stack","",%progbits 738#endif 739