1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8$assert DATATYPE in ["QC8", "QS8", "QU8"] 9$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32" 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 16$ISA = "neonv8" if ARMV8 else "neon" 17$CPU = "a35" if ARMV8 else "a7" 18$XMIN = "VMIN.U8" if DATATYPE == "QU8" else "VMIN.S8" 19$XMAX = "VMAX.U8" if DATATYPE == "QU8" else "VMAX.S8" 20$XXTL = "VMOVL.U8" if DATATYPE == "QU8" else "VMOVL.S8" 21$SQXTXN = "VQMOVUN.S16" if DATATYPE == "QU8" else "VQMOVN.S16" 22$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 23// void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU} 24// size_t mr, (r0) 25// size_t nc, r1 26// size_t kc, (r2) -> sp + 56 -> r5 27// size_t ks, (r3) -> sp + 60 -> r14 28// const ${XINT8_T}**restrict a, sp + 88 -> r2 29// const void*restrict w, sp + 92 -> r9 30// ${XINT8_T}*restrict c, sp + 96 -> r11 31// size_t cm_stride, sp + 100 -> r6 32// size_t cn_stride, sp + 104 -> r12 33// size_t a_offset, sp + 108 -> (r5) 34// const ${XINT8_T}* zero, sp + 112 -> r7 35// ${PARAMS_UNION}*params); sp + 116 -> (r5) 36 37// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 38 39// Based on cortex_a53 microkernel but with Neon loads 40 41// Register usage 42// A0 r3 d0-d1 q0 43 44// B r9 d8-d9 q4 q5 45 46// C0 r11 d16-d17 q8 d18-d19 q9 47// q2, q3 acc2 48 49// Unused r4, r8, r10, d15, q10-q15, q1-q3 50 51$if REQUANTIZATION == "RNDNU" and DATATYPE != "QU8": 52 // params structure is 16 bytes 53 // struct { 54 // int32_t right_pre_shift; d12[0] 55 // int32_t multiplier; d12[1] 56 // int32_t right_post_shift; d13[0] 57 // int16_t output_zero_point; d13[2] 58 // int8_t output_min; d13[6] 59 // int8_t output_max; d13[7] 60 // } rndnu_neon; 61$elif REQUANTIZATION == "RNDNU" and DATATYPE == "QU8": 62 // params structure is 20 bytes 63 // struct { 64 // uint8_t kernel_zero_point[4]; d14 65 // int32_t right_pre_shift; d12[0] 66 // int32_t multiplier; d12[1] 67 // int32_t right_post_shift; d13[0] 68 // int16_t output_zero_point; d13[2] 69 // uint8_t output_min; d13[6] 70 // uint8_t output_max; d13[7] 71 // } rndnu_neon; 72$elif DATATYPE == "QC8" and not ARMV8: 73 // params structure is 10 bytes 74 // struct { 75 // float magic_bias; d12[0] 76 // int32_t magic_bias_less_output_zero_point; d12[1] 77 // int8_t output_min; d13[6] 78 // int8_t output_max; d13[7] 79 // } xnn_qs8_minmax_params.neon; 80$else: 81 // params structure is 4 bytes 82 // struct { 83 // int16_t output_zero_point; d13[2] 84 // int8_t output_min; d13[6] 85 // int8_t output_max; d13[7] 86 // } xnn_qs8_minmax_params.neonv8; 87 88BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU} 89 # Push 88 bytes 90 # r2, r3 will be reloaded in outer loop. 91 PUSH {r2, r3, r5, r6, r7, r9, r11, lr} // +32 92 $if DATATYPE == "QU8": 93 VPUSH {d8-d14} // +56 = 88 94 $else: 95 SUB sp, sp, 8 // +8 96 VPUSH {d8-d13} // +48 = 88 97 98 LDR r2, [sp, 88] // a 99 LDR r9, [sp, 92] // w 100 LDR r11, [sp, 96] // c 101 LDR r6, [sp, 100] // cm_stride 102 LDR r12, [sp, 104] // cn_stride 103 LDR r7, [sp, 112] // zero 104 LDR r5, [sp, 116] // params 105 MOV r14, r3 // p = ks 106 107 # Load params values 108 $if DATATYPE == "QU8": 109 VLD1.32 {d14[]}, [r5]! // QU8 kernel_zero_point 110 $if REQUANTIZATION == "RNDNU": 111 VLDM r5, {d12-d13} // RNDNU params 112 $elif DATATYPE == "QC8" and ARMV8: 113 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 114 $elif DATATYPE == "QC8" and not ARMV8: 115 VLDM r5!, {d12} // QC8 neon params 116 VLD1.16 {d13[]}, [r5] 117 118 $if PREFETCH: 119 PLD [r9, 64] // Prefetch B 120 PLD [r9, 112] 121 PLD [r9, 192] 122 PLD [r9, 256] 123 PLD [r9, 320] 124 PLD [r9, 384] 125 126 .p2align 3 1270: 128 # Load initial bias from w into accumulators 129 VLDM r9!, {d16-d19} // Bias 130 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 131 VMOV.I32 q3, 0 132 133 .p2align 3 1341: 135 # Load next A pointer 136 LDR r3, [r2, 0] 137 138 # Add a_offset 139 LDR r5, [sp, 108] // a_offset 140 ADD r2, r2, 4 141 CMP r3, r7 // if a0 == zero 142 ADD r3, r3, r5 // a0 += a_offset 143 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset 144 145 LDR r5, [sp, 56] // kc 146 SUBS r5, r5, 8 // kc - 8 147 BLO 5f // less than 8 channels? 148 149 // Prologue - load A0 and B0 150 VLD1.8 {d0}, [r3]! // A0 151 SUBS r5, r5, 8 // k = k - 8 152 VLD1.8 {d8}, [r9]! // B0 153 BLO 3f // less than 8 channels? 154 155 // Main loop - 8 bytes 156 // 64 bytes for weights. 157 158 .p2align 3 1592: 160 // Extend 161 ${XXTL} q0, d0 162 $if DATATYPE == "QU8": 163 VSUBL.U8 q4, d8, d14 164 $else: 165 VMOVL.S8 q4, d8 166 $if PREFETCH: 167 PLD [r9, 448] 168 169 // BLOCK 0 170 VLD1.8 {d10}, [r9]! // B1 171 VMLAL.S16 q8, d8, d0[0] 172 VMLAL.S16 q9, d9, d0[0] 173 $if DATATYPE == "QU8": 174 VSUBL.U8 q5, d10, d14 175 $else: 176 VMOVL.S8 q5, d10 177 178 // BLOCK 1 179 VLD1.8 {d8}, [r9]! // B2 180 VMLAL.S16 q2, d10, d0[1] 181 VMLAL.S16 q3, d11, d0[1] 182 $if DATATYPE == "QU8": 183 VSUBL.U8 q4, d8, d14 184 $else: 185 VMOVL.S8 q4, d8 186 187 // BLOCK 2 188 VLD1.8 {d10}, [r9]! // B3 189 VMLAL.S16 q8, d8, d0[2] 190 VMLAL.S16 q9, d9, d0[2] 191 $if DATATYPE == "QU8": 192 VSUBL.U8 q5, d10, d14 193 $else: 194 VMOVL.S8 q5, d10 195 196 // BLOCK 3 197 VLD1.8 {d8}, [r9]! // B4 198 VMLAL.S16 q2, d10, d0[3] 199 VMLAL.S16 q3, d11, d0[3] 200 VLD1.8 {d0}, [r3]! // A0 201 $if DATATYPE == "QU8": 202 VSUBL.U8 q4, d8, d14 203 $else: 204 VMOVL.S8 q4, d8 205 206 // BLOCK 4 207 VLD1.8 {d10}, [r9]! // B5 208 VMLAL.S16 q8, d8, d1[0] 209 VMLAL.S16 q9, d9, d1[0] 210 $if DATATYPE == "QU8": 211 VSUBL.U8 q5, d10, d14 212 $else: 213 VMOVL.S8 q5, d10 214 215 // BLOCK 5 216 VLD1.8 {d8}, [r9]! // B6 217 VMLAL.S16 q2, d10, d1[1] 218 VMLAL.S16 q3, d11, d1[1] 219 $if DATATYPE == "QU8": 220 VSUBL.U8 q4, d8, d14 221 $else: 222 VMOVL.S8 q4, d8 223 224 // BLOCK 6 225 VLD1.8 {d10}, [r9]! // B7 226 VMLAL.S16 q8, d8, d1[2] 227 VMLAL.S16 q9, d9, d1[2] 228 $if DATATYPE == "QU8": 229 VSUBL.U8 q5, d10, d14 230 $else: 231 VMOVL.S8 q5, d10 232 SUBS r5, r5, 8 233 234 // BLOCK 7 235 VLD1.8 {d8}, [r9]! // B0 236 VMLAL.S16 q2, d10, d1[3] 237 VMLAL.S16 q3, d11, d1[3] 238 BHS 2b 239 240 // Epilogue 241 242 .p2align 3 2433: 244 // Extend 245 ${XXTL} q0, d0 246 $if DATATYPE == "QU8": 247 VSUBL.U8 q4, d8, d14 248 $else: 249 VMOVL.S8 q4, d8 250 $if PREFETCH: 251 PLD [r9, 448] 252 253 // BLOCK 0 254 VLD1.8 {d10}, [r9]! // B1 255 VMLAL.S16 q8, d8, d0[0] 256 VMLAL.S16 q9, d9, d0[0] 257 $if DATATYPE == "QU8": 258 VSUBL.U8 q5, d10, d14 259 $else: 260 VMOVL.S8 q5, d10 261 262 // BLOCK 1 263 VLD1.8 {d8}, [r9]! // B2 264 VMLAL.S16 q2, d10, d0[1] 265 VMLAL.S16 q3, d11, d0[1] 266 $if DATATYPE == "QU8": 267 VSUBL.U8 q4, d8, d14 268 $else: 269 VMOVL.S8 q4, d8 270 271 // BLOCK 2 272 VLD1.8 {d10}, [r9]! // B3 273 VMLAL.S16 q8, d8, d0[2] 274 VMLAL.S16 q9, d9, d0[2] 275 $if DATATYPE == "QU8": 276 VSUBL.U8 q5, d10, d14 277 $else: 278 VMOVL.S8 q5, d10 279 280 // BLOCK 3 281 VLD1.8 {d8}, [r9]! // B4 282 VMLAL.S16 q2, d10, d0[3] 283 VMLAL.S16 q3, d11, d0[3] 284 $if DATATYPE == "QU8": 285 VSUBL.U8 q4, d8, d14 286 $else: 287 VMOVL.S8 q4, d8 288 289 // BLOCK 4 290 VLD1.8 {d10}, [r9]! // B5 291 VMLAL.S16 q8, d8, d1[0] 292 VMLAL.S16 q9, d9, d1[0] 293 $if DATATYPE == "QU8": 294 VSUBL.U8 q5, d10, d14 295 $else: 296 VMOVL.S8 q5, d10 297 298 // BLOCK 5 299 VLD1.8 {d8}, [r9]! // B6 300 VMLAL.S16 q2, d10, d1[1] 301 VMLAL.S16 q3, d11, d1[1] 302 $if DATATYPE == "QU8": 303 VSUBL.U8 q4, d8, d14 304 $else: 305 VMOVL.S8 q4, d8 306 307 // BLOCK 6 308 VLD1.8 {d10}, [r9]! // B7 309 VMLAL.S16 q8, d8, d1[2] 310 VMLAL.S16 q9, d9, d1[2] 311 $if DATATYPE == "QU8": 312 VSUBL.U8 q5, d10, d14 313 $else: 314 VMOVL.S8 q5, d10 315 ADDS r5, r5, 8 316 317 VMLAL.S16 q2, d10, d1[3] 318 VMLAL.S16 q3, d11, d1[3] 319 320 # Is there a remainder?- 1-7 bytes of A 321 BNE 6f 322 3234: 324 # ks loop 325 SUBS r14, r14, 4 // ks -= MR * sizeof(void*) 326 BHI 1b 327 328 LDR r14, [sp, 60] // p = ks 329 330 VADD.S32 q8, q8, q2 331 VADD.S32 q9, q9, q3 332 333 $if REQUANTIZATION == "RNDNU": 334 # RNDNU quantization 335 VDUP.32 q0, d12[0] // right_pre_shift 336 337 VQSHL.S32 q8, q8, q0 338 VQSHL.S32 q9, q9, q0 339 340 VDUP.32 q2, d13[0] // right_post_shift 341 342 VQDMULH.S32 q8, q8, d12[1] // multiplier 343 VQDMULH.S32 q9, q9, d12[1] 344 345 VRSHL.S32 q8, q8, q2 346 VRSHL.S32 q9, q9, q2 347 $elif DATATYPE == "QC8" and ARMV8: 348 # QC8 FP32 quantization 349 VLD1.8 {q0-q1}, [r9]! 350 351 VCVT.F32.S32 q8, q8 352 VCVT.F32.S32 q9, q9 353 354 VMUL.F32 q8, q8, q0 // multiplier 355 VMUL.F32 q9, q9, q1 356 357 VCVTN.S32.F32 q8, q8 358 VCVTN.S32.F32 q9, q9 359 $elif DATATYPE == "QC8" and not ARMV8: 360 # QC8 FP32 quantization 361 VLD1.8 {q0-q1}, [r9]! 362 363 VDUP.32 q2, d12[0] // magic_bias 364 VDUP.32 q3, d12[1] // magic_bias_less_output_zero_point 365 366 VCVT.F32.S32 q8, q8 367 VCVT.F32.S32 q9, q9 368 369 VMUL.F32 q8, q8, q0 // multiplier 370 VMUL.F32 q9, q9, q1 371 372 VADD.F32 q8, q8, q2 // magic_bias 373 VADD.F32 q9, q9, q2 374 375 VQSUB.S32 q8, q8, q3 // magic_bias_less_output_zero_point 376 VQSUB.S32 q9, q9, q3 377 378 $if DATATYPE != "QC8" or ARMV8: 379 VDUP.16 q0, d13[2] // output_zero_point 380 381 VQMOVN.S32 d16, q8 382 VQMOVN.S32 d17, q9 383 384 $if DATATYPE != "QC8" or ARMV8: 385 VQADD.S16 q8, q8, q0 386 387 VDUP.8 d24, d13[6] // output_min 388 389 ${SQXTXN} d0, q8 390 391 VDUP.8 d25, d13[7] // output_max 392 393 ${XMAX} d0, d0, d24 394 395 SUBS r1, r1, 8 396 397 ${XMIN} d0, d0, d25 398 399 # Store full 1 x 8 400 BLO 7f 401 VST1.8 {d0}, [r11], r12 402 SUB r2, r2, r14 // a -= ks 403 BHI 0b 404 405 $if DATATYPE == "QU8": 406 VPOP {d8-d14} 407 ADD sp, sp, 8 // skip r2, r3 408 $else: 409 VPOP {d8-d13} 410 ADD sp, sp, 16 // skip pad of 8, r2, r3 411 POP {r5, r6, r7, r9, r11, pc} 412 413 # Remainder- 1 to 7 bytes of A 414 .p2align 3 4155: 416 AND r5, r5, 7 // kc remainder 1 to 7 4176: 418 VLD1.8 {d0}, [r3] 419 VLD1.8 {d8}, [r9]! 420 421 ${XXTL} q0, d0 422 $if DATATYPE == "QU8": 423 VSUBL.U8 q4, d8, d14 424 $else: 425 VMOVL.S8 q4, d8 426 VMLAL.S16 q8, d8, d0[0] 427 VMLAL.S16 q9, d9, d0[0] 428 CMP r5, 2 429 BLO 4b 430 431 VLD1.8 {d8}, [r9]! 432 $if DATATYPE == "QU8": 433 VSUBL.U8 q4, d8, d14 434 $else: 435 VMOVL.S8 q4, d8 436 VMLAL.S16 q8, d8, d0[1] 437 VMLAL.S16 q9, d9, d0[1] 438 BEQ 4b 439 440 VLD1.8 {d8}, [r9]! 441 $if DATATYPE == "QU8": 442 VSUBL.U8 q4, d8, d14 443 $else: 444 VMOVL.S8 q4, d8 445 VMLAL.S16 q8, d8, d0[2] 446 VMLAL.S16 q9, d9, d0[2] 447 CMP r5, 4 448 BLO 4b 449 450 VLD1.8 {d8}, [r9]! 451 $if DATATYPE == "QU8": 452 VSUBL.U8 q4, d8, d14 453 $else: 454 VMOVL.S8 q4, d8 455 VMLAL.S16 q8, d8, d0[3] 456 VMLAL.S16 q9, d9, d0[3] 457 BEQ 4b 458 459 VLD1.8 {d8}, [r9]! 460 $if DATATYPE == "QU8": 461 VSUBL.U8 q4, d8, d14 462 $else: 463 VMOVL.S8 q4, d8 464 VMLAL.S16 q8, d8, d1[0] 465 VMLAL.S16 q9, d9, d1[0] 466 CMP r5, 6 467 BLO 4b 468 469 VLD1.8 {d8}, [r9]! 470 $if DATATYPE == "QU8": 471 VSUBL.U8 q4, d8, d14 472 $else: 473 VMOVL.S8 q4, d8 474 VMLAL.S16 q8, d8, d1[1] 475 VMLAL.S16 q9, d9, d1[1] 476 BEQ 4b 477 478 VLD1.8 {d8}, [r9]! 479 $if DATATYPE == "QU8": 480 VSUBL.U8 q4, d8, d14 481 $else: 482 VMOVL.S8 q4, d8 483 VMLAL.S16 q8, d8, d1[2] 484 VMLAL.S16 q9, d9, d1[2] 485 B 4b 486 487 # Store odd width 488 .p2align 3 4897: 490 TST r1, 4 491 BEQ 8f 492 VST1.32 {d0[0]}, [r11]! 493 VEXT.8 q0, q0, q0, 4 4948: 495 TST r1, 2 496 BEQ 9f 497 VST1.16 {d0[0]}, [r11]! 498 VEXT.8 q0, q0, q0, 2 499 5009: 501 TST r1, 1 502 BEQ 10f 503 VST1.8 {d0[0]}, [r11] 504 50510: 506 $if DATATYPE == "QU8": 507 VPOP {d8-d14} 508 ADD sp, sp, 8 // skip r2, r3 509 $else: 510 VPOP {d8-d13} 511 ADD sp, sp, 16 // skip pad of 8, r2, r3 512 POP {r5, r6, r7, r9, r11, pc} 513 514END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU} 515 516#ifdef __ELF__ 517.section ".note.GNU-stack","",%progbits 518#endif 519