1// Copyright 2021 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert REQUANTIZATION in ["FP32", "RNDNU"] 7$assert not CHANNELWISE or REQUANTIZATION == "FP32" 8$assert DATATYPE in ["QC8", "QS8", "QU8"] 9$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32" 10 11#include <xnnpack/assembly.h> 12 13.syntax unified 14 15$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params" 16$ISA = "neonv8" if ARMV8 else "neon" 17$CPU = "a35" if ARMV8 else "a7" 18$XMIN = "VMIN.U8" if DATATYPE == "QU8" else "VMIN.S8" 19$XMAX = "VMAX.U8" if DATATYPE == "QU8" else "VMAX.S8" 20$XXTL = "VMOVL.U8" if DATATYPE == "QU8" else "VMOVL.S8" 21$SQXTXN = "VQMOVUN.S16" if DATATYPE == "QU8" else "VQMOVN.S16" 22$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" 23// void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}( 24// size_t mr, r0 25// size_t nc, r1 26// size_t kc, (r2) -> r5 27// const ${XINT8_T}*restrict a, r3 28// size_t a_stride, sp + 96 -> (unused) 29// const void*restrict w, sp + 100 -> r9 30// ${XINT8_T}*restrict c, sp + 104 -> r11 31// size_t cm_stride, sp + 108 -> (unused) 32// size_t cn_stride, sp + 112 -> r7 33// ${PARAMS_UNION} params) sp + 116 -> (r5) 34 35// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved. 36 37// Based on cortex_a53 microkernel but with Neon loads 38 39// Register usage 40// A0 r3 d0-d1 q0 41 42// B r9 d8-d9 q4 q5 43 44// C0 r11 d16-d17 q8 d18-d19 q9 45// q2, q3 acc2 46 47// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3 48 49$if REQUANTIZATION == "RNDNU" and DATATYPE != "QU8": 50 // params structure is 16 bytes 51 // struct { 52 // int32_t right_pre_shift; d12[0] 53 // int32_t multiplier; d12[1] 54 // int32_t right_post_shift; d13[0] 55 // int16_t output_zero_point; d13[2] 56 // int8_t output_min; d13[6] 57 // int8_t output_max; d13[7] 58 // } rndnu_neon; 59$elif REQUANTIZATION == "RNDNU" and DATATYPE == "QU8": 60 # params structure is 20 bytes 61 # struct { 62 # uint8_t kernel_zero_point[4]; d14 63 # int32_t right_pre_shift; d12[0] 64 # int32_t multiplier; d12[1] 65 # int32_t right_post_shift; d13[0] 66 # int16_t output_zero_point; d13[2] 67 # uint8_t output_min; d13[6] 68 # uint8_t output_max; d13[7] 69 # } rndnu_neon; 70$elif DATATYPE == "QC8" and not ARMV8: 71 // params structure is 10 bytes 72 // struct { 73 // float magic_bias; d12[0] 74 // int32_t magic_bias_less_output_zero_point; d12[1] 75 // int8_t output_min; d13[6] 76 // int8_t output_max; d13[7] 77 // } xnn_qs8_minmax_params.neon; 78$else: 79 // params structure is 4 bytes 80 // struct { 81 // int16_t output_zero_point; d13[2] 82 // int8_t output_min; d13[6] 83 // int8_t output_max; d13[7] 84 // } xnn_qs8_minmax_params.neonv8; 85 86BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU} 87 # Push 96 bytes 88 PUSH {r5, r7, r9, r11} // 16 89 $if DATATYPE == "QU8": 90 SUB sp, sp, 24 // +24 91 VPUSH {d8-d14} // +56 = 96 92 $else: 93 SUB sp, sp, 32 // +32 94 VPUSH {d8-d13} // +48 = 96 95 96 LDR r11, [sp, 104] // c 97 LDR r9, [sp, 100] // w 98 LDR r5, [sp, 116] // params 99 100 # Load params values 101 $if DATATYPE == "QU8": 102 VLD1.32 {d14[]}, [r5]! // QU8 kernel_zero_point 103 $if REQUANTIZATION == "RNDNU": 104 VLDM r5, {d12-d13} // RNDNU params 105 $elif DATATYPE == "QC8" and ARMV8: 106 VLD1.32 {d13[]}, [r5] // QC8 neonv8 params 107 $elif DATATYPE == "QC8" and not ARMV8: 108 VLDM r5!, {d12} // QC8 neon params 109 VLD1.16 {d13[]}, [r5] // output_min/max 110 LDR r7, [sp, 112] // cn_stride 111 112 $if PREFETCH: 113 PLD [r9, 64] // Prefetch B 114 PLD [r9, 128] 115 PLD [r9, 192] 116 PLD [r9, 256] 117 PLD [r9, 320] 118 PLD [r9, 384] 119 120 .p2align 3 1210: 122 # Load initial bias from w into accumulators 123 VLDM r9!, {d16-d19} // Bias 124 VMOV.I32 q2, 0 // second set of C for pipelining FMLA 125 SUBS r5, r2, 8 // k = kc - 8 126 VMOV.I32 q3, 0 127 $if PREFETCH: 128 PLD [r3, 64] // Prefetch A 129 BLO 4f // less than 8 channels? 130 131 // Prologue - load A0 and B0 132 VLD1.8 {d0}, [r3]! // A0 133 SUBS r5, r5, 8 // k = k - 8 134 VLD1.8 {d8}, [r9]! // B0 135 BLO 2f // less than 8 channels? 136 137 // Main loop - 8 bytes 138 // 64 bytes for weights. 139 140 .p2align 3 1411: 142 // Extend 143 ${XXTL} q0, d0 144 $if DATATYPE == "QU8": 145 VSUBL.U8 q4, d8, d14 146 $else: 147 VMOVL.S8 q4, d8 148 $if PREFETCH: 149 PLD [r9, 448] 150 151 // BLOCK 0 152 VLD1.8 {d10}, [r9]! // B1 153 VMLAL.S16 q8, d8, d0[0] 154 VMLAL.S16 q9, d9, d0[0] 155 $if DATATYPE == "QU8": 156 VSUBL.U8 q5, d10, d14 157 $else: 158 VMOVL.S8 q5, d10 159 160 // BLOCK 1 161 VLD1.8 {d8}, [r9]! // B2 162 VMLAL.S16 q2, d10, d0[1] 163 VMLAL.S16 q3, d11, d0[1] 164 $if DATATYPE == "QU8": 165 VSUBL.U8 q4, d8, d14 166 $else: 167 VMOVL.S8 q4, d8 168 169 // BLOCK 2 170 VLD1.8 {d10}, [r9]! // B3 171 VMLAL.S16 q8, d8, d0[2] 172 VMLAL.S16 q9, d9, d0[2] 173 $if DATATYPE == "QU8": 174 VSUBL.U8 q5, d10, d14 175 $else: 176 VMOVL.S8 q5, d10 177 178 // BLOCK 3 179 VLD1.8 {d8}, [r9]! // B4 180 VMLAL.S16 q2, d10, d0[3] 181 VMLAL.S16 q3, d11, d0[3] 182 VLD1.8 {d0}, [r3]! // A0 183 $if DATATYPE == "QU8": 184 VSUBL.U8 q4, d8, d14 185 $else: 186 VMOVL.S8 q4, d8 187 188 // BLOCK 4 189 VLD1.8 {d10}, [r9]! // B5 190 VMLAL.S16 q8, d8, d1[0] 191 VMLAL.S16 q9, d9, d1[0] 192 $if DATATYPE == "QU8": 193 VSUBL.U8 q5, d10, d14 194 $else: 195 VMOVL.S8 q5, d10 196 197 // BLOCK 5 198 VLD1.8 {d8}, [r9]! // B6 199 VMLAL.S16 q2, d10, d1[1] 200 VMLAL.S16 q3, d11, d1[1] 201 $if DATATYPE == "QU8": 202 VSUBL.U8 q4, d8, d14 203 $else: 204 VMOVL.S8 q4, d8 205 206 // BLOCK 6 207 VLD1.8 {d10}, [r9]! // B7 208 VMLAL.S16 q8, d8, d1[2] 209 VMLAL.S16 q9, d9, d1[2] 210 $if DATATYPE == "QU8": 211 VSUBL.U8 q5, d10, d14 212 $else: 213 VMOVL.S8 q5, d10 214 215 // BLOCK 7 216 VLD1.8 {d8}, [r9]! // B0 217 VMLAL.S16 q2, d10, d1[3] 218 VMLAL.S16 q3, d11, d1[3] 219 SUBS r5, r5, 8 220 BHS 1b 221 222 // Epilogue 223 224 .p2align 3 2252: 226 ${XXTL} q0, d0 227 $if DATATYPE == "QU8": 228 VSUBL.U8 q4, d8, d14 229 $else: 230 VMOVL.S8 q4, d8 231 232 VLD1.8 {d10}, [r9]! // B1 233 VMLAL.S16 q8, d8, d0[0] 234 VMLAL.S16 q9, d9, d0[0] 235 $if DATATYPE == "QU8": 236 VSUBL.U8 q5, d10, d14 237 $else: 238 VMOVL.S8 q5, d10 239 240 VLD1.8 {d8}, [r9]! // B2 241 VMLAL.S16 q2, d10, d0[1] 242 VMLAL.S16 q3, d11, d0[1] 243 $if DATATYPE == "QU8": 244 VSUBL.U8 q4, d8, d14 245 $else: 246 VMOVL.S8 q4, d8 247 248 VLD1.8 {d10}, [r9]! // B3 249 VMLAL.S16 q8, d8, d0[2] 250 VMLAL.S16 q9, d9, d0[2] 251 $if DATATYPE == "QU8": 252 VSUBL.U8 q5, d10, d14 253 $else: 254 VMOVL.S8 q5, d10 255 256 VLD1.8 {d8}, [r9]! // B4 257 VMLAL.S16 q2, d10, d0[3] 258 VMLAL.S16 q3, d11, d0[3] 259 $if DATATYPE == "QU8": 260 VSUBL.U8 q4, d8, d14 261 $else: 262 VMOVL.S8 q4, d8 263 264 VLD1.8 {d10}, [r9]! // B5 265 VMLAL.S16 q8, d8, d1[0] 266 VMLAL.S16 q9, d9, d1[0] 267 $if DATATYPE == "QU8": 268 VSUBL.U8 q5, d10, d14 269 $else: 270 VMOVL.S8 q5, d10 271 272 VLD1.8 {d8}, [r9]! // B6 273 VMLAL.S16 q2, d10, d1[1] 274 VMLAL.S16 q3, d11, d1[1] 275 $if DATATYPE == "QU8": 276 VSUBL.U8 q4, d8, d14 277 $else: 278 VMOVL.S8 q4, d8 279 280 VLD1.8 {d10}, [r9]! // B7 281 VMLAL.S16 q8, d8, d1[2] 282 VMLAL.S16 q9, d9, d1[2] 283 $if DATATYPE == "QU8": 284 VSUBL.U8 q5, d10, d14 285 $else: 286 VMOVL.S8 q5, d10 287 ADDS r5, r5, 8 288 289 VMLAL.S16 q2, d10, d1[3] 290 VMLAL.S16 q3, d11, d1[3] 291 292 # Is there a remainder?- 1-7 bytes of A 293 BNE 4f 294 2953: 296 VADD.S32 q8, q8, q2 297 VADD.S32 q9, q9, q3 298 299 $if REQUANTIZATION == "RNDNU": 300 # RNDNU quantization 301 VDUP.32 q0, d12[0] // right_pre_shift 302 303 VQSHL.S32 q8, q8, q0 304 VQSHL.S32 q9, q9, q0 305 306 VDUP.32 q2, d13[0] // right_post_shift 307 308 VQDMULH.S32 q8, q8, d12[1] // multiplier 309 VQDMULH.S32 q9, q9, d12[1] 310 311 VRSHL.S32 q8, q8, q2 312 VRSHL.S32 q9, q9, q2 313 $elif DATATYPE == "QC8" and ARMV8: 314 # QC8 FP32 quantization 315 VLD1.8 {q0-q1}, [r9]! 316 317 VCVT.F32.S32 q8, q8 318 VCVT.F32.S32 q9, q9 319 320 VMUL.F32 q8, q8, q0 // multiplier 321 VMUL.F32 q9, q9, q1 322 323 VCVTN.S32.F32 q8, q8 324 VCVTN.S32.F32 q9, q9 325 $elif DATATYPE == "QC8" and not ARMV8: 326 # QC8 FP32 quantization 327 VLD1.8 {q0-q1}, [r9]! 328 329 VDUP.32 q2, d12[0] // magic_bias 330 VDUP.32 q3, d12[1] // magic_bias_less_output_zero_point 331 332 VCVT.F32.S32 q8, q8 333 VCVT.F32.S32 q9, q9 334 335 VMUL.F32 q8, q8, q0 // multiplier 336 VMUL.F32 q9, q9, q1 337 338 VADD.F32 q8, q8, q2 // magic_bias 339 VADD.F32 q9, q9, q2 340 341 VQSUB.S32 q8, q8, q3 // magic_bias_less_output_zero_point 342 VQSUB.S32 q9, q9, q3 343 344 $if DATATYPE != "QC8" or ARMV8: 345 VDUP.16 q0, d13[2] // output_zero_point 346 347 VQMOVN.S32 d16, q8 348 VQMOVN.S32 d17, q9 349 350 $if DATATYPE != "QC8" or ARMV8: 351 VQADD.S16 q8, q8, q0 352 353 VDUP.8 d24, d13[6] // output_min 354 355 ${SQXTXN} d0, q8 356 357 VDUP.8 d25, d13[7] // output_max 358 359 ${XMAX} d0, d0, d24 360 361 SUBS r1, r1, 8 362 363 ${XMIN} d0, d0, d25 364 365 # Store full 1 x 8 366 BLO 5f 367 VST1.8 {d0}, [r11], r7 368 SUB r3, r3, r2 369 BHI 0b 370 371 $if DATATYPE == "QU8": 372 VPOP {d8-d14} 373 ADD sp, sp, 8 // skip pad of 8 374 $else: 375 VPOP {d8-d13} 376 ADD sp, sp, 16 // skip pad of 8 + d14 377 ADD sp, sp, 16 378 POP {r5, r7, r9, r11} 379 BX lr 380 381 # Remainder- 1 to 7 bytes of A 382 .p2align 3 3834: 384 AND r5, r5, 7 // kc remainder 1 to 7 385 386 VLD1.8 {d0}, [r3], r5 387 VLD1.8 {d8}, [r9]! 388 389 ${XXTL} q0, d0 390 $if DATATYPE == "QU8": 391 VSUBL.U8 q4, d8, d14 392 $else: 393 VMOVL.S8 q4, d8 394 VMLAL.S16 q8, d8, d0[0] 395 VMLAL.S16 q9, d9, d0[0] 396 CMP r5, 2 397 BLO 3b 398 399 VLD1.8 {d8}, [r9]! 400 $if DATATYPE == "QU8": 401 VSUBL.U8 q4, d8, d14 402 $else: 403 VMOVL.S8 q4, d8 404 VMLAL.S16 q8, d8, d0[1] 405 VMLAL.S16 q9, d9, d0[1] 406 BEQ 3b 407 408 VLD1.8 {d8}, [r9]! 409 $if DATATYPE == "QU8": 410 VSUBL.U8 q4, d8, d14 411 $else: 412 VMOVL.S8 q4, d8 413 VMLAL.S16 q8, d8, d0[2] 414 VMLAL.S16 q9, d9, d0[2] 415 CMP r5, 4 416 BLO 3b 417 418 VLD1.8 {d8}, [r9]! 419 $if DATATYPE == "QU8": 420 VSUBL.U8 q4, d8, d14 421 $else: 422 VMOVL.S8 q4, d8 423 VMLAL.S16 q8, d8, d0[3] 424 VMLAL.S16 q9, d9, d0[3] 425 BEQ 3b 426 427 VLD1.8 {d8}, [r9]! 428 $if DATATYPE == "QU8": 429 VSUBL.U8 q4, d8, d14 430 $else: 431 VMOVL.S8 q4, d8 432 VMLAL.S16 q8, d8, d1[0] 433 VMLAL.S16 q9, d9, d1[0] 434 CMP r5, 6 435 BLO 3b 436 437 VLD1.8 {d8}, [r9]! 438 $if DATATYPE == "QU8": 439 VSUBL.U8 q4, d8, d14 440 $else: 441 VMOVL.S8 q4, d8 442 VMLAL.S16 q8, d8, d1[1] 443 VMLAL.S16 q9, d9, d1[1] 444 BEQ 3b 445 446 VLD1.8 {d8}, [r9]! 447 $if DATATYPE == "QU8": 448 VSUBL.U8 q4, d8, d14 449 $else: 450 VMOVL.S8 q4, d8 451 VMLAL.S16 q8, d8, d1[2] 452 VMLAL.S16 q9, d9, d1[2] 453 B 3b 454 455 # Store odd width 456 .p2align 3 4575: 458 TST r1, 4 459 BEQ 6f 460 VST1.32 {d0[0]}, [r11]! 461 VEXT.8 q0, q0, q0, 4 4626: 463 TST r1, 2 464 BEQ 7f 465 VST1.16 {d0[0]}, [r11]! 466 VEXT.8 q0, q0, q0, 2 4677: 468 TST r1, 1 469 BEQ 8f 470 VST1.8 {d0[0]}, [r11] 4718: 472 $if DATATYPE == "QU8": 473 VPOP {d8-d14} 474 ADD sp, sp, 8 // skip pad of 8 475 $else: 476 VPOP {d8-d13} 477 ADD sp, sp, 16 // skip pad of 8 + d14 478 ADD sp, sp, 16 479 POP {r5, r7, r9, r11} 480 BX lr 481 482END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU} 483 484#ifdef __ELF__ 485.section ".note.GNU-stack","",%progbits 486#endif 487 488