/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c4-minmax-rndnu-neon-mull-ld2r.c | 124 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local 129 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 137 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 145 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 153 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 161 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 169 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 177 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 185 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 329 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mull-dup.c | 124 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local 129 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 137 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 145 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 153 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 161 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 169 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 177 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 185 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 329 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mull-ld1r.c | 128 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local 133 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 141 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 149 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 157 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 165 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 173 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 181 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 189 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 333 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local [all …]
|
D | 4x8c4-minmax-rndnu-neon-mull-ld2r.c | 100 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() local 105 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 113 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 121 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 129 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 213 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() local 214 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 216 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 218 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 220 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mull-dup.c | 100 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() local 105 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 113 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 121 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 129 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 213 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() local 214 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 216 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 218 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 220 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup()
|
D | 4x8c4-minmax-rndnu-neon-mull-ld1r.c | 104 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() local 109 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 117 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 125 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 133 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 217 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() local 218 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 220 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 222 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 224 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 380 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 385 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 393 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 401 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 409 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 417 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 425 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 433 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 441 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 585 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 392 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 397 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 405 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 413 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 421 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 429 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 437 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 445 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 453 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 597 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 380 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 385 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 393 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 401 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 409 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 417 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 425 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 433 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 441 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 585 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local [all …]
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld2r.c | 244 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() local 249 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 257 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 265 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 273 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 357 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() local 358 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 360 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 362 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 364 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld2r.c | 111 const int8x8_t va3c0 = vreinterpret_s8_s16(va30.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 116 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 124 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 132 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 140 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 298 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 299 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 301 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 303 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 305 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld1r.c | 119 const int8x8_t va3c0 = vreinterpret_s8_s16(va30); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() local 124 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 132 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 140 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 148 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 306 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() local 307 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 309 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 311 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 313 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r()
|
D | 4x16c2-minmax-rndnu-neon-mull-dup.c | 107 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 112 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 120 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 128 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 136 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 294 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 295 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 297 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 299 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 301 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c4-minmax-rndnu-neon-mull-ld2r.c | 141 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local 146 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 154 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 162 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 170 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 178 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 186 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 194 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 202 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() 346 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mull-ld1r.c | 145 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local 150 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 158 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 166 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 174 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 182 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 190 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 198 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 206 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() 350 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mull-dup.c | 141 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local 146 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 154 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 162 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 170 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 178 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 186 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 194 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 202 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() 346 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup() local [all …]
|
D | 4x8c4-minmax-rndnu-neon-mull-ld2r.c | 117 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() local 122 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 130 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 138 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 146 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 230 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() local 231 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 233 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 235 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r() 237 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mull-dup.c | 117 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() local 122 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 130 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 138 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 146 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 230 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() local 231 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 233 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 235 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup() 237 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup()
|
D | 4x8c4-minmax-rndnu-neon-mull-ld1r.c | 121 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() local 126 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 134 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 142 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 150 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 234 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() local 235 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 237 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 239 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r() 241 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 409 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 414 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 422 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 430 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 438 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 446 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 454 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 462 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 470 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 614 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 397 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 402 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 410 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 418 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 426 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 434 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 442 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 450 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 458 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 602 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local [all …]
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 397 const int8x8_t va3c0 = vreinterpret_s8_s32(va3.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 402 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 410 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 418 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 426 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 434 const int16x8_t vprod3x89c0 = vmull_s8(vb89c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 442 const int16x8_t vprod3xABc0 = vmull_s8(vbABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 450 const int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 458 const int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 602 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local [all …]
|
D | 4x16c2-minmax-rndnu-neon-mull-dup.c | 124 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 129 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 137 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 145 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 153 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 311 const int8x8_t va3c0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 312 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 314 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 316 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 318 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld1r.c | 273 const int8x8_t va3c0 = vreinterpret_s8_s32(va30); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() local 278 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 286 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 294 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 302 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 386 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() local 387 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 389 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 391 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 393 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r()
|
D | 4x8c4-minmax-rndnu-neon-mlal-dup.c | 261 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() local 266 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 274 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 282 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 290 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 374 const int8x8_t va3c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va3), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() local 375 const int16x8_t vprod3x01c0 = vmull_s8(vb01c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 377 const int16x8_t vprod3x23c0 = vmull_s8(vb23c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 379 const int16x8_t vprod3x45c0 = vmull_s8(vb45c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 381 const int16x8_t vprod3x67c0 = vmull_s8(vb67c0, va3c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup()
|