/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c4-minmax-rndnu-neon-mlal-ld1r.c | 220 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() local 225 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 235 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 245 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 255 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 265 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 275 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 285 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 295 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r()
|
D | 3x16c4-minmax-rndnu-neon-mlal-ld2r.c | 214 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() local 219 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 229 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 239 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 249 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 259 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 269 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 279 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 289 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mlal-dup.c | 214 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() local 219 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 229 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 239 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 249 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 259 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 269 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 279 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 289 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup()
|
D | 3x8c4-minmax-rndnu-neon-mlal-dup.c | 154 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() local 159 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 169 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 179 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 189 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld1r.c | 160 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() local 165 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 175 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 185 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 195 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld2r.c | 154 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() local 159 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 169 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 179 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 189 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 266 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 273 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 286 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 299 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 312 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 325 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 338 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 351 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 364 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 258 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 265 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 278 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 291 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 304 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 317 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 330 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 343 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 356 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 258 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 265 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 278 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 291 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 304 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 317 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 330 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 343 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 356 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld1r.c | 190 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() local 197 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 210 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 223 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 236 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r()
|
D | 4x8c4-minmax-rndnu-neon-mlal-dup.c | 182 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() local 189 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 202 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 215 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 228 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld2r.c | 182 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() local 189 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 202 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 215 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 228 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c4-minmax-rndnu-neon-mlal-ld2r.c | 199 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() local 204 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 214 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 224 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 234 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 244 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 254 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 264 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r() 274 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r()
|
D | 3x16c4-minmax-rndnu-neon-mlal-dup.c | 199 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() local 204 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 214 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 224 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 234 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 244 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 254 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 264 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup() 274 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup()
|
D | 3x16c4-minmax-rndnu-neon-mlal-ld1r.c | 205 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() local 210 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 220 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 230 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 240 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 250 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 260 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 270 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r() 280 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld2r.c | 139 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() local 144 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 154 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 164 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r() 174 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-ld1r.c | 145 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() local 150 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 160 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 170 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r() 180 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r()
|
D | 3x8c4-minmax-rndnu-neon-mlal-dup.c | 139 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() local 144 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 154 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 164 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup() 174 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup()
|
D | 4x16c4-minmax-rndnu-neon-mlal-dup.c | 241 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() local 248 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 261 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 274 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 287 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 300 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 313 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 326 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup() 339 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld1r.c | 249 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() local 256 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 269 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 282 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 295 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 308 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 321 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 334 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r() 347 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r()
|
D | 4x16c4-minmax-rndnu-neon-mlal-ld2r.c | 241 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() local 248 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 261 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 274 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 287 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 300 int16x8_t vprod2x89c1 = vmull_s8(vb89c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 313 int16x8_t vprod2xABc1 = vmull_s8(vbABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 326 int16x8_t vprod2xCDc1 = vmull_s8(vbCDc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r() 339 int16x8_t vprod2xEFc1 = vmull_s8(vbEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld2r.c | 165 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va2x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() local 172 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 185 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 198 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r() 211 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r()
|
D | 4x8c4-minmax-rndnu-neon-mlal-dup.c | 165 const int8x8_t va2c1x0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va2x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() local 172 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 185 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 198 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup() 211 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup()
|
D | 4x8c4-minmax-rndnu-neon-mlal-ld1r.c | 173 const int8x8_t va2c1x0 = vreinterpret_s8_s32(va21x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() local 180 int16x8_t vprod2x01c1 = vmull_s8(vb01c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 193 int16x8_t vprod2x23c1 = vmull_s8(vb23c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 206 int16x8_t vprod2x45c1 = vmull_s8(vb45c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r() 219 int16x8_t vprod2x67c1 = vmull_s8(vb67c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mlal-dup.c | 145 const int8x8_t va2c1x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() local 150 int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 160 int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 170 int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup() 180 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2c1x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup()
|