/external/XNNPACK/src/qu8-igemm/gen/ |
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 172 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 194 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 218 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 240 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 262 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 284 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 318 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 172 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 194 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 218 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 240 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 262 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 284 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 318 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 75 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 129 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 151 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 173 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 195 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 219 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 241 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 263 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 285 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 319 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-rndnu-neon-mull-addw-dup.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 129 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 157 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 185 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 213 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 241 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 269 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 297 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 325 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 361 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane-prfm.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 219 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 241 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 263 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 285 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 319 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 217 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 239 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 261 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 283 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 317 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 217 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 239 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 261 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 283 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 317 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 172 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 194 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 218 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 240 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 262 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 284 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 318 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 200 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 222 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 244 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 266 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 300 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane-prfm.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 202 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 224 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 246 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 268 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 302 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-rndnu-neon-mull-addw-dup.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 112 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 140 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 168 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 196 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 224 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc4)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 252 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc5)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 280 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc6)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 308 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc7)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 344 vacc3xCDEF = vaddw_s16(vacc3xCDEF, vget_high_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 200 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 222 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 244 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 266 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 300 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16c4-minmax-rndnu-neondot.c | 80 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() local 118 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 134 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 168 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 190 vacc3xCDEF = vqshlq_s32(vacc3xCDEF, vright_pre_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 207 vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 224 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 235 …const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), v… in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot() 249 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot()
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 155 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 177 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 201 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 223 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 245 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 267 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 301 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 155 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 177 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 201 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 223 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 245 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 267 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 301 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 155 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 177 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 201 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 223 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 245 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 267 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 301 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 78 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 112 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 134 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 156 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 178 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 202 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 224 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 246 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 268 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 302 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 217 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 239 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 261 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 283 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 317 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane-prfm.c | 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 171 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 193 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 219 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 241 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 263 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 285 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 319 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 172 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 194 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 218 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 240 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 262 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 284 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 318 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 74 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 172 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 194 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 220 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 242 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 264 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 286 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 320 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 200 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 222 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 244 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 266 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 300 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane-prfm.c | 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 154 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 176 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 202 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 224 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 246 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 268 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 302 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 155 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 177 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 203 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 225 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 247 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 269 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 303 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 77 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 155 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 177 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 201 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 223 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 245 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 267 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 301 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|