/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16-minmax-rndnu-neon-mlal-lane-prfm.c | 87 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() local 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 91 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 93 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 94 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 239 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() local 243 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 244 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() [all …]
|
D | 3x16-minmax-rndnu-neon-mlal-lane.c | 87 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() local 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 91 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 93 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 94 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 237 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() local 241 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 242 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 129 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 131 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 132 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 133 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 134 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 135 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 136 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 137 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 138 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 129 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 131 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 132 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 133 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 134 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 135 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 136 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 137 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 138 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 102 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 105 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 106 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 107 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 108 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 109 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 284 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 101 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 104 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 105 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 106 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 107 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 108 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 283 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 3x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 88 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() local 90 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 91 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 92 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 93 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 94 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 95 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 240 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() local 244 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 245 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() [all …]
|
D | 3x16-minmax-fp32-neonv8-mlal-lane.c | 88 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() local 90 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 91 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 92 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 93 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 94 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 95 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 238 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() local 242 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 243 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 129 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 131 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 132 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 133 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 134 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 135 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 136 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 137 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 138 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 130 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 133 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 134 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 135 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 136 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 137 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 138 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 139 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 130 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 133 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 134 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 135 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 136 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 137 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 138 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 139 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 129 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 131 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 132 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 133 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 134 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 135 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 136 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 137 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 138 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 150 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 152 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 153 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 154 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 155 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 156 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 157 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 158 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 159 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 151 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 153 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 154 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 155 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 156 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 157 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 158 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 159 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 160 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 151 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 153 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 154 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 155 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 156 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 157 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 158 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 159 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 160 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 150 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 152 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 153 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 154 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 155 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 156 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 157 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 158 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 159 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 118 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 121 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 122 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 123 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 124 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 125 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 300 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 150 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 152 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 153 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 154 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 155 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 156 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 157 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 158 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 159 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 150 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 152 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 153 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 154 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 155 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 156 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 157 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 158 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 159 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 118 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 121 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 122 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 123 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 124 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 125 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 300 const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 130 const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 132 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 133 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 134 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 135 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 136 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 137 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 138 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 139 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 102 const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 105 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 106 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 107 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 108 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 109 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 284 const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 103 const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 105 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 106 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 107 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 108 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 109 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 110 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 111 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 112 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 285 const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local [all …]
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 151 … const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 153 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 154 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 155 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 156 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 157 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 158 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 159 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 160 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 119 … const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 121 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 122 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 123 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 124 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 125 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 126 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 301 … const int16x8_t vxb89ABCDEFc0 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc0, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local [all …]
|