/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16-minmax-rndnu-neon-mlal-lane-prfm.c | 105 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() local 107 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 108 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 109 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 110 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 111 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 112 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 258 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() local 262 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() 263 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm() [all …]
|
D | 3x16-minmax-rndnu-neon-mlal-lane.c | 105 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() local 107 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 108 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 109 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 110 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 111 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 112 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 256 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() local 260 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() 261 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 159 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 161 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 162 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 163 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 164 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 165 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 166 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 167 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 168 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 159 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 161 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 162 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 163 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 164 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 165 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 166 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 167 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 168 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 124 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 126 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 127 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 128 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 129 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 130 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 131 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 307 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 123 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 126 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 127 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 128 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 129 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 130 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 306 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 3x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 106 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() local 108 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 109 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 110 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 111 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 112 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 113 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 259 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() local 263 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() 264 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane_prfm() [all …]
|
D | 3x16-minmax-fp32-neonv8-mlal-lane.c | 106 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() local 108 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 109 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 110 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 111 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 112 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 113 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 257 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() local 261 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() 262 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_3x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 159 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 161 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 162 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 163 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 164 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 165 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 166 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 167 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 168 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 160 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 162 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 163 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 164 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 165 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 166 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 167 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 168 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 169 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 160 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 162 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 163 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 164 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 165 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 166 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 167 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 168 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 169 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 159 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 161 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 162 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 163 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 164 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 165 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 166 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 167 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 168 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 180 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 182 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 183 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 184 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 185 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 186 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 187 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 188 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 189 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 181 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 183 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 184 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 185 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 186 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 187 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 188 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 189 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 190 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 181 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 183 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 184 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 185 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 186 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 187 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 188 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 189 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 190 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 180 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 182 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 183 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 184 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 185 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 186 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 187 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 188 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 189 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 140 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 142 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 143 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 144 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 145 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 146 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 147 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 323 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 180 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 182 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 183 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 184 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 185 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 186 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 187 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 188 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 189 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 180 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 182 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 183 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 184 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 185 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 186 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 187 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 188 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 189 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 140 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 142 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 143 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 144 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 145 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 146 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 147 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 323 const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 160 const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 162 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 163 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 164 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 165 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 166 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 167 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 168 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 169 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 124 const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 126 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 128 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 129 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 130 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 131 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 307 … const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 125 const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 127 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 128 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 129 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 130 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 131 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 132 vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 134 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 308 … const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local [all …]
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 181 … const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 183 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 184 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 185 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 186 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 187 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 188 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 189 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 190 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 141 … const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 143 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 144 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 145 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 146 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 147 vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 148 … vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 324 … const int16x8_t vxb89ABCDEFc1 = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEFc1, vb_zero_point)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local [all …]
|