/external/XNNPACK/src/qc8-igemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 131 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 131 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x8-minmax-fp32-neonv8-mlal-lane.c | 119 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() local 134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() [all …]
|
D | 6x8-minmax-fp32-neonv8-mlal-lane-prfm.c | 119 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() local 134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 197 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() [all …]
|
D | 6x8-minmax-fp32-neon-mlal-lane.c | 118 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() local 133 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 134 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 148 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 149 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 163 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 164 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 178 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 179 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 195 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x8-minmax-rndnu-neon-mlal-lane-prfm.c | 118 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() local 133 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 134 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 148 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 149 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 163 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 164 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 178 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 179 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() [all …]
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 110 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 131 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x8-minmax-rndnu-neon-mlal-lane.c | 119 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() local 134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 6x16-minmax-fp32-neon-mlal-lane-prfm.c | 109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local 124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() 184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 110 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local 125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() 185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-fp32-neonv8-mlal-lane.c | 110 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local 125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() 185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() [all …]
|
D | 6x16-minmax-fp32-neon-mlal-lane.c | 109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local 124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() 184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x8-minmax-fp32-neonv8-mlal-lane-prfm.c | 98 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() local 113 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 114 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 128 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 129 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 143 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 144 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 158 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 159 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() 176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() [all …]
|
D | 6x8-minmax-fp32-neonv8-mlal-lane.c | 98 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() local 113 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 114 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 128 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 129 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 143 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 144 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 158 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 159 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() [all …]
|
D | 6x8-minmax-fp32-neon-mlal-lane.c | 97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() local 112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() 174 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() [all …]
|
D | 6x8-minmax-fp32-neon-mlal-lane-prfm.c | 97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() local 112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 6x16-minmax-rndnu-neon-mlal-lane-prfm.c | 109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local 124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() 184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() [all …]
|
D | 6x16-minmax-rndnu-neon-mlal-lane.c | 109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local 124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() 184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() [all …]
|
D | 6x8-minmax-rndnu-neon-mlal-lane.c | 97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() local 112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() 174 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() [all …]
|
D | 6x8-minmax-rndnu-neon-mlal-lane-prfm.c | 97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() local 112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() 175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() [all …]
|