Home
last modified time | relevance | path

Searched refs:vxa4 (Results 1 – 25 of 28) sorted by relevance

12

/external/XNNPACK/src/qc8-igemm/gen/
D6x16-minmax-fp32-neon-mlal-lane-prfm.c130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local
145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
[all …]
D6x16-minmax-fp32-neonv8-mlal-lane.c131 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local
146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
[all …]
D6x16-minmax-fp32-neonv8-mlal-lane-prfm.c131 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local
146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
[all …]
D6x16-minmax-fp32-neon-mlal-lane.c130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local
145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
[all …]
D6x8-minmax-fp32-neonv8-mlal-lane.c119 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() local
134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
[all …]
D6x8-minmax-fp32-neonv8-mlal-lane-prfm.c119 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() local
134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
197 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
[all …]
D6x8-minmax-fp32-neon-mlal-lane.c118 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() local
133 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
134 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
148 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
149 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
163 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
164 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
178 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
179 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
195 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D6x16-minmax-rndnu-neon-mlal-lane-prfm.c130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local
145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
[all …]
D6x16-minmax-rndnu-neon-mlal-lane.c130 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local
145 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
146 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
160 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
161 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
176 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
190 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
191 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
205 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
[all …]
D6x8-minmax-rndnu-neon-mlal-lane-prfm.c118 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() local
133 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
134 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
148 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
149 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
163 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
164 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
178 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
179 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
[all …]
/external/XNNPACK/src/qu8-gemm/gen/
D6x16-minmax-rndnu-neon-mlal-lane.c110 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local
125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
[all …]
/external/XNNPACK/src/qu8-igemm/gen/
D6x16-minmax-rndnu-neon-mlal-lane.c131 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local
146 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
147 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
161 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
162 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
177 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
191 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
192 … vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
206 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
[all …]
D6x8-minmax-rndnu-neon-mlal-lane.c119 const int16x8_t vxa4 = vreinterpretq_s16_u16(vmovl_u8(va4)); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() local
134 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
135 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
149 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
150 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
164 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
165 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
179 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
180 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
196 … vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
[all …]
/external/XNNPACK/src/qc8-gemm/gen/
D6x16-minmax-fp32-neon-mlal-lane-prfm.c109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm() local
124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane_prfm()
[all …]
D6x16-minmax-fp32-neonv8-mlal-lane-prfm.c110 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm() local
125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane_prfm()
[all …]
D6x16-minmax-fp32-neonv8-mlal-lane.c110 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane() local
125 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
126 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
140 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
141 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
156 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
170 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
171 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
185 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neonv8_mlal_lane()
[all …]
D6x16-minmax-fp32-neon-mlal-lane.c109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane() local
124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x16__neon_mlal_lane()
[all …]
D6x8-minmax-fp32-neonv8-mlal-lane-prfm.c98 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm() local
113 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
114 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
128 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
129 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
143 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
144 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
158 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
159 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
176 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane_prfm()
[all …]
D6x8-minmax-fp32-neonv8-mlal-lane.c98 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane() local
113 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
114 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
128 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
129 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
143 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
144 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
158 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
159 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neonv8_mlal_lane()
[all …]
D6x8-minmax-fp32-neon-mlal-lane.c97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane() local
112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
174 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane()
[all …]
D6x8-minmax-fp32-neon-mlal-lane-prfm.c97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm() local
112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_6x8__neon_mlal_lane_prfm()
[all …]
/external/XNNPACK/src/qs8-gemm/gen/
D6x16-minmax-rndnu-neon-mlal-lane-prfm.c109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm() local
124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm()
[all …]
D6x16-minmax-rndnu-neon-mlal-lane.c109 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane() local
124 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
125 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
139 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
140 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
154 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
155 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
169 vacc4x89AB = vmlal_lane_s16(vacc4x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
170 vacc4xCDEF = vmlal_lane_s16(vacc4xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
184 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane()
[all …]
D6x8-minmax-rndnu-neon-mlal-lane.c97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane() local
112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
174 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane()
[all …]
D6x8-minmax-rndnu-neon-mlal-lane-prfm.c97 const int16x8_t vxa4 = vmovl_s8(va4); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm() local
112 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
113 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
127 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
128 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
142 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
143 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
157 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
158 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
175 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm()
[all …]

12