Home
last modified time | relevance | path

Searched refs:vxa3 (Results 1 – 25 of 208) sorted by relevance

123456789

/external/XNNPACK/src/qc8-igemm/gen/
D4x16-minmax-fp32-neon-mlal-lane.c104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neon-mlal-lane-prfm.c104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local
115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane.c105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane-prfm.c105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local
116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
[all …]
D4x8-minmax-fp32-neonv8-mlal-lane.c97 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() local
108 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
109 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
119 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
120 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
130 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
131 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
141 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
142 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
154 … vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane()
[all …]
/external/XNNPACK/src/qu8-gemm/gen/
D4x16-minmax-fp32-neon-mlal-lane.c88 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane.c89 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
100 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
101 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
111 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
112 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
122 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
123 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
133 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
134 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
144 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
D4x16-minmax-rndnu-neon-mlal-lane.c88 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local
99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D4x16-minmax-fp32-neon-mlal-lane.c104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane.c105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
D4x16-minmax-rndnu-neon-mlal-lane-prfm.c104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local
115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
[all …]
D4x16-minmax-rndnu-neon-mlal-lane.c104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local
115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
[all …]
D4x8-minmax-rndnu-neon-mlal-lane-prfm.c96 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() local
107 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
108 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
118 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
119 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
129 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
130 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
140 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
141 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
154 … vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm()
[all …]
/external/XNNPACK/src/qu8-igemm/gen/
D4x16-minmax-rndnu-neon-mlal-lane.c105 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local
116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neon-mlal-lane.c105 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane.c106 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
117 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
118 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
128 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
129 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
139 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
140 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
150 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
151 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
161 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
/external/XNNPACK/src/qs8-gemm/gen/
D4x16-minmax-fp32-neonv8-mlal-lane.c88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
D4x16-minmax-rndnu-neon-mlal-lane.c87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local
98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neon-mlal-lane.c87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-rndnu-neon-mlal-lane-prfm.c87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local
98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm()
[all …]
/external/XNNPACK/src/qc8-gemm/gen/
D4x16-minmax-fp32-neon-mlal-lane.c87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local
98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane()
[all …]
D4x16-minmax-fp32-neon-mlal-lane-prfm.c87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local
98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane-prfm.c88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local
99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm()
[all …]
D4x16-minmax-fp32-neonv8-mlal-lane.c88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local
99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane()
[all …]
D4x8-minmax-fp32-neon-mlal-lane-prfm.c79 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() local
90 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
91 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
101 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
102 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
112 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
113 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
123 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm()
[all …]

123456789