/external/XNNPACK/src/qc8-igemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane-prfm.c | 104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local 116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() [all …]
|
D | 4x8-minmax-fp32-neonv8-mlal-lane.c | 97 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() local 108 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 109 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 119 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 120 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 130 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 131 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 141 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 142 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() 154 … vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qc8_igemm_minmax_fp32_ukernel_4x8__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qu8-gemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 88 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 89 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 100 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 101 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 111 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 112 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 122 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 123 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 134 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 144 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 88 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 105 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane-prfm.c | 104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 104 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 126 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 127 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 138 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 148 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 149 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 159 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x8-minmax-rndnu-neon-mlal-lane-prfm.c | 96 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() local 107 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 108 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 118 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 119 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 129 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 130 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 140 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 141 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() 154 … vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm() [all …]
|
/external/XNNPACK/src/qu8-igemm/gen/ |
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 105 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 105 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 116 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 117 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 127 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 128 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 138 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 139 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 149 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 150 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 160 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 106 const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3)); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 117 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 118 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 128 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 129 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 139 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 140 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 150 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 151 … vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 161 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane.c | 87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() local 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() 142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane.c | 87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-rndnu-neon-mlal-lane-prfm.c | 87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() local 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() 142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 4x16-minmax-fp32-neon-mlal-lane.c | 87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() local 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() 142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-fp32-neon-mlal-lane-prfm.c | 87 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() local 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 109 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 110 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 120 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 121 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 131 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 132 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() 142 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane-prfm.c | 88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() local 99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() 143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane_prfm() [all …]
|
D | 4x16-minmax-fp32-neonv8-mlal-lane.c | 88 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() local 99 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 100 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 110 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 111 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 121 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 122 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 132 vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 133 vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() 143 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane() [all …]
|
D | 4x8-minmax-fp32-neon-mlal-lane-prfm.c | 79 const int16x8_t vxa3 = vmovl_s8(va3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() local 90 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 91 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 101 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 102 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 112 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 113 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 123 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 124 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() 137 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0); in xnn_qc8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane_prfm() [all …]
|