/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 114 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 116 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 118 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 171 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 173 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 142 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 145 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 148 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 220 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 223 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 107 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 109 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 111 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 134 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 137 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 140 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 170 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 174 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 178 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 269 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 273 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 138 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 140 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 142 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 251 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 253 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 131 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 133 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 135 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 161 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 165 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 169 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 2x8c8-minmax-neon-mull-padal.c | 96 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal() local 98 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 174 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 177 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 180 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 332 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 335 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 166 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 169 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 172 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 210 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 214 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 218 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 413 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 417 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 127 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 129 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 131 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 184 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 186 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 120 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 122 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 124 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 157 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 160 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 163 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 235 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 238 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 149 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 152 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 155 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 151 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 153 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 155 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 264 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 266 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 187 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 191 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 195 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 286 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 290 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 178 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 182 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 186 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 144 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 146 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 148 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 2x8c8-minmax-neon-mull-padal.c | 109 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal() local 111 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 189 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 192 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 195 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 347 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 350 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 181 int16x8_t vprod1x4 = vmull_s8(vget_low_s8(vb4), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 184 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 187 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 227 int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 231 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 235 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 430 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 434 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 134 const int16x8_t vprod1x4 = vmull_s8(vb4, va1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 137 vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|