/external/XNNPACK/src/qu8-gemm/ |
D | 8x8-minmax-neon.c | 95 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qu8_gemm_minmax_ukernel_8x8__neon() local 134 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 154 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa6), 1); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 174 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa6), 2); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 194 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa6), 3); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 214 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa6), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 234 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa6), 1); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 254 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa6), 2); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 274 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa6), 3); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 314 vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() [all …]
|
/external/XNNPACK/src/qu8-igemm/ |
D | 8x8-minmax-neon.c | 84 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 160 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 182 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 1); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 204 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 2); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 226 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 3); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 248 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 270 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 1); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 292 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 2); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 314 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_high_s16(vxa6), 3); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 356 … vacc6x4567 = vmlal_lane_s16(vacc6x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa6), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 8x8s4-minmax-neonfma.c | 98 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() local 131 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 160 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 189 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 218 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 253 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 274 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 292 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 300 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 344 vacc6x0123 = vacc6x4567; in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
|
D | 8x8s4-minmax-neon.c | 98 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() local 131 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 160 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 189 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 218 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 253 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 274 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 292 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 300 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 344 vacc6x0123 = vacc6x4567; in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 8x8s4inc-minmax-neon.c | 100 float32x4_t vacc6x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() local 133 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 162 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 191 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 220 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 255 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 276 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 294 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 302 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 346 vacc6x0123 = vacc6x4567; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
|
D | 8x8s4inc-minmax-neonfma.c | 100 float32x4_t vacc6x4567 = vld1q_f32(acc); acc += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() local 133 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 162 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 191 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 220 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 255 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 276 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 294 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 302 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 346 vacc6x0123 = vacc6x4567; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 8x8s4-minmax-neonfma.c | 87 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() local 164 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 193 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 222 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 251 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 286 vacc6x4567 = vfmaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 311 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 329 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 337 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 372 vacc6x0123 = vacc6x4567; in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
|
D | 8x8s4-minmax-neon.c | 87 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() local 164 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 193 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 222 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 251 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c3); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 286 vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 311 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 329 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 337 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 372 vacc6x0123 = vacc6x4567; in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
|
/external/XNNPACK/src/f32-ppmm/gen/ |
D | 8x8-minmax-neonfma.c | 79 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() local 106 vacc6x4567 = vfmaq_laneq_f32(vacc6x4567, vb4567, va4567, 2); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 132 vacc6x4567 = vfmaq_f32(vacc6x4567, va6666, vb4567); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 154 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 172 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 180 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 216 vacc6x0123 = vacc6x4567; in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma()
|
D | 8x8-minmax-neon.c | 79 float32x4_t vacc6x4567 = vacc0x4567; in xnn_f32_ppmm_minmax_ukernel_8x8__neon() local 105 vacc6x4567 = vmlaq_lane_f32(vacc6x4567, vb4567, vget_high_f32(va4567), 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 126 vacc6x4567 = vminq_f32(vacc6x4567, vmax); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 144 vacc6x4567 = vmaxq_f32(vacc6x4567, vmin); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 152 vst1q_f32(c6 + 4, vacc6x4567); in xnn_f32_ppmm_minmax_ukernel_8x8__neon() 188 vacc6x0123 = vacc6x4567; in xnn_f32_ppmm_minmax_ukernel_8x8__neon()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 8x8c4-minmax-neondot.c | 88 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() local 162 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 178 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 214 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 235 vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 254 vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 271 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 283 …const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), v… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot() 297 …x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 100 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() local 194 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 226 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 280 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 317 vacc6x4567 = vqrdmulhq_s32(vacc6x4567, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 352 vacc6x4567 = vsraq_n_s32(vacc6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 385 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift); in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 407 …const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), v… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot() 433 …x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_8x16c4__neondot()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 8x8c4-minmax-neondot.c | 101 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() local 139 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 155 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 191 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 213 const int32x4_t vproduct6x4567 = vqrdmulhq_n_s32(vacc6x4567, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 230 vacc6x4567 = vsraq_n_s32(vproduct6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 247 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 259 …const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), v… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot() 273 …x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x8c4__neondot()
|
D | 8x16c4-minmax-neondot.c | 113 int32x4_t vacc6x4567 = vacc0x4567; in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() local 171 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 203 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 257 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 295 const int32x4_t vproduct6x4567 = vqrdmulhq_n_s32(vacc6x4567, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 328 vacc6x4567 = vsraq_n_s32(vproduct6x4567, vbicq_s32(vacc6x4567, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 361 vacc6x4567 = vrshlq_s32(vacc6x4567, vright_shift); in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 383 …const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), v… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot() 409 …x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_8x16c4__neondot()
|