/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 56 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 105 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 126 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 154 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 56 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 135 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 194 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 215 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 243 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 56 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 121 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 142 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 170 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 62 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 78 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 153 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 181 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 221 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 78 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 197 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 292 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 320 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 360 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 78 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 182 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 213 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 253 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 68 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 84 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 100 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 201 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 236 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 288 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 68 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 84 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 100 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 259 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 390 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 425 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 477 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 74 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 90 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 106 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 122 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 249 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 291 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 355 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 68 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 84 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 100 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 243 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 284 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 336 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 74 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 90 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 106 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 122 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 304 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 355 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 419 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 74 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 90 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 106 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 122 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 321 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 488 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 530 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 594 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 59 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 116 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 140 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 168 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 59 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 146 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 205 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 229 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 257 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 59 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 132 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 156 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 184 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 63 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 79 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 166 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 197 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 237 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 63 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 79 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 210 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 305 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 336 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 376 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 67 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 83 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 99 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 216 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 254 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 306 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 63 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 79 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 195 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 229 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 269 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 71 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 87 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 103 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 119 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 266 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 311 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 375 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 67 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 83 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 99 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 258 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 302 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 354 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 67 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 83 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 99 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 274 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 405 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 443 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 495 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 71 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 87 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 103 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 119 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 321 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 375 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 439 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 71 …int32x4_t vacc0x12 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 87 int32x4_t vacc1x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 103 int32x4_t vacc2x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 119 int32x4_t vacc3x12 = vacc0x12; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 338 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 505 vacc0x12 = vpadalq_s16(vacc0x12, vprod0x12); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 550 const int32x4_t vsum0xCD = vpaddq_s32(vacc0x12, vacc0x13); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 614 const int32x2_t vpsum0xC = vadd_s32(vget_low_s32(vacc0x12), vget_high_s32(vacc0x12)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|