/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 54 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 99 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 125 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 149 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 54 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 127 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 188 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 214 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 238 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 54 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 115 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 141 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 165 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 60 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 143 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 180 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 216 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 60 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 183 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 282 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 319 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 355 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 60 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 76 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 170 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 212 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 248 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 66 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 98 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 187 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 235 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 283 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 98 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 239 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 376 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 424 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 472 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 72 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 104 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 120 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 231 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 290 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 350 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 82 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 98 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 225 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 283 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 331 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 72 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 104 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 120 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 280 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 354 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 414 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 72 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 88 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 104 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 120 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 295 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 470 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 529 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 589 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 57 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 110 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 139 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 163 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 57 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 138 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 199 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 228 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 252 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 57 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 126 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 155 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 179 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 61 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 156 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 196 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 232 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 196 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 295 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 335 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 371 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 65 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 97 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 202 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 253 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 301 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 77 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 183 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 228 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 264 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 69 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 101 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 117 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 248 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 310 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 370 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 97 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 240 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 301 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 349 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 81 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 97 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 254 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 391 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 442 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 490 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 69 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 101 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 117 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 297 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 374 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 434 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 69 …int32x4_t vacc0x10 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 85 int32x4_t vacc1x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 101 int32x4_t vacc2x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 117 int32x4_t vacc3x10 = vacc0x10; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 312 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 487 vacc0x10 = vpadalq_s16(vacc0x10, vprod0x10); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 549 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 609 const int32x2_t vpsum0xA = vadd_s32(vget_low_s32(vacc0x10), vget_high_s32(vacc0x10)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|