/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 59 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 114 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 127 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 157 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 59 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 147 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 203 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 216 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 246 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 59 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 130 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 143 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 173 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 65 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 81 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 168 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 182 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 224 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 81 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 218 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 307 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 321 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 363 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 81 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 200 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 214 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 256 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 71 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 87 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 103 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 222 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 237 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 291 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 71 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 87 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 103 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 289 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 411 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 426 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 480 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 77 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 93 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 109 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 125 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 276 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 292 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 358 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 71 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 87 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 103 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 270 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 285 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 339 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 77 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 93 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 109 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 125 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 340 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 356 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 422 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 77 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 93 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 109 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 125 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 360 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 515 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 531 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 597 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 62 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 125 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 141 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 171 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 158 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 214 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 230 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 260 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 141 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 157 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 187 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 66 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 82 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 181 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 198 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 240 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 82 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 231 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 320 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 337 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 379 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 70 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 86 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 102 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 237 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 255 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 309 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 82 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 213 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 230 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 272 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 74 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 90 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 106 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 122 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 293 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 312 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 378 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 86 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 102 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 285 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 303 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 357 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 86 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 102 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 304 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 426 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 444 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 498 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 74 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 90 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 106 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 122 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 357 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 376 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 442 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 74 …int32x4_t vacc0x15 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 90 int32x4_t vacc1x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 106 int32x4_t vacc2x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 122 int32x4_t vacc3x15 = vacc0x15; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 377 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 532 vacc0x15 = vpadalq_s16(vacc0x15, vprod0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 551 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 617 const int32x2_t vpsum0xF = vadd_s32(vget_low_s32(vacc0x15), vget_high_s32(vacc0x15)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|