/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 58 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 111 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 127 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 156 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 58 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 143 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 200 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 216 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 245 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 58 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 127 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 143 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 172 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 64 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 80 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 163 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 182 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 223 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 64 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 80 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 211 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 302 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 321 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 362 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 64 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 80 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 194 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 214 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 255 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 70 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 86 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 102 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 215 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 237 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 290 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 86 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 102 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 279 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 404 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 426 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 479 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 76 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 92 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 108 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 124 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 267 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 292 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 357 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 86 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 102 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 261 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 285 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 338 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 76 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 92 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 108 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 124 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 328 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 356 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 421 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 76 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 92 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 108 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 124 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 347 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 506 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 531 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 596 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 61 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 122 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 141 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 170 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 154 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 211 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 230 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 259 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 138 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 157 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 186 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 65 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 81 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 176 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 198 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 239 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 81 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 224 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 315 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 337 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 378 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 69 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 85 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 101 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 230 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 255 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 308 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 65 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 81 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 207 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 230 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 271 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 73 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 89 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 105 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 121 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 284 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 312 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 377 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 69 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 85 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 101 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 276 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 303 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 356 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 69 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 85 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 101 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 294 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 419 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 444 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 497 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 73 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 89 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 105 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 121 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 345 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 376 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 441 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 73 …int32x4_t vacc0x14 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 89 int32x4_t vacc1x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 105 int32x4_t vacc2x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 121 int32x4_t vacc3x14 = vacc0x14; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 364 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 523 vacc0x14 = vpadalq_s16(vacc0x14, vprod0x14); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 551 const int32x4_t vsum0xEF = vpaddq_s32(vacc0x14, vacc0x15); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 616 const int32x2_t vpsum0xE = vadd_s32(vget_low_s32(vacc0x14), vget_high_s32(vacc0x14)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|