/external/XNNPACK/src/bf16-gemm/gen/ |
D | 1x4c8-minmax-neonbf16-bfdot.c | 93 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot() local 97 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot()
|
D | 1x4c8-minmax-neonbf16-bfmlal.c | 102 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal() local 106 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal()
|
D | 1x4c8-minmax-neonfma-zip.c | 139 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 143 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonbf16-bfdot.c | 121 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot() local 128 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot()
|
D | 1x4c8-minmax-neonfma-shland.c | 139 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 143 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonbf16-bfmlal.c | 138 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal() local 145 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal()
|
D | 3x4c8-minmax-neonbf16-bfdot.c | 149 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() local 159 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot()
|
D | 2x4c8-minmax-neonfma-shland.c | 185 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 192 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 185 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 192 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 3x4c8-minmax-neonbf16-bfmlal.c | 174 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 184 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|
D | 4x4c8-minmax-neonbf16-bfdot.c | 177 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() local 190 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonfma-zip.c | 231 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 241 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonbf16-bfmlal.c | 210 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 223 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|
D | 5x4c8-minmax-neonbf16-bfdot.c | 205 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() local 221 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonfma-shland.c | 231 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 241 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonbf16-bfmlal.c | 246 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() local 262 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal()
|
D | 4x4c8-minmax-neonfma-shland.c | 277 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 290 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 4x4c8-minmax-neonfma-zip.c | 277 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 290 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 5x4c8-minmax-neonfma-zip.c | 323 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 339 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
|
D | 5x4c8-minmax-neonfma-shland.c | 323 const float32x2_t vsum0x1 = vadd_f32(vget_low_f32(vacc0x1), vget_high_f32(vacc0x1)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 339 float32x4_t vacc0x0123 = vcombine_f32(vpadd_f32(vsum0x0, vsum0x1), vpadd_f32(vsum0x2, vsum0x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
|