/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonfma-zip.c | 100 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 114 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 115 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 116 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 117 vacc3x3 = vfmaq_f32(vacc3x3, va3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 165 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 213 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 214 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 215 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 216 vacc3x3 = vfmaq_f32(vacc3x3, va3x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 100 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 114 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 115 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 116 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 117 vacc3x3 = vfmaq_f32(vacc3x3, va3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 165 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 213 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 214 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 215 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 216 vacc3x3 = vfmaq_f32(vacc3x3, va3x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonfma-zip.c | 112 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 129 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 130 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 131 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 132 vacc3x3 = vfmaq_f32(vacc3x3, va3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 133 vacc4x3 = vfmaq_f32(vacc4x3, va4e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 187 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 246 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 247 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 248 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() [all …]
|
D | 5x4c8-minmax-neonfma-shland.c | 112 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 129 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 130 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 131 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 132 vacc3x3 = vfmaq_f32(vacc3x3, va3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 133 vacc4x3 = vfmaq_f32(vacc4x3, va4e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 187 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 246 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 247 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 248 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() [all …]
|
D | 3x4c8-minmax-neonfma-shland.c | 88 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 99 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 100 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 101 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 143 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 180 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 181 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 182 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 88 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 99 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 100 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 101 vacc2x3 = vfmaq_f32(vacc2x3, va2e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 143 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 180 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 181 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 182 vacc2x3 = vfmaq_f32(vacc2x3, va2x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonfma-shland.c | 76 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 84 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 85 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 121 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 147 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 148 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 76 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 84 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 85 vacc1x3 = vfmaq_f32(vacc1x3, va1e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 121 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 147 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 148 vacc1x3 = vfmaq_f32(vacc1x3, va1x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-zip.c | 64 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 69 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 99 const float32x4_t vb3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 114 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-shland.c | 64 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 69 vacc0x3 = vfmaq_f32(vacc0x3, va0e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 99 const float32x4_t vb3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb3), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 114 vacc0x3 = vfmaq_f32(vacc0x3, va0x3e, vb3e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|