/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonfma-zip.c | 124 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 129 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 130 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 131 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 132 vacc3x0 = vfmaq_f32(vacc3x0, va3o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 218 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 240 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 241 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 242 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 243 vacc3x0 = vfmaq_f32(vacc3x0, va3x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 124 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 129 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 130 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 131 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 132 vacc3x0 = vfmaq_f32(vacc3x0, va3o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 218 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 240 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 241 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 242 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 243 vacc3x0 = vfmaq_f32(vacc3x0, va3x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonfma-zip.c | 141 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 146 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 147 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 148 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 149 vacc3x0 = vfmaq_f32(vacc3x0, va3o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 150 vacc4x0 = vfmaq_f32(vacc4x0, va4o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 252 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 278 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 279 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 280 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() [all …]
|
D | 5x4c8-minmax-neonfma-shland.c | 141 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 146 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 147 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 148 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 149 vacc3x0 = vfmaq_f32(vacc3x0, va3o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 150 vacc4x0 = vfmaq_f32(vacc4x0, va4o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 252 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 278 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 279 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 280 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() [all …]
|
D | 3x4c8-minmax-neonfma-shland.c | 107 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 112 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 113 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 114 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 184 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 202 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 203 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 204 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 107 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 112 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 113 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 114 vacc2x0 = vfmaq_f32(vacc2x0, va2o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 184 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 202 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 203 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 204 vacc2x0 = vfmaq_f32(vacc2x0, va2x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonfma-shland.c | 90 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 95 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 96 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 150 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 164 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 165 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 90 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 95 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 96 vacc1x0 = vfmaq_f32(vacc1x0, va1o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 150 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 164 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 165 vacc1x0 = vfmaq_f32(vacc1x0, va1x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-zip.c | 73 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 78 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 116 const float32x4_t vb0o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb0)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 126 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-shland.c | 73 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 78 vacc0x0 = vfmaq_f32(vacc0x0, va0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 116 const float32x4_t vb0o = vreinterpretq_f32_u16(vandq_u16(vb0, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 126 vacc0x0 = vfmaq_f32(vacc0x0, va0x0o, vb0o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|