/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonfma-zip.c | 127 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 141 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 142 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 143 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 144 vacc3x3 = vfmaq_f32(vacc3x3, va3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 221 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 252 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 253 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 254 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 255 vacc3x3 = vfmaq_f32(vacc3x3, va3x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 127 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 141 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 142 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 143 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 144 vacc3x3 = vfmaq_f32(vacc3x3, va3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 221 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 252 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 253 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 254 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 255 vacc3x3 = vfmaq_f32(vacc3x3, va3x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonfma-zip.c | 144 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 161 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 162 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 163 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 164 vacc3x3 = vfmaq_f32(vacc3x3, va3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 165 vacc4x3 = vfmaq_f32(vacc4x3, va4o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 255 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 293 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 294 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 295 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() [all …]
|
D | 5x4c8-minmax-neonfma-shland.c | 144 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 161 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 162 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 163 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 164 vacc3x3 = vfmaq_f32(vacc3x3, va3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 165 vacc4x3 = vfmaq_f32(vacc4x3, va4o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 255 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 293 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 294 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 295 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() [all …]
|
D | 3x4c8-minmax-neonfma-shland.c | 110 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 121 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 122 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 123 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 187 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 211 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 212 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 213 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 110 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 121 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 122 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 123 vacc2x3 = vfmaq_f32(vacc2x3, va2o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 187 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 211 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 212 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 213 vacc2x3 = vfmaq_f32(vacc2x3, va2x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonfma-shland.c | 93 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 101 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 102 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 153 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 170 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 171 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 93 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 101 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 102 vacc1x3 = vfmaq_f32(vacc1x3, va1o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 153 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 170 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 171 vacc1x3 = vfmaq_f32(vacc1x3, va1x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-zip.c | 76 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 81 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 119 const float32x4_t vb3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb3)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 129 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-shland.c | 76 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 81 vacc0x3 = vfmaq_f32(vacc0x3, va0o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 119 const float32x4_t vb3o = vreinterpretq_f32_u16(vandq_u16(vb3, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 129 vacc0x3 = vfmaq_f32(vacc0x3, va0x3o, vb3o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|