/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonfma-zip.c | 126 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 137 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 138 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 139 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 140 vacc3x2 = vfmaq_f32(vacc3x2, va3o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 220 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 248 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 249 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 250 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 251 vacc3x2 = vfmaq_f32(vacc3x2, va3x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 126 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 137 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 138 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 139 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 140 vacc3x2 = vfmaq_f32(vacc3x2, va3o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 220 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 248 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 249 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 250 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 251 vacc3x2 = vfmaq_f32(vacc3x2, va3x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonfma-zip.c | 143 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 156 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 157 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 158 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 159 vacc3x2 = vfmaq_f32(vacc3x2, va3o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 160 vacc4x2 = vfmaq_f32(vacc4x2, va4o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 254 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 288 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 289 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 290 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() [all …]
|
D | 5x4c8-minmax-neonfma-shland.c | 143 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 156 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 157 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 158 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 159 vacc3x2 = vfmaq_f32(vacc3x2, va3o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 160 vacc4x2 = vfmaq_f32(vacc4x2, va4o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 254 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 288 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 289 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 290 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() [all …]
|
D | 3x4c8-minmax-neonfma-shland.c | 109 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 118 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 119 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 120 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 186 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 208 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 209 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 210 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 109 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 118 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 119 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 120 vacc2x2 = vfmaq_f32(vacc2x2, va2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 186 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 208 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 209 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 210 vacc2x2 = vfmaq_f32(vacc2x2, va2x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonfma-shland.c | 92 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 99 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 100 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 152 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 168 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 169 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 92 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 99 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 100 vacc1x2 = vfmaq_f32(vacc1x2, va1o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 152 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 168 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 169 vacc1x2 = vfmaq_f32(vacc1x2, va1x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-zip.c | 75 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 80 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 118 const float32x4_t vb2o = vreinterpretq_f32_u16(vzip2q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 128 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-shland.c | 75 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 80 vacc0x2 = vfmaq_f32(vacc0x2, va0o, vb2o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 118 const float32x4_t vb2o = vreinterpretq_f32_u16(vandq_u16(vb2, vmask)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 128 vacc0x2 = vfmaq_f32(vacc0x2, va0x2o, vb2o); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|