Home
last modified time | relevance | path

Searched refs:vb2e (Results 1 – 10 of 10) sorted by relevance

/external/XNNPACK/src/bf16-gemm/gen/
D4x4c8-minmax-neonfma-zip.c99 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local
110 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
111 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
113 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
164 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local
209 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
210 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
212 vacc3x2 = vfmaq_f32(vacc3x2, va3x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
D4x4c8-minmax-neonfma-shland.c99 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local
110 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
111 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
113 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
164 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local
209 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
210 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
212 vacc3x2 = vfmaq_f32(vacc3x2, va3x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
D5x4c8-minmax-neonfma-zip.c111 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local
124 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
125 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
126 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
127 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
128 vacc4x2 = vfmaq_f32(vacc4x2, va4e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
186 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local
241 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
242 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
243 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
[all …]
D5x4c8-minmax-neonfma-shland.c111 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local
124 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
125 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
126 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
127 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
128 vacc4x2 = vfmaq_f32(vacc4x2, va4e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
186 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local
241 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
242 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
243 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
[all …]
D3x4c8-minmax-neonfma-shland.c87 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local
96 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
97 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
142 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local
177 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
178 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
D3x4c8-minmax-neonfma-zip.c87 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local
96 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
97 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
142 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local
177 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
178 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
D2x4c8-minmax-neonfma-shland.c75 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local
82 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
83 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
120 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local
145 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
146 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
D2x4c8-minmax-neonfma-zip.c75 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local
82 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
83 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
120 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local
145 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
146 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
D1x4c8-minmax-neonfma-zip.c63 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local
68 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
98 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local
113 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
D1x4c8-minmax-neonfma-shland.c63 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local
68 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
98 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local
113 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()