/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonfma-zip.c | 99 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 110 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 111 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 113 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 164 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 209 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 210 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 212 vacc3x2 = vfmaq_f32(vacc3x2, va3x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 99 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 110 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 111 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 112 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 113 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 164 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 209 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 210 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 211 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 212 vacc3x2 = vfmaq_f32(vacc3x2, va3x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonfma-zip.c | 111 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 124 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 125 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 126 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 127 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 128 vacc4x2 = vfmaq_f32(vacc4x2, va4e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 186 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 241 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 242 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 243 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() [all …]
|
D | 5x4c8-minmax-neonfma-shland.c | 111 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 124 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 125 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 126 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 127 vacc3x2 = vfmaq_f32(vacc3x2, va3e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 128 vacc4x2 = vfmaq_f32(vacc4x2, va4e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 186 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 241 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 242 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 243 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() [all …]
|
D | 3x4c8-minmax-neonfma-shland.c | 87 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 96 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 97 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 142 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 177 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 178 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 87 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 96 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 97 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 98 vacc2x2 = vfmaq_f32(vacc2x2, va2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 142 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 177 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 178 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 179 vacc2x2 = vfmaq_f32(vacc2x2, va2x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 2x4c8-minmax-neonfma-shland.c | 75 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 82 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 83 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 120 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() local 145 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland() 146 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland()
|
D | 2x4c8-minmax-neonfma-zip.c | 75 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 82 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 83 vacc1x2 = vfmaq_f32(vacc1x2, va1e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 120 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() local 145 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip() 146 vacc1x2 = vfmaq_f32(vacc1x2, va1x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-zip.c | 63 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 68 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() 98 const float32x4_t vb2e = vreinterpretq_f32_u16(vzip1q_u16(vzero, vb2)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip() local 113 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip()
|
D | 1x4c8-minmax-neonfma-shland.c | 63 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 68 vacc0x2 = vfmaq_f32(vacc0x2, va0e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() 98 const float32x4_t vb2e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(vb2), 16)); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland() local 113 vacc0x2 = vfmaq_f32(vacc0x2, va0x2e, vb2e); in xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland()
|