/external/XNNPACK/src/bf16-gemm/gen/ |
D | 3x4c8-minmax-neonbf16-bfmlal.c | 154 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 155 vacc2x3 = vbfmlalbq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 156 vacc2x3 = vbfmlaltq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|
D | 3x4c8-minmax-neonfma-shland.c | 156 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 169 … const float32x4_t va2x3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(va2x3), 16)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 200 const float32x4_t va2x3o = vreinterpretq_f32_u16(vandq_u16(va2x3, vmask)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 156 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 169 const float32x4_t va2x3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 200 const float32x4_t va2x3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonbf16-bfmlal.c | 183 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 184 vacc2x3 = vbfmlalbq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 185 vacc2x3 = vbfmlaltq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|
D | 3x4c8-minmax-neonbf16-bfdot.c | 130 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() local 131 vacc2x3 = vbfdotq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot()
|
D | 4x4c8-minmax-neonfma-zip.c | 181 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 198 const float32x4_t va2x3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 237 const float32x4_t va2x3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 181 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 198 … const float32x4_t va2x3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(va2x3), 16)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 237 const float32x4_t va2x3o = vreinterpretq_f32_u16(vandq_u16(va2x3, vmask)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonbf16-bfmlal.c | 212 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() local 213 vacc2x3 = vbfmlalbq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 214 vacc2x3 = vbfmlaltq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal()
|
D | 4x4c8-minmax-neonbf16-bfdot.c | 152 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() local 153 vacc2x3 = vbfdotq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot()
|
D | 5x4c8-minmax-neonfma-zip.c | 206 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 227 const float32x4_t va2x3e = vreinterpretq_f32_u16(vzip1q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 274 const float32x4_t va2x3o = vreinterpretq_f32_u16(vzip2q_u16(vzero, va2x3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
|
D | 5x4c8-minmax-neonfma-shland.c | 206 const uint16x8_t va2x3 = vbicq_u16(va2, vm3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 227 … const float32x4_t va2x3e = vreinterpretq_f32_u32(vshlq_n_u32(vreinterpretq_u32_u16(va2x3), 16)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 274 const float32x4_t va2x3o = vreinterpretq_f32_u16(vandq_u16(va2x3, vmask)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonbf16-bfdot.c | 174 … const bfloat16x8_t va2x3 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm3)); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() local 175 vacc2x3 = vbfdotq_f32(vacc2x3, va2x3, vb3); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot()
|