/external/XNNPACK/src/bf16-gemm/gen/ |
D | 3x4c8-minmax-neonbf16-bfdot.c | 176 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() local 183 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 195 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 199 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot() 204 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonbf16-bfmlal.c | 201 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 208 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 220 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 224 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 229 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|
D | 4x8c2-minmax-neonbf16-bfdot-lane-ld128.c | 270 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() local 284 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() 301 vst1_bf16(c2, vout2x0123); c2 += 4; in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() 306 vout2x0123 = vout2x4567; in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() 312 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() 317 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128() 323 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128()
|
D | 4x4c8-minmax-neonbf16-bfdot.c | 210 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() local 218 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 233 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 238 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot() 244 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot()
|
D | 3x4c8-minmax-neonfma-shland.c | 258 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() local 265 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 277 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 281 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland() 286 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland()
|
D | 3x4c8-minmax-neonfma-zip.c | 258 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() local 265 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 277 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 281 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip() 286 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonbf16-bfmlal.c | 243 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 251 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 266 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 271 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 277 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|
D | 5x4c8-minmax-neonbf16-bfdot.c | 244 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() local 253 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() 271 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() 277 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot() 284 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot()
|
D | 5x8c2-minmax-neonbf16-bfdot-lane-ld128.c | 312 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() local 328 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() 349 vst1_bf16(c2, vout2x0123); c2 += 4; in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() 355 vout2x0123 = vout2x4567; in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() 362 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() 368 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128() 375 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128()
|
D | 6x8c2-minmax-neonbf16-bfdot-lane-ld128.c | 354 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() local 372 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() 397 vst1_bf16(c2, vout2x0123); c2 += 4; in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() 404 vout2x0123 = vout2x4567; in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() 412 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() 419 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128() 427 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128()
|
D | 4x4c8-minmax-neonfma-zip.c | 310 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() local 318 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 333 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 338 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip() 344 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip()
|
D | 4x4c8-minmax-neonfma-shland.c | 310 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() local 318 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 333 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 338 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland() 344 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland()
|
D | 5x4c8-minmax-neonbf16-bfmlal.c | 285 bfloat16x4_t vout2x0123 = vcvt_bf16_f32(vacc2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() local 294 vst1_bf16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 312 vst1_lane_u32((void*) c2, vreinterpret_u32_bf16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 318 …vout2x0123 = vreinterpret_bf16_u16(vext_u16(vreinterpret_u16_bf16(vout2x0123), vreinterpret_u16_bf… in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal() 325 vst1_lane_bf16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal()
|
D | 5x4c8-minmax-neonfma-zip.c | 362 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() local 371 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 389 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 395 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip() 402 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip()
|
D | 5x4c8-minmax-neonfma-shland.c | 362 uint16x4_t vout2x0123 = vshrn_n_u32(vreinterpretq_u32_f32(vacc2x0123), 16); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() local 371 vst1_u16(c2, vout2x0123); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 389 vst1_lane_u32((void*) c2, vreinterpret_u32_u16(vout2x0123), 0); c2 += 2; in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 395 vout2x0123 = vext_u16(vout2x0123, vout2x0123, 2); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland() 402 vst1_lane_u16(c2, vout2x0123, 0); in xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland()
|