Home
last modified time | relevance | path

Searched refs:vk34 (Results 1 – 25 of 92) sorted by relevance

1234

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
349 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
350 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
351 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
352 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
353 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
603 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
604 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
605 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
606 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
349 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
350 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
351 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
352 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
353 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
603 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
604 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
605 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
606 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-sse-5x4.c57 const __m128 vk34 = _mm_load1_ps(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
354 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
355 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
356 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
357 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
358 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
607 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
608 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
609 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
610 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
310 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
311 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
312 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
313 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
531 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
532 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
533 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
534 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
731 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
310 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
311 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
312 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
313 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
531 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
532 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
533 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
534 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
731 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
310 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
311 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
312 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
313 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
527 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
528 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
529 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
530 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
723 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
310 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
311 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
312 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
313 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
527 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
528 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
529 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
530 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
723 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
271 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
272 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
273 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
454 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
455 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
456 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
619 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
620 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
621 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
271 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
272 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
273 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
451 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
452 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
453 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
613 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
614 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
615 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
271 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
272 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
273 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
454 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
455 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
456 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
619 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
620 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
621 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
271 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
272 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
273 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
451 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
452 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
453 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
613 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
614 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
615 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
D5x5p2-minmax-sse-4x4.c57 const __m128 vk34 = _mm_load1_ps(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
312 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
313 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
314 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
315 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
526 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
527 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
528 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
529 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
720 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c57 const __m128 vk34 = _mm_load1_ps(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
312 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
313 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
314 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
315 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
530 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
531 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
532 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
533 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi6x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
728 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c57 const __m128 vk34 = _mm_load1_ps(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
270 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
271 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
272 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
448 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
449 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
450 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
609 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
610 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
611 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
D5x5p2-minmax-sse-3x4.c57 const __m128 vk34 = _mm_load1_ps(weights + 20); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
270 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
271 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
272 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
445 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
446 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
447 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
603 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
604 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
605 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
D5x5p2-minmax-scalar-3x1.c54 const float vk34 = weights[20]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
238 vo0p0 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
239 vo1p0 += vi4x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
240 vo2p0 += vi5x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
232 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
233 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
375 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
376 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
503 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
504 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
232 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
233 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
377 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
378 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
507 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
508 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
232 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
233 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
377 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
378 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
507 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
508 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local
232 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
233 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
379 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
380 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
511 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
512 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3() local
232 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
233 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
379 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
380 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
511 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
512 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
232 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
233 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
375 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
376 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
503 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
504 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
D5x5p2-minmax-scalar-2x1.c54 const float vk34 = weights[20]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
200 vo0p0 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
201 vo1p0 += vi4x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D5x5s2p2-minmax-scalar-3x1.c55 const float vk34 = weights[20]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
271 vo0p0 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
272 vo1p0 += vi5x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
273 vo2p0 += vi7x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c55 const float vk34 = weights[20]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
271 vo0p1 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
272 vo1p1 += vi5x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
273 vo2p1 += vi7x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()

1234