Home
last modified time | relevance | path

Searched refs:vk24 (Results 1 – 25 of 145) sorted by relevance

123456

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
343 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
344 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
345 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
346 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
347 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
597 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
598 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
599 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
600 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
343 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
344 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
345 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
346 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
347 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
597 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
598 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
599 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
600 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-sse-5x4.c52 const __m128 vk24 = _mm_load1_ps(weights + 15); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
349 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
350 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
351 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
352 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
353 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
602 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
603 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
604 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
605 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
308 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
526 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
527 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
528 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
529 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
726 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
308 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
526 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
527 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
528 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
529 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
726 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
308 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
522 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
524 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
525 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
718 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
308 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
522 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
524 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
525 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
718 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
450 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
451 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
452 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
615 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
616 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
617 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
447 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
448 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
449 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
609 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
610 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
611 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
450 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
451 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
452 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
615 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
616 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
617 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
447 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
448 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
449 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
609 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
610 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
611 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
D5x5p2-minmax-sse-4x4.c52 const __m128 vk24 = _mm_load1_ps(weights + 15); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
308 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
309 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
310 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
522 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
523 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
524 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
525 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
716 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c52 const __m128 vk24 = _mm_load1_ps(weights + 15); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
308 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
309 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
310 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
526 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
527 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
528 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
529 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
724 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c52 const __m128 vk24 = _mm_load1_ps(weights + 15); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
267 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
268 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
269 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
445 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
446 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
447 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
606 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
607 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
608 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
D5x5p2-minmax-sse-3x4.c52 const __m128 vk24 = _mm_load1_ps(weights + 15); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
267 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
268 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
269 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
442 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
443 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
444 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
600 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
601 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
602 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
D5x5p2-minmax-scalar-3x1.c49 const float vk24 = weights[15]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
235 vo0p0 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
236 vo1p0 += vi3x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
237 vo2p0 += vi4x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
372 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
373 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
500 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
501 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
374 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
375 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
504 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
505 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
374 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
375 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
504 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
505 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
376 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
377 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
508 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
509 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
376 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
377 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
508 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
509 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
230 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
372 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
373 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
500 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
501 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
D5x5p2-minmax-scalar-2x1.c49 const float vk24 = weights[15]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
198 vo0p0 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
199 vo1p0 += vi3x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D5x5s2p2-minmax-scalar-3x1.c50 const float vk24 = weights[15]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
268 vo0p0 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
269 vo1p0 += vi4x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
270 vo2p0 += vi6x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c50 const float vk24 = weights[15]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
268 vo0p0 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
269 vo1p0 += vi4x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
270 vo2p0 += vi6x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()

123456