/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 357 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 359 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 361 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 611 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 613 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 357 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 359 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 361 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 611 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 613 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 624 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 626 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 457 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 459 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 618 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 620 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 457 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 459 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 618 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 620 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 538 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 540 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 737 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 534 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 536 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 729 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 538 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 540 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 737 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 534 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 536 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 729 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 359 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 360 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 361 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 362 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 363 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 612 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 614 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 615 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 316 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 318 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 530 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 531 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 532 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 724 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 316 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 318 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 534 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 535 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 536 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 537 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 732 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local 277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 624 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 626 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 273 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 274 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 275 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 451 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 452 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 453 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 612 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 614 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
|
D | 5x5p2-minmax-sse-3x4.c | 62 const __m128 vk44 = _mm_load1_ps(weights + 25); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 273 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 274 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 275 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 448 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 449 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 450 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 606 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 607 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 608 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
|
D | 5x5p2-minmax-scalar-3x1.c | 59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 241 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 242 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 243 vo2p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local 237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 382 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 383 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 511 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() 512 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local 237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 382 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 383 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 511 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() 512 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local 237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 380 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 381 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 507 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 508 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local 237 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 238 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 380 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 381 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 507 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 508 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c | 70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local 237 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 238 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 384 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 385 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 515 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() 516 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
|
D | 5x5p2-minmax-scalar-2x1.c | 59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 202 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 203 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 59 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 241 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 242 vo1p0 += vi5x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 243 vo2p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 60 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 274 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 275 vo1p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 276 vo2p0 += vi8x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 60 const float vk44 = weights[25]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 274 vo0p0 += vi4x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 275 vo1p0 += vi6x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 276 vo2p0 += vi8x4 * vk44; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|