/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5s2p2-minmax-neon-3x4.c | 218 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() local 243 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 250 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 412 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() local 423 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 430 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
|
D | 5x5s2p2-minmax-neonfma-3x4-acc2.c | 218 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() local 243 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 250 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 415 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() local 426 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() 433 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
|
D | 5x5s2p2-minmax-neonfma-3x4.c | 218 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() local 243 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 250 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 412 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() local 423 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 430 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
|
D | 5x5s2p2-minmax-neon-3x4-acc2.c | 218 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() local 243 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 250 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 415 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() local 426 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() 433 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 278 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local 340 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 347 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 520 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local 531 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 538 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 278 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local 340 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 347 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 517 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local 528 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 278 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local 340 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 347 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 517 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local 528 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 278 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local 340 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 347 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 520 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local 531 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 538 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
|
D | 5x5s2p2-minmax-neon-2x4-acc2.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2() local 205 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2() 333 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2() local 346 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2()
|
D | 5x5s2p2-minmax-neon-2x4.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() local 205 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 331 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() local 344 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4()
|
D | 5x5s2p2-minmax-neonfma-2x4-acc3.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3() local 205 vo1p2 = vfmaq_lane_f32(vo1p2, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3() 335 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3() local 348 vo1p2 = vfmaq_lane_f32(vo1p2, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3()
|
D | 5x5s2p2-minmax-neon-2x4-acc3.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3() local 205 vo1p2 = vmlaq_lane_f32(vo1p2, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3() 335 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3() local 348 vo1p2 = vmlaq_lane_f32(vo1p2, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3()
|
D | 5x5s2p2-minmax-neonfma-2x4-acc2.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2() local 205 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2() 333 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2() local 346 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2()
|
D | 5x5s2p2-minmax-neonfma-2x4.c | 182 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x8ACE9BDF.val[1], 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() local 205 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 331 const float32x4_t vi5x79BD = vextq_f32(vi5x1357, vi5x9BDF, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() local 344 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 258 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 320 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 327 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 500 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 511 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 518 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
|
D | 5x5s2p2-minmax-sse-3x4.c | 306 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() local 348 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 353 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 526 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() local 536 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c | 258 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 320 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 327 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 497 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 508 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 515 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c | 258 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 320 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 327 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 497 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 508 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 515 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 258 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 320 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 327 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 500 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 511 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 518 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
|
D | 5x5s2p2-minmax-sse-3x4-acc2.c | 306 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local 348 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 353 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 529 const __m128 vi5x79BD = _mm_move_ss(vi5xF9BD, vi5x7135); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local 539 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x79BD, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 544 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c | 234 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() local 286 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 422 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() local 435 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 234 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() local 286 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 420 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() local 433 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c | 234 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() local 286 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 422 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() local 435 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 234 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() local 286 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 420 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() local 433 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 234 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() local 286 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 418 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() local 431 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
|