/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-neon-6x4.c | 149 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 170 vo5p0 = vmlaq_lane_f32(vo5p0, vi7x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 290 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 311 vo5p0 = vmlaq_lane_f32(vo5p0, vi7x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
|
D | 3x3p1-minmax-ssse3-6x4.c | 163 …const __m128 vi7x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi7x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 182 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 297 …const __m128 vi7x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi7x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 316 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
|
D | 3x3p1-minmax-neonfma-6x4.c | 149 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 170 vo5p0 = vfmaq_lane_f32(vo5p0, vi7x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 290 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 311 vo5p0 = vfmaq_lane_f32(vo5p0, vi7x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 170 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 189 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 303 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 322 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 170 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 189 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 303 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 322 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
|
D | 5x5p2-minmax-neonfma-5x4.c | 163 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 188 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 193 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 418 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 443 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 448 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 666 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 691 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 696 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
|
D | 5x5p2-minmax-neon-5x4.c | 163 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 188 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 193 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 418 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 443 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 448 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 666 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 691 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 696 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 192 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 217 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 222 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 446 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 471 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 476 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 693 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 718 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 723 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 192 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 217 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 222 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 446 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 471 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 476 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 693 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 718 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 723 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
|
D | 3x3p1-minmax-sse-6x4.c | 204 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 223 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 388 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 407 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
|
D | 5x5p2-minmax-sse-5x4.c | 200 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 222 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 226 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 453 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 475 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 479 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 700 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 722 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 726 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-6x4.c | 152 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 173 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 292 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 313 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-6x4.c | 152 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 173 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 292 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 313 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 166 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 191 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 196 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 420 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 445 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 450 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 667 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 692 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 697 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 166 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 191 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 196 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 420 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 445 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 450 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 667 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 692 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 697 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 148 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 173 vo3p1 = vmlaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 370 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 395 vo3p1 = vmlaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 586 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 611 vo3p1 = vmlaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
|
D | 5x5p2-minmax-neon-4x4.c | 148 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 173 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 366 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 391 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 578 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 603 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
|
D | 5x5p2-minmax-neonfma-4x4.c | 148 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 173 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 366 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 391 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 578 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 603 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 148 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 173 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 370 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 395 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 586 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 611 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 177 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 202 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 398 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 423 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 613 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 638 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 177 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 394 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 419 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 605 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 630 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 177 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 202 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 398 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 423 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 613 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 638 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 177 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 394 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 419 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 605 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 630 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
|
D | 5x5p2-minmax-sse-4x4.c | 183 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 204 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 397 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 418 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 606 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 627 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 183 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 204 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 401 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 422 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 614 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 635 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi7x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
|