Home
last modified time | relevance | path

Searched refs:vo0p0 (Results 1 – 25 of 337) sorted by relevance

12345678910>>...14

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neon-1x4.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
86 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
88 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
90 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
92 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
104 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
106 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
[all …]
D5x5p2-minmax-neonfma-1x4.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
86 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
88 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
90 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
92 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
104 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
106 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4.c81 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local
89vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
91vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
93vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
95vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
97vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
107vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
111vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c107 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local
115 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
117 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
119 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
133 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
137 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c107 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local
115 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
117 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
119 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
123 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
133 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
135 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
137 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4.c81 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local
89vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
91vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
93vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
95vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
97vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
107vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
109vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
111vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
[all …]
D5x5p2-minmax-sse-1x4.c102 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local
103 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
104 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
105 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
106 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
131 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
132 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
133 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
134 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
135 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
[all …]
D5x5s2p2-minmax-neon-1x4.c92 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4() local
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
96 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
98 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
100 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
104 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
106 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
110 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4()
[all …]
D5x5s2p2-minmax-neonfma-1x4.c92 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4() local
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
96 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
98 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
100 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
104 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
106 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
110 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4()
[all …]
D5x5s2p2-minmax-scalar-1x1.c123 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local
124 vo0p0 += vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
125 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
126 vo0p0 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
127 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
135 vo0p0 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
136 vo0p0 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
137 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
138 vo0p0 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
139 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
[all …]
D5x5p2-minmax-scalar-1x1.c111 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local
112 vo0p0 += vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
113 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
114 vo0p0 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
115 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
123 vo0p0 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
124 vo0p0 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
125 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
126 vo0p0 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
127 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4.c136 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4() local
138 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
140 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
142 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
144 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
146 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
152 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4.c136 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4() local
138 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
140 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
142 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
144 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
146 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
148 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
152 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
154 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-1x4.c116 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4() local
118vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
120vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
126vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
132vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
134vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-1x4.c116 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4() local
118vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
120vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
122vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
126vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
128vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
132vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
134vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4()
[all …]
D5x5p2-minmax-neon-2x4.c86 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
96 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
99 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
102 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
105 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
118 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
121 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
124 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
127 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
[all …]
D5x5p2-minmax-neonfma-2x4.c86 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
96 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
99 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
102 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
105 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
118 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
121 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
124 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
127 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c115 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
125 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
128 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
134 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
137 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
147 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
153 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
156 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c115 v128_t vo0p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
125 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
128 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
134 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
137 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
147 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
153 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
156 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-2x4.c89 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local
99vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
102vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
108vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
111vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
121vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
127vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
[all …]
D5x5s2p2-minmax-sse-1x4.c132 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4() local
133 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
134 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
135 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
136 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
144 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
145 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
146 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
147 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
148 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4()
[all …]
D5x5p2-minmax-scalar-2x1.c122 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
124 vo0p0 += vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
126 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
128 vo0p0 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
130 vo0p0 += vi4x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
140 vo0p0 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
142 vo0p0 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
144 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
146 vo0p0 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
148 vo0p0 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
[all …]
D5x5p2-minmax-neonfma-1x4-acc2.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
88 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
90 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
104 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
108 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
123 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
127 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
131 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
141 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
[all …]
D5x5p2-minmax-neon-1x4-acc2.c78 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
88 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
90 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
104 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
108 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
123 vo0p0 = vmlaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
127 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
131 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
141 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-2x4.c89 v128_t vo0p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local
99vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
102vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
105vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
108vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
111vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
121vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
124vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
127vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
130vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
[all …]

12345678910>>...14