Home
last modified time | relevance | path

Searched refs:vo0p1 (Results 1 – 25 of 187) sorted by relevance

12345678

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neonfma-1x4-acc2.c86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
102 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
106 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
110 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
125 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
129 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
139 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
143 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
147 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
[all …]
D5x5p2-minmax-neon-1x4-acc2.c86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
102 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
106 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
110 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
125 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
129 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
139 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
143 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
147 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
135 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
139 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
158 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
168 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
176 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() local
95vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
105vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
109vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
113vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
132vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
142vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
146vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
150vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
135 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
139 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
158 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
168 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
176 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local
95vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
105vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
109vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
113vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
132vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
142vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
146vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
150vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
[all …]
D5x5s2p2-minmax-scalar-1x1-acc2.c124 float vo0p1 = vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local
126 vo0p1 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
135 vo0p1 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
137 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
139 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
148 vo0p1 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
150 vo0p1 += vi3x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
159 vo0p1 += vi0x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
161 vo0p1 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
163 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
[all …]
D5x5p2-minmax-scalar-1x1-acc2.c112 float vo0p1 = vi1x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local
114 vo0p1 += vi3x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
123 vo0p1 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
125 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
127 vo0p1 += vi4x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
136 vo0p1 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
138 vo0p1 += vi3x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
147 vo0p1 += vi0x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
149 vo0p1 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
151 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
[all …]
D5x5p2-minmax-sse-1x4-acc2.c103 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local
105 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
131 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
133 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
135 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
160 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
162 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
171 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
173 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
[all …]
D5x5s2p2-minmax-neon-1x4-acc2.c94 float32x4_t vo0p1 = vmulq_lane_f32(vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2() local
100 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
104 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
108 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
112 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
127 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
131 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
152 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
156 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
160 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2()
[all …]
D5x5s2p2-minmax-neonfma-1x4-acc2.c94 float32x4_t vo0p1 = vmulq_lane_f32(vi0x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2() local
100 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
104 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
108 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
112 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
127 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
131 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
152 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
156 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
160 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2()
[all …]
D5x5p2-minmax-neonfma-1x4-acc3.c86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
90 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
102 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
108 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
125 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
131 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
143 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
160 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
166 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
170 vo0p0 = vaddq_f32(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4-acc3.c86 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
90 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
102 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
108 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
125 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
131 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
143 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
160 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
166 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
170 vo0p0 = vaddq_f32(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c138 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2() local
144 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
148 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
152 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
156 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
171 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
175 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
217 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
221 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
225 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local
93vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
105vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
111vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
134vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
146vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
163vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
169vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
173 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local
119 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
137 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
160 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
189 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
195 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
199 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c115 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local
119 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
131 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
137 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
154 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
160 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
172 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
189 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
195 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
199 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc2.c118 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2() local
124vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
132vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
136vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
151vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
155vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
197vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
201vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
205vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc2.c118 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2() local
124vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
132vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
136vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
151vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
155vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
197vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
201vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
205vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c138 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2() local
144 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
148 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
152 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
156 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
171 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
175 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
217 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
221 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
225 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-neonfma-2x4-acc2.c96 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local
105 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
118 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
124 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
130 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
149 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
155 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
168 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
174 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
180 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
[all …]
D5x5p2-minmax-neon-2x4-acc2.c96 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local
105 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
118 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
124 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
130 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
149 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
155 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
168 vo0p1 = vmlaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
174 vo0p1 = vmlaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
180 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c89 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local
93vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
105vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
111vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
128vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
134vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
146vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
163vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
169vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
173 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c125 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
134 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
147 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
153 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
159 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
178 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
184 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
203 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
209 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c125 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
134 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
147 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
153 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
159 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
178 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
184 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
197 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
203 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
209 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
[all …]

12345678