Home
last modified time | relevance | path

Searched refs:vo2p1 (Results 1 – 23 of 23) sorted by relevance

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c137 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
149 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
165 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
173 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
204 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
236 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
244 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
[all …]
D5x5p2-minmax-neonfma-3x4-acc2.c108 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local
120 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
136 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
144 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
152 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
175 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
183 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
199 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
207 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
215 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
[all …]
D5x5p2-minmax-neon-3x4-acc2.c108 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local
120 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
136 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
144 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
152 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
175 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
183 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
199 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
207 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
215 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
[all …]
D5x5p2-minmax-scalar-3x1-acc2.c138 float vo2p1 = vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local
144 vo2p1 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
159 vo2p1 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
165 vo2p1 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
171 vo2p1 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
186 vo2p1 += vi3x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
192 vo2p1 += vi5x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
207 vo2p1 += vi2x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
213 vo2p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
219 vo2p1 += vi6x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
[all …]
D5x5s2p2-minmax-scalar-3x1-acc2.c173 float vo2p1 = vi5x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
179 vo2p1 += vi7x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
196 vo2p1 += vi4x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
202 vo2p1 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
208 vo2p1 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
225 vo2p1 += vi5x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
231 vo2p1 += vi7x2 * vk32; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
248 vo2p1 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
254 vo2p1 += vi6x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
260 vo2p1 += vi8x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c137 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
149 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
165 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
173 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
204 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
236 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
244 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c125 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
131 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
169 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
175 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
181 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
218 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
224 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
239 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
245 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
251 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c111 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local
123vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
139vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
147vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
155vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
178vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
186vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
202vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
210vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
218vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
[all …]
D5x5p2-minmax-neon-4x4-acc2.c118 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
133 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
152 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
162 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
172 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
199 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
209 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
228 vo2p1 = vmlaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
238 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
248 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c111 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local
123vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
139vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
147vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
155vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
178vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
186vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
202vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
210vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
218vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
[all …]
D5x5p2-minmax-neonfma-4x4-acc2.c118 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
133 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
152 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
162 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
172 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
199 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
209 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
228 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
238 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
248 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
[all …]
D5x5s2p2-minmax-neonfma-3x4-acc2.c131 float32x4_t vo2p1 = vmulq_lane_f32(vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2() local
143 vo2p1 = vfmaq_lane_f32(vo2p1, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
151 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
159 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
167 vo2p1 = vfmaq_lane_f32(vo2p1, vi8x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
194 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
202 vo2p1 = vfmaq_lane_f32(vo2p1, vi7x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
239 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
247 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
255 vo2p1 = vfmaq_lane_f32(vo2p1, vi8x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2()
[all …]
D5x5s2p2-minmax-neon-3x4-acc2.c131 float32x4_t vo2p1 = vmulq_lane_f32(vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2() local
143 vo2p1 = vmlaq_lane_f32(vo2p1, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
151 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
159 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
167 vo2p1 = vmlaq_lane_f32(vo2p1, vi8x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
194 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x68AC, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
202 vo2p1 = vmlaq_lane_f32(vo2p1, vi7x68AC, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
239 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
247 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x79BD, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
255 vo2p1 = vmlaq_lane_f32(vo2p1, vi8x79BD, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c147 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
162 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
191 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
257 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c147 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
162 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
191 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
257 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c191 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local
203 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
211 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
219 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
227 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
254 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
262 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
336 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
344 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
352 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c191 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local
203 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
211 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
219 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
227 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
254 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
262 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
336 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
344 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
352 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c135 __m128 vo2p1 = _mm_mul_ps(vi3x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
143 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
187 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
195 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
203 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
246 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi3x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
254 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
272 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi2x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
280 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
288 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c171 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local
183vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
191vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
199vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
207vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
234vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
242vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
316vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
324vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
332vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c171 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local
183vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
191vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
199vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
207vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
234vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
242vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
316vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
324vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
332vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
[all …]
D5x5s2p2-minmax-sse-3x4-acc2.c186 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local
192 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
209 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
215 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
221 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi8x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
257 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x68AC, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
263 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi7x68AC, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
345 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
351 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x79BD, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
357 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi8x79BD, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c121 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local
136vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
155vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
165vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
175vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
202vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
212vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
231vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
241vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
251vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c121 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local
136vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
155vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
165vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
175vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
202vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
212vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
231vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
241vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
251vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
[all …]