Home
last modified time | relevance | path

Searched refs:vi4x5678 (Results 1 – 25 of 123) sorted by relevance

12345

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-sse-1x4-acc2.c169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local
175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
267 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local
273 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
279 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
354 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local
360 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
366 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
D5x5p2-minmax-sse-1x4.c169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local
175 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
266 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local
272 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
278 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
352 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local
358 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
364 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
D5x5p2-minmax-sse-1x4-acc5.c169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local
175 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
270 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local
276 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
282 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
360 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local
366 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
372 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
D5x5p2-minmax-sse-1x4-acc4.c169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local
175 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
269 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local
275 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
281 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
358 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local
364 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
370 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
D5x5p2-minmax-sse-1x4-acc3.c169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local
175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
268 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local
274 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
280 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
356 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local
362 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
368 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
D3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c185 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
292 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
304 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
D3x3p1-minmax-neonfma-5x4.c167 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
175 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
180 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
185 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
283 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
291 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
296 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
301 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
D3x3p1-minmax-neon-5x4.c167 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
175 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
180 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
185 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
283 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
291 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
296 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
301 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
D5x5p2-minmax-sse-2x4.c201 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local
211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
212 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
219 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
337 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local
347 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
348 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
355 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
459 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local
469 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
[all …]
D5x5p2-minmax-neonfma-1x4-acc3.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
147 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
246 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
256 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
348 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
358 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
364 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
D5x5p2-minmax-neon-1x4.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
147 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
244 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
254 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
344 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
354 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
360 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
D5x5p2-minmax-neonfma-1x4-acc2.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
147 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
245 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
255 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
346 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
356 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
362 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
D5x5p2-minmax-neon-1x4-acc2.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
147 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
245 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
255 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
346 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
356 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
362 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
D5x5p2-minmax-neon-1x4-acc3.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
147 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
246 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
256 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
348 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
358 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
364 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
D5x5p2-minmax-neonfma-1x4.c137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
147 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
244 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
254 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
344 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
354 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
360 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
D5x5p2-minmax-sse-3x4-acc2.c233 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
245 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
247 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
249 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
257 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
411 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
423 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
425 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
427 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
435 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4.c233 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
245 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
247 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
249 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
257 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
408 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
420 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
422 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
424 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
432 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c185 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
292 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
304 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
D3x3p1-minmax-ssse3-5x4.c178 …const __m128 vi4x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
186 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
190 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
194 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
287 … const __m128 vi4x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi4x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
295 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
299 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
303 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
D3x3p1-minmax-sse-5x4.c234 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
244 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
248 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
252 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
387 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
397 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
401 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
405 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
D5x5p2-minmax-neonfma-2x4-acc2.c165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local
178 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
180 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
311 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local
324 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
326 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
448 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local
461 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
463 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
470 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
D5x5p2-minmax-neon-2x4.c165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
178 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
180 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
309 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
322 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
324 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
444 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
457 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
459 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
466 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
D5x5p2-minmax-neonfma-2x4.c165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
178 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
180 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
309 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
322 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
324 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
444 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
457 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
459 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
466 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
D5x5p2-minmax-neon-2x4-acc2.c165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local
178 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
180 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
311 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local
324 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
326 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
448 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local
461 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
463 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
470 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
D5x5p2-minmax-sse-2x4-acc3.c201 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3() local
211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
212 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
219 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
341 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3() local
351 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
352 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
359 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
467 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3() local
477 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3()
[all …]

12345