/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-neon-2x4.c | 110 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4() local 115 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4() 117 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4() 175 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4() local 180 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4() 182 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 123 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2() local 128 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2() 129 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2() 182 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2() local 187 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2() 188 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-2x4.c | 123 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4() local 128 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4() 129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4() 180 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4() local 185 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4() 186 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4()
|
D | 3x3p1-minmax-neonfma-2x4-acc2.c | 110 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2() local 115 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2() 117 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2() 177 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2() local 182 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2() 184 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2()
|
D | 3x3p1-minmax-neon-2x4-acc2.c | 110 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2() local 115 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2() 117 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2() 177 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2() local 182 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2() 184 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2()
|
D | 3x3p1-minmax-neonfma-2x4.c | 110 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4() local 115 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4() 117 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4() 175 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4() local 180 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4() 182 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 123 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2() local 128 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2() 129 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2() 182 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2() local 187 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2() 188 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-2x4.c | 123 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4() local 128 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4() 129 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4() 180 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4() local 185 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4() 186 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-2x4-acc2.c | 111 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() local 116 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 118 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 177 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() local 182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 184 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
|
D | 3x3p1-minmax-ssse3-2x4.c | 118 …const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4() local 123 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4() 124 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4() 176 … const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4() local 181 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4() 182 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-2x4.c | 111 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() local 116 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 118 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 175 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() local 180 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 182 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-2x4-acc2.c | 111 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() local 116 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 118 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 177 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() local 182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 184 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
|
D | 3x3p1-minmax-ssse3-2x4-acc2.c | 118 …const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2() local 123 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2() 124 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2() 178 … const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2() local 183 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2() 184 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2()
|
D | 3x3p1-minmax-sse-2x4.c | 150 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4() local 157 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4() 158 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4() 234 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4() local 241 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4() 242 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-2x4.c | 111 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() local 116 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 118 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 175 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() local 180 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 182 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
|
D | 3x3p1-minmax-neon-1x4-acc3.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() local 97 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() 142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() local 147 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
|
D | 3x3p1-minmax-neonfma-1x4-acc3.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() local 97 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() 142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() local 147 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
|
D | 3x3p1-minmax-neonfma-1x4-acc2.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() local 97 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() 141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() local 146 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
|
D | 3x3p1-minmax-neonfma-1x4.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() local 97 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() 140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() local 145 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
|
D | 3x3p1-minmax-neon-1x4.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() local 97 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() 140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() local 145 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
|
D | 3x3p1-minmax-neon-1x4-acc2.c | 92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() local 97 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() 141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() local 146 vo0p1 = vmlaq_lane_f32(vo0p1, vi1x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
|
D | 3x3p1-minmax-sse-2x4-acc2.c | 150 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2() local 157 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2() 158 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2() 236 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2() local 243 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2() 244 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc3.c | 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local 98 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() 142 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local 147 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-1x4-acc4.c | 93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local 98 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() 143 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local 148 …vo0p3 = wasm_f32x4_add(vo0p3, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local 108 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() 146 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local 150 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
|