Home
last modified time | relevance | path

Searched refs:vi1x4567 (Results 1 – 25 of 332) sorted by relevance

12345678910>>...14

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D3x3p1-minmax-neon-1x4-acc3.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
121 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
126 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
131 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3()
D3x3p1-minmax-neonfma-1x4-acc3.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
121 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
126 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
131 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
142 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3()
D3x3p1-minmax-neonfma-1x4-acc2.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
120 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
125 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
130 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2()
D3x3p1-minmax-neonfma-1x4.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4() local
73 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
119 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
124 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
129 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4()
D3x3p1-minmax-neon-1x4.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4() local
73 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
119 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
124 vo0p0 = vmlaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
129 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
140 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4()
D3x3p1-minmax-neon-1x4-acc2.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
120 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
125 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
130 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
141 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2()
D3x3p1-minmax-wasmsimd-arm-splat-1x4-acc3.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
121 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
126 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
131 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
142 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3()
D3x3p1-minmax-wasmsimd-arm-splat-1x4-acc4.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
122 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
127 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
132 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
143 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4()
D3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3() local
88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
129 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
133 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
137 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
146 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3()
D3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2() local
88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
128 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
132 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
136 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
145 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2()
D3x3p1-minmax-wasmsimd-x86-loadsplat-1x4.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4() local
88 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
127 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
135 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
144 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4()
D3x3p1-minmax-wasmsimd-x86-splat-1x4-acc4.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
122 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
127 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
132 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
143 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4()
D3x3p1-minmax-wasmsimd-arm-splat-1x4.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4() local
74 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
119 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
129 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
140 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4()
D3x3p1-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2() local
88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
128 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
132 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
136 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
145 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2()
D3x3p1-minmax-wasmsimd-x86-splat-1x4-acc2.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
120 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
125 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
130 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
141 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
D3x3p1-minmax-wasmsimd-arm-splat-1x4-acc2.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
120 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
125 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
130 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
141 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2()
D3x3p1-minmax-wasmsimd-x86-splat-1x4-acc3.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3() local
74 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
121 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
126 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
131 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
142 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3()
D3x3p1-minmax-ssse3-1x4.c68 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4() local
83 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
87 …const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
95 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
99 …678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x89AB), _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
107 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
123 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
127 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
131 …const __m128 vi1x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi1x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
140 … const __m128 vi1x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi1x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4()
D3x3p1-minmax-neonfma-1x4-acc4.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
122 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
127 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
132 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
143 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3() local
88 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
129 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
133 v128_t vo0p1 = wasm_f32x4_mul(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
137 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
146 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3()
D3x3p1-minmax-wasmsimd-arm-loadsplat-1x4.c73 v128_t vi1x4567 = wasm_v128_load(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4() local
88 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
92 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
100 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
104 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
112 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
127 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
131 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
135 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
144 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4()
D3x3p1-minmax-neon-1x4-acc4.c60 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4() local
73 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
78 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
88 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
92 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
102 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
122 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567))); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
127 float32x4_t vo0p1 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
132 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
143 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4()
D3x3p1-minmax-wasmsimd-x86-splat-1x4.c61 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4() local
74 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
79 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
89 vi1x0123 = vi1x4567; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
93 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
103 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
119 vi1x4567 = wasm_v128_and(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
124 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
129 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
140 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4()
D3x3p1-minmax-sse-1x4-acc4.c71 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4() local
91 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
96 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
117 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
133 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
152 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
158 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
163 __m128 vo0p1 = _mm_mul_ps(vi1x4567, vk11); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
181 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4()
D3x3p1-minmax-sse-1x4.c71 __m128 vi1x4567 = _mm_loadu_ps(i1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4() local
91 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
96 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
117 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
133 vi1x4567 = vi1x89AB; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
149 vi1x4567 = _mm_and_ps(vmask, vi1x4567); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
155 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
160 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk11)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()
178 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4()

12345678910>>...14