Home
last modified time | relevance | path

Searched refs:vi4x4567 (Results 1 – 25 of 226) sorted by relevance

12345678910

/external/XNNPACK/src/f16-dwconv2d-chw/gen/
D5x5p2-minmax-neonfp16arith-1x4-acc4.c72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() local
92 vo0p2 = vfma_laneq_f16(vo0p2, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
202 vo0p2 = vfma_laneq_f16(vo0p2, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
208 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
[all …]
D5x5p2-minmax-neonfp16arith-1x4.c72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() local
92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
199 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
205 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4()
[all …]
D5x5p2-minmax-neonfp16arith-1x4-acc2.c72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() local
92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
200 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
206 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2()
[all …]
D5x5p2-minmax-neonfp16arith-1x4-acc3.c72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() local
92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
201 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
207 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
[all …]
D5x5p2-minmax-neonfp16arith-1x4-acc5.c72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() local
92 vo0p1 = vfma_laneq_f16(vo0p1, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
203 vo0p1 = vfma_laneq_f16(vo0p1, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
209 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5()
[all …]
/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neonfma-1x4.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
201 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
[all …]
D5x5p2-minmax-neonfma-1x4-acc2.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
202 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
[all …]
D5x5p2-minmax-neonfma-1x4-acc3.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
203 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4-acc3.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
203 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
201 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
[all …]
D5x5p2-minmax-neon-1x4-acc2.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
202 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
230 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local
95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
209 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
235 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local
95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
204 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
210 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
234 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
234 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
233 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
233 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5p2-minmax-neon-1x4-acc4.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local
94 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
204 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
210 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local
95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
202 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
208 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
231 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
237 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
230 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
[all …]
D5x5p2-minmax-neon-1x4-acc5.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() local
94 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
205 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
[all …]
D5x5p2-minmax-neonfma-1x4-acc5.c74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() local
94 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
205 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
[all …]

12345678910