Home
last modified time | relevance | path

Searched refs:vi5x4567 (Results 1 – 25 of 178) sorted by relevance

12345678

/external/XNNPACK/src/f16-dwconv2d-chw/gen/
D5x5p2-minmax-neonfp16arith-2x4-acc2.c80 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2() local
107 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
114 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
141 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
142 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
164 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
191 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
192 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
253 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
260 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc2()
[all …]
D5x5p2-minmax-neonfp16arith-2x4.c80 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4() local
107 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
114 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
141 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
142 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
164 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
191 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
192 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
251 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
258 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4()
[all …]
D5x5p2-minmax-neonfp16arith-2x4-acc3.c80 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() local
107 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
114 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
141 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
142 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
164 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
191 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
192 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
255 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
262 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3()
[all …]
D5x5p2-minmax-neonfp16arith-3x4.c87 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4() local
118 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
121 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
129 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
162 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
163 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
192 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
225 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
226 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
299 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4()
[all …]
D5x5p2-minmax-neonfp16arith-3x4-acc2.c87 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2() local
118 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
121 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
129 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
162 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
163 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
192 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
225 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
226 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
302 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_3x4_acc2()
[all …]
D5x5p2-minmax-neonfp16arith-4x4.c94 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4() local
127 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
131 vo2p0 = vfma_laneq_f16(vo2p0, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
135 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
144 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
183 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
184 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
220 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
259 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
260 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4()
[all …]
D5x5p2-minmax-neonfp16arith-4x4-acc2.c94 float16x4_t vi5x4567 = vld1_f16(i5); i5 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() local
127 vo3p0 = vfma_laneq_f16(vo3p0, vi5x4567, vw89ABCDEF, 5); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
131 vo2p1 = vfma_laneq_f16(vo2p1, vi5x4567, vwGHIJKLMN, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
135 vo1p0 = vfma_laneq_f16(vo1p0, vi5x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
144 const float16x4_t vi5x3456 = vext_f16(vi5x0123, vi5x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
183 const float16x4_t vi5x2345 = vext_f16(vi5x0123, vi5x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
184 vi5x0123 = vi5x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
220 const float16x4_t vi5x5678 = vext_f16(vi5x4567, vi5x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
259 const float16x4_t vi5x6789 = vext_f16(vi5x4567, vi5x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
260 vi5x4567 = vi5x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2()
[all …]
/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neon-2x4-acc2.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local
109 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
255 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
262 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
[all …]
D5x5p2-minmax-neonfma-2x4.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local
109 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
253 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
260 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
[all …]
D5x5p2-minmax-neon-2x4.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local
109 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
253 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
260 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
[all …]
D5x5p2-minmax-neonfma-2x4-acc2.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local
109 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
255 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
262 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
279 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
286 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
281 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
288 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
281 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
288 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
283 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
290 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
283 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
290 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c109 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local
136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
143 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
170 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
171 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
193 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
220 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
221 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
279 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
286 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4()
[all …]
D5x5p2-minmax-neon-2x4-acc3.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3() local
109 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
257 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
264 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3()
[all …]
D5x5p2-minmax-neonfma-2x4-acc3.c82 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3() local
109 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
116 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
143 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
144 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
166 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
193 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
194 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
257 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
264 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-2x4.c83 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local
110 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
117 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
144 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
145 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
167 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
194 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
195 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
253 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
260 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-2x4.c83 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local
110 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
117 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
144 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
145 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
167 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
194 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
195 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
253 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
260 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c116 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local
147 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
150 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
158 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
191 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
192 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
221 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
254 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
255 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
330 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
[all …]
D5x5p2-minmax-neon-3x4.c89 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local
120 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
123 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
131 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
165 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
194 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
227 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
228 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
301 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c116 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
147 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
150 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
158 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
191 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
192 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
221 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
254 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
255 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
327 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c116 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
147 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
150 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
158 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
191 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
192 vi5x0123 = vi5x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
221 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
254 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
255 vi5x4567 = vi5x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
330 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
[all …]

12345678