/external/XNNPACK/src/f16-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfp16arith-1x4-acc4.c | 72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() local 92 vo0p2 = vfma_laneq_f16(vo0p2, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 202 vo0p2 = vfma_laneq_f16(vo0p2, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 208 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() [all …]
|
D | 5x5p2-minmax-neonfp16arith-1x4.c | 72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() local 92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 199 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() 205 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4() [all …]
|
D | 5x5p2-minmax-neonfp16arith-1x4-acc2.c | 72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() local 92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 200 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() 206 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfp16arith-1x4-acc3.c | 72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() local 92 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 201 vo0p0 = vfma_laneq_f16(vo0p0, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 207 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neonfp16arith-1x4-acc5.c | 72 float16x4_t vi4x4567 = vld1_f16(i4); i4 += 4; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() local 92 vo0p1 = vfma_laneq_f16(vo0p1, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 98 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 118 const float16x4_t vi4x2345 = vext_f16(vi4x0123, vi4x4567, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 119 vi4x0123 = vi4x4567; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 135 const float16x4_t vi4x5678 = vext_f16(vi4x4567, vi4x89AB, 1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 155 const float16x4_t vi4x6789 = vext_f16(vi4x4567, vi4x89AB, 2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 156 vi4x4567 = vi4x89AB; in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 203 vo0p1 = vfma_laneq_f16(vo0p1, vi4x4567, vwGHIJKLMN, 7); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 209 const float16x4_t vi4x3456 = vext_f16(vi4x0123, vi4x4567, 3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() [all …]
|
/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-1x4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 201 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 202 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 94 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 203 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 203 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 201 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 94 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 202 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 230 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c | 75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local 95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 203 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 209 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 229 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 235 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 204 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 210 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 234 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 234 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 233 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 233 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc4.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local 94 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 204 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 210 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 75 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 101 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 121 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 122 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 138 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 158 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 159 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 202 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 208 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 231 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 237 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c | 101 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 127 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 147 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 148 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 164 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 184 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 185 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 230 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc5.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() local 94 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 205 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc5.c | 74 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() local 94 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 120 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 121 vi4x0123 = vi4x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 157 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 158 vi4x4567 = vi4x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 205 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 211 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() [all …]
|