/external/XNNPACK/src/f32-conv-hwc/gen/ |
D | 3x3s2p0p1c3x8-neonfma-2x1.c | 87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() local 102 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 104 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 117 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 119 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 126 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 128 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 141 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 143 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 150 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() [all …]
|
D | 3x3s2p0p1c3x8-neon-2x1.c | 85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() local 100 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 102 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 115 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 117 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 124 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 126 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 139 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 141 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 148 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() [all …]
|
D | 3x3s2p0p1c3x8-neonfma-2x2.c | 87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() local 113 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 115 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 138 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 140 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 152 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 154 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 177 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 179 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 198 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() [all …]
|
D | 3x3s2p0p1c3x8-neon-2x2.c | 85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() local 111 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 113 vo1x0c4567 = vmlaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 136 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 138 vo0x0c4567 = vmlaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 150 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 152 vo1x0c4567 = vmlaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 175 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 177 vo0x0c4567 = vmlaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 196 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() [all …]
|
D | 3x3s2p0p1c3x4-neon-2x1.c | 85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() local 97 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 106 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 112 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 121 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 127 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 136 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 142 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 151 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() 245 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1() [all …]
|
D | 3x3s2p0p1c3x4-neonfma-2x1.c | 87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() local 99 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 108 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 114 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 123 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 129 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 138 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 144 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 153 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() 247 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1() [all …]
|
D | 3x3s2p1c3x8-neon-2x1.c | 85 float32x4_t vi2x0 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() local 100 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 102 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 115 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 117 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 124 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 126 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 139 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 141 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 148 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() [all …]
|
D | 3x3s2p1c3x8-neonfma-2x1.c | 87 float32x4_t vi2x0 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() local 102 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 104 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 117 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 119 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 126 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 128 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 141 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 143 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 150 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() [all …]
|
D | 3x3s2p0p1c3x4-neon-2x2.c | 85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() local 106 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 121 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 130 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 145 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 161 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 176 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 185 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 200 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 335 vi2x0 = vi2x3; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() [all …]
|
D | 3x3s2p0p1c3x4-neonfma-2x2.c | 87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() local 108 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 123 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 132 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 147 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 163 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 178 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 187 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 202 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 337 vi2x0 = vi2x3; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x8-neonfma-2x2.c | 87 float32x4_t vi2x0 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() local 113 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 115 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 138 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 140 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 159 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 161 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 184 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 186 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 198 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x8-neon-2x2.c | 85 float32x4_t vi2x0 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() local 111 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 113 vo1x0c4567 = vmlaq_lane_f32(vo1x0c4567, vk00c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 136 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 138 vo0x0c4567 = vmlaq_lane_f32(vo0x0c4567, vk20c0x4567, vget_low_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 157 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 159 vo1x0c4567 = vmlaq_lane_f32(vo1x0c4567, vk00c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 182 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 184 vo0x0c4567 = vmlaq_lane_f32(vo0x0c4567, vk20c1x4567, vget_high_f32(vi2x0), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 196 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() [all …]
|
/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-3x1.c | 71 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 91 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 93 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 95 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 101 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 149 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 151 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 153 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3p1-minmax-scalar-2x1-acc2.c | 65 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() local 83 float vo1p1 = vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 84 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 89 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 129 float vo1p1 = vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 130 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2()
|
D | 3x3p1-minmax-scalar-2x1.c | 65 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() local 83 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 84 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 89 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 127 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 128 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1()
|
D | 3x3p1-minmax-scalar-4x1.c | 77 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 100 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 103 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 106 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 113 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 172 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 175 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 178 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3s2p1-minmax-scalar-2x1-acc2.c | 76 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() local 89 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 92 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 115 vi2x0 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 148 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 151 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2()
|
D | 3x3s2p1-minmax-scalar-2x1.c | 76 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() local 89 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 92 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 115 vi2x0 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 146 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 149 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1()
|
D | 3x3p1-minmax-scalar-5x1.c | 83 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 109 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 113 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 117 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 125 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 195 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 199 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 203 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 5x5p2-minmax-scalar-3x1.c | 92 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 135 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 137 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 139 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 151 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 262 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 264 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 266 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 278 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 367 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() [all …]
|
D | 5x5p2-minmax-scalar-2x1.c | 86 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 125 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 126 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 135 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 220 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 221 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 230 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 299 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 300 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-scalar-1x1-acc2.c | 59 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 73 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 104 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2()
|
D | 3x3p1-minmax-scalar-1x1.c | 59 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 73 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 103 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1()
|
D | 3x3p1-minmax-scalar-1x1-acc3.c | 59 float vi2x0 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 73 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 105 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3()
|
/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 99 v128_t vi2x0 = vzero; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 120 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 133 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 148 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 161 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x0, vi2x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 169 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 182 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x0, vi2x0, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 322 vi2x0 = vi2x3; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 368 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 385 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x0, vi2x0, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() [all …]
|