/external/XNNPACK/src/f32-dwconv-spchw/ |
D | 3x3p1-neonfma.c | 60 float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() local 78 vo4567p00 = vfmaq_lane_f32(vo4567p00, vi2x4567, vw89, 0); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 81 vo4567p01 = vfmaq_laneq_f32(vo4567p01, vi2x4567, vw4567, 1); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 84 vo4567p02 = vfmaq_laneq_f32(vo4567p02, vi2x4567, vw0123, 2); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 90 const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 108 vi2x0123 = vi2x4567; in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 114 const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 132 vi2x4567 = vi2x89AB; in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 161 vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567))); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() 167 vo4567p00 = vfmaq_lane_f32(vo4567p00, vi2x4567, vw89, 0); in xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma() [all …]
|
D | 3x3p1-sse.c | 65 __m128 vi2x4567 = _mm_loadu_ps(i2); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() local 87 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 91 __m128 vo4567p2 = _mm_mul_ps(vi2x4567, vk21); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 113 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 128 vi2x4567 = vi2x89AB; in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 147 vi2x4567 = _mm_and_ps(vmask, vi2x4567); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 154 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 158 __m128 vo4567p2 = _mm_mul_ps(vi2x4567, vk21); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse() 177 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero); in xnn_f32_dwconv_spchw_ukernel_3x3p1__sse()
|
D | 5x5p2-neonfma.c | 66 float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() local 88 vo4567p20 = vfmaq_laneq_f32(vo4567p20, vi2x4567, vw0123, 3); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 91 vo4567p10 = vfmaq_laneq_f32(vo4567p10, vi2x4567, vw89AB, 0); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 94 vo4567p00 = vfmaq_laneq_f32(vo4567p00, vi2x4567, vwCDEF, 1); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 109 const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 137 const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 165 vi2x0123 = vi2x4567; in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 173 const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 201 const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2); in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() 229 vi2x4567 = vi2x89AB; in xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma() [all …]
|
D | 3x3s2p1-neonfma.c | 56 …const float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stri… in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma() local 66 const float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma() 67 const float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma() 109 const float32x4_t vi2x4567 = vld1q_f32(i2); in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma() local 119 …reinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vuzp1q_f32(vi2x4567, vi2x89AB)))); in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma() 120 …reinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vuzp2q_f32(vi2x4567, vi2x89AB)))); in xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma()
|
D | 5x5s2p2-neonfma.c | 61 float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride); in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() local 97 float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() 98 float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() 177 vi2x4567 = vi2xCDEF; in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() 249 float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() 250 float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB); in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma() 341 vi2x4567 = vi2xCDEF; in xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma()
|
/external/XNNPACK/src/f32-dwconv/gen/ |
D | up8x4-psimd.c | 69 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x4__psimd() local 75 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x4__psimd()
|
D | up8x4-psimd-acc2.c | 69 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x4__psimd_acc2() local 75 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x4__psimd_acc2()
|
D | up8x4-sse.c | 69 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x4__sse() local 75 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x4__sse()
|
D | up8x4-sse-acc2.c | 69 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x4__sse_acc2() local 75 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x4__sse_acc2()
|
D | up8x9-psimd.c | 79 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x9__psimd() local 85 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__psimd()
|
D | up8x9-neon-acc2.c | 75 const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 79 vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2()
|
D | up8x9-neonfma-acc2.c | 75 const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 79 vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2()
|
D | up8x9-sse-acc2.c | 79 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 85 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2()
|
D | up8x9-neon.c | 75 const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; in xnn_f32_dwconv_ukernel_up8x9__neon() local 79 vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__neon()
|
D | up8x9-sse.c | 79 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x9__sse() local 85 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x9__sse()
|
D | up8x9-psimd-acc2.c | 79 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 85 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2()
|
D | up8x9-neonfma.c | 75 const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma() local 79 vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x9__neonfma()
|
D | up8x25-psimd.c | 111 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x25__psimd() local 117 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x25__psimd()
|
D | up8x25-psimd-acc2.c | 111 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() local 117 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2()
|
D | up8x25-sse.c | 111 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x25__sse() local 117 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x25__sse()
|
D | up8x25-sse-acc2.c | 111 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() local 117 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2()
|