/external/XNNPACK/src/f32-dwconv/gen/ |
D | up4x25-psimd.c | 88 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x25__psimd() local 95 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 101 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 107 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 113 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 119 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 125 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 131 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 137 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() [all …]
|
D | up4x25-sse.c | 88 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up4x25__sse() local 95 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 101 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 107 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 113 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 119 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 125 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 131 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 137 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 143 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() [all …]
|
D | up8x25-psimd.c | 88 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x25__psimd() local 98 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 107 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 116 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 125 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 134 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 152 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 161 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() [all …]
|
D | up8x25-sse.c | 88 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x25__sse() local 98 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 107 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 116 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 125 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 134 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 143 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 152 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 161 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 170 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() [all …]
|
D | up8x9-psimd.c | 56 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x9__psimd() local 66 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 75 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 84 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 93 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 102 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 120 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 129 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 138 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() [all …]
|
D | up8x9-neon.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neon() local 64 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 71 vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 78 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 85 vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 92 vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 99 vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 106 vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 113 vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 120 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() [all …]
|
D | up8x9-sse.c | 56 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x9__sse() local 66 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 75 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 84 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 93 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 102 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 120 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 129 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 138 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() [all …]
|
D | up8x9-neonfma.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma() local 64 vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 71 vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 78 vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 85 vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 92 vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 99 vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 106 vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 113 vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 120 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() [all …]
|
D | up4x9-neon.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neon() local 61 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 65 vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 69 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 73 vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 77 vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 81 vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 85 vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 89 vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 93 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() [all …]
|
D | up4x9-neonfma.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neonfma() local 61 vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 65 vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 69 vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 73 vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 77 vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 81 vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 85 vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 89 vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 93 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() [all …]
|
D | up4x9-sse.c | 56 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up4x9__sse() local 63 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 69 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 75 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 81 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 87 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 93 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 99 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 105 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() [all …]
|
D | up4x9-psimd.c | 56 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x9__psimd() local 63 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 69 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 75 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 81 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 87 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 93 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 99 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 105 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() [all …]
|
D | up4x25-psimd-acc2.c | 88 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() local 95 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 107 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 119 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 131 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 155 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 167 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 179 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 191 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() [all …]
|
D | up4x25-sse-acc2.c | 88 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() local 95 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 107 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 119 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 131 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 143 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 155 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 167 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 179 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 191 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() [all …]
|
D | up8x4-psimd.c | 46 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x4__psimd() local 56 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 65 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 74 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 83 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 89 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x4__psimd() 99 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x4__psimd() local 105 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() 117 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x4__psimd() [all …]
|
D | up8x25-psimd-acc2.c | 88 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() local 98 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 116 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 134 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 152 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 188 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 206 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 224 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 242 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() [all …]
|
D | up8x4-sse.c | 46 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x4__sse() local 56 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 65 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 74 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 83 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 89 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x4__sse() 99 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x4__sse() local 105 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() 117 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x4__sse() [all …]
|
D | up8x25-sse-acc2.c | 88 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() local 98 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 116 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 134 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 152 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 170 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 188 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 206 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 224 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 242 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() [all …]
|
D | up8x9-neon-acc2.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 64 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 78 vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 92 vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 106 vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 120 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 124 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 127 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 136 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 141 vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() [all …]
|
D | up8x9-neonfma-acc2.c | 56 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 64 vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 78 vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 92 vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 106 vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 120 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 124 vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 127 float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 136 float32x4_t vacc0123p0 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 141 vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() [all …]
|
D | up8x9-sse-acc2.c | 56 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 66 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 84 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 102 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 120 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 138 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 144 vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 147 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 157 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 163 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() [all …]
|
D | up8x9-psimd-acc2.c | 56 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 66 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 84 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 102 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 120 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 138 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 144 vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 147 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 157 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 163 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() [all …]
|
D | up4x9-psimd-acc2.c | 56 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() local 63 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 75 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 87 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 99 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 116 vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 118 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 125 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() local 129 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() [all …]
|
D | up4x9-sse-acc2.c | 56 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() local 63 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 75 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 87 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 99 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 116 vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 118 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 125 __m128 vacc0123p0 = _mm_load_ps(w); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() local 129 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() [all …]
|
D | up4x4-psimd.c | 46 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x4__psimd() local 53 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 59 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 65 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 71 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 76 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin); in xnn_f32_dwconv_ukernel_up4x4__psimd() 83 psimd_f32 vacc0123p0 = psimd_load_f32(w); in xnn_f32_dwconv_ukernel_up4x4__psimd() local 87 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 91 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() 95 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123); in xnn_f32_dwconv_ukernel_up4x4__psimd() [all …]
|