/external/XNNPACK/src/f32-dwconv/gen/ |
D | up8x9-psimd.c | 136 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd() local 138 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 207 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd() local 208 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd() 255 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd() local 256 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd()
|
D | up8x9-neon-acc2.c | 118 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 120 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 172 const float32x4_t vk8x0123 = vld1q_f32(w + 68); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 173 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() 220 const float32x4_t vk8x0123 = vld1q_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2() local 221 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon_acc2()
|
D | up8x9-neonfma-acc2.c | 118 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 120 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 172 const float32x4_t vk8x0123 = vld1q_f32(w + 68); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 173 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() 220 const float32x4_t vk8x0123 = vld1q_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2() local 221 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2()
|
D | up8x9-sse-acc2.c | 136 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 138 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 210 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 211 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() 260 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2() local 261 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse_acc2()
|
D | up8x9-neon.c | 118 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neon() local 120 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 169 const float32x4_t vk8x0123 = vld1q_f32(w + 68); in xnn_f32_dwconv_ukernel_up8x9__neon() local 170 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon() 215 const float32x4_t vk8x0123 = vld1q_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__neon() local 216 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neon()
|
D | up8x9-sse.c | 136 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse() local 138 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 207 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse() local 208 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse() 255 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x9__sse() local 256 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x9__sse()
|
D | up8x9-psimd-acc2.c | 136 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 138 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 210 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 211 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() 260 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2() local 261 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__psimd_acc2()
|
D | up8x9-neonfma.c | 118 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up8x9__neonfma() local 120 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 169 const float32x4_t vk8x0123 = vld1q_f32(w + 68); in xnn_f32_dwconv_ukernel_up8x9__neonfma() local 170 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma() 215 const float32x4_t vk8x0123 = vld1q_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x9__neonfma() local 216 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x9__neonfma()
|
D | up4x9-psimd-acc2.c | 110 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() local 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() 160 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2() local 161 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd_acc2()
|
D | up4x9-sse-acc2.c | 110 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() local 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() 160 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2() local 161 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse_acc2()
|
D | up4x9-neon.c | 92 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neon() local 93 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neon() 138 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neon() local 139 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neon()
|
D | up4x9-neonfma.c | 92 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neonfma() local 93 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma() 138 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neonfma() local 139 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma()
|
D | up4x9-sse.c | 110 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x9__sse() local 111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse() 158 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x9__sse() local 159 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x9__sse()
|
D | up4x9-psimd.c | 110 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x9__psimd() local 111 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd() 158 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x9__psimd() local 159 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__psimd()
|
D | up4x9-neonfma-acc2.c | 92 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2() local 93 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2() 140 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2() local 141 vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2()
|
D | up4x9-neon-acc2.c | 92 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neon_acc2() local 93 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neon_acc2() 140 const float32x4_t vk8x0123 = vld1q_f32(w); w += 4; in xnn_f32_dwconv_ukernel_up4x9__neon_acc2() local 141 vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x9__neon_acc2()
|
D | up4x25-psimd-acc2.c | 142 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() local 143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() 288 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2() local 289 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd_acc2()
|
D | up4x25-psimd.c | 142 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x25__psimd() local 143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd() 286 const psimd_f32 vk8x0123 = psimd_load_f32(w + 36); in xnn_f32_dwconv_ukernel_up4x25__psimd() local 287 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up4x25__psimd()
|
D | up4x25-sse.c | 142 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x25__sse() local 143 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse() 286 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x25__sse() local 287 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse()
|
D | up4x25-sse-acc2.c | 142 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() local 143 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() 288 const __m128 vk8x0123 = _mm_load_ps(w + 36); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2() local 289 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up4x25__sse_acc2()
|
D | up8x25-psimd.c | 168 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd() local 170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 383 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd() local 384 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd() 527 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd() local 528 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd()
|
D | up8x25-psimd-acc2.c | 168 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() local 170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 386 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() local 387 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() 532 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2() local 533 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123); in xnn_f32_dwconv_ukernel_up8x25__psimd_acc2()
|
D | up8x25-sse.c | 168 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse() local 170 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 383 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse() local 384 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse() 527 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse() local 528 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse()
|
D | up8x25-sse-acc2.c | 168 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() local 170 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 386 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() local 387 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() 532 const __m128 vk8x0123 = _mm_load_ps(w + 72); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2() local 533 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123)); in xnn_f32_dwconv_ukernel_up8x25__sse_acc2()
|