/external/XNNPACK/src/f32-ppmm/gen/ |
D | 8x8-minmax-neonfma.c | 88 const float32x4_t vb0123 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() local 92 vacc0x0123 = vfmaq_laneq_f32(vacc0x0123, vb0123, va0123, 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 93 vacc1x0123 = vfmaq_laneq_f32(vacc1x0123, vb0123, va0123, 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 94 vacc2x0123 = vfmaq_laneq_f32(vacc2x0123, vb0123, va0123, 2); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 95 vacc3x0123 = vfmaq_laneq_f32(vacc3x0123, vb0123, va0123, 3); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 96 vacc4x0123 = vfmaq_laneq_f32(vacc4x0123, vb0123, va4567, 0); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 97 vacc5x0123 = vfmaq_laneq_f32(vacc5x0123, vb0123, va4567, 1); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 98 vacc6x0123 = vfmaq_laneq_f32(vacc6x0123, vb0123, va4567, 2); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 99 vacc7x0123 = vfmaq_laneq_f32(vacc7x0123, vb0123, va4567, 3); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() 118 vacc0x0123 = vfmaq_f32(vacc0x0123, va0000, vb0123); in xnn_f32_ppmm_minmax_ukernel_8x8__neonfma() [all …]
|
D | 4x8-minmax-neonfma.c | 63 const float32x4_t vb0123 = vld1q_f32(w); w += 4; in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() local 67 vacc0x0123 = vfmaq_laneq_f32(vacc0x0123, vb0123, va0123, 0); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 68 vacc1x0123 = vfmaq_laneq_f32(vacc1x0123, vb0123, va0123, 1); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 69 vacc2x0123 = vfmaq_laneq_f32(vacc2x0123, vb0123, va0123, 2); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 70 vacc3x0123 = vfmaq_laneq_f32(vacc3x0123, vb0123, va0123, 3); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 81 vacc0x0123 = vfmaq_f32(vacc0x0123, va0000, vb0123); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 82 vacc1x0123 = vfmaq_f32(vacc1x0123, va1111, vb0123); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 83 vacc2x0123 = vfmaq_f32(vacc2x0123, va2222, vb0123); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma() 84 vacc3x0123 = vfmaq_f32(vacc3x0123, va3333, vb0123); in xnn_f32_ppmm_minmax_ukernel_4x8__neonfma()
|
/external/XNNPACK/src/f32-vbinary/gen/ |
D | vmin-sse-x8.c | 38 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmin_ukernel__sse_x8() local 42 __m128 vy0123 = _mm_min_ps(va0123, vb0123); in xnn_f32_vmin_ukernel__sse_x8() 55 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmin_ukernel__sse_x8() local 58 __m128 vy0123 = _mm_min_ps(va0123, vb0123); in xnn_f32_vmin_ukernel__sse_x8() 64 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmin_ukernel__sse_x8() local 66 __m128 vy0123 = _mm_min_ps(va0123, vb0123); in xnn_f32_vmin_ukernel__sse_x8()
|
D | vmax-sse-x8.c | 38 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmax_ukernel__sse_x8() local 42 __m128 vy0123 = _mm_max_ps(va0123, vb0123); in xnn_f32_vmax_ukernel__sse_x8() 55 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmax_ukernel__sse_x8() local 58 __m128 vy0123 = _mm_max_ps(va0123, vb0123); in xnn_f32_vmax_ukernel__sse_x8() 64 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmax_ukernel__sse_x8() local 66 __m128 vy0123 = _mm_max_ps(va0123, vb0123); in xnn_f32_vmax_ukernel__sse_x8()
|
D | vmin-neon-x8.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmin_ukernel__neon_x8() local 38 float32x4_t vy0123 = vminq_f32(va0123, vb0123); in xnn_f32_vmin_ukernel__neon_x8() 48 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmin_ukernel__neon_x8() local 50 float32x4_t vy0123 = vminq_f32(va0123, vb0123); in xnn_f32_vmin_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vmin_ukernel__neon_x8() local 57 float32x4_t vy0123 = vminq_f32(va0123, vb0123); in xnn_f32_vmin_ukernel__neon_x8()
|
D | vmax-neon-x8.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmax_ukernel__neon_x8() local 38 float32x4_t vy0123 = vmaxq_f32(va0123, vb0123); in xnn_f32_vmax_ukernel__neon_x8() 48 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmax_ukernel__neon_x8() local 50 float32x4_t vy0123 = vmaxq_f32(va0123, vb0123); in xnn_f32_vmax_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vmax_ukernel__neon_x8() local 57 float32x4_t vy0123 = vmaxq_f32(va0123, vb0123); in xnn_f32_vmax_ukernel__neon_x8()
|
D | vsqrdiff-sse-x8.c | 38 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsqrdiff_ukernel__sse_x8() local 42 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__sse_x8() 57 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsqrdiff_ukernel__sse_x8() local 60 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__sse_x8() 67 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsqrdiff_ukernel__sse_x8() local 69 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__sse_x8()
|
D | vsqrdiff-neon-x8.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vsqrdiff_ukernel__neon_x8() local 38 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__neon_x8() 50 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vsqrdiff_ukernel__neon_x8() local 52 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__neon_x8() 58 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vsqrdiff_ukernel__neon_x8() local 60 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__neon_x8()
|
D | vmul-minmax-neon-x8.c | 36 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmul_minmax_ukernel__neon_x8() local 40 float32x4_t vy0123 = vmulq_f32(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmul_minmax_ukernel__neon_x8() local 57 float32x4_t vy0123 = vmulq_f32(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__neon_x8() 64 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vmul_minmax_ukernel__neon_x8() local 66 float32x4_t vy0123 = vmulq_f32(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__neon_x8()
|
D | vadd-minmax-sse-x8.c | 40 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vadd_minmax_ukernel__sse_x8() local 44 __m128 vy0123 = _mm_add_ps(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__sse_x8() 62 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vadd_minmax_ukernel__sse_x8() local 65 __m128 vy0123 = _mm_add_ps(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__sse_x8() 73 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vadd_minmax_ukernel__sse_x8() local 75 __m128 vy0123 = _mm_add_ps(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__sse_x8()
|
D | vdiv-minmax-neon-x8.c | 36 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vdiv_minmax_ukernel__neon_x8() local 40 float32x4_t vy0123 = vdivq_f32(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vdiv_minmax_ukernel__neon_x8() local 57 float32x4_t vy0123 = vdivq_f32(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__neon_x8() 64 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vdiv_minmax_ukernel__neon_x8() local 66 float32x4_t vy0123 = vdivq_f32(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__neon_x8()
|
D | vadd-minmax-neon-x8.c | 36 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vadd_minmax_ukernel__neon_x8() local 40 float32x4_t vy0123 = vaddq_f32(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vadd_minmax_ukernel__neon_x8() local 57 float32x4_t vy0123 = vaddq_f32(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__neon_x8() 64 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vadd_minmax_ukernel__neon_x8() local 66 float32x4_t vy0123 = vaddq_f32(va0123, vb0123); in xnn_f32_vadd_minmax_ukernel__neon_x8()
|
D | vmul-minmax-sse-x8.c | 40 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmul_minmax_ukernel__sse_x8() local 44 __m128 vy0123 = _mm_mul_ps(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__sse_x8() 62 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmul_minmax_ukernel__sse_x8() local 65 __m128 vy0123 = _mm_mul_ps(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__sse_x8() 73 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmul_minmax_ukernel__sse_x8() local 75 __m128 vy0123 = _mm_mul_ps(va0123, vb0123); in xnn_f32_vmul_minmax_ukernel__sse_x8()
|
D | vdiv-minmax-sse-x8.c | 40 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vdiv_minmax_ukernel__sse_x8() local 44 __m128 vy0123 = _mm_div_ps(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__sse_x8() 62 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vdiv_minmax_ukernel__sse_x8() local 65 __m128 vy0123 = _mm_div_ps(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__sse_x8() 73 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vdiv_minmax_ukernel__sse_x8() local 75 __m128 vy0123 = _mm_div_ps(va0123, vb0123); in xnn_f32_vdiv_minmax_ukernel__sse_x8()
|
D | vsub-minmax-sse-x8.c | 40 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsub_minmax_ukernel__sse_x8() local 44 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__sse_x8() 62 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsub_minmax_ukernel__sse_x8() local 65 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__sse_x8() 73 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsub_minmax_ukernel__sse_x8() local 75 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__sse_x8()
|
D | vsub-minmax-neon-x8.c | 36 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vsub_minmax_ukernel__neon_x8() local 40 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__neon_x8() 55 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vsub_minmax_ukernel__neon_x8() local 57 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__neon_x8() 64 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vsub_minmax_ukernel__neon_x8() local 66 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__neon_x8()
|
D | vmax-neon-x4.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmax_ukernel__neon_x4() local 36 float32x4_t vy0123 = vmaxq_f32(va0123, vb0123); in xnn_f32_vmax_ukernel__neon_x4() 44 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vmax_ukernel__neon_x4() local 46 float32x4_t vy0123 = vmaxq_f32(va0123, vb0123); in xnn_f32_vmax_ukernel__neon_x4()
|
D | vmax-sse-x4.c | 37 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmax_ukernel__sse_x4() local 40 __m128 vy0123 = _mm_max_ps(va0123, vb0123); in xnn_f32_vmax_ukernel__sse_x4() 49 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmax_ukernel__sse_x4() local 51 __m128 vy0123 = _mm_max_ps(va0123, vb0123); in xnn_f32_vmax_ukernel__sse_x4()
|
D | vmin-neon-x4.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vmin_ukernel__neon_x4() local 36 float32x4_t vy0123 = vminq_f32(va0123, vb0123); in xnn_f32_vmin_ukernel__neon_x4() 44 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vmin_ukernel__neon_x4() local 46 float32x4_t vy0123 = vminq_f32(va0123, vb0123); in xnn_f32_vmin_ukernel__neon_x4()
|
D | vmin-sse-x4.c | 37 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmin_ukernel__sse_x4() local 40 __m128 vy0123 = _mm_min_ps(va0123, vb0123); in xnn_f32_vmin_ukernel__sse_x4() 49 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vmin_ukernel__sse_x4() local 51 __m128 vy0123 = _mm_min_ps(va0123, vb0123); in xnn_f32_vmin_ukernel__sse_x4()
|
D | vsqrdiff-sse-x4.c | 37 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsqrdiff_ukernel__sse_x4() local 40 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__sse_x4() 50 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsqrdiff_ukernel__sse_x4() local 52 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__sse_x4()
|
D | vsqrdiff-neon-x4.c | 34 const float32x4_t vb0123 = vld1q_f32(b); b += 4; in xnn_f32_vsqrdiff_ukernel__neon_x4() local 36 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__neon_x4() 45 const float32x4_t vb0123 = vld1q_f32(b); in xnn_f32_vsqrdiff_ukernel__neon_x4() local 47 float32x4_t vy0123 = vsubq_f32(va0123, vb0123); in xnn_f32_vsqrdiff_ukernel__neon_x4()
|
D | vsub-minmax-sse-x4.c | 39 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsub_minmax_ukernel__sse_x4() local 42 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__sse_x4() 54 const __m128 vb0123 = _mm_loadu_ps(b); in xnn_f32_vsub_minmax_ukernel__sse_x4() local 56 __m128 vy0123 = _mm_sub_ps(va0123, vb0123); in xnn_f32_vsub_minmax_ukernel__sse_x4()
|
/external/XNNPACK/src/f32-ibilinear/gen/ |
D | neonfma-c8.c | 62 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 67 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 72 const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); in xnn_f32_ibilinear_ukernel__neonfma_c8() 97 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 100 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 103 const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); in xnn_f32_ibilinear_ukernel__neonfma_c8() 125 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 128 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c8() local 131 const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); in xnn_f32_ibilinear_ukernel__neonfma_c8()
|
D | neonfma-c4.c | 56 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 59 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 62 const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); in xnn_f32_ibilinear_ukernel__neonfma_c4() 83 const float32x4_t vb0123 = vfmaq_f32(vbl0123, vbd0123, valphah); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 86 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_ibilinear_ukernel__neonfma_c4() local 89 const float32x4_t vd0123 = vsubq_f32(vb0123, vt0123); in xnn_f32_ibilinear_ukernel__neonfma_c4()
|