/external/XNNPACK/src/f32-bilinear/gen/ |
D | neon-c8.c | 38 const float32x2_t valphahv = vld1_f32(weights); weights += 2; in xnn_f32_bilinear_ukernel__neon_c8() local 56 const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 57 const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 58 const float32x4_t vt4567 = vmlaq_lane_f32(vtl4567, vtd4567, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 59 const float32x4_t vb4567 = vmlaq_lane_f32(vbl4567, vbd4567, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 64 const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neon_c8() 65 const float32x4_t vo4567 = vmlaq_lane_f32(vt4567, vd4567, valphahv, 1); in xnn_f32_bilinear_ukernel__neon_c8() 79 const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 80 const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c8() 84 const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neon_c8() [all …]
|
D | neonfma-c8.c | 38 const float32x2_t valphahv = vld1_f32(weights); weights += 2; in xnn_f32_bilinear_ukernel__neonfma_c8() local 40 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() 41 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c8() 66 const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() 67 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() 68 const float32x4_t vt4567 = vfmaq_lane_f32(vtl4567, vtd4567, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() 69 const float32x4_t vb4567 = vfmaq_lane_f32(vbl4567, vbd4567, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() 79 const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c8() 80 const float32x4_t vo4567 = vfmaq_lane_f32(vt4567, vd4567, valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c8() 99 const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c8() [all …]
|
D | neonfma-c4.c | 38 const float32x2_t valphahv = vld1_f32(weights); weights += 2; in xnn_f32_bilinear_ukernel__neonfma_c4() local 40 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c4() 41 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c4() 58 const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c4() 59 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c4() 67 const float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c4() 85 const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c4() 86 const float32x4_t vb0123 = vfmaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neonfma_c4() 94 float32x4_t vo0123 = vfmaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neonfma_c4()
|
D | sse-c4.c | 37 __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights); in xnn_f32_bilinear_ukernel__sse_c4() local 38 valphahv = _mm_unpacklo_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c4() 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c4() 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c4()
|
D | neon-c4.c | 38 const float32x2_t valphahv = vld1_f32(weights); weights += 2; in xnn_f32_bilinear_ukernel__neon_c4() local 50 const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c4() 51 const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c4() 55 const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neon_c4() 68 const float32x4_t vt0123 = vmlaq_lane_f32(vtl0123, vtd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c4() 69 const float32x4_t vb0123 = vmlaq_lane_f32(vbl0123, vbd0123, valphahv, 0); in xnn_f32_bilinear_ukernel__neon_c4() 73 const float32x4_t vo0123 = vmlaq_lane_f32(vt0123, vd0123, valphahv, 1); in xnn_f32_bilinear_ukernel__neon_c4()
|
D | sse-c8.c | 37 __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights); in xnn_f32_bilinear_ukernel__sse_c8() local 38 valphahv = _mm_unpacklo_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c8() 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c8() 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv); in xnn_f32_bilinear_ukernel__sse_c8()
|
/external/XNNPACK/src/f32-bilinear/ |
D | neon.c.in | 39 const float32x2_t valphahv = vld1_f32(weights); weights += 2; 42 const float32x4_t valphah = vdupq_lane_f32(valphahv, 0); 43 const float32x4_t valphav = vdupq_lane_f32(valphahv, 1); 65 …onst float32x4_t vt${ABC[C:C+4]} = vfmaq_lane_f32(vtl${ABC[C:C+4]}, vtd${ABC[C:C+4]}, valphahv, 0); 66 …onst float32x4_t vb${ABC[C:C+4]} = vfmaq_lane_f32(vbl${ABC[C:C+4]}, vbd${ABC[C:C+4]}, valphahv, 0); 70 …onst float32x4_t vt${ABC[C:C+4]} = vmlaq_lane_f32(vtl${ABC[C:C+4]}, vtd${ABC[C:C+4]}, valphahv, 0); 71 …onst float32x4_t vb${ABC[C:C+4]} = vmlaq_lane_f32(vbl${ABC[C:C+4]}, vbd${ABC[C:C+4]}, valphahv, 0); 82 … const float32x4_t vo${ABC[C:C+4]} = vfmaq_lane_f32(vt${ABC[C:C+4]}, vd${ABC[C:C+4]}, valphahv, 1); 86 … const float32x4_t vo${ABC[C:C+4]} = vmlaq_lane_f32(vt${ABC[C:C+4]}, vd${ABC[C:C+4]}, valphahv, 1); 106 const float32x4_t vt0123 = vfmaq_lane_f32(vtl0123, vtd0123, valphahv, 0); [all …]
|
D | sse.c.in | 37 __m128 valphahv = _mm_loadl_pi(_mm_undefined_ps(), (const __m64*) weights); 38 valphahv = _mm_unpacklo_ps(valphahv, valphahv); 39 const __m128 valphah = _mm_movelh_ps(valphahv, valphahv); 40 const __m128 valphav = _mm_movehl_ps(valphahv, valphahv);
|