/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8-neonfma-lane-ld128.c | 127 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 128 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 129 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 130 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 131 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 133 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 134 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 135 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 136 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() [all …]
|
D | 6x8-neon-lane-ld128.c | 127 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 128 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 129 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 130 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 131 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 132 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 133 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 134 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 135 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 136 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() [all …]
|
D | 4x8-neon-lane-ld128.c | 103 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 104 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 105 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 106 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 107 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 108 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 109 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 110 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 115 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() 116 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_4x8__neon_lane_ld128() [all …]
|
D | 4x8-neonfma-lane-ld128.c | 103 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 104 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 105 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 106 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 107 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 108 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 109 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 110 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 115 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() 116 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128() [all …]
|
D | 6x8-neonfma-dup-ld128.c | 127 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 128 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 129 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 130 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 131 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 132 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 149 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 150 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 151 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() 152 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128() [all …]
|
D | 6x8-neon-dup-ld128.c | 127 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 128 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 129 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 130 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 131 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 132 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 149 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 150 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 151 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() 152 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_igemm_ukernel_6x8__neon_dup_ld128() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8-neonfma-lane-ld128.c | 101 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 102 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 103 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 104 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 105 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 106 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 107 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 108 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 109 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 110 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() [all …]
|
D | 6x8-neon-lane-ld128.c | 101 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 102 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 103 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 104 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 105 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 107 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 108 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 109 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 110 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() [all …]
|
D | 4x8-neonfma-lane-ld128.c | 83 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 84 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 85 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 86 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 87 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 88 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 89 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 90 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 95 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() 96 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128() [all …]
|
D | 4x8-neon-lane-ld128.c | 83 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 84 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 85 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 86 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 87 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 88 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 89 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 90 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 95 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() 96 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128() [all …]
|
D | 6x8-neon-dup-ld128.c | 101 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 102 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 103 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 104 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 105 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 106 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 123 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 124 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 125 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() 126 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128() [all …]
|
D | 6x8-neonfma-dup-ld128.c | 101 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 102 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 103 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 104 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 105 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 106 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 123 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 124 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 125 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() 126 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-neonfma-lane-ld128.c | 99 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 100 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 101 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 102 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 103 vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 105 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 106 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 107 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 108 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() [all …]
|
D | 6x8-neon-lane-ld128.c | 99 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 100 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 101 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 102 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 103 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 105 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 106 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 107 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 108 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() [all …]
|
D | 4x8-neon-lane-ld128.c | 81 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 82 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 83 vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 84 vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 85 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 86 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 87 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 88 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 93 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() 94 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_4x8__neon_lane_ld128() [all …]
|
D | 4x8-neonfma-lane-ld128.c | 81 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 82 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 83 vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 84 vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 85 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 86 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 87 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 88 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 93 vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() 94 vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128() [all …]
|
D | 6x8-neonfma-dup-ld128.c | 99 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 100 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 101 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 102 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 103 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 104 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 121 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 122 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 123 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() 124 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128() [all …]
|
D | 6x8-neon-dup-ld128.c | 99 const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 100 const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 101 const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 102 const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 103 const float32x4_t va4c0 = vdupq_lane_f32(vget_low_f32(va4), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 104 const float32x4_t va5c0 = vdupq_lane_f32(vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 121 const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 122 const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 123 const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() 124 const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1); in xnn_f32_gemm_ukernel_6x8__neon_dup_ld128() [all …]
|
/external/XNNPACK/src/f32-spmm/gen/ |
D | 4x4-neonfma.c | 127 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x4__neonfma() 128 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x4__neonfma() 129 float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x4__neonfma() 130 float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x4__neonfma() 132 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x4__neonfma() 133 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x4__neonfma() 134 vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x4__neonfma() 135 vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x4__neonfma() 159 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x4__neonfma() 160 vout01 = vmax_f32(vout01, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x4__neonfma() [all …]
|
D | 8x4-neonfma.c | 223 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_8x4__neonfma() 224 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_8x4__neonfma() 225 float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_8x4__neonfma() 226 float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_8x4__neonfma() 228 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_8x4__neonfma() 229 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_8x4__neonfma() 230 vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_8x4__neonfma() 231 vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_8x4__neonfma() 255 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_8x4__neonfma() 256 vout01 = vmax_f32(vout01, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_8x4__neonfma() [all …]
|
D | 4x2-neonfma.c | 113 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x2__neonfma() 114 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x2__neonfma() 116 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x2__neonfma() 117 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x2__neonfma() 139 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x2__neonfma() 140 vout01 = vmax_f32(vout01, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x2__neonfma() 171 float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x2__neonfma() 172 float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax)); in xnn_f32_spmm_ukernel_4x2__neonfma() 174 vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x2__neonfma() 175 vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin)); in xnn_f32_spmm_ukernel_4x2__neonfma() [all …]
|
/external/XNNPACK/src/f32-ppmm/gen/ |
D | 8x8-neon.c | 91 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123, vget_low_f32(va0123), 0); in xnn_f32_ppmm_ukernel_8x8__neon() 92 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123, vget_low_f32(va0123), 1); in xnn_f32_ppmm_ukernel_8x8__neon() 95 vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123, vget_low_f32(va4567), 0); in xnn_f32_ppmm_ukernel_8x8__neon() 96 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123, vget_low_f32(va4567), 1); in xnn_f32_ppmm_ukernel_8x8__neon() 99 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567, vget_low_f32(va0123), 0); in xnn_f32_ppmm_ukernel_8x8__neon() 100 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567, vget_low_f32(va0123), 1); in xnn_f32_ppmm_ukernel_8x8__neon() 103 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567, vget_low_f32(va4567), 0); in xnn_f32_ppmm_ukernel_8x8__neon() 104 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567, vget_low_f32(va4567), 1); in xnn_f32_ppmm_ukernel_8x8__neon() 196 float32x2_t vacc7x01 = vget_low_f32(vacc7x0123); in xnn_f32_ppmm_ukernel_8x8__neon() 197 float32x2_t vacc6x01 = vget_low_f32(vacc6x0123); in xnn_f32_ppmm_ukernel_8x8__neon() [all …]
|
D | 4x8-neon.c | 66 vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123, vget_low_f32(va0123), 0); in xnn_f32_ppmm_ukernel_4x8__neon() 67 vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123, vget_low_f32(va0123), 1); in xnn_f32_ppmm_ukernel_4x8__neon() 70 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567, vget_low_f32(va0123), 0); in xnn_f32_ppmm_ukernel_4x8__neon() 71 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567, vget_low_f32(va0123), 1); in xnn_f32_ppmm_ukernel_4x8__neon() 127 float32x2_t vacc3x01 = vget_low_f32(vacc3x0123); in xnn_f32_ppmm_ukernel_4x8__neon() 128 float32x2_t vacc2x01 = vget_low_f32(vacc2x0123); in xnn_f32_ppmm_ukernel_4x8__neon() 129 float32x2_t vacc1x01 = vget_low_f32(vacc1x0123); in xnn_f32_ppmm_ukernel_4x8__neon() 130 float32x2_t vacc0x01 = vget_low_f32(vacc0x0123); in xnn_f32_ppmm_ukernel_4x8__neon()
|
/external/XNNPACK/src/f32-gavgpool-spchw/ |
D | neon-x4.c | 78 const float32x4_t vsum01 = vcombine_f32(vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)), in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 79 vadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1))); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 80 const float32x4_t vsum23 = vcombine_f32(vadd_f32(vget_low_f32(vsum2), vget_high_f32(vsum2)), in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 81 vadd_f32(vget_low_f32(vsum3), vget_high_f32(vsum3))); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 82 const float32x4_t vsum = vcombine_f32(vpadd_f32(vget_low_f32(vsum01), vget_high_f32(vsum01)), in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 83 vpadd_f32(vget_low_f32(vsum23), vget_high_f32(vsum23))); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 114 float32x2_t vsum = vadd_f32(vget_low_f32(vsum0), vget_high_f32(vsum0)); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 117 float32x2_t vout = vmul_f32(vsum, vget_low_f32(vmultiplier)); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 119 vout = vmax_f32(vout, vget_low_f32(voutput_min)); in xnn_f32_gavgpool_spchw_ukernel__neon_x4() 120 vout = vmin_f32(vout, vget_low_f32(voutput_max)); in xnn_f32_gavgpool_spchw_ukernel__neon_x4()
|
/external/webrtc/webrtc/modules/audio_processing/aec/ |
D | aec_rdft_neon.c | 34 float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); in cft1st_128_neon() 36 float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); in cft1st_128_neon() 63 a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); in cft1st_128_neon() 64 a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); in cft1st_128_neon() 115 vst1_f32(&a[j + 0], vget_low_f32(xx0)); in cftmdl_128_neon() 117 vst1_f32(&a[j + 16], vget_low_f32(xx1)); in cftmdl_128_neon() 122 vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); in cftmdl_128_neon() 123 vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); in cftmdl_128_neon() 124 vst1_f32(&a[j + 40], vget_low_f32(yy4)); in cftmdl_128_neon() 169 vst1_f32(&a[j + 0], vget_low_f32(xx)); in cftmdl_128_neon() [all …]
|