/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8s4-psimd.c | 100 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_gemminc_ukernel_6x8s4__psimd() local 112 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 118 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 126 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 133 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 144 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 150 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 158 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 165 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemminc_ukernel_6x8s4__psimd() 176 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__psimd() [all …]
|
D | 6x8s4-neonfma.c | 95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neonfma() local 106 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 112 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 129 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 135 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 152 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 158 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() 165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4-neon.c | 95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8s4__neon() local 106 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_6x8s4__neon() 112 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_6x8s4__neon() 119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 129 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 135 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() 152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_6x8s4__neon() 158 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_6x8s4__neon() 165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_6x8s4__neon() [all …]
|
D | 8x8s4-neon.c | 111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_8x8s4__neon() local 124 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_8x8s4__neon() 132 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_8x8s4__neon() 141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon() 153 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_8x8s4__neon() 161 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_8x8s4__neon() 170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon() 182 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_8x8s4__neon() 190 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_8x8s4__neon() 199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neon() [all …]
|
D | 8x8s4-neonfma.c | 111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_8x8s4__neonfma() local 124 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 132 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 153 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 161 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 182 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 190 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() 199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_ukernel_8x8s4__neonfma() [all …]
|
D | 6x8-neonfma-lane-ld128.c | 95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() local 106 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 112 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 122 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 128 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 138 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 144 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 154 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 160 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() 169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
D | 6x8-neon-lane-ld128.c | 95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() local 106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 112 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 122 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 128 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 144 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 154 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 160 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() 169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128() local [all …]
|
D | 6x8-neon-lane-ld64.c | 95 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() local 105 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() 111 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() 120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() 126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() 134 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() local 144 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64() 150 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8s4-psimd.c | 98 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_gemm_ukernel_6x8s4__psimd() local 110 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 116 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 124 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 131 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd() 142 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__psimd() 148 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__psimd() 156 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_ukernel_6x8s4__psimd() 163 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_gemm_ukernel_6x8s4__psimd() 174 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__psimd() [all …]
|
D | 6x8s4-neon.c | 93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8s4__neon() local 104 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neon() 110 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neon() 117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() 127 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neon() 133 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neon() 140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() 150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neon() 156 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neon() 163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4-neonfma.c | 93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8s4__neonfma() local 104 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 110 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 127 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 133 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 156 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_6x8s4__neonfma() 163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_6x8s4__neonfma() [all …]
|
D | 8x8s4-neonfma.c | 109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_8x8s4__neonfma() local 122 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 130 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 151 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 159 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 180 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 188 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_8x8s4__neonfma() 197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neonfma() [all …]
|
D | 8x8s4-neon.c | 109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_8x8s4__neon() local 122 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_ukernel_8x8s4__neon() 130 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_ukernel_8x8s4__neon() 139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon() 151 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_ukernel_8x8s4__neon() 159 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_ukernel_8x8s4__neon() 168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon() 180 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_ukernel_8x8s4__neon() 188 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_ukernel_8x8s4__neon() 197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_ukernel_8x8s4__neon() [all …]
|
D | 6x8-neonfma-lane-ld128.c | 93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() local 104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 110 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 120 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 126 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 142 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 152 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 158 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() 167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
D | 6x8-neon-lane-ld128.c | 93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() local 104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 110 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 142 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 152 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 158 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() 167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_ukernel_6x8__neon_lane_ld128() local [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8s4-psimd.c | 126 psimd_f32 va5 = psimd_load_f32(a5); in xnn_f32_igemm_ukernel_6x8s4__psimd() local 138 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 144 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 152 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 159 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd() 170 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__psimd() 176 vacc5x4567 = psimd_qfma_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__psimd() 184 va5 = __builtin_shufflevector(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_ukernel_6x8s4__psimd() 191 va5 = __builtin_shuffle(va5, va5, (psimd_s32) { 1, 2, 3, 0 }); in xnn_f32_igemm_ukernel_6x8s4__psimd() 202 vacc5x0123 = psimd_qfma_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__psimd() [all …]
|
D | 6x8s4-neonfma.c | 120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8s4__neonfma() local 131 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 137 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 154 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 160 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 177 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 183 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_6x8s4__neonfma() 190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4-neon.c | 120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8s4__neon() local 131 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_6x8s4__neon() 137 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_6x8s4__neon() 144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon() 154 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_6x8s4__neon() 160 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_6x8s4__neon() 167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon() 177 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_6x8s4__neon() 183 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_6x8s4__neon() 190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_6x8s4__neon() [all …]
|
D | 8x8s4-neon.c | 142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_8x8s4__neon() local 155 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_8x8s4__neon() 163 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_8x8s4__neon() 172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon() 184 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_8x8s4__neon() 192 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_8x8s4__neon() 201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon() 213 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_8x8s4__neon() 221 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_8x8s4__neon() 230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neon() [all …]
|
D | 8x8s4-neonfma.c | 142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_8x8s4__neonfma() local 155 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 163 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 184 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 192 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 213 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 221 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_ukernel_8x8s4__neonfma() 230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_ukernel_8x8s4__neonfma() [all …]
|
D | 6x8-neonfma-lane-ld128.c | 121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() local 132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 138 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 148 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 154 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 170 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 180 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 186 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() 195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
D | 6x8-neon-lane-ld128.c | 121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() local 132 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 138 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 148 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 154 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 164 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 170 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 180 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 186 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() 195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld128() local [all …]
|
D | 6x8-neon-lane-ld64.c | 121 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() local 131 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() 137 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() 146 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() 152 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() 161 const float32x4_t va5 = vld1q_dup_f32(a5); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() local 171 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64() 177 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_igemm_ukernel_6x8__neon_lane_ld64()
|
D | 6x8-neonfma-lane-ld64.c | 121 const float32x2_t va5 = vld1_f32(a5); a5 += 2; in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() local 131 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() 137 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() 146 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() 152 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() 161 const float32x4_t va5 = vld1q_dup_f32(a5); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() local 171 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64() 177 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567); in xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64()
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x8-neonfp16arith-ld64.c | 87 const float16x4_t va5 = vld1_f16(a5); a5 += 4; in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local 97 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 104 const float16x8_t va5c0 = vdupq_lane_f16(va5, 0); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 121 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 128 const float16x8_t va5c1 = vdupq_lane_f16(va5, 1); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 145 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 152 const float16x8_t va5c2 = vdupq_lane_f16(va5, 2); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 169 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 176 const float16x8_t va5c3 = vdupq_lane_f16(va5, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 195 const float16x8_t va5 = vld1q_dup_f16(a5); a5 += 1; in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local [all …]
|