/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8-minmax-neonfma-lane-ld64.c | 124 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() local 132 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() 133 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() 134 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() 135 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() 136 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64() 137 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64()
|
D | 6x8-minmax-neon-lane-ld64.c | 124 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() local 132 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() 133 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() 134 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() 135 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() 136 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64() 137 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64()
|
D | 6x8-minmax-neonfma-dup-ld64.c | 124 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() local 138 vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() 139 vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() 140 vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() 141 vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() 142 vacc4x4567 = vfmaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64() 143 vacc5x4567 = vfmaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64()
|
D | 6x8-minmax-neon-dup-ld64.c | 124 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() local 138 vacc0x4567 = vmlaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() 139 vacc1x4567 = vmlaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() 140 vacc2x4567 = vmlaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() 141 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() 142 vacc4x4567 = vmlaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64() 143 vacc5x4567 = vmlaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64()
|
D | 4x8-minmax-neon-dup-ld64.c | 100 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64() local 110 vacc0x4567 = vmlaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64() 111 vacc1x4567 = vmlaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64() 112 vacc2x4567 = vmlaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64() 113 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64()
|
D | 4x8-minmax-neonfma-dup-ld64.c | 100 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64() local 110 vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64() 111 vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64() 112 vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64() 113 vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64()
|
D | 4x8-minmax-neon-lane-ld64.c | 100 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64() local 106 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64() 107 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64() 108 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64() 109 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64()
|
D | 4x8-minmax-neonfma-lane-ld64.c | 100 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64() local 106 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64() 107 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64() 108 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64() 109 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-minmax-neonfma-lane-ld64.c | 96 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() local 104 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() 105 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() 106 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() 107 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() 108 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64() 109 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64()
|
D | 6x8-minmax-neon-lane-ld64.c | 96 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() local 104 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() 105 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() 106 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() 107 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() 108 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64() 109 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld64()
|
D | 5x8-minmax-neonfma-lane-ld64.c | 87 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() local 94 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 95 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 96 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 97 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64() 98 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neonfma_lane_ld64()
|
D | 5x8-minmax-neon-lane-ld64.c | 87 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() local 94 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 95 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 96 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 97 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64() 98 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64()
|
D | 6x8-minmax-neon-dup-ld64.c | 96 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() local 110 vacc0x4567 = vmlaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() 111 vacc1x4567 = vmlaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() 112 vacc2x4567 = vmlaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() 113 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() 114 vacc4x4567 = vmlaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64() 115 vacc5x4567 = vmlaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64()
|
D | 6x8-minmax-neonfma-dup-ld64.c | 96 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() local 110 vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() 111 vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() 112 vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() 113 vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() 114 vacc4x4567 = vfmaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64() 115 vacc5x4567 = vfmaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_dup_ld64()
|
D | 4x8-minmax-neon-lane-ld64.c | 78 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64() local 84 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64() 85 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64() 86 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64() 87 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64()
|
D | 4x8-minmax-neonfma-lane-ld64.c | 78 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64() local 84 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64() 85 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64() 86 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64() 87 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld64()
|
D | 4x8-minmax-neonfma-dup-ld64.c | 78 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64() local 88 vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64() 89 vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64() 90 vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64() 91 vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8inc-minmax-neon-lane-ld64.c | 98 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() local 106 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() 107 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() 108 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() 109 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() 110 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64() 111 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld64()
|
D | 6x8inc-minmax-neonfma-lane-ld64.c | 98 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() local 106 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() 107 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() 108 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() 109 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() 110 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64() 111 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld64()
|
D | 5x8inc-minmax-neon-lane-ld64.c | 89 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() local 96 vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 97 vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 98 vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 99 vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64() 100 vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neon_lane_ld64()
|
D | 5x8inc-minmax-neonfma-lane-ld64.c | 89 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() local 96 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 97 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 98 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 99 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64() 100 vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0); in xnn_f32_gemminc_minmax_ukernel_5x8__neonfma_lane_ld64()
|
D | 6x8inc-minmax-neon-dup-ld64.c | 98 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() local 112 vacc0x4567 = vmlaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() 113 vacc1x4567 = vmlaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() 114 vacc2x4567 = vmlaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() 115 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() 116 vacc4x4567 = vmlaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64() 117 vacc5x4567 = vmlaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_dup_ld64()
|
D | 6x8inc-minmax-neonfma-dup-ld64.c | 98 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() local 112 vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() 113 vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() 114 vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() 115 vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() 116 vacc4x4567 = vfmaq_f32(vacc4x4567, va4c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64() 117 vacc5x4567 = vfmaq_f32(vacc5x4567, va5c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_dup_ld64()
|
D | 4x8inc-minmax-neonfma-lane-ld64.c | 80 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neonfma_lane_ld64() local 86 vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neonfma_lane_ld64() 87 vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neonfma_lane_ld64() 88 vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neonfma_lane_ld64() 89 vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0); in xnn_f32_gemminc_minmax_ukernel_4x8__neonfma_lane_ld64()
|
D | 4x8inc-minmax-neon-dup-ld64.c | 80 const float32x4_t vb4567c0 = vld1q_f32(w); w += 4; in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld64() local 90 vacc0x4567 = vmlaq_f32(vacc0x4567, va0c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld64() 91 vacc1x4567 = vmlaq_f32(vacc1x4567, va1c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld64() 92 vacc2x4567 = vmlaq_f32(vacc2x4567, va2c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld64() 93 vacc3x4567 = vmlaq_f32(vacc3x4567, va3c0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_4x8__neon_dup_ld64()
|