/external/XNNPACK/src/f16-gemm/gen/ |
D | 8x8-neonfp16arith-ld64.c | 195 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() local 198 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 199 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 200 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 201 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 202 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 203 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 204 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 205 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() 216 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-neonfp16arith-ld64.c | 161 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() local 164 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 165 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 166 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 167 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 168 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 169 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 178 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 179 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() 180 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-neonfp16arith-ld64.c | 127 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() local 130 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 131 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 132 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 133 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 140 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 141 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 142 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64() 143 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_ukernel_4x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4-fma3-broadcast.c | 135 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() local 138 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 139 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 140 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast() 141 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-fma3-broadcast.c | 154 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() local 157 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 158 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 159 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 160 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast() 161 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 116 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() local 119 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 120 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-fma3-broadcast.c | 78 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast() local 81 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-fma3-broadcast.c | 133 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() local 136 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 137 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 138 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast() 139 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-fma3-broadcast.c | 152 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() local 155 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 156 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 157 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 158 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast() 159 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 114 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() local 117 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 118 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast() 119 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-fma3-broadcast.c | 76 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast() local 79 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-fma3-broadcast.c | 177 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() local 180 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 181 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 182 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 183 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast() 184 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16s4-fma3-broadcast.c | 155 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() local 158 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 159 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 160 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast() 161 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4-fma3-broadcast.c | 133 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() local 136 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 137 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast() 138 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-fma3-broadcast.c | 89 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast() local 92 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/q8-gemm/ |
D | 4x8-neon.c | 114 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8); in xnn_q8_gemm_ukernel_4x8__neon() local 115 const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); in xnn_q8_gemm_ukernel_4x8__neon() 225 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8); in xnn_q8_gemm_ukernel_4x8__neon() local 226 … const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); in xnn_q8_gemm_ukernel_4x8__neon()
|
D | 8x8-neon.c | 178 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8); in xnn_q8_gemm_ukernel_8x8__neon() local 179 const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); in xnn_q8_gemm_ukernel_8x8__neon() 361 const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((uintptr_t) w + 8); in xnn_q8_gemm_ukernel_8x8__neon() local 362 … const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point)); in xnn_q8_gemm_ukernel_8x8__neon()
|