/external/XNNPACK/src/f32-dwconv/gen/ |
D | up16x9-minmax-fma3-acc2.c | 165 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 167 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 239 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 240 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 293 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 294 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
|
D | up16x9-minmax-fma3.c | 165 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 167 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 236 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 237 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 288 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 289 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3()
|
D | up16x9-minmax-avx.c | 165 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 167 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 236 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 237 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 288 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 289 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx()
|
D | up16x9-minmax-avx-acc2.c | 165 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 167 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() 239 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 240 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() 293 const __m256 vk8x01234567 = _mm256_load_ps(w + 144); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 294 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2()
|
D | up8x9-minmax-fma3.c | 139 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() local 140 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 191 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() local 192 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3()
|
D | up8x9-minmax-avx.c | 139 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() local 140 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 191 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() local 192 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx()
|
D | up8x9-minmax-fma3-acc2.c | 139 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 140 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 193 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 194 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2()
|
D | up8x9-minmax-avx-acc2.c | 139 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() local 140 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() 193 const __m256 vk8x01234567 = _mm256_load_ps(w + 72); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() local 194 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2()
|
/external/XNNPACK/src/f16-dwconv/gen/ |
D | up16x9-minmax-neonfp16arith-acc2.c | 149 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 151 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 203 const float16x8_t vk8x01234567 = vld1q_f16(w + 136); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 204 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 251 const float16x8_t vk8x01234567 = vld1q_f16(w + 144); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 252 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2()
|
D | up16x9-minmax-neonfp16arith.c | 149 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 151 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 200 const float16x8_t vk8x01234567 = vld1q_f16(w + 136); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 201 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 246 const float16x8_t vk8x01234567 = vld1q_f16(w + 144); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 247 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith()
|
D | up8x9-minmax-neonfp16arith.c | 123 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() local 124 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 169 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() local 170 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith()
|
D | up8x9-minmax-neonfp16arith-acc2.c | 123 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() local 124 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() 171 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() local 172 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2()
|
D | up16x9-minmax-fma3.c | 168 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 144))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 170 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() 239 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 240 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() 299 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 144))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 300 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3()
|
D | up8x9-minmax-fma3-acc2.c | 142 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 143 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() 204 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 72))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 205 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2()
|
D | up8x9-minmax-fma3.c | 142 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 72))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() local 143 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() 202 const __m256 vk8x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 72))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() local 203 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3()
|
D | up32x9-minmax-neonfp16arith-acc2.c | 201 const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 205 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() 267 const float16x8_t vk8x01234567 = vld1q_f16(w + 280); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 268 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() 315 const float16x8_t vk8x01234567 = vld1q_f16(w + 288); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 316 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2()
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 141 … const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 143 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 144 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 221 const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 64))); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 223 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 224 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-rndnu-neon-mul16.c | 142 … const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 145 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 222 const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 64))); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 224 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 225 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 141 … const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 143 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 144 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 222 const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 64))); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 224 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 225 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|
/external/XNNPACK/src/qc8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 140 … const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 142 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 143 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 223 const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 64))); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 225 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 226 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 140 … const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 142 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 143 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 224 const int16x8_t vk8x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 64))); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 226 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 227 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-sse2-mul16-add16.c | 181 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)… in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() local 185 … const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() 314 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)… in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() local 317 … const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16()
|
/external/XNNPACK/src/qu8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 142 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 145 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 222 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 224 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 225 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-rndnu-neon-mul16.c | 143 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 145 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 146 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 223 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 225 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 226 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 142 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 145 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 223 …const int16x8_t vk8x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 225 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 226 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|