/external/XNNPACK/src/f32-dwconv/gen/ |
D | up16x9-minmax-fma3-acc2.c | 129 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 131 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 215 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 216 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 277 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 278 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2()
|
D | up16x9-minmax-fma3.c | 129 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 131 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 212 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 213 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 272 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 273 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3()
|
D | up16x9-minmax-avx.c | 129 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 131 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 212 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 213 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 272 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 273 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx()
|
D | up16x9-minmax-avx-acc2.c | 129 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 131 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() 215 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 216 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() 277 const __m256 vk4x01234567 = _mm256_load_ps(w + 80); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2() local 278 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx_acc2()
|
D | up8x9-minmax-fma3.c | 115 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() local 116 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 175 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() local 176 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3()
|
D | up8x9-minmax-avx.c | 115 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() local 116 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 175 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() local 176 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx()
|
D | up8x9-minmax-fma3-acc2.c | 115 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 116 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 177 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 178 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2()
|
D | up8x9-minmax-avx-acc2.c | 115 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() local 116 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() 177 const __m256 vk4x01234567 = _mm256_load_ps(w + 40); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2() local 178 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx_acc2()
|
/external/XNNPACK/src/f16-dwconv/gen/ |
D | up16x9-minmax-neonfp16arith-acc2.c | 121 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 123 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 187 const float16x8_t vk4x01234567 = vld1q_f16(w + 72); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 188 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 235 const float16x8_t vk4x01234567 = vld1q_f16(w + 80); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 236 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2()
|
D | up16x9-minmax-neonfp16arith.c | 121 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 123 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 184 const float16x8_t vk4x01234567 = vld1q_f16(w + 72); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 185 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 230 const float16x8_t vk4x01234567 = vld1q_f16(w + 80); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 231 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith()
|
D | up8x9-minmax-neonfp16arith.c | 107 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() local 108 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 153 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() local 154 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith()
|
D | up8x9-minmax-neonfp16arith-acc2.c | 107 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() local 108 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() 155 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2() local 156 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2()
|
D | up16x9-minmax-fma3.c | 132 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 80))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 134 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() 215 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 216 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() 279 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 80))); in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3() local 280 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up16x9__fma3()
|
D | up8x9-minmax-fma3-acc2.c | 118 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 119 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() 184 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 40))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 185 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3_acc2()
|
D | up8x9-minmax-fma3.c | 118 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 40))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() local 119 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() 182 const __m256 vk4x01234567 = _mm256_cvtph_ps(_mm_load_si128((const __m128i*) (w + 40))); in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3() local 183 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc0… in xnn_f16_dwconv_minmax_ukernel_up8x9__fma3()
|
D | up32x9-minmax-neonfp16arith-acc2.c | 149 const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 153 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() 251 const float16x8_t vk4x01234567 = vld1q_f16(w + 152); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 252 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() 299 const float16x8_t vk4x01234567 = vld1q_f16(w + 160); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2() local 300 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up32x9__neonfp16arith_acc2()
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 117 … const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 119 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 120 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 201 const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 32))); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 203 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 204 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-rndnu-neon-mul16.c | 118 … const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 121 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 202 const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 32))); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 204 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 205 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 117 … const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 119 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 120 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 202 const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 32))); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 204 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 205 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|
/external/XNNPACK/src/qc8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 116 … const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 118 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 119 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 203 const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 32))); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 205 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 206 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 116 … const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8(w)); w = (const void*) ((const int8_t*) w + 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 118 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 119 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 204 const int16x8_t vk4x01234567 = vmovl_s8(vld1_s8((const void*) ((const int8_t*) w + 32))); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 206 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 207 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-sse2-mul16-add16.c | 135 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)… in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() local 139 … const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() 272 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)… in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16() local 275 … const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16()
|
/external/XNNPACK/src/qu8-dwconv/gen/ |
D | up8x9-minmax-fp32-neonv8-mul16.c | 118 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 121 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 202 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() local 204 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16() 205 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neonv8_mul16()
|
D | up8x9-minmax-rndnu-neon-mul16.c | 119 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 121 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 122 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 203 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() local 205 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16() 206 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16()
|
D | up8x9-minmax-fp32-neon-mul16.c | 118 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8(w), vkernel_zero_point)); w … in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 121 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 203 …const int16x8_t vk4x01234567 = vreinterpretq_s16_u16(vsubl_u8(vld1_u8((const void*) ((const uint8_… in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() local 205 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16() 206 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__neon_mul16()
|