/external/XNNPACK/src/f32-dwconv/gen/ |
D | up8x25-minmax-avx.c | 252 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() local 256 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 390 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() local 392 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx()
|
D | up8x25-minmax-fma3-acc2.c | 252 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() local 256 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 392 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() local 394 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2()
|
D | up8x25-minmax-fma3.c | 252 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() local 256 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 390 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() local 392 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3()
|
D | up16x25-minmax-avx.c | 295 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() local 301 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 493 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() local 497 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 631 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() local 633 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx()
|
D | up16x25-minmax-fma3-acc2.c | 295 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() local 301 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 496 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() local 500 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 636 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() local 638 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2()
|
D | up16x25-minmax-fma3.c | 295 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() local 301 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 493 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() local 497 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 631 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() local 633 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3()
|
D | up8x25-minmax-avx-acc2.c | 252 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() local 256 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 392 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() local 394 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2()
|
D | up16x25-minmax-avx-acc2.c | 295 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() local 301 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 496 const __m256 vi14x01234567 = _mm256_loadu_ps(i14); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() local 500 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 636 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() local 638 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2()
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up8x25-minmax-fp32-neon-mul16.c | 256 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); i14 += 8; in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 259 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 260 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 427 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 430 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 431 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16()
|
D | up8x25-minmax-rndnu-neon-mul16.c | 257 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); i14 += 8; in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() local 260 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 261 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 427 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() local 430 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 431 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16()
|
D | up8x25-minmax-fp32-neonv8-mul16.c | 256 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); i14 += 8; in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 259 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 260 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 426 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 429 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 430 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16()
|
D | up8x25-minmax-fp32-sse2-mul16-add16.c | 329 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16() local 333 … const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16() 638 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16() local 641 … const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8); in xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16()
|
/external/XNNPACK/src/qc8-dwconv/gen/ |
D | up8x25-minmax-fp32-neonv8-mul16.c | 255 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); i14 += 8; in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 258 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 259 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 428 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 431 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 432 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16()
|
D | up8x25-minmax-fp32-neon-mul16.c | 255 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); i14 += 8; in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 258 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 259 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 429 const int16x8_t vi14x01234567 = vmovl_s8(vld1_s8(i14)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 432 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 433 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16()
|
/external/XNNPACK/src/f16-dwconv/gen/ |
D | up8x25-minmax-neonfp16arith.c | 226 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() local 228 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 336 const float16x8_t vi14x01234567 = vld1q_f16(i14); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() local 338 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith()
|
D | up8x25-minmax-neonfp16arith-acc2.c | 226 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() local 228 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 338 const float16x8_t vi14x01234567 = vld1q_f16(i14); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() local 340 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2()
|
D | up16x25-minmax-neonfp16arith-acc2.c | 269 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() local 273 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 418 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() local 420 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 530 const float16x8_t vi14x01234567 = vld1q_f16(i14); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() local 532 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2()
|
D | up16x25-minmax-neonfp16arith.c | 269 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() local 273 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 415 const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() local 417 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 525 const float16x8_t vi14x01234567 = vld1q_f16(i14); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() local 527 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith()
|
D | up8x25-minmax-fma3-acc2.c | 255 const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14)); in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2() local 259 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vac… in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2() 408 const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14)); in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2() local 411 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vac… in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2()
|
D | up8x25-minmax-fma3.c | 255 const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14)); in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3() local 259 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vac… in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3() 406 const __m256 vi14x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i14)); in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3() local 409 …vacc01234567p0 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_fmadd_ps(vi14x01234567, vk14x01234567, vac… in xnn_f16_dwconv_minmax_ukernel_up8x25__fma3()
|
/external/XNNPACK/src/qu8-dwconv/gen/ |
D | up8x25-minmax-fp32-neonv8-mul16.c | 257 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8; in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 260 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 261 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 427 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() local 430 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16() 431 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mul16()
|
D | up8x25-minmax-fp32-neon-mul16.c | 257 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8; in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 260 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 261 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 428 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() local 431 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16() 432 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__neon_mul16()
|
D | up8x25-minmax-rndnu-neon-mul8.c | 287 const uint8x8_t vi14x01234567 = vld1_u8(i14); i14 += 8; in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8() local 290 vprod01234567 = vmull_u8(vi14x01234567, vk14x01234567); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8() 291 vsum01234567 = vaddw_u8(vsum01234567, vi14x01234567); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8() 479 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8() local 482 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8() 483 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8()
|
D | up8x25-minmax-fp32-wasmsimd-mul16.c | 307 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16() local 309 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16() 312 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16() 582 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16() local 584 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16() 586 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567); in xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16()
|
D | up8x25-minmax-rndnu-neon-mul16.c | 258 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); i14 += 8; in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() local 261 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 262 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 428 const int16x8_t vi14x01234567 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(i14))); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() local 431 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi14x01234567), vget_low_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16() 432 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi14x01234567), vget_high_s16(vk14x01234567)); in xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16()
|